From b0ad64831b8fdafee3d466fa4193b720c742185c Mon Sep 17 00:00:00 2001 From: Matjaz Domen Pecan Date: Tue, 7 Apr 2026 08:24:22 +0100 Subject: [PATCH] feat: decompose parameter expansions into Word.parts Parameter expansions ($var, ${var}, ${var:-default}, ${#var}, ${!var}) were tracked as spans during lexing but not decomposed into AST nodes in Word.parts. Downstream consumers walking Word.parts could detect CommandSubstitution and AnsiCQuote but completely missed parameter expansions, forcing fragile string-matching fallbacks. Add ParamExpansion(String) and SimpleVar(String) variants to WordSegment. Refactor segment building into build_segments() with a configurable filter so sexp output (segments_from_spans) is unchanged while word.parts decomposition (segments_with_params) includes parameter expansions. The param expansion parser handles: - Simple variables: $var, $?, $1 - Braced expansions: ${var}, ${var:-default}, ${var##pattern} - Length: ${#var} - Indirect: ${!var}, ${!var:-default} - Array subscripts: ${arr[@]} - All standard bash operators (22 operators, longest-match-first) All 1604+ Parable tests and oracle tests pass unchanged. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/format/mod.rs | 3 + src/parser/word_parts.rs | 307 ++++++++++++++++++++++++++++++++++++++- src/sexp/mod.rs | 3 + src/sexp/word.rs | 58 +++++++- 4 files changed, 359 insertions(+), 12 deletions(-) diff --git a/src/format/mod.rs b/src/format/mod.rs index 360701e..06be22a 100644 --- a/src/format/mod.rs +++ b/src/format/mod.rs @@ -605,6 +605,9 @@ fn process_word_value(value: &str, spans: &[crate::lexer::word_builder::WordSpan } result.push(')'); } + WordSegment::ParamExpansion(text) | WordSegment::SimpleVar(text) => { + result.push_str(text); + } } } result diff --git a/src/parser/word_parts.rs b/src/parser/word_parts.rs index ab7b494..ba438ce 100644 --- a/src/parser/word_parts.rs +++ b/src/parser/word_parts.rs @@ -7,7 +7,7 @@ use std::cell::Cell; use crate::ast::{ListItem, Node, NodeKind, Span}; use crate::lexer::word_builder::WordSpan; -use crate::sexp::word::{WordSegment, segments_from_spans}; +use crate::sexp::word::{WordSegment, segments_with_params}; thread_local! { static DECOMPOSE_DEPTH: Cell = const { Cell::new(0) }; @@ -44,7 +44,7 @@ impl Drop for DepthGuard { /// Decomposes a word using lexer spans — the primary path for all /// token-derived words. pub(super) fn decompose_word_with_spans(value: &str, spans: &[WordSpan]) -> Vec { - let segments = segments_from_spans(value, spans); + let segments = segments_with_params(value, spans); segments.into_iter().map(segment_to_node).collect() } @@ -65,6 +65,8 @@ fn segment_to_node(seg: WordSegment) -> Node { WordSegment::ProcessSubstitution(direction, content) => { procsub_to_node(direction, &content) } + WordSegment::SimpleVar(text) => parse_simple_var(&text), + WordSegment::ParamExpansion(text) => parse_braced_param(&text), } } @@ -128,6 +130,157 @@ fn wrap_nodes(mut nodes: Vec) -> Node { } } +/// Parses `$var` — strip leading `$`, remainder is the param name. +fn parse_simple_var(text: &str) -> Node { + let param = text.get(1..).unwrap_or(""); + Node::empty(NodeKind::ParamExpansion { + param: param.to_string(), + op: None, + arg: None, + }) +} + +/// Parses `${...}` — strip `${` and `}`, dispatch on inner content. +fn parse_braced_param(text: &str) -> Node { + let inner = match text.get(2..text.len().saturating_sub(1)) { + Some(s) if !s.is_empty() => s, + _ => return literal_fallback(text), + }; + parse_param_inner(inner) +} + +/// Dispatches based on the first character of the inner `${...}` content. +fn parse_param_inner(inner: &str) -> Node { + let first = inner.as_bytes().first().copied(); + match first { + // ${#...} — length prefix, unless inner is just "#" (special param) + Some(b'#') if inner.len() > 1 => parse_length_prefix(&inner[1..]), + // ${!...} — indirect prefix, unless inner is just "!" (special param) + Some(b'!') if inner.len() > 1 => parse_indirect_prefix(&inner[1..]), + _ => parse_plain_param(inner), + } +} + +/// Parses `${#var}` — the content after `#` is the param name. +fn parse_length_prefix(after_hash: &str) -> Node { + let (param, _) = split_param_and_rest(after_hash); + Node::empty(NodeKind::ParamLength { + param: param.to_string(), + }) +} + +/// Parses `${!var}` or `${!var:-default}`. +fn parse_indirect_prefix(after_bang: &str) -> Node { + let (param, op, arg) = extract_param_op_arg(after_bang); + Node::empty(NodeKind::ParamIndirect { param, op, arg }) +} + +/// Parses `${var}` or `${var:-default}`. +fn parse_plain_param(inner: &str) -> Node { + let (param, op, arg) = extract_param_op_arg(inner); + Node::empty(NodeKind::ParamExpansion { param, op, arg }) +} + +/// Shared extraction: splits param name, operator, and argument. +fn extract_param_op_arg(s: &str) -> (String, Option, Option) { + let (param, rest) = split_param_and_rest(s); + let (op, arg) = parse_op_and_arg(rest); + (param.to_string(), op, arg) +} + +/// Splits a parameter name from the remainder. Handles identifiers, +/// special single-char params, digits, and array subscripts. +fn split_param_and_rest(s: &str) -> (&str, &str) { + if s.is_empty() { + return ("", ""); + } + let bytes = s.as_bytes(); + // Special single-char params: @, *, ?, -, $, !, # + if matches!(bytes[0], b'@' | b'*' | b'?' | b'-' | b'$' | b'!' | b'#') { + return (&s[..1], &s[1..]); + } + // Digit-only positional params (e.g., "0", "10", "100") + if bytes[0].is_ascii_digit() { + let end = bytes + .iter() + .position(|b| !b.is_ascii_digit()) + .unwrap_or(s.len()); + return maybe_with_subscript(s, end); + } + // Identifier: [a-zA-Z_][a-zA-Z0-9_]* + if bytes[0].is_ascii_alphabetic() || bytes[0] == b'_' { + let end = bytes + .iter() + .position(|b| !b.is_ascii_alphanumeric() && *b != b'_') + .unwrap_or(s.len()); + return maybe_with_subscript(s, end); + } + // Unknown — treat entire string as param + (s, "") +} + +/// If the character at `name_end` is `[`, extend to include the subscript. +fn maybe_with_subscript(s: &str, name_end: usize) -> (&str, &str) { + if s.as_bytes().get(name_end) == Some(&b'[') + && let Some(bracket_len) = find_matching_bracket(&s[name_end..]) + { + let end = name_end + bracket_len + 1; + return (&s[..end], &s[end..]); + } + (&s[..name_end], &s[name_end..]) +} + +/// Finds the matching `]` for a `[` at position 0, returning the index +/// of `]` relative to the input. Handles nested brackets. +fn find_matching_bracket(s: &str) -> Option { + let mut depth = 0i32; + for (i, ch) in s.char_indices() { + match ch { + '[' => depth += 1, + ']' => { + depth -= 1; + if depth == 0 { + return Some(i); + } + } + _ => {} + } + } + None +} + +/// Known parameter expansion operators, longest-match-first. +const OPERATORS: &[&str] = &[ + ":-", ":=", ":+", ":?", // colon-prefixed defaults + "-", "=", "+", "?", // plain defaults + "##", "#", // prefix removal + "%%", "%", // suffix removal + "//", "/#", "/%", "/", // substitution + "^^", "^", // uppercase + ",,", ",", // lowercase + "@", // transformation + ":", // substring +]; + +/// Extracts operator and argument from the remainder after the param name. +fn parse_op_and_arg(s: &str) -> (Option, Option) { + if s.is_empty() { + return (None, None); + } + for op in OPERATORS { + if let Some(arg) = s.strip_prefix(op) { + let arg_opt = if arg.is_empty() { + None + } else { + Some(arg.to_string()) + }; + return (Some((*op).to_string()), arg_opt); + } + } + // Unknown operator — treat entire remainder as op + (Some(s.to_string()), None) +} + #[cfg(test)] mod tests { use super::*; @@ -151,12 +304,158 @@ mod tests { } #[test] - fn simple_variable_stays_literal() { + fn simple_variable_expansion() { let parts = decompose("$foo"); assert_eq!(parts.len(), 1); assert!(matches!( &parts[0].kind, - NodeKind::WordLiteral { value } if value == "$foo" + NodeKind::ParamExpansion { param, op, arg } + if param == "foo" && op.is_none() && arg.is_none() + )); + } + + #[test] + fn braced_variable_expansion() { + let parts = decompose("${foo}"); + assert_eq!(parts.len(), 1); + assert!(matches!( + &parts[0].kind, + NodeKind::ParamExpansion { param, op, arg } + if param == "foo" && op.is_none() && arg.is_none() + )); + } + + #[test] + fn param_with_default() { + let parts = decompose("${foo:-default}"); + assert_eq!(parts.len(), 1); + assert!(matches!( + &parts[0].kind, + NodeKind::ParamExpansion { param, op, arg } + if param == "foo" + && op.as_deref() == Some(":-") + && arg.as_deref() == Some("default") + )); + } + + #[test] + fn param_length() { + let parts = decompose("${#foo}"); + assert_eq!(parts.len(), 1); + assert!(matches!( + &parts[0].kind, + NodeKind::ParamLength { param } if param == "foo" + )); + } + + #[test] + fn param_indirect() { + let parts = decompose("${!foo}"); + assert_eq!(parts.len(), 1); + assert!(matches!( + &parts[0].kind, + NodeKind::ParamIndirect { param, op, arg } + if param == "foo" && op.is_none() && arg.is_none() + )); + } + + #[test] + fn special_param_question_mark() { + let parts = decompose("$?"); + assert_eq!(parts.len(), 1); + assert!(matches!( + &parts[0].kind, + NodeKind::ParamExpansion { param, op, arg } + if param == "?" && op.is_none() && arg.is_none() + )); + } + + #[test] + fn positional_param() { + let parts = decompose("$1"); + assert_eq!(parts.len(), 1); + assert!(matches!( + &parts[0].kind, + NodeKind::ParamExpansion { param, op, arg } + if param == "1" && op.is_none() && arg.is_none() + )); + } + + #[test] + fn multi_digit_positional() { + let parts = decompose("${10}"); + assert_eq!(parts.len(), 1); + assert!(matches!( + &parts[0].kind, + NodeKind::ParamExpansion { param, op, arg } + if param == "10" && op.is_none() && arg.is_none() + )); + } + + #[test] + fn prefix_removal_operator() { + let parts = decompose("${foo##pattern}"); + assert_eq!(parts.len(), 1); + assert!(matches!( + &parts[0].kind, + NodeKind::ParamExpansion { param, op, arg } + if param == "foo" + && op.as_deref() == Some("##") + && arg.as_deref() == Some("pattern") + )); + } + + #[test] + fn special_param_hash_braced() { + // ${#} is the special param "#", NOT ParamLength + let parts = decompose("${#}"); + assert_eq!(parts.len(), 1); + assert!(matches!( + &parts[0].kind, + NodeKind::ParamExpansion { param, op, arg } + if param == "#" && op.is_none() && arg.is_none() + )); + } + + #[test] + fn mixed_text_and_param() { + let parts = decompose("hello${world}end"); + assert_eq!(parts.len(), 3); + assert!(matches!( + &parts[0].kind, + NodeKind::WordLiteral { value } if value == "hello" + )); + assert!(matches!( + &parts[1].kind, + NodeKind::ParamExpansion { param, .. } if param == "world" + )); + assert!(matches!( + &parts[2].kind, + NodeKind::WordLiteral { value } if value == "end" + )); + } + + #[test] + fn array_subscript() { + let parts = decompose("${arr[@]}"); + assert_eq!(parts.len(), 1); + assert!(matches!( + &parts[0].kind, + NodeKind::ParamExpansion { param, op, arg } + if param == "arr[@]" && op.is_none() && arg.is_none() + )); + } + + #[test] + fn indirect_with_operator() { + let parts = decompose("${!foo:-bar}"); + assert_eq!(parts.len(), 1); + assert!(matches!( + &parts[0].kind, + NodeKind::ParamIndirect { param, op, arg } + if param == "foo" + && op.as_deref() == Some(":-") + && arg.as_deref() == Some("bar") )); } diff --git a/src/sexp/mod.rs b/src/sexp/mod.rs index 5e8f785..5ad6077 100644 --- a/src/sexp/mod.rs +++ b/src/sexp/mod.rs @@ -735,6 +735,9 @@ fn write_redirect_segments( } write!(f, ")")?; } + word::WordSegment::ParamExpansion(text) | word::WordSegment::SimpleVar(text) => { + write!(f, "{text}")?; + } } } Ok(()) diff --git a/src/sexp/word.rs b/src/sexp/word.rs index e1fc31b..33bacb4 100644 --- a/src/sexp/word.rs +++ b/src/sexp/word.rs @@ -26,13 +26,19 @@ pub enum WordSegment { CommandSubstitution(String), /// Process substitution `>(...)` or `<(...)` — direction + content. ProcessSubstitution(char, String), + /// Parameter expansion `${...}` — raw text includes `${` and `}`. + ParamExpansion(String), + /// Simple variable `$var` — raw text includes `$` prefix. + SimpleVar(String), } /// Formats word segments into S-expression output. pub fn write_word_segments(f: &mut fmt::Formatter<'_>, segments: &[WordSegment]) -> fmt::Result { for seg in segments { match seg { - WordSegment::Literal(text) => { + WordSegment::Literal(text) + | WordSegment::ParamExpansion(text) + | WordSegment::SimpleVar(text) => { for ch in text.chars() { write_escaped_char(f, ch)?; } @@ -97,9 +103,26 @@ pub fn write_word_segments(f: &mut fmt::Formatter<'_>, segments: &[WordSegment]) pub fn segments_from_spans( value: &str, spans: &[crate::lexer::word_builder::WordSpan], +) -> Vec { + build_segments(value, spans, is_sexp_relevant) +} + +/// Like `segments_from_spans` but also decomposes parameter expansions +/// and simple variables into their own segments (for `Word.parts`). +pub fn segments_with_params( + value: &str, + spans: &[crate::lexer::word_builder::WordSpan], +) -> Vec { + build_segments(value, spans, is_decomposable) +} + +fn build_segments( + value: &str, + spans: &[crate::lexer::word_builder::WordSpan], + filter: fn(&crate::lexer::word_builder::WordSpanKind) -> bool, ) -> Vec { use crate::lexer::word_builder::{QuotingContext, WordSpanKind}; - let top_level = collect_top_level_sexp_spans(spans); + let top_level = collect_filtered_spans(spans, filter); let mut segments = Vec::new(); let mut pos = 0; for span in &top_level { @@ -137,7 +160,17 @@ pub fn segments_from_spans( } } } - _ => {} // filtered out by is_sexp_relevant + WordSpanKind::ParamExpansion => { + if let Some(text) = value.get(span.start..span.end) { + segments.push(WordSegment::ParamExpansion(text.to_string())); + } + } + WordSpanKind::SimpleVar => { + if let Some(text) = value.get(span.start..span.end) { + segments.push(WordSegment::SimpleVar(text.to_string())); + } + } + _ => {} // filtered out by span filter } pos = span.end; } @@ -204,13 +237,22 @@ const fn is_sexp_relevant(kind: &crate::lexer::word_builder::WordSpanKind) -> bo ) } -/// Collects top-level sexp-relevant spans sorted by start offset. -/// A span is top-level if no other sexp-relevant span fully contains it. -fn collect_top_level_sexp_spans( +/// Sexp-relevant spans plus parameter expansions and simple variables. +const fn is_decomposable(kind: &crate::lexer::word_builder::WordSpanKind) -> bool { + use crate::lexer::word_builder::WordSpanKind; + if is_sexp_relevant(kind) { + return true; + } + matches!(kind, WordSpanKind::ParamExpansion | WordSpanKind::SimpleVar) +} + +/// Collects top-level spans matching `filter`, sorted by start offset. +/// A span is top-level if no other matching span fully contains it. +fn collect_filtered_spans( spans: &[crate::lexer::word_builder::WordSpan], + filter: fn(&crate::lexer::word_builder::WordSpanKind) -> bool, ) -> Vec<&crate::lexer::word_builder::WordSpan> { - // Collect sexp-relevant spans sorted by start offset - let mut relevant: Vec<_> = spans.iter().filter(|s| is_sexp_relevant(&s.kind)).collect(); + let mut relevant: Vec<_> = spans.iter().filter(|s| filter(&s.kind)).collect(); relevant.sort_by_key(|s| s.start); // Single pass: skip spans nested inside a previously collected one let mut result = Vec::new();