dawnmy · dawnmy · Dec 18, 2025 · Oct 25, 2025 · Oct 25, 2025 · Oct 25, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "tsvkit"
-version = "0.9.5"
+version = "0.9.6"
 edition = "2024"
 
 [dependencies]

diff --git a/README.md b/README.md
@@ -34,8 +34,8 @@
 
 ### Key features
 - Stream-friendly processing; every command reads from files or standard input and writes to standard output.
-- Column selectors that accept names, 1-based indices, ranges, and multi-file specifications.
-- Expression language with arithmetic, comparisons, logical operators, regex matching, and numeric helper functions.
+- Column selectors that accept names, 1-based indices, ranges, regexes, and multi-file specifications.
+- Expression language with arithmetic, comparisons, logical operators, list membership, regex matching, and numeric helper functions.
 - Aggregations for grouped summaries (`summarize`) and row-wise calculations (`mutate`).
 - Excel tooling to inspect, preview, export, and assemble `.xlsx` workbooks.
 
@@ -120,14 +120,17 @@ Selectors are reused in `cut`, `filter`, `join`, `mutate`, `summarize`, and othe
 | `index` | 1-based column index. | `1,4,9` |
 | `-index` | Column counted from the end (1 = last). | `-1,-2` |
 | `start:end` | Inclusive range by name or index. Supports open ends. | `IL6:IL10`, `2:5`, `:IL10`, `IL6:` |
+| `~"regex"` | Columns whose names match the regular expression. Requires headers. | `~"^sample_"` |
 | `:` | Select every column in order. | `-f ':'` |
-| `mixed` | Combine names, indices, and ranges. | `sample_id,3:5,tech` |
+| `mixed` | Combine names, indices, ranges, and regexes. | `sample_id,3:5,~"_pct$"` |
 | `multi-file` | Separate selectors for each input with semicolons (primarily `join`). | `sample_id;subject_id` |
 | `range in expressions` | Prefixed with `$` to access a slice of values. | `$IL6:$IL10` |
 
 > Wrap selectors in backticks or braces to treat punctuation literally. For example, ``-f '`IL6:IL10`,`total,reads`'`` or `-f '{IL6:IL10},{total,reads}'` selects columns named `IL6:IL10` and `total,reads` instead of expanding a range or splitting on the comma.
 
-Negative indices are also valid inside ranges: `:-2` selects every column except the final two, while `-3:` keeps the last three columns.
+Negative indices are also valid inside ranges: `:-2` selects every column except the final two, while `-3:` keeps the last three columns. Regex selectors deduplicate by first match; add `--allow-dups` (or `-D`) on `cut`/`summarize` when you need repeated columns.
+
+> Regex selectors require a header row. When `-H/--no-header` is active, using `~"..."` results in an error with guidance to remove the regex or restore headers.
 
 Anywhere you access column *values* inside an expression, prefix the selector with `$` (`$purity`, `$1`, `$IL6:$IL10`).
 
@@ -152,9 +155,13 @@ The same expression language powers `filter -e`, `mutate -e name=EXPR`, and rege
 | `!` / `not` | Logical negation. | Booleans |
 | `~` | Regex match. Right-hand side can be literal text or a `$range`. | Strings |
 | `!~` | Regex does *not* match. | Strings |
+| `in` | Membership test against a list literal or numeric range. | `$group in ["case","control"]` |
+| `!in` | Negated membership test. | `$status !in ["fail","missing"]` |
 
 > Reference columns whose names contain operators or punctuation with `${column-name}` inside expressions (e.g. `${dna-} - $rna_ug`). This prevents the parser from treating the characters as arithmetic.
 
+List literals use square brackets: `[1,2,3]`, `["case","control"]`, `[IL6:IL10]`. Combine them with `in`/`!in` to test membership, or pass them to helper functions that accept lists.
+
 **Numeric helper functions**
 
 | Function | Description |
@@ -228,13 +235,25 @@ Ranges expand consecutive columns automatically:
 tsvkit cut -f 'sample_id,IL6:IL10' examples/cytokines.tsv
 ```
 
+Regex selectors pick up columns whose headers match a pattern. Combine them with names, indices, and ranges in any order:
+
+```bash
+tsvkit cut -f '1,group,~"^IL",~"_pct$"' examples/qc.tsv
+```
+
+Matches deduplicate by default; add `-D/--allow-dups` to keep every occurrence when multiple selectors target the same column.
+
 ### `filter`
-Filter rows with boolean logic, arithmetic, column ranges, and regexes.
+Filter rows with boolean logic, arithmetic, column ranges, regexes, and list membership tests.
 
 ```bash
 tsvkit filter -e '$group == "case" & $purity >= 0.94' examples/samples.tsv
 ```
 
+```bash
+tsvkit filter -e '$status !in ["fail","missing","error"] & $tech ~ "sRNA"' examples/samples.tsv
+```
+
 **Expression building blocks for `filter`**
 
 | Building block | Examples | Notes |
@@ -248,6 +267,7 @@ tsvkit filter -e '$group == "case" & $purity >= 0.94' examples/samples.tsv
 | Row-wise aggregators | `sum($dna_ug:$rna_ug)`, `mode($1,$3)`, `countunique($gene:)` | Same catalog as [`summarize`](#summarize): totals, quantiles (`q*` / `p*`), variance/SD, products, entropy, argmin/argmax, membership stats. Works with ranges, lists, and open selectors. |
 | Regex match | `$tech ~ "sRNA"`, `$notes !~ "(?i)fail"` | Patterns follow Rust `regex` syntax. `(?i)` enables case-insensitive matching. |
 | Regex across ranges | `$gene:$notes ~ "kinase"`, `~ "control"` | When the left-hand side is omitted, `~` scans all columns. |
+| Membership | `$group in ["case","control"]`, `$rank in [1:3]` | Right-hand side must be a list literal or numeric range. |
 
 **Regex usage at a glance**
 
@@ -329,6 +349,12 @@ tsvkit summarize \
   examples/samples.tsv
 ```
 
+Regex selectors work here as well, so you can summarize whole families of columns in one shot:
+
+```bash
+tsvkit summarize -g patient -s '~"^sample_"=mean,sd' -D cohort.tsv
+```
+
 **Aggregators supported by `summarize`**
 
 _Counts & membership_

diff --git a/src/common.rs b/src/common.rs
@@ -1,10 +1,12 @@
+use std::collections::HashSet;
 use std::fs::File;
 use std::io::{self, BufReader};
 use std::path::Path;
 
 use anyhow::{Context, Result, anyhow, bail};
 use csv::ReaderBuilder;
 use flate2::read::MultiGzDecoder;
+use regex::Regex;
 use xz2::read::XzDecoder;
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
@@ -27,6 +29,7 @@ pub enum ColumnSelector {
     Index(usize),
     FromEnd(usize),
     Name(String),
+    Regex(String),
     Range(Option<Box<ColumnSelector>>, Option<Box<ColumnSelector>>),
     Special(SpecialColumn),
 }
@@ -99,7 +102,24 @@ pub fn resolve_selectors(
     selectors: &[ColumnSelector],
     no_header: bool,
 ) -> Result<Vec<usize>> {
-    let mut indices = Vec::with_capacity(selectors.len());
+    resolve_selectors_with_options(headers, selectors, no_header, false)
+}
+
+pub fn resolve_selectors_allow_duplicates(
+    headers: &[String],
+    selectors: &[ColumnSelector],
+    no_header: bool,
+) -> Result<Vec<usize>> {
+    resolve_selectors_with_options(headers, selectors, no_header, true)
+}
+
+fn resolve_selectors_with_options(
+    headers: &[String],
+    selectors: &[ColumnSelector],
+    no_header: bool,
+    allow_duplicates: bool,
+) -> Result<Vec<usize>> {
+    let mut indices = Vec::new();
     for selector in selectors {
         match selector {
             ColumnSelector::Special(special) => {
@@ -108,9 +128,13 @@ pub fn resolve_selectors(
                     special.default_header()
                 );
             }
-            ColumnSelector::Index(_) | ColumnSelector::FromEnd(_) | ColumnSelector::Name(_) => {
-                let index = resolve_selector_index(headers, selector, no_header)?;
-                indices.push(index);
+            ColumnSelector::Index(_)
+            | ColumnSelector::FromEnd(_)
+            | ColumnSelector::Name(_)
+            | ColumnSelector::Regex(_) => {
+                let mut resolved =
+                    resolve_selector_indices(headers, selector, no_header, allow_duplicates)?;
+                indices.append(&mut resolved);
             }
             ColumnSelector::Range(start, end) => {
                 if headers.is_empty() {
@@ -282,6 +306,9 @@ fn parse_simple_selector(token: &str) -> Result<ColumnSelector> {
     if token.is_empty() {
         return Err(anyhow!("empty column selector"));
     }
+    if let Some(regex) = parse_regex_literal(token)? {
+        return Ok(ColumnSelector::Regex(regex));
+    }
     if let Some(literal) = parse_backtick_literal(token)? {
         return Ok(ColumnSelector::Name(literal));
     }
@@ -314,6 +341,46 @@ fn parse_simple_selector(token: &str) -> Result<ColumnSelector> {
     Ok(ColumnSelector::Name(token.to_string()))
 }
 
+fn parse_regex_literal(token: &str) -> Result<Option<String>> {
+    let trimmed = token.trim();
+    if !trimmed.starts_with('~') {
+        return Ok(None);
+    }
+    let remainder = trimmed[1..].trim_start();
+    let mut chars = remainder.chars();
+    match chars.next() {
+        Some('"') => {
+            let mut value = String::new();
+            let mut escaped = false;
+            while let Some(ch) = chars.next() {
+                if escaped {
+                    value.push(ch);
+                    escaped = false;
+                    continue;
+                }
+                match ch {
+                    '\\' => {
+                        escaped = true;
+                    }
+                    '"' => {
+                        if !chars.as_str().is_empty() {
+                            bail!("unexpected trailing characters after regex selector literal");
+                        }
+                        return Ok(Some(value));
+                    }
+                    other => value.push(other),
+                }
+            }
+            bail!("unterminated regex selector literal");
+        }
+        Some(other) => bail!(
+            "regex column selector must use double quotes (e.g. ~\"pattern\"), got '{}'",
+            other
+        ),
+        None => bail!("regex column selector requires a quoted pattern"),
+    }
+}
+
 fn parse_backtick_literal(token: &str) -> Result<Option<String>> {
     let trimmed = token.trim();
     if !trimmed.starts_with('`') {
@@ -521,6 +588,89 @@ fn tokenize_selector_spec(spec: &str) -> Result<Vec<SelectorToken>> {
     Ok(tokens)
 }
 
+fn resolve_selector_indices(
+    headers: &[String],
+    selector: &ColumnSelector,
+    no_header: bool,
+    allow_duplicates: bool,
+) -> Result<Vec<usize>> {
+    match selector {
+        ColumnSelector::Index(idx) => {
+            let index = *idx;
+            if index >= headers.len() {
+                bail!(
+                    "column index {} out of range ({} columns)",
+                    index + 1,
+                    headers.len()
+                );
+            }
+            Ok(vec![index])
+        }
+        ColumnSelector::FromEnd(offset) => {
+            let offset = *offset;
+            if offset == 0 {
+                bail!("column selector '-0' is not allowed");
+            }
+            if offset > headers.len() {
+                bail!(
+                    "column selector '-{}' out of range ({} columns)",
+                    offset,
+                    headers.len()
+                );
+            }
+            Ok(vec![headers.len() - offset])
+        }
+        ColumnSelector::Name(name) => {
+            if no_header {
+                bail!("column names cannot be used when input lacks a header row");
+            }
+            if allow_duplicates {
+                let mut matches = Vec::new();
+                for (idx, header) in headers.iter().enumerate() {
+                    if header == name {
+                        matches.push(idx);
+                    }
+                }
+                if matches.is_empty() {
+                    bail!("column '{}' not found", name);
+                }
+                Ok(matches)
+            } else {
+                let index = headers
+                    .iter()
+                    .position(|h| h == name)
+                    .with_context(|| format!("column '{}' not found", name))?;
+                Ok(vec![index])
+            }
+        }
+        ColumnSelector::Regex(pattern) => {
+            if no_header {
+                bail!("regex column selectors require headers");
+            }
+            let regex = Regex::new(pattern)
+                .with_context(|| format!("invalid regex pattern '{}'", pattern))?;
+            let mut seen = HashSet::new();
+            let mut matches = Vec::new();
+            for (idx, header) in headers.iter().enumerate() {
+                if regex.is_match(header) {
+                    if allow_duplicates || seen.insert(header.clone()) {
+                        matches.push(idx);
+                    }
+                }
+            }
+            if matches.is_empty() {
+                bail!("regex pattern '{}' did not match any columns", pattern);
+            }
+            Ok(matches)
+        }
+        ColumnSelector::Range(_, _) => unreachable!("range selectors handled separately"),
+        ColumnSelector::Special(special) => bail!(
+            "special column '{}' not supported without column injection",
+            special.default_header()
+        ),
+    }
+}
+
 fn resolve_selector_index(
     headers: &[String],
     selector: &ColumnSelector,
@@ -562,6 +712,9 @@ fn resolve_selector_index(
                 .with_context(|| format!("column '{}' not found", name))?;
             Ok(index)
         }
+        ColumnSelector::Regex(_) => {
+            bail!("regex column selectors cannot be used in range endpoints")
+        }
         ColumnSelector::Special(special) => bail!(
             "special column '{}' not supported without column injection",
             special.default_header()
@@ -576,7 +729,7 @@ fn resolve_selector_index(
 mod tests {
     use super::{
         ColumnSelector, SpecialColumn, parse_selector_list, parse_single_selector,
-        resolve_selectors,
+        resolve_selectors, resolve_selectors_allow_duplicates,
     };
 
     #[test]
@@ -702,4 +855,30 @@ mod tests {
         assert!(matches!(selectors[1], ColumnSelector::Name(ref name) if name == "__file__"));
         assert!(matches!(selectors[2], ColumnSelector::Name(ref name) if name == "__base__"));
     }
+
+    #[test]
+    fn regex_selector_matches_columns() {
+        let headers = vec![
+            "sample_a".to_string(),
+            "other".to_string(),
+            "sample_b".to_string(),
+        ];
+        let selectors = parse_selector_list("~\"^sample_\"").unwrap();
+        let indices = resolve_selectors(&headers, &selectors, false).unwrap();
+        assert_eq!(indices, vec![0, 2]);
+    }
+
+    #[test]
+    fn allow_duplicates_includes_repeated_headers() {
+        let headers = vec![
+            "value".to_string(),
+            "value".to_string(),
+            "other".to_string(),
+        ];
+        let selectors = parse_selector_list("value").unwrap();
+        let indices = resolve_selectors(&headers, &selectors, false).unwrap();
+        assert_eq!(indices, vec![0]);
+        let indices = resolve_selectors_allow_duplicates(&headers, &selectors, false).unwrap();
+        assert_eq!(indices, vec![0, 1]);
+    }
 }