From 9f5da7483c32f901c1f642457fbe18c128f8e671 Mon Sep 17 00:00:00 2001 From: "Z.-L. Deng" Date: Mon, 6 Oct 2025 09:47:08 +0200 Subject: [PATCH 1/2] Add info command and polish CLI ergonomics --- Cargo.lock | 2 +- README.md | 24 +++++ src/info.rs | 238 ++++++++++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 25 +++++- src/mutate.rs | 88 ++++++++++++++++--- 5 files changed, 365 insertions(+), 12 deletions(-) create mode 100644 src/info.rs diff --git a/Cargo.lock b/Cargo.lock index 457efc6..ca20a1e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -603,7 +603,7 @@ dependencies = [ [[package]] name = "tsvkit" -version = "0.8.6" +version = "0.8.7" dependencies = [ "anyhow", "calamine", diff --git a/README.md b/README.md index d524696..e43a4e3 100644 --- a/README.md +++ b/README.md @@ -80,6 +80,7 @@ The same pipeline works if the cytokine table is compressed (`examples/cytokines The table below lists every `tsvkit` subcommand and a one-line purpose summary; each item links to the detailed section later in this guide. +- [`info`](#info) — inspect table shape, inferred column types, and sample values. - [`cut`](#cut) — select or reorder columns via names, indices, or ranges. - [`filter`](#filter) — keep rows matching an expression (math, logic, regex, functions). - [`join`](#join) — join multiple TSVs on key columns with parallel loading and fill values. @@ -145,6 +146,29 @@ Aggregators support descriptive statistics in `summarize` and row-wise calculati Each subsection highlights the core options, shows realistic invocations, and calls out relevant selectors or expressions. +### `info` + +Get a quick, structured summary of any TSV: the overall shape plus one row per column with inferred types and sample values. The preview column defaults to the first three rows, but you can raise or lower it with `-n` (e.g. `-n 5`). Combine with `-H` when the input has no header row so the summary omits the name column. + +```bash +tsvkit info examples/samples.tsv +``` + +_Output_ +``` +#shape(6, 9) +index name type first3 +1 sample_id str [S01, S02, S03] +2 subject_id str [P001, P002, P001] +3 group str [case, control, case] +4 timepoint str [baseline, baseline, week4] +5 purity num [0.94, 0.90, 0.96] +6 dna_ug num [25.3, 22.8, 27.4] +7 rna_ug num [18.1, 17.5, 19.8] +8 contamination_pct num [0.02, 0.03, 0.01] +9 tech str [sRNA-seq, sRNA-seq, sRNA-seq] +``` + ### `cut` Select or reorder columns by name, index, or range. diff --git a/src/info.rs b/src/info.rs new file mode 100644 index 0000000..b1ec02f --- /dev/null +++ b/src/info.rs @@ -0,0 +1,238 @@ +use std::io::{self, BufWriter, Write}; +use std::path::PathBuf; + +use anyhow::{Context, Result}; +use chrono::NaiveDate; +use clap::Args; + +use crate::common::{InputOptions, reader_for_path, should_skip_record}; + +#[derive(Args, Debug)] +#[command( + about = "Inspect TSV dimensions, column types, and value previews", + long_about = r#"Report the table shape and column details for a TSV file (or stdin). The output starts with #shape(rows, cols) followed by a TSV summary listing each column's index, optional name, inferred type (num/date/str), and the first N observed values (default 3). Respects shared options like -H/--no-header, -C/--comment-char, -E/--ignore-empty-row, and -I/--ignore-illegal-row."# +)] +pub struct InfoArgs { + /// Input TSV file (use '-' for stdin; gz/xz supported) + #[arg(value_name = "FILE", default_value = "-")] + pub file: PathBuf, + + /// Number of sample values to include in the preview column + #[arg(short = 'n', long = "preview", value_name = "N", default_value_t = 3)] + pub preview: usize, + + /// Treat input as headerless (suppress column names in the summary) + #[arg(short = 'H', long = "no-header")] + pub no_header: bool, + + /// Lines starting with this comment character are skipped + #[arg( + short = 'C', + long = "comment-char", + value_name = "CHAR", + default_value = "#" + )] + pub comment_char: String, + + /// Ignore rows where every field is empty/whitespace + #[arg(short = 'E', long = "ignore-empty-row")] + pub ignore_empty_row: bool, + + /// Ignore rows whose column count differs from the header/first row + #[arg(short = 'I', long = "ignore-illegal-row")] + pub ignore_illegal_row: bool, +} + +pub fn run(args: InfoArgs) -> Result<()> { + let input_opts = InputOptions::from_flags( + &args.comment_char, + args.ignore_empty_row, + args.ignore_illegal_row, + )?; + let mut reader = reader_for_path(&args.file, args.no_header, &input_opts)?; + let mut writer = BufWriter::new(io::stdout().lock()); + + let preview_limit = args.preview; + let mut column_names: Vec = Vec::new(); + let mut summaries: Vec = Vec::new(); + let mut row_count = 0usize; + let mut expected_width: Option = None; + + if !args.no_header { + let headers = reader + .headers() + .with_context(|| format!("failed reading header from {:?}", args.file))? + .iter() + .map(|s| s.to_string()) + .collect::>(); + expected_width = if headers.is_empty() { + None + } else { + Some(headers.len()) + }; + column_names = headers; + summaries = (0..column_names.len()) + .map(|_| ColumnSummary::new(preview_limit)) + .collect(); + } + + for record in reader.records() { + let record = record.with_context(|| format!("failed reading from {:?}", args.file))?; + if should_skip_record(&record, &input_opts, expected_width) { + continue; + } + + if record.len() > column_names.len() { + extend_columns( + record.len(), + &mut column_names, + &mut summaries, + preview_limit, + ); + } + + if expected_width.is_none() { + expected_width = Some(record.len()); + } + + absorb_record(&record, &mut summaries); + row_count += 1; + } + + let column_count = summaries.len(); + writeln!(writer, "#shape({}, {})", row_count, column_count)?; + + let mut header_fields = vec!["index".to_string()]; + if !args.no_header { + header_fields.push("name".to_string()); + } + header_fields.push("type".to_string()); + header_fields.push(format!("first{}", preview_limit)); + writeln!(writer, "{}", header_fields.join("\t"))?; + + for (idx, summary) in summaries.iter().enumerate() { + let mut fields = vec![(idx + 1).to_string()]; + if !args.no_header { + let name = column_names.get(idx).map(|s| s.as_str()).unwrap_or(""); + fields.push(name.to_string()); + } + fields.push(summary.type_label().to_string()); + fields.push(summary.preview_string()); + writeln!(writer, "{}", fields.join("\t"))?; + } + + writer.flush()?; + Ok(()) +} + +fn extend_columns( + target_len: usize, + column_names: &mut Vec, + summaries: &mut Vec, + preview_limit: usize, +) { + while column_names.len() < target_len { + let idx = column_names.len(); + column_names.push(format!("col{}", idx + 1)); + summaries.push(ColumnSummary::new(preview_limit)); + } +} + +fn absorb_record(record: &csv::StringRecord, summaries: &mut [ColumnSummary]) { + for (idx, summary) in summaries.iter_mut().enumerate() { + let value = record.get(idx).unwrap_or(""); + summary.update(value); + } +} + +struct ColumnSummary { + preview_limit: usize, + previews: Vec, + kind: ColumnTypeState, +} + +impl ColumnSummary { + fn new(preview_limit: usize) -> Self { + ColumnSummary { + preview_limit, + previews: Vec::new(), + kind: ColumnTypeState::Unknown, + } + } + + fn update(&mut self, value: &str) { + if self.previews.len() < self.preview_limit { + self.previews.push(value.to_string()); + } + self.kind.observe(value); + } + + fn type_label(&self) -> &'static str { + self.kind.label() + } + + fn preview_string(&self) -> String { + if self.previews.is_empty() { + "[]".to_string() + } else { + format!("[{}]", self.previews.join(", ")) + } + } +} + +#[derive(Clone, Copy)] +enum ColumnTypeState { + Unknown, + Numeric, + Date, + Text, +} + +enum ValueKind { + Numeric, + Date, + Text, +} + +impl ColumnTypeState { + fn observe(&mut self, raw: &str) { + let trimmed = raw.trim(); + if trimmed.is_empty() { + return; + } + let value_kind = if is_numeric(trimmed) { + ValueKind::Numeric + } else if is_date(trimmed) { + ValueKind::Date + } else { + ValueKind::Text + }; + + match (*self, value_kind) { + (ColumnTypeState::Unknown, ValueKind::Numeric) => *self = ColumnTypeState::Numeric, + (ColumnTypeState::Unknown, ValueKind::Date) => *self = ColumnTypeState::Date, + (ColumnTypeState::Unknown, ValueKind::Text) => *self = ColumnTypeState::Text, + (ColumnTypeState::Numeric, ValueKind::Numeric) => {} + (ColumnTypeState::Date, ValueKind::Date) => {} + (ColumnTypeState::Text, _) => {} + _ => *self = ColumnTypeState::Text, + } + } + + fn label(&self) -> &'static str { + match self { + ColumnTypeState::Unknown => "str", + ColumnTypeState::Numeric => "num", + ColumnTypeState::Date => "date", + ColumnTypeState::Text => "str", + } + } +} + +fn is_numeric(value: &str) -> bool { + value.parse::().is_ok() +} + +fn is_date(value: &str) -> bool { + NaiveDate::parse_from_str(value, "%Y-%m-%d").is_ok() +} diff --git a/src/main.rs b/src/main.rs index cfb9479..31870f7 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,6 +1,7 @@ use anyhow::Result; use clap::{Parser, Subcommand}; use std::env; +use std::io; mod common; mod csv; @@ -8,6 +9,7 @@ mod cut; mod excel; mod expression; mod filter; +mod info; mod join; mod melt; mod mutate; @@ -56,12 +58,14 @@ enum Commands { Excel(excel::ExcelArgs), /// CSV utilities (convert to TSV) Csv(csv::CsvArgs), + /// Inspect TSV schema, column types, and previews + Info(info::InfoArgs), } fn main() -> Result<()> { let raw_args: Vec<_> = env::args_os().collect(); let cli = Cli::parse_from(raw_args.clone()); - match cli.command { + let result = match cli.command { Commands::Join(args) => join::run(args), Commands::Summarize(args) => summarize::run(args), Commands::Cut(args) => cut::run(args), @@ -74,5 +78,24 @@ fn main() -> Result<()> { Commands::Slice(args) => slice::run(args), Commands::Excel(args) => excel::run(args, &raw_args), Commands::Csv(args) => csv::run(args), + Commands::Info(args) => info::run(args), + }; + + if let Err(err) = &result { + if is_broken_pipe(err) { + return Ok(()); + } } + + result +} + +fn is_broken_pipe(err: &anyhow::Error) -> bool { + err.chain().any(|cause| { + if let Some(io_err) = cause.downcast_ref::() { + io_err.kind() == io::ErrorKind::BrokenPipe + } else { + false + } + }) } diff --git a/src/mutate.rs b/src/mutate.rs index 6053a30..9320ab0 100644 --- a/src/mutate.rs +++ b/src/mutate.rs @@ -243,20 +243,88 @@ fn parse_assignment_expression( headers: &[String], no_header: bool, ) -> Result { - let (name_part, value_part) = expr - .split_once('=') - .with_context(|| "assignment must use name=expression syntax")?; - let name = name_part.trim(); - if name.is_empty() { - bail!("missing column name on left-hand side"); + if let Some((name_part, value_part)) = split_assignment(expr) { + let name = name_part.trim(); + if name.is_empty() { + bail!("missing column name on left-hand side"); + } + let function = parse_function(value_part.trim(), headers, no_header)?; + return Ok(MutateOp::Create { + name: name.to_string(), + func: function, + }); } - let function = parse_function(value_part.trim(), headers, no_header)?; - Ok(MutateOp::Create { - name: name.to_string(), - func: function, + + if no_header { + let auto_name = format!("col{}", headers.len() + 1); + let function = parse_function(expr.trim(), headers, no_header)?; + return Ok(MutateOp::Create { + name: auto_name, + func: function, + }); + } + + bail!("assignment must use name=expression syntax") +} + +fn split_assignment(expr: &str) -> Option<(&str, &str)> { + find_assignment(expr).map(|idx| { + let (left, right_with_eq) = expr.split_at(idx); + let right = &right_with_eq[1..]; + (left, right) }) } +fn find_assignment(expr: &str) -> Option { + let mut in_single = false; + let mut in_double = false; + let mut prev_char: Option = None; + let mut prev_non_ws: Option = None; + + let mut iter = expr.char_indices(); + while let Some((idx, ch)) = iter.next() { + match ch { + '\'' => { + if !in_double && prev_char != Some('\\') { + in_single = !in_single; + } + } + '"' => { + if !in_single && prev_char != Some('\\') { + in_double = !in_double; + } + } + '=' if !in_single && !in_double => { + let mut next_non_ws = None; + let mut lookahead = expr[idx + ch.len_utf8()..].chars(); + while let Some(next) = lookahead.next() { + if next.is_whitespace() { + continue; + } + next_non_ws = Some(next); + break; + } + + let prev = prev_non_ws; + if matches!(prev, Some('=') | Some('!') | Some('<') | Some('>')) { + // part of ==, !=, <=, >= + } else if matches!(next_non_ws, Some('=')) { + // part of == + } else { + return Some(idx); + } + } + _ => {} + } + + if !ch.is_whitespace() { + prev_non_ws = Some(ch); + } + prev_char = Some(ch); + } + None +} + fn parse_function(value: &str, headers: &[String], no_header: bool) -> Result { let trimmed = value.trim(); From ff02abcf5feaaa20ea38ee0861dd7e177f189967 Mon Sep 17 00:00:00 2001 From: "Z.-L. Deng" Date: Mon, 6 Oct 2025 15:48:09 +0800 Subject: [PATCH 2/2] Update Cargo.toml --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 7e86798..f6a008b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "tsvkit" -version = "0.8.7" +version = "0.8.8" edition = "2024" [dependencies]