Skip to content
Merged

Dev #13

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "tsvkit"
version = "0.8.7"
version = "0.8.8"
edition = "2024"

[dependencies]
Expand Down
24 changes: 24 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ The same pipeline works if the cytokine table is compressed (`examples/cytokines

The table below lists every `tsvkit` subcommand and a one-line purpose summary; each item links to the detailed section later in this guide.

- [`info`](#info) — inspect table shape, inferred column types, and sample values.
- [`cut`](#cut) — select or reorder columns via names, indices, or ranges.
- [`filter`](#filter) — keep rows matching an expression (math, logic, regex, functions).
- [`join`](#join) — join multiple TSVs on key columns with parallel loading and fill values.
Expand Down Expand Up @@ -145,6 +146,29 @@ Aggregators support descriptive statistics in `summarize` and row-wise calculati

Each subsection highlights the core options, shows realistic invocations, and calls out relevant selectors or expressions.

### `info`

Get a quick, structured summary of any TSV: the overall shape plus one row per column with inferred types and sample values. The preview column defaults to the first three rows, but you can raise or lower it with `-n` (e.g. `-n 5`). Combine with `-H` when the input has no header row so the summary omits the name column.

```bash
tsvkit info examples/samples.tsv
```

_Output_
```
#shape(6, 9)
index name type first3
1 sample_id str [S01, S02, S03]
2 subject_id str [P001, P002, P001]
3 group str [case, control, case]
4 timepoint str [baseline, baseline, week4]
5 purity num [0.94, 0.90, 0.96]
6 dna_ug num [25.3, 22.8, 27.4]
7 rna_ug num [18.1, 17.5, 19.8]
8 contamination_pct num [0.02, 0.03, 0.01]
9 tech str [sRNA-seq, sRNA-seq, sRNA-seq]
```

### `cut`

Select or reorder columns by name, index, or range.
Expand Down
238 changes: 238 additions & 0 deletions src/info.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
use std::io::{self, BufWriter, Write};
use std::path::PathBuf;

use anyhow::{Context, Result};
use chrono::NaiveDate;
use clap::Args;

use crate::common::{InputOptions, reader_for_path, should_skip_record};

#[derive(Args, Debug)]
#[command(
about = "Inspect TSV dimensions, column types, and value previews",
long_about = r#"Report the table shape and column details for a TSV file (or stdin). The output starts with #shape(rows, cols) followed by a TSV summary listing each column's index, optional name, inferred type (num/date/str), and the first N observed values (default 3). Respects shared options like -H/--no-header, -C/--comment-char, -E/--ignore-empty-row, and -I/--ignore-illegal-row."#
)]
pub struct InfoArgs {
/// Input TSV file (use '-' for stdin; gz/xz supported)
#[arg(value_name = "FILE", default_value = "-")]
pub file: PathBuf,

/// Number of sample values to include in the preview column
#[arg(short = 'n', long = "preview", value_name = "N", default_value_t = 3)]
pub preview: usize,

/// Treat input as headerless (suppress column names in the summary)
#[arg(short = 'H', long = "no-header")]
pub no_header: bool,

/// Lines starting with this comment character are skipped
#[arg(
short = 'C',
long = "comment-char",
value_name = "CHAR",
default_value = "#"
)]
pub comment_char: String,

/// Ignore rows where every field is empty/whitespace
#[arg(short = 'E', long = "ignore-empty-row")]
pub ignore_empty_row: bool,

/// Ignore rows whose column count differs from the header/first row
#[arg(short = 'I', long = "ignore-illegal-row")]
pub ignore_illegal_row: bool,
}

pub fn run(args: InfoArgs) -> Result<()> {
let input_opts = InputOptions::from_flags(
&args.comment_char,
args.ignore_empty_row,
args.ignore_illegal_row,
)?;
let mut reader = reader_for_path(&args.file, args.no_header, &input_opts)?;
let mut writer = BufWriter::new(io::stdout().lock());

let preview_limit = args.preview;
let mut column_names: Vec<String> = Vec::new();
let mut summaries: Vec<ColumnSummary> = Vec::new();
let mut row_count = 0usize;
let mut expected_width: Option<usize> = None;

if !args.no_header {
let headers = reader
.headers()
.with_context(|| format!("failed reading header from {:?}", args.file))?
.iter()
.map(|s| s.to_string())
.collect::<Vec<_>>();
expected_width = if headers.is_empty() {
None
} else {
Some(headers.len())
};
column_names = headers;
summaries = (0..column_names.len())
.map(|_| ColumnSummary::new(preview_limit))
.collect();
}

for record in reader.records() {
let record = record.with_context(|| format!("failed reading from {:?}", args.file))?;
if should_skip_record(&record, &input_opts, expected_width) {
continue;
}

if record.len() > column_names.len() {
extend_columns(
record.len(),
&mut column_names,
&mut summaries,
preview_limit,
);
}

if expected_width.is_none() {
expected_width = Some(record.len());
}

absorb_record(&record, &mut summaries);
row_count += 1;
}

let column_count = summaries.len();
writeln!(writer, "#shape({}, {})", row_count, column_count)?;

let mut header_fields = vec!["index".to_string()];
if !args.no_header {
header_fields.push("name".to_string());
}
header_fields.push("type".to_string());
header_fields.push(format!("first{}", preview_limit));
writeln!(writer, "{}", header_fields.join("\t"))?;

for (idx, summary) in summaries.iter().enumerate() {
let mut fields = vec![(idx + 1).to_string()];
if !args.no_header {
let name = column_names.get(idx).map(|s| s.as_str()).unwrap_or("");
fields.push(name.to_string());
}
fields.push(summary.type_label().to_string());
fields.push(summary.preview_string());
writeln!(writer, "{}", fields.join("\t"))?;
}

writer.flush()?;
Ok(())
}

fn extend_columns(
target_len: usize,
column_names: &mut Vec<String>,
summaries: &mut Vec<ColumnSummary>,
preview_limit: usize,
) {
while column_names.len() < target_len {
let idx = column_names.len();
column_names.push(format!("col{}", idx + 1));
summaries.push(ColumnSummary::new(preview_limit));
}
}

fn absorb_record(record: &csv::StringRecord, summaries: &mut [ColumnSummary]) {
for (idx, summary) in summaries.iter_mut().enumerate() {
let value = record.get(idx).unwrap_or("");
summary.update(value);
}
}

struct ColumnSummary {
preview_limit: usize,
previews: Vec<String>,
kind: ColumnTypeState,
}

impl ColumnSummary {
fn new(preview_limit: usize) -> Self {
ColumnSummary {
preview_limit,
previews: Vec::new(),
kind: ColumnTypeState::Unknown,
}
}

fn update(&mut self, value: &str) {
if self.previews.len() < self.preview_limit {
self.previews.push(value.to_string());
}
self.kind.observe(value);
}

fn type_label(&self) -> &'static str {
self.kind.label()
}

fn preview_string(&self) -> String {
if self.previews.is_empty() {
"[]".to_string()
} else {
format!("[{}]", self.previews.join(", "))
}
}
}

#[derive(Clone, Copy)]
enum ColumnTypeState {
Unknown,
Numeric,
Date,
Text,
}

enum ValueKind {
Numeric,
Date,
Text,
}

impl ColumnTypeState {
fn observe(&mut self, raw: &str) {
let trimmed = raw.trim();
if trimmed.is_empty() {
return;
}
let value_kind = if is_numeric(trimmed) {
ValueKind::Numeric
} else if is_date(trimmed) {
ValueKind::Date
} else {
ValueKind::Text
};

match (*self, value_kind) {
(ColumnTypeState::Unknown, ValueKind::Numeric) => *self = ColumnTypeState::Numeric,
(ColumnTypeState::Unknown, ValueKind::Date) => *self = ColumnTypeState::Date,
(ColumnTypeState::Unknown, ValueKind::Text) => *self = ColumnTypeState::Text,
(ColumnTypeState::Numeric, ValueKind::Numeric) => {}
(ColumnTypeState::Date, ValueKind::Date) => {}
(ColumnTypeState::Text, _) => {}
_ => *self = ColumnTypeState::Text,
}
}

fn label(&self) -> &'static str {
match self {
ColumnTypeState::Unknown => "str",
ColumnTypeState::Numeric => "num",
ColumnTypeState::Date => "date",
ColumnTypeState::Text => "str",
}
}
}

fn is_numeric(value: &str) -> bool {
value.parse::<f64>().is_ok()
}

fn is_date(value: &str) -> bool {
NaiveDate::parse_from_str(value, "%Y-%m-%d").is_ok()
}
25 changes: 24 additions & 1 deletion src/main.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
use anyhow::Result;
use clap::{Parser, Subcommand};
use std::env;
use std::io;

mod common;
mod csv;
mod cut;
mod excel;
mod expression;
mod filter;
mod info;
mod join;
mod melt;
mod mutate;
Expand Down Expand Up @@ -56,12 +58,14 @@ enum Commands {
Excel(excel::ExcelArgs),
/// CSV utilities (convert to TSV)
Csv(csv::CsvArgs),
/// Inspect TSV schema, column types, and previews
Info(info::InfoArgs),
}

fn main() -> Result<()> {
let raw_args: Vec<_> = env::args_os().collect();
let cli = Cli::parse_from(raw_args.clone());
match cli.command {
let result = match cli.command {
Commands::Join(args) => join::run(args),
Commands::Summarize(args) => summarize::run(args),
Commands::Cut(args) => cut::run(args),
Expand All @@ -74,5 +78,24 @@ fn main() -> Result<()> {
Commands::Slice(args) => slice::run(args),
Commands::Excel(args) => excel::run(args, &raw_args),
Commands::Csv(args) => csv::run(args),
Commands::Info(args) => info::run(args),
};

if let Err(err) = &result {
if is_broken_pipe(err) {
return Ok(());
}
}

result
}

fn is_broken_pipe(err: &anyhow::Error) -> bool {
err.chain().any(|cause| {
if let Some(io_err) = cause.downcast_ref::<io::Error>() {
io_err.kind() == io::ErrorKind::BrokenPipe
} else {
false
}
})
}
Loading