diff --git a/docs/spec/todos/TODO-0116.md b/docs/spec/todos/TODO-0116.md new file mode 100644 index 0000000..9922e15 --- /dev/null +++ b/docs/spec/todos/TODO-0116.md @@ -0,0 +1,53 @@ +--- +id: 116 +title: "Trim DataFusion default features to reduce binary size" +status: todo +priority: medium +created: 2026-03-17 +depends_on: [] +blocks: [] +--- + +# TODO-0116: Trim DataFusion default features to reduce binary size + +## Summary + +DataFusion is pulled with all default features, many of which mdvs doesn't use. Disabling unused features should reduce compile time and binary size. + +## Details + +### Features needed + +- `parquet` — read/write parquet files +- `sql` — SQL query execution (CREATE VIEW, SELECT, JOIN) +- `nested_expressions` — array functions (`array_has`, `array_length`) used in `--where` +- `string_expressions` — string functions (LIKE, LOWER, UPPER) +- `recursive_protection` — stack overflow safety +- `unicode_expressions` — possibly needed for Unicode field names +- `regex_expressions` — possibly needed for LIKE patterns + +### Features to disable + +- `crypto_expressions` — md5, sha256 (not used) +- `datetime_expressions` — date/time functions (not used, dates are strings) +- `encoding_expressions` — base64, hex (not used) +- `compression` — bzip2, xz, flate2, zstd for reading compressed CSV/JSON (not used — Snappy for parquet is built into the parquet crate) + +### Change + +```toml +datafusion = { version = "52", default-features = false, features = [ + "parquet", "sql", "nested_expressions", "string_expressions", + "unicode_expressions", "regex_expressions", "recursive_protection", +] } +``` + +### Testing + +- Verify all `--where` queries work (scalar, array, nested object, LIKE, LOWER) +- Verify parquet read/write with Snappy compression still works +- Check binary size before/after + +## Files + +- `Cargo.toml` — datafusion dependency spec diff --git a/docs/spec/todos/TODO-0117.md b/docs/spec/todos/TODO-0117.md new file mode 100644 index 0000000..86f7fc8 --- /dev/null +++ b/docs/spec/todos/TODO-0117.md @@ -0,0 +1,46 @@ +--- +id: 117 +title: "Fix null values skipping Disallowed and NullNotAllowed checks" +status: done +completed: 2026-03-17 +files_updated: [src/cmd/check.rs] +priority: high +created: 2026-03-17 +depends_on: [] +blocks: [] +--- + +# TODO-0117: Fix null values skipping Disallowed and NullNotAllowed checks + +## Summary + +`check_field_values()` in `src/cmd/check.rs` line 401 does `if value.is_null() { continue; }`, which skips ALL subsequent checks for null values. This means null values never trigger `Disallowed` (field present at a wrong path) or `NullNotAllowed` (null on a non-nullable field) unless the field is required. + +## Problem + +The four violation checks are independent and orthogonal: + +1. **WrongType** — does the value match the declared type? Null always passes (null is accepted by any type). +2. **Disallowed** — is the field present at a path not in `allowed`? The key is there regardless of whether the value is null. +3. **NullNotAllowed** — is the value null and `nullable = false`? +4. **MissingRequired** — is the field absent in a required path? (Handled in `check_required_fields()`, works correctly.) + +All four should run independently on every field in every file. A field can trigger both `Disallowed` AND `NullNotAllowed` simultaneously. + +Currently, `continue` on null prevents checks 2 and 3 from running. Only `check_required_fields()` catches `NullNotAllowed`, and only for files matching a `required` glob. + +## Reproduction + +In the Refractions vault, `projects` has `nullable = true` inferred because 5 files have bare `projects:` (YAML null). If you change `nullable` to `false` in `mdvs.toml`, `mdvs check` still reports no violations — because those 5 files are in non-required paths (`technologies/rust/concepts/`, etc.) and the null `continue` skips them entirely. + +## Fix + +In `check_field_values()`, remove the unconditional `continue` on null. Instead: +- Run `Disallowed` check normally (null or not, the key is present) +- Run `NullNotAllowed` check (if field is in toml and `nullable = false`) +- Do NOT run `WrongType` check (null is accepted by any type — no value to compare) + +## Files + +- `src/cmd/check.rs` — `check_field_values()` around line 401 +- Tests: add test for null on non-nullable, non-required field diff --git a/docs/spec/todos/index.md b/docs/spec/todos/index.md index 61ae58b..f735be5 100644 --- a/docs/spec/todos/index.md +++ b/docs/spec/todos/index.md @@ -117,3 +117,5 @@ | [0113](TODO-0113.md) | Progress bar for model download and embedding | todo | medium | 2026-03-16 | | [0114](TODO-0114.md) | Auto-generate CLI output examples in mdBook with mdbook-cmdrun | todo | medium | 2026-03-17 | | [0115](TODO-0115.md) | Embed asciinema recordings in mdBook for interactive demos | todo | medium | 2026-03-17 | +| [0116](TODO-0116.md) | Trim DataFusion default features to reduce binary size | todo | medium | 2026-03-17 | +| [0117](TODO-0117.md) | Fix null values skipping Disallowed and NullNotAllowed checks | done | high | 2026-03-17 | diff --git a/src/cmd/check.rs b/src/cmd/check.rs index 4bbff3a..4e9bc7f 100644 --- a/src/cmd/check.rs +++ b/src/cmd/check.rs @@ -398,25 +398,8 @@ fn check_field_values( continue; } - if value.is_null() { - continue; - } - if let Some(toml_field) = field_map.get(field_name.as_str()) { - let expected = FieldType::try_from(&toml_field.field_type) - .map_err(|e| anyhow::anyhow!("invalid type for '{}': {}", field_name, e))?; - if !type_matches(&expected, value) { - let key = ViolationKey { - field: field_name.clone(), - kind: ViolationKind::WrongType, - rule: format!("type {}", toml_field.field_type), - }; - violations.entry(key).or_default().push(ViolatingFile { - path: file.path.clone(), - detail: Some(format!("got {}", actual_type_name(value))), - }); - } - + // Disallowed: field present at a path not in allowed if !matches_any_glob(&toml_field.allowed, &file_path_str) { let key = ViolationKey { field: field_name.clone(), @@ -428,6 +411,38 @@ fn check_field_values( detail: None, }); } + + if value.is_null() { + // NullNotAllowed: null value on a non-nullable field + if !toml_field.nullable { + let key = ViolationKey { + field: field_name.clone(), + kind: ViolationKind::NullNotAllowed, + rule: "not nullable".to_string(), + }; + violations.entry(key).or_default().push(ViolatingFile { + path: file.path.clone(), + detail: None, + }); + } + } else { + // WrongType: value doesn't match declared type + let expected = + FieldType::try_from(&toml_field.field_type).map_err(|e| { + anyhow::anyhow!("invalid type for '{}': {}", field_name, e) + })?; + if !type_matches(&expected, value) { + let key = ViolationKey { + field: field_name.clone(), + kind: ViolationKind::WrongType, + rule: format!("type {}", toml_field.field_type), + }; + violations.entry(key).or_default().push(ViolatingFile { + path: file.path.clone(), + detail: Some(format!("got {}", actual_type_name(value))), + }); + } + } } else { new_field_paths .entry(field_name.clone()) @@ -945,4 +960,153 @@ mod tests { // Should have: draft wrong type, tags missing required, draft disallowed assert!(result.field_violations.len() >= 3); } + + #[tokio::test] + async fn null_on_non_nullable_non_required_field() { + let tmp = tempfile::tempdir().unwrap(); + fs::create_dir_all(tmp.path().join("notes")).unwrap(); + + fs::write( + tmp.path().join("notes/note1.md"), + "---\ntitle: Hello\nstatus:\n---\n# Hello\nBody.", + ) + .unwrap(); + + // status: nullable=false, required=[], allowed=["**"] + write_toml( + tmp.path(), + vec![ + string_field("title"), + TomlField { + name: "status".into(), + field_type: FieldTypeSerde::Scalar("String".into()), + allowed: vec!["**".into()], + required: vec![], + nullable: false, + }, + ], + vec![], + ); + + let result = run(tmp.path(), true, false).await.result.unwrap(); + assert!(result.has_violations()); + let v = result + .field_violations + .iter() + .find(|v| v.field == "status") + .expect("expected NullNotAllowed for status"); + assert!(matches!(v.kind, ViolationKind::NullNotAllowed)); + } + + #[tokio::test] + async fn null_on_nullable_non_required_field() { + let tmp = tempfile::tempdir().unwrap(); + fs::create_dir_all(tmp.path().join("notes")).unwrap(); + + fs::write( + tmp.path().join("notes/note1.md"), + "---\ntitle: Hello\nstatus:\n---\n# Hello\nBody.", + ) + .unwrap(); + + // status: nullable=true, required=[], allowed=["**"] + write_toml( + tmp.path(), + vec![ + string_field("title"), + TomlField { + name: "status".into(), + field_type: FieldTypeSerde::Scalar("String".into()), + allowed: vec!["**".into()], + required: vec![], + nullable: true, + }, + ], + vec![], + ); + + let result = run(tmp.path(), true, false).await.result.unwrap(); + assert!(!result.has_violations()); + } + + #[tokio::test] + async fn null_on_disallowed_path() { + let tmp = tempfile::tempdir().unwrap(); + fs::create_dir_all(tmp.path().join("notes")).unwrap(); + + fs::write( + tmp.path().join("notes/note1.md"), + "---\ntitle: Hello\ndraft:\n---\n# Hello\nBody.", + ) + .unwrap(); + + // draft: allowed only in blog/**, but file is in notes/ + write_toml( + tmp.path(), + vec![ + string_field("title"), + TomlField { + name: "draft".into(), + field_type: FieldTypeSerde::Scalar("Boolean".into()), + allowed: vec!["blog/**".into()], + required: vec![], + nullable: true, + }, + ], + vec![], + ); + + let result = run(tmp.path(), true, false).await.result.unwrap(); + assert!(result.has_violations()); + let v = result + .field_violations + .iter() + .find(|v| v.field == "draft") + .expect("expected Disallowed for draft"); + assert!(matches!(v.kind, ViolationKind::Disallowed)); + } + + #[tokio::test] + async fn null_on_disallowed_path_and_not_nullable() { + let tmp = tempfile::tempdir().unwrap(); + fs::create_dir_all(tmp.path().join("notes")).unwrap(); + + fs::write( + tmp.path().join("notes/note1.md"), + "---\ntitle: Hello\ndraft:\n---\n# Hello\nBody.", + ) + .unwrap(); + + // draft: allowed only in blog/**, nullable=false + // Should trigger BOTH Disallowed AND NullNotAllowed + write_toml( + tmp.path(), + vec![ + string_field("title"), + TomlField { + name: "draft".into(), + field_type: FieldTypeSerde::Scalar("Boolean".into()), + allowed: vec!["blog/**".into()], + required: vec![], + nullable: false, + }, + ], + vec![], + ); + + let result = run(tmp.path(), true, false).await.result.unwrap(); + assert!(result.has_violations()); + + let has_disallowed = result + .field_violations + .iter() + .any(|v| v.field == "draft" && matches!(v.kind, ViolationKind::Disallowed)); + let has_null_not_allowed = result + .field_violations + .iter() + .any(|v| v.field == "draft" && matches!(v.kind, ViolationKind::NullNotAllowed)); + + assert!(has_disallowed, "expected Disallowed for draft"); + assert!(has_null_not_allowed, "expected NullNotAllowed for draft"); + } }