diff --git a/README.md b/README.md index f779de6..17220d3 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ [![Go Report Card](https://goreportcard.com/badge/github.com/osamingo/go-csvpp)](https://goreportcard.com/report/github.com/osamingo/go-csvpp) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) -A Go implementation of the [IETF CSV++ specification](https://datatracker.ietf.org/doc/draft-mscaldas-csvpp/) (draft-mscaldas-csvpp-01). +A Go implementation of the [IETF CSV++ specification](https://datatracker.ietf.org/doc/draft-mscaldas-csvpp/) (draft-mscaldas-csvpp-02). CSV++ extends traditional CSV to support **arrays** and **structured fields** within cells, enabling complex data representation while maintaining CSV's simplicity. @@ -184,9 +184,12 @@ CSV++ supports four field types in headers: ### Default Delimiters -- Array delimiter: `~` (tilde) +- Array delimiter: `~` (tilde) — applies only to top-level arrays - Component delimiter: `^` (caret) +Per draft-02, the default tilde for empty brackets (`[]`) applies only to first-level arrays. +Nested arrays **must** specify an explicit delimiter (e.g., `values[;]`). + Custom delimiters can be specified in the header: - `phone[|]` - uses `|` as array delimiter - `geo;(lat;lon)` - uses `;` as component delimiter @@ -307,7 +310,13 @@ if csvpp.HasFormulaPrefix(value) { ## Specification This implementation follows the IETF CSV++ specification: -- [draft-mscaldas-csvpp-01](https://datatracker.ietf.org/doc/draft-mscaldas-csvpp/) +- [draft-mscaldas-csvpp-02](https://datatracker.ietf.org/doc/draft-mscaldas-csvpp/02/) + +### Conformance Notes + +The following draft-02 features are **not yet implemented**: + +- **Leaf-only quoting validation**: draft-02 requires that RFC 4180 double-quote quoting is applied only to leaf elements (atomic values). Quoting non-leaf values (entire arrays or structured fields) is invalid per the specification. This library delegates CSV-level quoting to `encoding/csv`, which processes quotes before CSV++ delimiter splitting. As a result, non-leaf quoting cannot be detected or rejected at the CSV++ layer. In practice, the behavior is correct for well-formed input. ## License diff --git a/doc.go b/doc.go index 21fb1f6..4aa0111 100644 --- a/doc.go +++ b/doc.go @@ -1,4 +1,4 @@ -// Package csvpp implements the IETF CSV++ specification (draft-mscaldas-csvpp-01). +// Package csvpp implements the IETF CSV++ specification (draft-mscaldas-csvpp-02). // // CSV++ extends traditional CSV to support arrays and structured fields within cells, // enabling complex data representation while maintaining CSV's simplicity. @@ -9,10 +9,13 @@ // CSV++ introduces four field types beyond simple text values: // // - Simple: "name" - plain text value -// - Array: "tags[]" - multiple values separated by a delimiter (default: ~) +// - Array: "tags[]" - multiple values separated by a delimiter (default: ~ at top level only) // - Structured: "geo(lat^lon)" - named components separated by a delimiter (default: ^) // - ArrayStructured: "addresses[](street^city)" - array of structured values // +// Per draft-02, the default tilde (~) delimiter for empty brackets applies only to +// top-level (first-level) arrays. Nested arrays MUST explicitly specify a delimiter. +// // These field types are represented by the [FieldKind] constants: // [SimpleField], [ArrayField], [StructuredField], and [ArrayStructuredField]. // diff --git a/export_test.go b/export_test.go index 4368764..cf5c375 100644 --- a/export_test.go +++ b/export_test.go @@ -6,7 +6,7 @@ var ( ParseColumnHeaderWithDepth = parseColumnHeaderWithDepth ParseHeaderRecordWithMaxDepth = parseHeaderRecordWithMaxDepth ParseName = parseName - ParseArrayDelimiter = parseArrayDelimiter + ParseArrayDelimiterWithDepth = parseArrayDelimiterWithDepth IsFieldChar = isFieldChar SplitByRune = splitByRune SplitByDelimiter = splitByDelimiter diff --git a/header.go b/header.go index c0682a1..737d02c 100644 --- a/header.go +++ b/header.go @@ -45,7 +45,7 @@ func parseColumnHeaderWithDepth(s string, depth, maxDepth int) (*ColumnHeader, e // 2. Extract array delimiter if "[" is present // ABNF: array-field = name "[" [delimiter] "]" if strings.HasPrefix(remaining, "[") { - delim, rest, err := parseArrayDelimiter(remaining) + delim, rest, err := parseArrayDelimiterWithDepth(remaining, depth) if err != nil { return nil, err } @@ -112,10 +112,12 @@ func isFieldChar(r rune) bool { r == '_' || r == '-' } -// parseArrayDelimiter extracts the "[" delimiter "]" part per IETF CSV++ Section 2.2.2. +// parseArrayDelimiterWithDepth extracts the "[" delimiter "]" part per IETF CSV++ Section 2.2.2. // ABNF: array-field = name "[" [delimiter] "]" -// If no delimiter is specified, DefaultArrayDelimiter (~) is used. -func parseArrayDelimiter(s string) (delim rune, rest string, err error) { +// If no delimiter is specified at the top level (depth == 0), DefaultArrayDelimiter (~) is used. +// Per draft-02, nested arrays (depth > 0) MUST explicitly specify a delimiter; +// empty brackets are invalid because the default tilde is already consumed by the outer level. +func parseArrayDelimiterWithDepth(s string, depth int) (delim rune, rest string, err error) { if !strings.HasPrefix(s, "[") { return 0, s, nil } @@ -133,7 +135,10 @@ func parseArrayDelimiter(s string) (delim rune, rest string, err error) { rest = after if raw == "" { - // Use default delimiter + if depth > 0 { + return 0, "", fmt.Errorf("%w: nested arrays must specify an explicit delimiter (empty brackets are invalid at depth > 0)", ErrInvalidHeader) + } + // Use default delimiter only at top level delim = DefaultArrayDelimiter } else { r, size := utf8.DecodeRuneInString(raw) diff --git a/header_test.go b/header_test.go index a07bd86..46ded74 100644 --- a/header_test.go +++ b/header_test.go @@ -121,6 +121,24 @@ func TestParseColumnHeader(t *testing.T) { }, }, }, + { + name: "success: nested array with explicit delimiter", + input: "data(type^values[;])", + want: &csvpp.ColumnHeader{ + Name: "data", + Kind: csvpp.StructuredField, + ComponentDelimiter: csvpp.DefaultComponentDelimiter, + Components: []*csvpp.ColumnHeader{ + {Name: "type", Kind: csvpp.SimpleField}, + {Name: "values", Kind: csvpp.ArrayField, ArrayDelimiter: ';'}, + }, + }, + }, + { + name: "error: nested array with empty brackets", + input: "data(type^values[])", + wantErr: true, + }, { name: "error: empty string", input: "", @@ -242,48 +260,74 @@ func TestParseName(t *testing.T) { } } -func TestParseArrayDelimiter(t *testing.T) { +func TestParseArrayDelimiterWithDepth(t *testing.T) { t.Parallel() tests := []struct { name string input string + depth int wantDelim rune wantRest string wantErr bool }{ { - name: "success: default delimiter", + name: "success: default delimiter at top level", input: "[]", + depth: 0, wantDelim: csvpp.DefaultArrayDelimiter, wantRest: "", }, { - name: "success: custom delimiter", + name: "success: custom delimiter at top level", input: "[|]", + depth: 0, wantDelim: '|', wantRest: "", }, + { + name: "success: custom delimiter at nested level", + input: "[;]", + depth: 1, + wantDelim: ';', + wantRest: "", + }, { name: "success: with rest", input: "[](lat^lon)", + depth: 0, wantDelim: csvpp.DefaultArrayDelimiter, wantRest: "(lat^lon)", }, { name: "success: no bracket", input: "(lat^lon)", + depth: 0, wantDelim: 0, wantRest: "(lat^lon)", }, { name: "error: missing closing bracket", input: "[|", + depth: 0, wantErr: true, }, { name: "error: multiple characters as delimiter", input: "[||]", + depth: 0, + wantErr: true, + }, + { + name: "error: empty brackets at nested level", + input: "[]", + depth: 1, + wantErr: true, + }, + { + name: "error: empty brackets at deeply nested level", + input: "[]", + depth: 3, wantErr: true, }, } @@ -292,19 +336,19 @@ func TestParseArrayDelimiter(t *testing.T) { t.Run(tt.name, func(t *testing.T) { t.Parallel() - gotDelim, gotRest, err := csvpp.ParseArrayDelimiter(tt.input) + gotDelim, gotRest, err := csvpp.ParseArrayDelimiterWithDepth(tt.input, tt.depth) if (err != nil) != tt.wantErr { - t.Errorf("parseArrayDelimiter() error = %v, wantErr %v", err, tt.wantErr) + t.Errorf("parseArrayDelimiterWithDepth() error = %v, wantErr %v", err, tt.wantErr) return } if tt.wantErr { return } if gotDelim != tt.wantDelim { - t.Errorf("parseArrayDelimiter() delim = %v, want %v", gotDelim, tt.wantDelim) + t.Errorf("parseArrayDelimiterWithDepth() delim = %v, want %v", gotDelim, tt.wantDelim) } if diff := cmp.Diff(tt.wantRest, gotRest); diff != "" { - t.Errorf("parseArrayDelimiter() rest mismatch (-want +got):\n%s", diff) + t.Errorf("parseArrayDelimiterWithDepth() rest mismatch (-want +got):\n%s", diff) } }) } diff --git a/reader_test.go b/reader_test.go index b178561..3b4f6cb 100644 --- a/reader_test.go +++ b/reader_test.go @@ -377,27 +377,41 @@ func TestSplitByRune(t *testing.T) { func TestReader_NestedStructuredField(t *testing.T) { t.Parallel() - // Test nested array in components - input := "name,data[](type^values[])\nAlice,home^1~2~work^3~4\n" - r := csvpp.NewReader(strings.NewReader(input)) + t.Run("error: nested array with empty brackets", func(t *testing.T) { + t.Parallel() - got, err := r.Read() - if err != nil { - t.Fatalf("Reader.Read() error = %v", err) - } + // Per draft-02, nested arrays with empty brackets are invalid + input := "name,data[](type^values[])\nAlice,home^1~2~work^3~4\n" + r := csvpp.NewReader(strings.NewReader(input)) - want := []*csvpp.Field{ - {Value: "Alice"}, - {Components: []*csvpp.Field{ - {Components: []*csvpp.Field{{Value: "home"}, {Values: []string{"1"}}}}, - {Components: []*csvpp.Field{{Value: "2"}, {Values: []string{}}}}, - {Components: []*csvpp.Field{{Value: "work"}, {Values: []string{"3"}}}}, - {Components: []*csvpp.Field{{Value: "4"}, {Values: []string{}}}}, - }}, - } - // Note: This test documents current behavior (may need adjustment based on spec) - _ = want - _ = got + _, err := r.Read() + if err == nil { + t.Error("Reader.Read() expected error for nested array with empty brackets") + } + }) + + t.Run("success: nested array with explicit delimiter", func(t *testing.T) { + t.Parallel() + + input := "name,data[](type^values[;])\nAlice,home^1;2~work^3;4\n" + r := csvpp.NewReader(strings.NewReader(input)) + + got, err := r.Read() + if err != nil { + t.Fatalf("Reader.Read() error = %v", err) + } + + want := []*csvpp.Field{ + {Value: "Alice"}, + {Components: []*csvpp.Field{ + {Components: []*csvpp.Field{{Value: "home"}, {Values: []string{"1", "2"}}}}, + {Components: []*csvpp.Field{{Value: "work"}, {Values: []string{"3", "4"}}}}, + }}, + } + if diff := cmp.Diff(want, got); diff != "" { + t.Errorf("Reader.Read() mismatch (-want +got):\n%s", diff) + } + }) } func TestReader_MismatchedFields(t *testing.T) { @@ -450,8 +464,8 @@ func TestReader_MaxNestingDepth(t *testing.T) { func TestReader_NestedComponents(t *testing.T) { t.Parallel() - // Test nested structured field with array component - input := "name,data(type^values[])\nAlice,home^1~2~3\n" + // Test nested structured field with array component (explicit delimiter per draft-02) + input := "name,data(type^values[;])\nAlice,home^1;2;3\n" r := csvpp.NewReader(strings.NewReader(input)) got, err := r.Read()