From be13c81a0c629936002cf9dbed7a76bb3bd394a6 Mon Sep 17 00:00:00 2001 From: Ilya Tribusean Date: Wed, 3 Dec 2025 23:25:27 +0200 Subject: [PATCH 1/4] fix mutable bytes --- utfbom.go | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/utfbom.go b/utfbom.go index c5ab8d6..edde726 100644 --- a/utfbom.go +++ b/utfbom.go @@ -13,14 +13,7 @@ import ( "sync" ) -var ( - _ io.Reader = (*Reader)(nil) - utf8BOM = []byte{0xef, 0xbb, 0xbf} - utf16BEBOM = []byte{0xfe, 0xff} - utf16LEBOM = []byte{0xff, 0xfe} - utf32BEBOM = []byte{0x00, 0x00, 0xfe, 0xff} - utf32LEBOM = []byte{0xff, 0xfe, 0x00, 0x00} -) +var _ io.Reader = (*Reader)(nil) // ErrRead helps to trace error origin. var ErrRead = errors.New("utfbom: I/O error during BOM processing") @@ -71,25 +64,25 @@ func DetectEncoding[T string | []byte](input T) Encoding { return Unknown } - if len(ibs) >= 3 && bytes.HasPrefix(ibs, utf8BOM) { + if len(ibs) >= 3 && bytes.HasPrefix(ibs, []byte{0xef, 0xbb, 0xbf}) { return UTF8 } if len(ibs) >= 4 { - if bytes.HasPrefix(ibs, utf32BEBOM) { + if bytes.HasPrefix(ibs, []byte{0x00, 0x00, 0xfe, 0xff}) { return UTF32BigEndian } - if bytes.HasPrefix(ibs, utf32LEBOM) { + if bytes.HasPrefix(ibs, []byte{0xff, 0xfe, 0x00, 0x00}) { return UTF32LittleEndian } } - if bytes.HasPrefix(ibs, utf16BEBOM) { + if bytes.HasPrefix(ibs, []byte{0xfe, 0xff}) { return UTF16BigEndian } - if bytes.HasPrefix(ibs, utf16LEBOM) { + if bytes.HasPrefix(ibs, []byte{0xff, 0xfe}) { return UTF16LittleEndian } @@ -146,15 +139,15 @@ func (e Encoding) Bytes() []byte { default: return nil case UTF8: - return utf8BOM + return []byte{0xef, 0xbb, 0xbf} case UTF16BigEndian: - return utf16BEBOM + return []byte{0xfe, 0xff} case UTF16LittleEndian: - return utf16LEBOM + return []byte{0xff, 0xfe} case UTF32BigEndian: - return utf32BEBOM + return []byte{0x00, 0x00, 0xfe, 0xff} case UTF32LittleEndian: - return utf32LEBOM + return []byte{0xff, 0xfe, 0x00, 0x00} } } From 53b832268ab7342b871e52ae1d1fe9e3ad104f5b Mon Sep 17 00:00:00 2001 From: Ilya Tribusean Date: Wed, 3 Dec 2025 23:33:52 +0200 Subject: [PATCH 2/4] test --- utfbom_test.go | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/utfbom_test.go b/utfbom_test.go index 012a6e8..59e0f35 100644 --- a/utfbom_test.go +++ b/utfbom_test.go @@ -443,6 +443,34 @@ func TestEncoding_Bytes(t *testing.T) { } } +// TestBytes_NoAliasing checks that bytes returned by Bytes() are immutable. +func TestBytes_NoAliasing(t *testing.T) { + t.Parallel() + + encodings := []utfbom.Encoding{ + utfbom.UTF8, + utfbom.UTF16BigEndian, + utfbom.UTF16LittleEndian, + utfbom.UTF32BigEndian, + utfbom.UTF32LittleEndian, + } + + for _, enc := range encodings { + t.Run(enc.String(), func(t *testing.T) { + t.Parallel() + + original := enc.Bytes() + originalCopy := make([]byte, len(original)) + copy(originalCopy, original) + + original[0] = 0x00 + + fresh := enc.Bytes() + be.Equal(t, fresh, originalCopy) + }) + } +} + func TestPrepend(t *testing.T) { t.Parallel() From 92eb9d436eee04dc69f2ce1430cc2d394100ccf5 Mon Sep 17 00:00:00 2001 From: Ilya Tribusean Date: Wed, 3 Dec 2025 23:51:53 +0200 Subject: [PATCH 3/4] tests and polishing --- utfbom.go | 64 +++++++++++++++++++++++++---------------------- utfbom_test.go | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+), 30 deletions(-) diff --git a/utfbom.go b/utfbom.go index edde726..e8ff577 100644 --- a/utfbom.go +++ b/utfbom.go @@ -47,48 +47,51 @@ const ( UTF32LittleEndian ) -// DetectEncoding inspects the initial bytes of a string or byte slice (T) -// and returns the detected text encoding based on the presence of known BOMs (Byte Order Marks). -// If no known BOM is found, it returns Unknown. -// -// Supported encodings: -// - UTF-8 (BOM: 0xef 0xbb 0xbf) -// - UTF-16 Big Endian (BOM: 0xfe 0xff) -// - UTF-16 Little Endian (BOM: 0xff 0xfe) -// - UTF-32 Big Endian (BOM: 0x00 0x00 0xfe 0xff) -// - UTF-32 Little Endian (BOM: 0xff 0xfe 0x00 0x00) -func DetectEncoding[T string | []byte](input T) Encoding { - ibs := []byte(input) - - if len(ibs) < 2 { +// detectEncodingBytes is the internal implementation that works directly on []byte. +func detectEncodingBytes(b []byte) Encoding { + if len(b) < 2 { return Unknown } - if len(ibs) >= 3 && bytes.HasPrefix(ibs, []byte{0xef, 0xbb, 0xbf}) { + if len(b) >= 3 && bytes.HasPrefix(b, []byte{0xef, 0xbb, 0xbf}) { return UTF8 } - if len(ibs) >= 4 { - if bytes.HasPrefix(ibs, []byte{0x00, 0x00, 0xfe, 0xff}) { + if len(b) >= 4 { + if bytes.HasPrefix(b, []byte{0x00, 0x00, 0xfe, 0xff}) { return UTF32BigEndian } - if bytes.HasPrefix(ibs, []byte{0xff, 0xfe, 0x00, 0x00}) { + if bytes.HasPrefix(b, []byte{0xff, 0xfe, 0x00, 0x00}) { return UTF32LittleEndian } } - if bytes.HasPrefix(ibs, []byte{0xfe, 0xff}) { + if bytes.HasPrefix(b, []byte{0xfe, 0xff}) { return UTF16BigEndian } - if bytes.HasPrefix(ibs, []byte{0xff, 0xfe}) { + if bytes.HasPrefix(b, []byte{0xff, 0xfe}) { return UTF16LittleEndian } return Unknown } +// DetectEncoding inspects the initial bytes of a string or byte slice (T) +// and returns the detected text encoding based on the presence of known BOMs (Byte Order Marks). +// If no known BOM is found, it returns Unknown. +// +// Supported encodings: +// - UTF-8 (BOM: 0xef 0xbb 0xbf) +// - UTF-16 Big Endian (BOM: 0xfe 0xff) +// - UTF-16 Little Endian (BOM: 0xff 0xfe) +// - UTF-32 Big Endian (BOM: 0x00 0x00 0xfe 0xff) +// - UTF-32 Little Endian (BOM: 0xff 0xfe 0x00 0x00) +func DetectEncoding[T ~string | ~[]byte](input T) Encoding { + return detectEncodingBytes([]byte(input)) +} + // AnyOf reports whether the Encoding value equals any of the given Encoding values. // It returns true if a match is found, otherwise false. func (e Encoding) AnyOf(es ...Encoding) bool { @@ -101,7 +104,7 @@ func (e Encoding) AnyOf(es ...Encoding) bool { return false } -// Strings returns human-readable name of encoding. +// String returns the human-readable name of the encoding. func (e Encoding) String() string { switch e { case UTF8: @@ -151,11 +154,11 @@ func (e Encoding) Bytes() []byte { } } -// Trim removes the BOM prefix from the input `s` based on the encoding `enc`. +// Trim removes the BOM prefix from the input. // Supports string or []byte inputs and returns the same type without the BOM. -func Trim[T string | []byte](input T) (T, Encoding) { +func Trim[T ~string | ~[]byte](input T) (T, Encoding) { b := []byte(input) - enc := DetectEncoding(b) + enc := detectEncodingBytes(b) if enc == Unknown { return input, enc @@ -167,14 +170,14 @@ func Trim[T string | []byte](input T) (T, Encoding) { // Prepend adds the corresponding Byte Order Mark (BOM) for a given encoding // to the beginning of a string or byte slice. // If the provided encoding is Unknown, the input is returned unmodified. -func Prepend[T string | []byte](input T, enc Encoding) T { +func Prepend[T ~string | ~[]byte](input T, enc Encoding) T { if enc == Unknown { return input } b := []byte(input) - if DetectEncoding(b) != Unknown { + if detectEncodingBytes(b) != Unknown { return input } @@ -183,6 +186,8 @@ func Prepend[T string | []byte](input T, enc Encoding) T { // Reader implements automatic BOM (Unicode Byte Order Mark) checking and // removing as necessary for an io.Reader object. +// +// Reader is not safe for concurrent use. type Reader struct { rd *bufio.Reader once sync.Once @@ -191,6 +196,7 @@ type Reader struct { } // NewReader wraps an incoming reader. +// Passing a nil reader will cause a panic on the first Read call. func NewReader(rd io.Reader) *Reader { return &Reader{ rd: bufio.NewReader(rd), @@ -200,10 +206,8 @@ func NewReader(rd io.Reader) *Reader { } // Read implements the io.Reader interface. -// On the first read call, it reads from the underlying Reader, detects and removes any Byte Order Mark (BOM). -// Subsequent calls delegate directly to the underlying Reader without BOM handling. -// Read is only safe for concurrent use during the first call due to sync.Once; after that, thread-safety -// depends on the underlying Reader. It is best to assume unsafe concurrent use. +// On the first call, it detects and removes any Byte Order Mark (BOM). +// Subsequent calls delegate directly to the underlying Reader. func (r *Reader) Read(buf []byte) (int, error) { const maxBOMLen = 4 diff --git a/utfbom_test.go b/utfbom_test.go index 59e0f35..1d0e669 100644 --- a/utfbom_test.go +++ b/utfbom_test.go @@ -515,3 +515,71 @@ func TestPrepend(t *testing.T) { } }) } + +type CustomString string + +type CustomBytes []byte + +func TestDetectEncoding_TypeAliases(t *testing.T) { + t.Parallel() + + t.Run("custom_string", func(t *testing.T) { + input := CustomString("\ufeffhello") + enc := utfbom.DetectEncoding(input) + be.Equal(t, enc, utfbom.UTF8) + }) + + t.Run("custom_bytes", func(t *testing.T) { + input := CustomBytes([]byte{0xfe, 0xff, 'h', 'i'}) + enc := utfbom.DetectEncoding(input) + be.Equal(t, enc, utfbom.UTF16BigEndian) + }) +} + +func TestTrim_TypeAliases(t *testing.T) { + t.Parallel() + + t.Run("custom_string", func(t *testing.T) { + input := CustomString("\ufeffhello") + out, enc := utfbom.Trim(input) + be.Equal(t, enc, utfbom.UTF8) + be.Equal(t, out, CustomString("hello")) + }) + + t.Run("custom_bytes", func(t *testing.T) { + input := CustomBytes([]byte{0xfe, 0xff, 'h', 'i'}) + out, enc := utfbom.Trim(input) + be.Equal(t, enc, utfbom.UTF16BigEndian) + be.Equal(t, out, CustomBytes([]byte{'h', 'i'})) + }) +} + +func TestPrepend_TypeAliases(t *testing.T) { + t.Parallel() + + t.Run("custom_string", func(t *testing.T) { + input := CustomString("hello") + out := utfbom.Prepend(input, utfbom.UTF8) + be.Equal(t, out, CustomString("\ufeffhello")) + }) + + t.Run("custom_bytes", func(t *testing.T) { + input := CustomBytes([]byte{'h', 'i'}) + out := utfbom.Prepend(input, utfbom.UTF16BigEndian) + be.Equal(t, out, CustomBytes([]byte{0xfe, 0xff, 'h', 'i'})) + }) +} + +func TestNewReader_NilPanics(t *testing.T) { + t.Parallel() + + rd := utfbom.NewReader(nil) + + defer func() { + r := recover() + be.True(t, r != nil) + }() + + buf := make([]byte, 10) + _, _ = rd.Read(buf) +} From a0170f75d0b40918a6d6d2866406196831e1117b Mon Sep 17 00:00:00 2001 From: Ilya Tribusean Date: Wed, 3 Dec 2025 23:58:38 +0200 Subject: [PATCH 4/4] do not like private method for bytes detection, revert to public one --- utfbom.go | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/utfbom.go b/utfbom.go index e8ff577..9ad9b42 100644 --- a/utfbom.go +++ b/utfbom.go @@ -47,8 +47,19 @@ const ( UTF32LittleEndian ) -// detectEncodingBytes is the internal implementation that works directly on []byte. -func detectEncodingBytes(b []byte) Encoding { +// DetectEncoding inspects the initial bytes of a string or byte slice (T) +// and returns the detected text encoding based on the presence of known BOMs (Byte Order Marks). +// If no known BOM is found, it returns Unknown. +// +// Supported encodings: +// - UTF-8 (BOM: 0xef 0xbb 0xbf) +// - UTF-16 Big Endian (BOM: 0xfe 0xff) +// - UTF-16 Little Endian (BOM: 0xff 0xfe) +// - UTF-32 Big Endian (BOM: 0x00 0x00 0xfe 0xff) +// - UTF-32 Little Endian (BOM: 0xff 0xfe 0x00 0x00) +func DetectEncoding[T ~string | ~[]byte](input T) Encoding { + b := []byte(input) + if len(b) < 2 { return Unknown } @@ -78,20 +89,6 @@ func detectEncodingBytes(b []byte) Encoding { return Unknown } -// DetectEncoding inspects the initial bytes of a string or byte slice (T) -// and returns the detected text encoding based on the presence of known BOMs (Byte Order Marks). -// If no known BOM is found, it returns Unknown. -// -// Supported encodings: -// - UTF-8 (BOM: 0xef 0xbb 0xbf) -// - UTF-16 Big Endian (BOM: 0xfe 0xff) -// - UTF-16 Little Endian (BOM: 0xff 0xfe) -// - UTF-32 Big Endian (BOM: 0x00 0x00 0xfe 0xff) -// - UTF-32 Little Endian (BOM: 0xff 0xfe 0x00 0x00) -func DetectEncoding[T ~string | ~[]byte](input T) Encoding { - return detectEncodingBytes([]byte(input)) -} - // AnyOf reports whether the Encoding value equals any of the given Encoding values. // It returns true if a match is found, otherwise false. func (e Encoding) AnyOf(es ...Encoding) bool { @@ -158,7 +155,7 @@ func (e Encoding) Bytes() []byte { // Supports string or []byte inputs and returns the same type without the BOM. func Trim[T ~string | ~[]byte](input T) (T, Encoding) { b := []byte(input) - enc := detectEncodingBytes(b) + enc := DetectEncoding(b) if enc == Unknown { return input, enc @@ -177,7 +174,7 @@ func Prepend[T ~string | ~[]byte](input T, enc Encoding) T { b := []byte(input) - if detectEncodingBytes(b) != Unknown { + if DetectEncoding(b) != Unknown { return input }