diff --git a/utfbom.go b/utfbom.go index c5ab8d6..9ad9b42 100644 --- a/utfbom.go +++ b/utfbom.go @@ -13,14 +13,7 @@ import ( "sync" ) -var ( - _ io.Reader = (*Reader)(nil) - utf8BOM = []byte{0xef, 0xbb, 0xbf} - utf16BEBOM = []byte{0xfe, 0xff} - utf16LEBOM = []byte{0xff, 0xfe} - utf32BEBOM = []byte{0x00, 0x00, 0xfe, 0xff} - utf32LEBOM = []byte{0xff, 0xfe, 0x00, 0x00} -) +var _ io.Reader = (*Reader)(nil) // ErrRead helps to trace error origin. var ErrRead = errors.New("utfbom: I/O error during BOM processing") @@ -64,32 +57,32 @@ const ( // - UTF-16 Little Endian (BOM: 0xff 0xfe) // - UTF-32 Big Endian (BOM: 0x00 0x00 0xfe 0xff) // - UTF-32 Little Endian (BOM: 0xff 0xfe 0x00 0x00) -func DetectEncoding[T string | []byte](input T) Encoding { - ibs := []byte(input) +func DetectEncoding[T ~string | ~[]byte](input T) Encoding { + b := []byte(input) - if len(ibs) < 2 { + if len(b) < 2 { return Unknown } - if len(ibs) >= 3 && bytes.HasPrefix(ibs, utf8BOM) { + if len(b) >= 3 && bytes.HasPrefix(b, []byte{0xef, 0xbb, 0xbf}) { return UTF8 } - if len(ibs) >= 4 { - if bytes.HasPrefix(ibs, utf32BEBOM) { + if len(b) >= 4 { + if bytes.HasPrefix(b, []byte{0x00, 0x00, 0xfe, 0xff}) { return UTF32BigEndian } - if bytes.HasPrefix(ibs, utf32LEBOM) { + if bytes.HasPrefix(b, []byte{0xff, 0xfe, 0x00, 0x00}) { return UTF32LittleEndian } } - if bytes.HasPrefix(ibs, utf16BEBOM) { + if bytes.HasPrefix(b, []byte{0xfe, 0xff}) { return UTF16BigEndian } - if bytes.HasPrefix(ibs, utf16LEBOM) { + if bytes.HasPrefix(b, []byte{0xff, 0xfe}) { return UTF16LittleEndian } @@ -108,7 +101,7 @@ func (e Encoding) AnyOf(es ...Encoding) bool { return false } -// Strings returns human-readable name of encoding. +// String returns the human-readable name of the encoding. func (e Encoding) String() string { switch e { case UTF8: @@ -146,21 +139,21 @@ func (e Encoding) Bytes() []byte { default: return nil case UTF8: - return utf8BOM + return []byte{0xef, 0xbb, 0xbf} case UTF16BigEndian: - return utf16BEBOM + return []byte{0xfe, 0xff} case UTF16LittleEndian: - return utf16LEBOM + return []byte{0xff, 0xfe} case UTF32BigEndian: - return utf32BEBOM + return []byte{0x00, 0x00, 0xfe, 0xff} case UTF32LittleEndian: - return utf32LEBOM + return []byte{0xff, 0xfe, 0x00, 0x00} } } -// Trim removes the BOM prefix from the input `s` based on the encoding `enc`. +// Trim removes the BOM prefix from the input. // Supports string or []byte inputs and returns the same type without the BOM. -func Trim[T string | []byte](input T) (T, Encoding) { +func Trim[T ~string | ~[]byte](input T) (T, Encoding) { b := []byte(input) enc := DetectEncoding(b) @@ -174,7 +167,7 @@ func Trim[T string | []byte](input T) (T, Encoding) { // Prepend adds the corresponding Byte Order Mark (BOM) for a given encoding // to the beginning of a string or byte slice. // If the provided encoding is Unknown, the input is returned unmodified. -func Prepend[T string | []byte](input T, enc Encoding) T { +func Prepend[T ~string | ~[]byte](input T, enc Encoding) T { if enc == Unknown { return input } @@ -190,6 +183,8 @@ func Prepend[T string | []byte](input T, enc Encoding) T { // Reader implements automatic BOM (Unicode Byte Order Mark) checking and // removing as necessary for an io.Reader object. +// +// Reader is not safe for concurrent use. type Reader struct { rd *bufio.Reader once sync.Once @@ -198,6 +193,7 @@ type Reader struct { } // NewReader wraps an incoming reader. +// Passing a nil reader will cause a panic on the first Read call. func NewReader(rd io.Reader) *Reader { return &Reader{ rd: bufio.NewReader(rd), @@ -207,10 +203,8 @@ func NewReader(rd io.Reader) *Reader { } // Read implements the io.Reader interface. -// On the first read call, it reads from the underlying Reader, detects and removes any Byte Order Mark (BOM). -// Subsequent calls delegate directly to the underlying Reader without BOM handling. -// Read is only safe for concurrent use during the first call due to sync.Once; after that, thread-safety -// depends on the underlying Reader. It is best to assume unsafe concurrent use. +// On the first call, it detects and removes any Byte Order Mark (BOM). +// Subsequent calls delegate directly to the underlying Reader. func (r *Reader) Read(buf []byte) (int, error) { const maxBOMLen = 4 diff --git a/utfbom_test.go b/utfbom_test.go index 012a6e8..1d0e669 100644 --- a/utfbom_test.go +++ b/utfbom_test.go @@ -443,6 +443,34 @@ func TestEncoding_Bytes(t *testing.T) { } } +// TestBytes_NoAliasing checks that bytes returned by Bytes() are immutable. +func TestBytes_NoAliasing(t *testing.T) { + t.Parallel() + + encodings := []utfbom.Encoding{ + utfbom.UTF8, + utfbom.UTF16BigEndian, + utfbom.UTF16LittleEndian, + utfbom.UTF32BigEndian, + utfbom.UTF32LittleEndian, + } + + for _, enc := range encodings { + t.Run(enc.String(), func(t *testing.T) { + t.Parallel() + + original := enc.Bytes() + originalCopy := make([]byte, len(original)) + copy(originalCopy, original) + + original[0] = 0x00 + + fresh := enc.Bytes() + be.Equal(t, fresh, originalCopy) + }) + } +} + func TestPrepend(t *testing.T) { t.Parallel() @@ -487,3 +515,71 @@ func TestPrepend(t *testing.T) { } }) } + +type CustomString string + +type CustomBytes []byte + +func TestDetectEncoding_TypeAliases(t *testing.T) { + t.Parallel() + + t.Run("custom_string", func(t *testing.T) { + input := CustomString("\ufeffhello") + enc := utfbom.DetectEncoding(input) + be.Equal(t, enc, utfbom.UTF8) + }) + + t.Run("custom_bytes", func(t *testing.T) { + input := CustomBytes([]byte{0xfe, 0xff, 'h', 'i'}) + enc := utfbom.DetectEncoding(input) + be.Equal(t, enc, utfbom.UTF16BigEndian) + }) +} + +func TestTrim_TypeAliases(t *testing.T) { + t.Parallel() + + t.Run("custom_string", func(t *testing.T) { + input := CustomString("\ufeffhello") + out, enc := utfbom.Trim(input) + be.Equal(t, enc, utfbom.UTF8) + be.Equal(t, out, CustomString("hello")) + }) + + t.Run("custom_bytes", func(t *testing.T) { + input := CustomBytes([]byte{0xfe, 0xff, 'h', 'i'}) + out, enc := utfbom.Trim(input) + be.Equal(t, enc, utfbom.UTF16BigEndian) + be.Equal(t, out, CustomBytes([]byte{'h', 'i'})) + }) +} + +func TestPrepend_TypeAliases(t *testing.T) { + t.Parallel() + + t.Run("custom_string", func(t *testing.T) { + input := CustomString("hello") + out := utfbom.Prepend(input, utfbom.UTF8) + be.Equal(t, out, CustomString("\ufeffhello")) + }) + + t.Run("custom_bytes", func(t *testing.T) { + input := CustomBytes([]byte{'h', 'i'}) + out := utfbom.Prepend(input, utfbom.UTF16BigEndian) + be.Equal(t, out, CustomBytes([]byte{0xfe, 0xff, 'h', 'i'})) + }) +} + +func TestNewReader_NilPanics(t *testing.T) { + t.Parallel() + + rd := utfbom.NewReader(nil) + + defer func() { + r := recover() + be.True(t, r != nil) + }() + + buf := make([]byte, 10) + _, _ = rd.Read(buf) +}