diff --git a/utfbom.go b/utfbom.go index 1562960..fadb4eb 100644 --- a/utfbom.go +++ b/utfbom.go @@ -19,6 +19,8 @@ var _ io.Reader = (*Reader)(nil) // ErrRead helps to trace error origin. var ErrRead = errors.New("utfbom: I/O error during BOM processing") +const maxBOMLen = 4 + // Encoding is a character encoding standard. type Encoding int @@ -59,6 +61,10 @@ const ( // - UTF-32 Big Endian (BOM: 0x00 0x00 0xfe 0xff) // - UTF-32 Little Endian (BOM: 0xff 0xfe 0x00 0x00) func DetectEncoding[T ~string | ~[]byte](input T) Encoding { + if len(input) > maxBOMLen { + input = input[:maxBOMLen] + } + b := []byte(input) if len(b) < 2 { @@ -149,31 +155,28 @@ func (e Encoding) Bytes() []byte { // Trim removes the BOM prefix from the input. // Supports string or []byte inputs and returns the same type without the BOM. func Trim[T ~string | ~[]byte](input T) (T, Encoding) { - b := []byte(input) - enc := DetectEncoding(b) + enc := DetectEncoding(input) if enc == Unknown { return input, enc } - return T(b[enc.Len():]), enc + return input[enc.Len():], enc } // Prepend adds the corresponding Byte Order Mark (BOM) for a given encoding // to the beginning of a string or byte slice. -// If the provided encoding is Unknown, the input is returned unmodified. +// The input is returned unmodified if enc is Unknown or if the input already has any BOM. func Prepend[T ~string | ~[]byte](input T, enc Encoding) T { if enc == Unknown { return input } - b := []byte(input) - - if DetectEncoding(b) != Unknown { + if DetectEncoding(input) != Unknown { return input } - return T(append(enc.Bytes(), b...)) + return T(append(enc.Bytes(), []byte(input)...)) } // Reader implements automatic BOM (Unicode Byte Order Mark) checking and @@ -201,8 +204,6 @@ func NewReader(rd io.Reader) *Reader { // On the first call, it detects and removes any Byte Order Mark (BOM). // Subsequent calls delegate directly to the underlying Reader. func (r *Reader) Read(buf []byte) (int, error) { - const maxBOMLen = 4 - if len(buf) == 0 { return 0, nil } diff --git a/utfbom_test.go b/utfbom_test.go index a89e3e6..15b135a 100644 --- a/utfbom_test.go +++ b/utfbom_test.go @@ -4,6 +4,7 @@ import ( "bytes" "encoding/csv" "encoding/hex" + "errors" "fmt" "io" "strings" @@ -218,41 +219,23 @@ func TestEncoding_Len(t *testing.T) { t.Parallel() testCases := []struct { + name string enc utfbom.Encoding expected int }{ - { - enc: utfbom.Unknown, - expected: 0, - }, - { - enc: utfbom.UTF8, - expected: 3, - }, - { - enc: utfbom.UTF16BigEndian, - expected: 2, - }, - { - enc: utfbom.UTF16LittleEndian, - expected: 2, - }, - { - enc: utfbom.UTF32BigEndian, - expected: 4, - }, - { - enc: utfbom.UTF32LittleEndian, - expected: 4, - }, - { - enc: 999, - expected: 0, - }, + {"Unknown", utfbom.Unknown, 0}, + {"UTF8", utfbom.UTF8, 3}, + {"UTF16BigEndian", utfbom.UTF16BigEndian, 2}, + {"UTF16LittleEndian", utfbom.UTF16LittleEndian, 2}, + {"UTF32BigEndian", utfbom.UTF32BigEndian, 4}, + {"UTF32LittleEndian", utfbom.UTF32LittleEndian, 4}, + {"InvalidEncoding", 999, 0}, } for _, tc := range testCases { - be.Equal(t, tc.enc.Len(), tc.expected) + t.Run(tc.name, func(t *testing.T) { + be.Equal(t, tc.enc.Len(), tc.expected) + }) } } @@ -313,16 +296,6 @@ func TestReader_StringWithoutBOM(t *testing.T) { be.Err(t, iotest.TestReader(rd, []byte(nobomstring)), nil) } -func TestReader_UsualReader(t *testing.T) { - t.Parallel() - - bomPrefixedStringReader := strings.NewReader(teststring) - - rd := utfbom.NewReader(bomPrefixedStringReader) - - be.Err(t, iotest.TestReader(rd, []byte(teststring[3:])), nil) -} - func TestReader_OneByteReader(t *testing.T) { t.Parallel() @@ -570,6 +543,19 @@ func TestPrepend_TypeAliases(t *testing.T) { }) } +// TestReader_UnderlyingReaderError verifies that when the underlying reader +// returns a non-EOF error during BOM detection, it is wrapped with ErrRead. +func TestReader_UnderlyingReaderError(t *testing.T) { + t.Parallel() + + rd := utfbom.NewReader(iotest.ErrReader(errors.New("disk failure"))) + + buf := make([]byte, 10) + n, err := rd.Read(buf) + be.Equal(t, 0, n) + be.True(t, errors.Is(err, utfbom.ErrRead)) +} + func TestNewReader_NilPanics(t *testing.T) { t.Parallel()