Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 11 additions & 10 deletions utfbom.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ var _ io.Reader = (*Reader)(nil)
// ErrRead helps to trace error origin.
var ErrRead = errors.New("utfbom: I/O error during BOM processing")

const maxBOMLen = 4

// Encoding is a character encoding standard.
type Encoding int

Expand Down Expand Up @@ -59,6 +61,10 @@ const (
// - UTF-32 Big Endian (BOM: 0x00 0x00 0xfe 0xff)
// - UTF-32 Little Endian (BOM: 0xff 0xfe 0x00 0x00)
func DetectEncoding[T ~string | ~[]byte](input T) Encoding {
if len(input) > maxBOMLen {
input = input[:maxBOMLen]
}

b := []byte(input)

if len(b) < 2 {
Expand Down Expand Up @@ -149,31 +155,28 @@ func (e Encoding) Bytes() []byte {
// Trim removes the BOM prefix from the input.
// Supports string or []byte inputs and returns the same type without the BOM.
func Trim[T ~string | ~[]byte](input T) (T, Encoding) {
b := []byte(input)
enc := DetectEncoding(b)
enc := DetectEncoding(input)

if enc == Unknown {
return input, enc
}

return T(b[enc.Len():]), enc
return input[enc.Len():], enc
}

// Prepend adds the corresponding Byte Order Mark (BOM) for a given encoding
// to the beginning of a string or byte slice.
// If the provided encoding is Unknown, the input is returned unmodified.
// The input is returned unmodified if enc is Unknown or if the input already has any BOM.
func Prepend[T ~string | ~[]byte](input T, enc Encoding) T {
if enc == Unknown {
return input
}

b := []byte(input)

if DetectEncoding(b) != Unknown {
if DetectEncoding(input) != Unknown {
return input
}

return T(append(enc.Bytes(), b...))
return T(append(enc.Bytes(), []byte(input)...))
}

// Reader implements automatic BOM (Unicode Byte Order Mark) checking and
Expand Down Expand Up @@ -201,8 +204,6 @@ func NewReader(rd io.Reader) *Reader {
// On the first call, it detects and removes any Byte Order Mark (BOM).
// Subsequent calls delegate directly to the underlying Reader.
func (r *Reader) Read(buf []byte) (int, error) {
const maxBOMLen = 4

if len(buf) == 0 {
return 0, nil
}
Expand Down
64 changes: 25 additions & 39 deletions utfbom_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"bytes"
"encoding/csv"
"encoding/hex"
"errors"
"fmt"
"io"
"strings"
Expand Down Expand Up @@ -218,41 +219,23 @@ func TestEncoding_Len(t *testing.T) {
t.Parallel()

testCases := []struct {
name string
enc utfbom.Encoding
expected int
}{
{
enc: utfbom.Unknown,
expected: 0,
},
{
enc: utfbom.UTF8,
expected: 3,
},
{
enc: utfbom.UTF16BigEndian,
expected: 2,
},
{
enc: utfbom.UTF16LittleEndian,
expected: 2,
},
{
enc: utfbom.UTF32BigEndian,
expected: 4,
},
{
enc: utfbom.UTF32LittleEndian,
expected: 4,
},
{
enc: 999,
expected: 0,
},
{"Unknown", utfbom.Unknown, 0},
{"UTF8", utfbom.UTF8, 3},
{"UTF16BigEndian", utfbom.UTF16BigEndian, 2},
{"UTF16LittleEndian", utfbom.UTF16LittleEndian, 2},
{"UTF32BigEndian", utfbom.UTF32BigEndian, 4},
{"UTF32LittleEndian", utfbom.UTF32LittleEndian, 4},
{"InvalidEncoding", 999, 0},
}

for _, tc := range testCases {
be.Equal(t, tc.enc.Len(), tc.expected)
t.Run(tc.name, func(t *testing.T) {
be.Equal(t, tc.enc.Len(), tc.expected)
})
}
}

Expand Down Expand Up @@ -313,16 +296,6 @@ func TestReader_StringWithoutBOM(t *testing.T) {
be.Err(t, iotest.TestReader(rd, []byte(nobomstring)), nil)
}

func TestReader_UsualReader(t *testing.T) {
t.Parallel()

bomPrefixedStringReader := strings.NewReader(teststring)

rd := utfbom.NewReader(bomPrefixedStringReader)

be.Err(t, iotest.TestReader(rd, []byte(teststring[3:])), nil)
}

func TestReader_OneByteReader(t *testing.T) {
t.Parallel()

Expand Down Expand Up @@ -570,6 +543,19 @@ func TestPrepend_TypeAliases(t *testing.T) {
})
}

// TestReader_UnderlyingReaderError verifies that when the underlying reader
// returns a non-EOF error during BOM detection, it is wrapped with ErrRead.
func TestReader_UnderlyingReaderError(t *testing.T) {
t.Parallel()

rd := utfbom.NewReader(iotest.ErrReader(errors.New("disk failure")))

buf := make([]byte, 10)
n, err := rd.Read(buf)
be.Equal(t, 0, n)
be.True(t, errors.Is(err, utfbom.ErrRead))
}

func TestNewReader_NilPanics(t *testing.T) {
t.Parallel()

Expand Down