Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 24 additions & 30 deletions utfbom.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,7 @@ import (
"sync"
)

var (
_ io.Reader = (*Reader)(nil)
utf8BOM = []byte{0xef, 0xbb, 0xbf}
utf16BEBOM = []byte{0xfe, 0xff}
utf16LEBOM = []byte{0xff, 0xfe}
utf32BEBOM = []byte{0x00, 0x00, 0xfe, 0xff}
utf32LEBOM = []byte{0xff, 0xfe, 0x00, 0x00}
)
var _ io.Reader = (*Reader)(nil)

// ErrRead helps to trace error origin.
var ErrRead = errors.New("utfbom: I/O error during BOM processing")
Expand Down Expand Up @@ -64,32 +57,32 @@ const (
// - UTF-16 Little Endian (BOM: 0xff 0xfe)
// - UTF-32 Big Endian (BOM: 0x00 0x00 0xfe 0xff)
// - UTF-32 Little Endian (BOM: 0xff 0xfe 0x00 0x00)
func DetectEncoding[T string | []byte](input T) Encoding {
ibs := []byte(input)
func DetectEncoding[T ~string | ~[]byte](input T) Encoding {
b := []byte(input)

if len(ibs) < 2 {
if len(b) < 2 {
return Unknown
}

if len(ibs) >= 3 && bytes.HasPrefix(ibs, utf8BOM) {
if len(b) >= 3 && bytes.HasPrefix(b, []byte{0xef, 0xbb, 0xbf}) {
return UTF8
}

if len(ibs) >= 4 {
if bytes.HasPrefix(ibs, utf32BEBOM) {
if len(b) >= 4 {
if bytes.HasPrefix(b, []byte{0x00, 0x00, 0xfe, 0xff}) {
return UTF32BigEndian
}

if bytes.HasPrefix(ibs, utf32LEBOM) {
if bytes.HasPrefix(b, []byte{0xff, 0xfe, 0x00, 0x00}) {
return UTF32LittleEndian
}
}

if bytes.HasPrefix(ibs, utf16BEBOM) {
if bytes.HasPrefix(b, []byte{0xfe, 0xff}) {
return UTF16BigEndian
}

if bytes.HasPrefix(ibs, utf16LEBOM) {
if bytes.HasPrefix(b, []byte{0xff, 0xfe}) {
return UTF16LittleEndian
}

Expand All @@ -108,7 +101,7 @@ func (e Encoding) AnyOf(es ...Encoding) bool {
return false
}

// Strings returns human-readable name of encoding.
// String returns the human-readable name of the encoding.
func (e Encoding) String() string {
switch e {
case UTF8:
Expand Down Expand Up @@ -146,21 +139,21 @@ func (e Encoding) Bytes() []byte {
default:
return nil
case UTF8:
return utf8BOM
return []byte{0xef, 0xbb, 0xbf}
case UTF16BigEndian:
return utf16BEBOM
return []byte{0xfe, 0xff}
case UTF16LittleEndian:
return utf16LEBOM
return []byte{0xff, 0xfe}
case UTF32BigEndian:
return utf32BEBOM
return []byte{0x00, 0x00, 0xfe, 0xff}
case UTF32LittleEndian:
return utf32LEBOM
return []byte{0xff, 0xfe, 0x00, 0x00}
}
}

// Trim removes the BOM prefix from the input `s` based on the encoding `enc`.
// Trim removes the BOM prefix from the input.
// Supports string or []byte inputs and returns the same type without the BOM.
func Trim[T string | []byte](input T) (T, Encoding) {
func Trim[T ~string | ~[]byte](input T) (T, Encoding) {
b := []byte(input)
enc := DetectEncoding(b)

Expand All @@ -174,7 +167,7 @@ func Trim[T string | []byte](input T) (T, Encoding) {
// Prepend adds the corresponding Byte Order Mark (BOM) for a given encoding
// to the beginning of a string or byte slice.
// If the provided encoding is Unknown, the input is returned unmodified.
func Prepend[T string | []byte](input T, enc Encoding) T {
func Prepend[T ~string | ~[]byte](input T, enc Encoding) T {
if enc == Unknown {
return input
}
Expand All @@ -190,6 +183,8 @@ func Prepend[T string | []byte](input T, enc Encoding) T {

// Reader implements automatic BOM (Unicode Byte Order Mark) checking and
// removing as necessary for an io.Reader object.
//
// Reader is not safe for concurrent use.
type Reader struct {
rd *bufio.Reader
once sync.Once
Expand All @@ -198,6 +193,7 @@ type Reader struct {
}

// NewReader wraps an incoming reader.
// Passing a nil reader will cause a panic on the first Read call.
func NewReader(rd io.Reader) *Reader {
return &Reader{
rd: bufio.NewReader(rd),
Expand All @@ -207,10 +203,8 @@ func NewReader(rd io.Reader) *Reader {
}

// Read implements the io.Reader interface.
// On the first read call, it reads from the underlying Reader, detects and removes any Byte Order Mark (BOM).
// Subsequent calls delegate directly to the underlying Reader without BOM handling.
// Read is only safe for concurrent use during the first call due to sync.Once; after that, thread-safety
// depends on the underlying Reader. It is best to assume unsafe concurrent use.
// On the first call, it detects and removes any Byte Order Mark (BOM).
// Subsequent calls delegate directly to the underlying Reader.
func (r *Reader) Read(buf []byte) (int, error) {
const maxBOMLen = 4

Expand Down
96 changes: 96 additions & 0 deletions utfbom_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -443,6 +443,34 @@ func TestEncoding_Bytes(t *testing.T) {
}
}

// TestBytes_NoAliasing checks that bytes returned by Bytes() are immutable.
func TestBytes_NoAliasing(t *testing.T) {
t.Parallel()

encodings := []utfbom.Encoding{
utfbom.UTF8,
utfbom.UTF16BigEndian,
utfbom.UTF16LittleEndian,
utfbom.UTF32BigEndian,
utfbom.UTF32LittleEndian,
}

for _, enc := range encodings {
t.Run(enc.String(), func(t *testing.T) {
t.Parallel()

original := enc.Bytes()
originalCopy := make([]byte, len(original))
copy(originalCopy, original)

original[0] = 0x00

fresh := enc.Bytes()
be.Equal(t, fresh, originalCopy)
})
}
}

func TestPrepend(t *testing.T) {
t.Parallel()

Expand Down Expand Up @@ -487,3 +515,71 @@ func TestPrepend(t *testing.T) {
}
})
}

type CustomString string

type CustomBytes []byte

func TestDetectEncoding_TypeAliases(t *testing.T) {
t.Parallel()

t.Run("custom_string", func(t *testing.T) {
input := CustomString("\ufeffhello")
enc := utfbom.DetectEncoding(input)
be.Equal(t, enc, utfbom.UTF8)
})

t.Run("custom_bytes", func(t *testing.T) {
input := CustomBytes([]byte{0xfe, 0xff, 'h', 'i'})
enc := utfbom.DetectEncoding(input)
be.Equal(t, enc, utfbom.UTF16BigEndian)
})
}

func TestTrim_TypeAliases(t *testing.T) {
t.Parallel()

t.Run("custom_string", func(t *testing.T) {
input := CustomString("\ufeffhello")
out, enc := utfbom.Trim(input)
be.Equal(t, enc, utfbom.UTF8)
be.Equal(t, out, CustomString("hello"))
})

t.Run("custom_bytes", func(t *testing.T) {
input := CustomBytes([]byte{0xfe, 0xff, 'h', 'i'})
out, enc := utfbom.Trim(input)
be.Equal(t, enc, utfbom.UTF16BigEndian)
be.Equal(t, out, CustomBytes([]byte{'h', 'i'}))
})
}

func TestPrepend_TypeAliases(t *testing.T) {
t.Parallel()

t.Run("custom_string", func(t *testing.T) {
input := CustomString("hello")
out := utfbom.Prepend(input, utfbom.UTF8)
be.Equal(t, out, CustomString("\ufeffhello"))
})

t.Run("custom_bytes", func(t *testing.T) {
input := CustomBytes([]byte{'h', 'i'})
out := utfbom.Prepend(input, utfbom.UTF16BigEndian)
be.Equal(t, out, CustomBytes([]byte{0xfe, 0xff, 'h', 'i'}))
})
}

func TestNewReader_NilPanics(t *testing.T) {
t.Parallel()

rd := utfbom.NewReader(nil)

defer func() {
r := recover()
be.True(t, r != nil)
}()

buf := make([]byte, 10)
_, _ = rd.Read(buf)
}