From e9e92526679b486f31fc4bd73433a331a922b0e9 Mon Sep 17 00:00:00 2001 From: Nikita Skovoroda Date: Tue, 27 Jan 2026 22:12:49 +0400 Subject: [PATCH 1/3] lib: use utf8 fast path for streaming TextDecoder --- lib/internal/encoding.js | 59 +++++++++++++-- lib/internal/encoding/util.js | 72 +++++++++++++++++++ test/parallel/test-bootstrap-modules.js | 1 + ...test-whatwg-encoding-custom-textdecoder.js | 53 ++++---------- 4 files changed, 138 insertions(+), 47 deletions(-) create mode 100644 lib/internal/encoding/util.js diff --git a/lib/internal/encoding.js b/lib/internal/encoding.js index 5523c70a157225..7d4747abc23bb9 100644 --- a/lib/internal/encoding.js +++ b/lib/internal/encoding.js @@ -28,11 +28,13 @@ const kHandle = Symbol('handle'); const kFlags = Symbol('flags'); const kEncoding = Symbol('encoding'); const kDecoder = Symbol('decoder'); +const kChunk = Symbol('chunk'); const kFatal = Symbol('kFatal'); const kUTF8FastPath = Symbol('kUTF8FastPath'); const kIgnoreBOM = Symbol('kIgnoreBOM'); const { isSinglebyteEncoding, createSinglebyteDecoder } = require('internal/encoding/single-byte'); +const { unfinishedBytesUtf8, mergePrefixUtf8 } = require('internal/encoding/util'); const { getConstructorOf, @@ -447,9 +449,11 @@ class TextDecoder { this[kUTF8FastPath] = false; this[kHandle] = undefined; this[kSingleByte] = undefined; // Does not care about streaming or BOM + this[kChunk] = null; // A copy of previous streaming tail or null if (enc === 'utf-8') { this[kUTF8FastPath] = true; + this[kBOMSeen] = false; } else if (isSinglebyteEncoding(enc)) { this[kSingleByte] = createSinglebyteDecoder(enc, this[kFatal]); } else { @@ -458,7 +462,6 @@ class TextDecoder { } #prepareConverter() { - if (this[kHandle] !== undefined) return; if (hasIntl) { let icuEncoding = this[kEncoding]; if (icuEncoding === 'gbk') icuEncoding = 'gb18030'; // 10.1.1. GBK's decoder is gb18030's decoder @@ -466,7 +469,7 @@ class TextDecoder { if (handle === undefined) throw new ERR_ENCODING_NOT_SUPPORTED(this[kEncoding]); this[kHandle] = handle; - } else if (this[kEncoding] === 'utf-8' || this[kEncoding] === 'utf-16le') { + } else if (this[kEncoding] === 'utf-16le') { if (this[kFatal]) throw new ERR_NO_ICU('"fatal" option'); this[kHandle] = new (lazyStringDecoder())(this[kEncoding]); this[kBOMSeen] = false; @@ -483,11 +486,55 @@ class TextDecoder { const stream = options?.stream; if (this[kUTF8FastPath]) { - if (!stream) return decodeUTF8(input, this[kIgnoreBOM], this[kFatal]); - this[kUTF8FastPath] = false; - } + const chunk = this[kChunk]; + const ignoreBom = this[kIgnoreBOM] || this[kBOMSeen]; + if (!stream) { + this[kBOMSeen] = false; + if (!chunk) return decodeUTF8(input, ignoreBom, this[kFatal]); + } + + let u = parseInput(input); + if (u.length === 0 && stream) return ''; // no state change + let prefix; + if (chunk) { + const merged = mergePrefixUtf8(u, this[kChunk]); + if (u.length < 3) { + u = merged; // Might be unfinished, but fully consumed old u + } else { + prefix = merged; // Stops at complete chunk + const add = prefix.length - this[kChunk].length; + if (add > 0) u = u.subarray(add); + } + + this[kChunk] = null; + } - this.#prepareConverter(); + if (stream) { + const trail = unfinishedBytesUtf8(u, u.length); + if (trail > 0) { + this[kChunk] = new FastBuffer(u.subarray(-trail)); // copy + if (!prefix && trail === u.length) return ''; // No further state change + u = u.subarray(0, -trail); + } + } + + try { + const res = (prefix ? decodeUTF8(prefix, ignoreBom, this[kFatal]) : '') + + decodeUTF8(u, ignoreBom || prefix, this[kFatal]); + + // "BOM seen" is set on the current decode call only if it did not error, + // in "serialize I/O queue" after decoding + // We don't get here if we had no complete data to process, + // and we don't want BOM processing after that if streaming + if (stream) this[kBOMSeen] = true; + + return res; + } catch (e) { + this[kChunk] = null; // Reset unfinished chunk on errors + // The correct way per spec seems to be not destroying the decoder state (aka BOM here) in stream mode + throw e; + } + } if (hasIntl) { const flags = stream ? 0 : CONVERTER_FLAGS_FLUSH; diff --git a/lib/internal/encoding/util.js b/lib/internal/encoding/util.js new file mode 100644 index 00000000000000..107a0f41b5d811 --- /dev/null +++ b/lib/internal/encoding/util.js @@ -0,0 +1,72 @@ +// From https://npmjs.com/package/@exodus/bytes +// Copyright Exodus Movement. Licensed under MIT License. + +'use strict'; + +const { + Uint8Array, +} = primordials; + + +/** + * Get a number of last bytes in an Uint8Array `data` ending at `len` that don't + * form a codepoint yet, but can be a part of a single codepoint on more data. + * @param {Uint8Array} data Uint8Array of potentially UTF-8 bytes + * @param {number} len Position to look behind from + * @returns {number} Number of unfinished potentially valid UTF-8 bytes ending at position `len` + */ +function unfinishedBytesUtf8(data, len) { + // 0-3 + let pos = 0; + while (pos < 2 && pos < len && (data[len - pos - 1] & 0xc0) === 0x80) pos++; // Go back 0-2 trailing bytes + if (pos === len) return 0; // no space for lead + const lead = data[len - pos - 1]; + if (lead < 0xc2 || lead > 0xf4) return 0; // not a lead + if (pos === 0) return 1; // Nothing to recheck, we have only lead, return it. 2-byte must return here + if (lead < 0xe0 || (lead < 0xf0 && pos >= 2)) return 0; // 2-byte, or 3-byte or less and we already have 2 trailing + const lower = lead === 0xf0 ? 0x90 : lead === 0xe0 ? 0xa0 : 0x80; + const upper = lead === 0xf4 ? 0x8f : lead === 0xed ? 0x9f : 0xbf; + const next = data[len - pos]; + return next >= lower && next <= upper ? pos + 1 : 0; +} + +/** + * Merge prefix `chunk` with `data` and return new combined prefix. + * For data.length < 3, fully consumes data and can return unfinished data, + * otherwise returns a prefix with no unfinished bytes + * @param {Uint8Array} data Uint8Array of potentially UTF-8 bytes + * @param {Uint8Array} chunk Prefix to prepend before `data` + * @returns {Uint8Array} If data.length >= 3: an Uint8Array containing `chunk` and a slice of `data` + * so that the result has no unfinished UTF-8 codepoints. If data.length < 3: concat(chunk, data). + */ +function mergePrefixUtf8(data, chunk) { + if (data.length === 0) return chunk; + if (data.length < 3) { + // No reason to bruteforce offsets, also it's possible this doesn't yet end the sequence + const res = new Uint8Array(data.length + chunk.length); + res.set(chunk); + res.set(data, chunk.length); + return res; + } + + // Slice off a small portion of data into prefix chunk so we can decode them separately without extending array size + const temp = new Uint8Array(chunk.length + 3); // We have 1-3 bytes and need 1-3 more bytes + temp.set(chunk); + temp.set(data.subarray(0, 3), chunk.length); + + // Stop at the first offset where unfinished bytes reaches 0 or fits into data + // If that doesn't happen (data too short), just concat chunk and data completely (above) + for (let i = 1; i <= 3; i++) { + const unfinished = unfinishedBytesUtf8(temp, chunk.length + i); // 0-3 + if (unfinished <= i) { + // Always reachable at 3, but we still need 'unfinished' value for it + const add = i - unfinished; // 0-3 + return add > 0 ? temp.subarray(0, chunk.length + add) : chunk; + } + } + + // Unreachable + return null; +} + +module.exports = { unfinishedBytesUtf8, mergePrefixUtf8 }; diff --git a/test/parallel/test-bootstrap-modules.js b/test/parallel/test-bootstrap-modules.js index d69a299625d9f2..05d7830a2ab1d2 100644 --- a/test/parallel/test-bootstrap-modules.js +++ b/test/parallel/test-bootstrap-modules.js @@ -89,6 +89,7 @@ expected.beforePreExec = new Set([ 'Internal Binding fs', 'NativeModule internal/encoding', 'NativeModule internal/encoding/single-byte', + 'NativeModule internal/encoding/util', 'NativeModule internal/blob', 'NativeModule internal/fs/utils', 'NativeModule fs', diff --git a/test/parallel/test-whatwg-encoding-custom-textdecoder.js b/test/parallel/test-whatwg-encoding-custom-textdecoder.js index eabe54b36d7674..9734825b6b27a5 100644 --- a/test/parallel/test-whatwg-encoding-custom-textdecoder.js +++ b/test/parallel/test-whatwg-encoding-custom-textdecoder.js @@ -80,20 +80,8 @@ assert(TextDecoder); ['unicode-1-1-utf-8', 'utf8', 'utf-8'].forEach((i) => { const dec = new TextDecoder(i, { fatal: true }); - if (common.hasIntl) { - dec.decode(buf.slice(0, 8), { stream: true }); - dec.decode(buf.slice(8)); - } else { - assert.throws( - () => { - dec.decode(buf.slice(0, 8), { stream: true }); - }, - { - code: 'ERR_NO_ICU', - name: 'TypeError', - message: '"fatal" option is not supported on Node.js compiled without ICU' - }); - } + dec.decode(buf.slice(0, 8), { stream: true }); + dec.decode(buf.slice(8)); }); // Test TextDecoder, label undefined, options null @@ -122,33 +110,16 @@ if (common.hasIntl) { // Test TextDecoder inspect with hidden fields { const dec = new TextDecoder('utf-8', { ignoreBOM: true }); - if (common.hasIntl) { - assert.strictEqual( - util.inspect(dec, { showHidden: true }), - 'TextDecoder {\n' + - ' encoding: \'utf-8\',\n' + - ' fatal: false,\n' + - ' ignoreBOM: true,\n' + - ' Symbol(flags): 4,\n' + - ' Symbol(handle): undefined\n' + - '}' - ); - } else { - dec.decode(Uint8Array.of(0), { stream: true }); - assert.strictEqual( - util.inspect(dec, { showHidden: true }), - 'TextDecoder {\n' + - " encoding: 'utf-8',\n" + - ' fatal: false,\n' + - ' ignoreBOM: true,\n' + - ' Symbol(flags): 4,\n' + - ' Symbol(handle): StringDecoder {\n' + - " encoding: 'utf8',\n" + - ' Symbol(kNativeDecoder): \n' + - ' }\n' + - '}' - ); - } + assert.strictEqual( + util.inspect(dec, { showHidden: true }), + 'TextDecoder {\n' + + ' encoding: \'utf-8\',\n' + + ' fatal: false,\n' + + ' ignoreBOM: true,\n' + + ' Symbol(flags): 4,\n' + + ' Symbol(handle): undefined\n' + + '}' + ); } From 155995438c2a1ded915050365d88cc93d06d3f07 Mon Sep 17 00:00:00 2001 From: Nikita Skovoroda Date: Wed, 28 Jan 2026 03:08:19 +0400 Subject: [PATCH 2/3] benchmark: add streaming TextDecoder benchmark --- benchmark/util/text-decoder-stream.js | 55 +++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 benchmark/util/text-decoder-stream.js diff --git a/benchmark/util/text-decoder-stream.js b/benchmark/util/text-decoder-stream.js new file mode 100644 index 00000000000000..16293b5b1375de --- /dev/null +++ b/benchmark/util/text-decoder-stream.js @@ -0,0 +1,55 @@ +'use strict'; + +const common = require('../common.js'); + +const bench = common.createBenchmark(main, { + encoding: ['utf-8', 'utf-16le'], + ignoreBOM: [0, 1], + fatal: [0, 1], + unicode: [0, 1], + len: [256, 1024 * 16, 1024 * 128], + chunks: [10], + n: [1e3], + type: ['SharedArrayBuffer', 'ArrayBuffer', 'Buffer'], +}); + +const UNICODE_ALPHA = 'Blåbærsyltetøy'; +const ASCII_ALPHA = 'Blueberry jam'; + +function main({ encoding, len, unicode, chunks, n, ignoreBOM, type, fatal }) { + const decoder = new TextDecoder(encoding, { ignoreBOM, fatal }); + let buf; + + const fill = Buffer.from(unicode ? UNICODE_ALPHA : ASCII_ALPHA, encoding); + + switch (type) { + case 'SharedArrayBuffer': { + buf = new SharedArrayBuffer(len); + Buffer.from(buf).fill(fill); + break; + } + case 'ArrayBuffer': { + buf = new ArrayBuffer(len); + Buffer.from(buf).fill(fill); + break; + } + case 'Buffer': { + buf = Buffer.alloc(len, fill); + break; + } + } + + const chunk = Math.ceil(len / chunks); + const max = len - chunk; + bench.start(); + for (let i = 0; i < n; i++) { + let pos = 0; + while (pos < max) { + decoder.decode(buf.slice(pos, pos + chunk), { stream: true }); + pos += chunk; + } + + decoder.decode(buf.slice(pos)); + } + bench.end(n); +} From 398a94edb09fa1cc73b19a7e03c52f4e887735ff Mon Sep 17 00:00:00 2001 From: Nikita Skovoroda Date: Wed, 28 Jan 2026 10:12:20 +0400 Subject: [PATCH 3/3] lib: add utf16 fast path for TextDecoder --- lib/internal/encoding.js | 80 +++++++++---------- lib/internal/encoding/util.js | 57 ++++++++----- ...test-whatwg-encoding-custom-textdecoder.js | 2 +- 3 files changed, 74 insertions(+), 65 deletions(-) diff --git a/lib/internal/encoding.js b/lib/internal/encoding.js index 7d4747abc23bb9..5f1655426d5bd5 100644 --- a/lib/internal/encoding.js +++ b/lib/internal/encoding.js @@ -20,8 +20,8 @@ const { FastBuffer } = require('internal/buffer'); const { ERR_ENCODING_NOT_SUPPORTED, ERR_INVALID_ARG_TYPE, + ERR_ENCODING_INVALID_ENCODED_DATA, ERR_INVALID_THIS, - ERR_NO_ICU, } = require('internal/errors').codes; const kSingleByte = Symbol('single-byte'); const kHandle = Symbol('handle'); @@ -30,11 +30,11 @@ const kEncoding = Symbol('encoding'); const kDecoder = Symbol('decoder'); const kChunk = Symbol('chunk'); const kFatal = Symbol('kFatal'); -const kUTF8FastPath = Symbol('kUTF8FastPath'); +const kUnicode = Symbol('kUnicode'); const kIgnoreBOM = Symbol('kIgnoreBOM'); const { isSinglebyteEncoding, createSinglebyteDecoder } = require('internal/encoding/single-byte'); -const { unfinishedBytesUtf8, mergePrefixUtf8 } = require('internal/encoding/util'); +const { unfinishedBytes, mergePrefix } = require('internal/encoding/util'); const { getConstructorOf, @@ -419,11 +419,24 @@ if (hasIntl) { const kBOMSeen = Symbol('BOM seen'); -let StringDecoder; -function lazyStringDecoder() { - if (StringDecoder === undefined) - ({ StringDecoder } = require('string_decoder')); - return StringDecoder; +function fixupDecodedString(res, ignoreBom, fatal, encoding) { + if (res.length === 0) return ''; + if (!ignoreBom && res[0] === '\ufeff') res = StringPrototypeSlice(res, 1); + if (!fatal) return res.toWellFormed(); + if (!res.isWellFormed()) throw new ERR_ENCODING_INVALID_ENCODED_DATA(encoding, undefined); + return res; +} + +function decodeUTF16le(input, ignoreBom, fatal) { + return fixupDecodedString(parseInput(input).ucs2Slice(), ignoreBom, fatal, 'utf-16le'); +} + +function decodeUTF16be(input, ignoreBom, fatal) { + const be = parseInput(input); + const le = new FastBuffer(be.length); + le.set(be); + le.swap16(); + return fixupDecodedString(le.ucs2Slice(), ignoreBom, fatal, 'utf-16be'); } class TextDecoder { @@ -446,33 +459,29 @@ class TextDecoder { this[kEncoding] = enc; this[kIgnoreBOM] = Boolean(options?.ignoreBOM); this[kFatal] = Boolean(options?.fatal); - this[kUTF8FastPath] = false; + this[kUnicode] = undefined; this[kHandle] = undefined; this[kSingleByte] = undefined; // Does not care about streaming or BOM this[kChunk] = null; // A copy of previous streaming tail or null if (enc === 'utf-8') { - this[kUTF8FastPath] = true; + this[kUnicode] = decodeUTF8; + this[kBOMSeen] = false; + } else if (enc === 'utf-16le') { + this[kUnicode] = decodeUTF16le; + this[kBOMSeen] = false; + } else if (enc === 'utf-16be') { + this[kUnicode] = decodeUTF16be; this[kBOMSeen] = false; } else if (isSinglebyteEncoding(enc)) { this[kSingleByte] = createSinglebyteDecoder(enc, this[kFatal]); - } else { - this.#prepareConverter(); // Need to throw early if we don't support the encoding - } - } - - #prepareConverter() { - if (hasIntl) { + } if (hasIntl) { let icuEncoding = this[kEncoding]; if (icuEncoding === 'gbk') icuEncoding = 'gb18030'; // 10.1.1. GBK's decoder is gb18030's decoder const handle = icuGetConverter(icuEncoding, this[kFlags]); if (handle === undefined) throw new ERR_ENCODING_NOT_SUPPORTED(this[kEncoding]); this[kHandle] = handle; - } else if (this[kEncoding] === 'utf-16le') { - if (this[kFatal]) throw new ERR_NO_ICU('"fatal" option'); - this[kHandle] = new (lazyStringDecoder())(this[kEncoding]); - this[kBOMSeen] = false; } else { throw new ERR_ENCODING_NOT_SUPPORTED(this[kEncoding]); } @@ -485,19 +494,19 @@ class TextDecoder { if (this[kSingleByte]) return this[kSingleByte](parseInput(input)); const stream = options?.stream; - if (this[kUTF8FastPath]) { + if (this[kUnicode]) { const chunk = this[kChunk]; const ignoreBom = this[kIgnoreBOM] || this[kBOMSeen]; if (!stream) { this[kBOMSeen] = false; - if (!chunk) return decodeUTF8(input, ignoreBom, this[kFatal]); + if (!chunk) return this[kUnicode](input, ignoreBom, this[kFatal]); } let u = parseInput(input); if (u.length === 0 && stream) return ''; // no state change let prefix; if (chunk) { - const merged = mergePrefixUtf8(u, this[kChunk]); + const merged = mergePrefix(u, this[kChunk], this[kEncoding]); if (u.length < 3) { u = merged; // Might be unfinished, but fully consumed old u } else { @@ -510,7 +519,7 @@ class TextDecoder { } if (stream) { - const trail = unfinishedBytesUtf8(u, u.length); + const trail = unfinishedBytes(u, u.length, this[kEncoding]); if (trail > 0) { this[kChunk] = new FastBuffer(u.subarray(-trail)); // copy if (!prefix && trail === u.length) return ''; // No further state change @@ -519,8 +528,8 @@ class TextDecoder { } try { - const res = (prefix ? decodeUTF8(prefix, ignoreBom, this[kFatal]) : '') + - decodeUTF8(u, ignoreBom || prefix, this[kFatal]); + const res = (prefix ? this[kUnicode](prefix, ignoreBom, this[kFatal]) : '') + + this[kUnicode](u, ignoreBom || prefix, this[kFatal]); // "BOM seen" is set on the current decode call only if it did not error, // in "serialize I/O queue" after decoding @@ -541,22 +550,7 @@ class TextDecoder { return icuDecode(this[kHandle], input, flags, this[kEncoding]); } - input = parseInput(input); - - let result = stream ? this[kHandle].write(input) : this[kHandle].end(input); - - if (result.length > 0 && !this[kBOMSeen] && !this[kIgnoreBOM]) { - // If the very first result in the stream is a BOM, and we are not - // explicitly told to ignore it, then we discard it. - if (result[0] === '\ufeff') { - result = StringPrototypeSlice(result, 1); - } - this[kBOMSeen] = true; - } - - if (!stream) this[kBOMSeen] = false; - - return result; + // Unreachable } } diff --git a/lib/internal/encoding/util.js b/lib/internal/encoding/util.js index 107a0f41b5d811..80d0cb9fc3028f 100644 --- a/lib/internal/encoding/util.js +++ b/lib/internal/encoding/util.js @@ -7,39 +7,54 @@ const { Uint8Array, } = primordials; - /** * Get a number of last bytes in an Uint8Array `data` ending at `len` that don't * form a codepoint yet, but can be a part of a single codepoint on more data. - * @param {Uint8Array} data Uint8Array of potentially UTF-8 bytes + * @param {Uint8Array} data Uint8Array of potentially UTF-8/UTF-16 bytes * @param {number} len Position to look behind from - * @returns {number} Number of unfinished potentially valid UTF-8 bytes ending at position `len` + * @param {string} enc Encoding to use: utf-8, utf-16le, or utf16-be + * @returns {number} Number (0-3) of unfinished potentially valid UTF bytes ending at position `len` */ -function unfinishedBytesUtf8(data, len) { - // 0-3 - let pos = 0; - while (pos < 2 && pos < len && (data[len - pos - 1] & 0xc0) === 0x80) pos++; // Go back 0-2 trailing bytes - if (pos === len) return 0; // no space for lead - const lead = data[len - pos - 1]; - if (lead < 0xc2 || lead > 0xf4) return 0; // not a lead - if (pos === 0) return 1; // Nothing to recheck, we have only lead, return it. 2-byte must return here - if (lead < 0xe0 || (lead < 0xf0 && pos >= 2)) return 0; // 2-byte, or 3-byte or less and we already have 2 trailing - const lower = lead === 0xf0 ? 0x90 : lead === 0xe0 ? 0xa0 : 0x80; - const upper = lead === 0xf4 ? 0x8f : lead === 0xed ? 0x9f : 0xbf; - const next = data[len - pos]; - return next >= lower && next <= upper ? pos + 1 : 0; +function unfinishedBytes(data, len, enc) { + switch (enc) { + case 'utf-8': { + // 0-3 + let pos = 0; + while (pos < 2 && pos < len && (data[len - pos - 1] & 0xc0) === 0x80) pos++; // Go back 0-2 trailing bytes + if (pos === len) return 0; // no space for lead + const lead = data[len - pos - 1]; + if (lead < 0xc2 || lead > 0xf4) return 0; // not a lead + if (pos === 0) return 1; // Nothing to recheck, we have only lead, return it. 2-byte must return here + if (lead < 0xe0 || (lead < 0xf0 && pos >= 2)) return 0; // 2-byte, 3-byte or less and we already have 2 trailing + const lower = lead === 0xf0 ? 0x90 : lead === 0xe0 ? 0xa0 : 0x80; + const upper = lead === 0xf4 ? 0x8f : lead === 0xed ? 0x9f : 0xbf; + const next = data[len - pos]; + return next >= lower && next <= upper ? pos + 1 : 0; + } + + case 'utf-16le': + case 'utf-16be': { + // 0-3 + const uneven = len % 2; // Uneven byte length adds 1 + if (len < 2) return uneven; + const l = len - uneven - 1; + const last = enc === 'utf-16le' ? (data[l] << 8) ^ data[l - 1] : (data[l - 1] << 8) ^ data[l]; + return last >= 0xd8_00 && last < 0xdc_00 ? uneven + 2 : uneven; // lone lead adds 2 + } + } } /** * Merge prefix `chunk` with `data` and return new combined prefix. * For data.length < 3, fully consumes data and can return unfinished data, * otherwise returns a prefix with no unfinished bytes - * @param {Uint8Array} data Uint8Array of potentially UTF-8 bytes + * @param {Uint8Array} data Uint8Array of potentially UTF-8/UTF-16 bytes * @param {Uint8Array} chunk Prefix to prepend before `data` + * @param {string} enc Encoding to use: utf-8, utf-16le, or utf16-be * @returns {Uint8Array} If data.length >= 3: an Uint8Array containing `chunk` and a slice of `data` - * so that the result has no unfinished UTF-8 codepoints. If data.length < 3: concat(chunk, data). + * so that the result has no unfinished codepoints. If data.length < 3: concat(chunk, data). */ -function mergePrefixUtf8(data, chunk) { +function mergePrefix(data, chunk, enc) { if (data.length === 0) return chunk; if (data.length < 3) { // No reason to bruteforce offsets, also it's possible this doesn't yet end the sequence @@ -57,7 +72,7 @@ function mergePrefixUtf8(data, chunk) { // Stop at the first offset where unfinished bytes reaches 0 or fits into data // If that doesn't happen (data too short), just concat chunk and data completely (above) for (let i = 1; i <= 3; i++) { - const unfinished = unfinishedBytesUtf8(temp, chunk.length + i); // 0-3 + const unfinished = unfinishedBytes(temp, chunk.length + i, enc); // 0-3 if (unfinished <= i) { // Always reachable at 3, but we still need 'unfinished' value for it const add = i - unfinished; // 0-3 @@ -69,4 +84,4 @@ function mergePrefixUtf8(data, chunk) { return null; } -module.exports = { unfinishedBytesUtf8, mergePrefixUtf8 }; +module.exports = { unfinishedBytes, mergePrefix }; diff --git a/test/parallel/test-whatwg-encoding-custom-textdecoder.js b/test/parallel/test-whatwg-encoding-custom-textdecoder.js index 9734825b6b27a5..10ef410f5bf77b 100644 --- a/test/parallel/test-whatwg-encoding-custom-textdecoder.js +++ b/test/parallel/test-whatwg-encoding-custom-textdecoder.js @@ -101,7 +101,7 @@ assert(TextDecoder); } // Test TextDecoder, UTF-16be -if (common.hasIntl) { +{ const dec = new TextDecoder('utf-16be'); const res = dec.decode(Buffer.from('test€', 'utf-16le').swap16()); assert.strictEqual(res, 'test€');