Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions benchmark/util/text-decoder-stream.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
'use strict';

const common = require('../common.js');

const bench = common.createBenchmark(main, {
encoding: ['utf-8', 'utf-16le'],
ignoreBOM: [0, 1],
fatal: [0, 1],
unicode: [0, 1],
len: [256, 1024 * 16, 1024 * 128],
chunks: [10],
n: [1e3],
type: ['SharedArrayBuffer', 'ArrayBuffer', 'Buffer'],
});

const UNICODE_ALPHA = 'Blåbærsyltetøy';
const ASCII_ALPHA = 'Blueberry jam';

function main({ encoding, len, unicode, chunks, n, ignoreBOM, type, fatal }) {
const decoder = new TextDecoder(encoding, { ignoreBOM, fatal });
let buf;

const fill = Buffer.from(unicode ? UNICODE_ALPHA : ASCII_ALPHA, encoding);

switch (type) {
case 'SharedArrayBuffer': {
buf = new SharedArrayBuffer(len);
Buffer.from(buf).fill(fill);
break;
}
case 'ArrayBuffer': {
buf = new ArrayBuffer(len);
Buffer.from(buf).fill(fill);
break;
}
case 'Buffer': {
buf = Buffer.alloc(len, fill);
break;
}
}

const chunk = Math.ceil(len / chunks);
const max = len - chunk;
bench.start();
for (let i = 0; i < n; i++) {
let pos = 0;
while (pos < max) {
decoder.decode(buf.slice(pos, pos + chunk), { stream: true });
pos += chunk;
}

decoder.decode(buf.slice(pos));
}
bench.end(n);
}
125 changes: 83 additions & 42 deletions lib/internal/encoding.js
Original file line number Diff line number Diff line change
Expand Up @@ -20,19 +20,21 @@ const { FastBuffer } = require('internal/buffer');
const {
ERR_ENCODING_NOT_SUPPORTED,
ERR_INVALID_ARG_TYPE,
ERR_ENCODING_INVALID_ENCODED_DATA,
ERR_INVALID_THIS,
ERR_NO_ICU,
} = require('internal/errors').codes;
const kSingleByte = Symbol('single-byte');
const kHandle = Symbol('handle');
const kFlags = Symbol('flags');
const kEncoding = Symbol('encoding');
const kDecoder = Symbol('decoder');
const kChunk = Symbol('chunk');
const kFatal = Symbol('kFatal');
const kUTF8FastPath = Symbol('kUTF8FastPath');
const kUnicode = Symbol('kUnicode');
const kIgnoreBOM = Symbol('kIgnoreBOM');

const { isSinglebyteEncoding, createSinglebyteDecoder } = require('internal/encoding/single-byte');
const { unfinishedBytes, mergePrefix } = require('internal/encoding/util');

const {
getConstructorOf,
Expand Down Expand Up @@ -417,11 +419,24 @@ if (hasIntl) {

const kBOMSeen = Symbol('BOM seen');

let StringDecoder;
function lazyStringDecoder() {
if (StringDecoder === undefined)
({ StringDecoder } = require('string_decoder'));
return StringDecoder;
function fixupDecodedString(res, ignoreBom, fatal, encoding) {
if (res.length === 0) return '';
if (!ignoreBom && res[0] === '\ufeff') res = StringPrototypeSlice(res, 1);
if (!fatal) return res.toWellFormed();
if (!res.isWellFormed()) throw new ERR_ENCODING_INVALID_ENCODED_DATA(encoding, undefined);
return res;
}

function decodeUTF16le(input, ignoreBom, fatal) {
return fixupDecodedString(parseInput(input).ucs2Slice(), ignoreBom, fatal, 'utf-16le');
}

function decodeUTF16be(input, ignoreBom, fatal) {
const be = parseInput(input);
const le = new FastBuffer(be.length);
le.set(be);
le.swap16();
return fixupDecodedString(le.ucs2Slice(), ignoreBom, fatal, 'utf-16be');
}

class TextDecoder {
Expand All @@ -444,32 +459,29 @@ class TextDecoder {
this[kEncoding] = enc;
this[kIgnoreBOM] = Boolean(options?.ignoreBOM);
this[kFatal] = Boolean(options?.fatal);
this[kUTF8FastPath] = false;
this[kUnicode] = undefined;
this[kHandle] = undefined;
this[kSingleByte] = undefined; // Does not care about streaming or BOM
this[kChunk] = null; // A copy of previous streaming tail or null

if (enc === 'utf-8') {
this[kUTF8FastPath] = true;
this[kUnicode] = decodeUTF8;
this[kBOMSeen] = false;
} else if (enc === 'utf-16le') {
this[kUnicode] = decodeUTF16le;
this[kBOMSeen] = false;
} else if (enc === 'utf-16be') {
this[kUnicode] = decodeUTF16be;
this[kBOMSeen] = false;
} else if (isSinglebyteEncoding(enc)) {
this[kSingleByte] = createSinglebyteDecoder(enc, this[kFatal]);
} else {
this.#prepareConverter(); // Need to throw early if we don't support the encoding
}
}

#prepareConverter() {
if (this[kHandle] !== undefined) return;
if (hasIntl) {
} if (hasIntl) {
let icuEncoding = this[kEncoding];
if (icuEncoding === 'gbk') icuEncoding = 'gb18030'; // 10.1.1. GBK's decoder is gb18030's decoder
const handle = icuGetConverter(icuEncoding, this[kFlags]);
if (handle === undefined)
throw new ERR_ENCODING_NOT_SUPPORTED(this[kEncoding]);
this[kHandle] = handle;
} else if (this[kEncoding] === 'utf-8' || this[kEncoding] === 'utf-16le') {
if (this[kFatal]) throw new ERR_NO_ICU('"fatal" option');
this[kHandle] = new (lazyStringDecoder())(this[kEncoding]);
this[kBOMSeen] = false;
} else {
throw new ERR_ENCODING_NOT_SUPPORTED(this[kEncoding]);
}
Expand All @@ -482,34 +494,63 @@ class TextDecoder {
if (this[kSingleByte]) return this[kSingleByte](parseInput(input));

const stream = options?.stream;
if (this[kUTF8FastPath]) {
if (!stream) return decodeUTF8(input, this[kIgnoreBOM], this[kFatal]);
this[kUTF8FastPath] = false;
}

this.#prepareConverter();

if (hasIntl) {
const flags = stream ? 0 : CONVERTER_FLAGS_FLUSH;
return icuDecode(this[kHandle], input, flags, this[kEncoding]);
}
if (this[kUnicode]) {
const chunk = this[kChunk];
const ignoreBom = this[kIgnoreBOM] || this[kBOMSeen];
if (!stream) {
this[kBOMSeen] = false;
if (!chunk) return this[kUnicode](input, ignoreBom, this[kFatal]);
}

input = parseInput(input);
let u = parseInput(input);
if (u.length === 0 && stream) return ''; // no state change
let prefix;
if (chunk) {
const merged = mergePrefix(u, this[kChunk], this[kEncoding]);
if (u.length < 3) {
u = merged; // Might be unfinished, but fully consumed old u
} else {
prefix = merged; // Stops at complete chunk
const add = prefix.length - this[kChunk].length;
if (add > 0) u = u.subarray(add);
}

this[kChunk] = null;
}

let result = stream ? this[kHandle].write(input) : this[kHandle].end(input);
if (stream) {
const trail = unfinishedBytes(u, u.length, this[kEncoding]);
if (trail > 0) {
this[kChunk] = new FastBuffer(u.subarray(-trail)); // copy
if (!prefix && trail === u.length) return ''; // No further state change
u = u.subarray(0, -trail);
}
}

if (result.length > 0 && !this[kBOMSeen] && !this[kIgnoreBOM]) {
// If the very first result in the stream is a BOM, and we are not
// explicitly told to ignore it, then we discard it.
if (result[0] === '\ufeff') {
result = StringPrototypeSlice(result, 1);
try {
const res = (prefix ? this[kUnicode](prefix, ignoreBom, this[kFatal]) : '') +
this[kUnicode](u, ignoreBom || prefix, this[kFatal]);

// "BOM seen" is set on the current decode call only if it did not error,
// in "serialize I/O queue" after decoding
// We don't get here if we had no complete data to process,
// and we don't want BOM processing after that if streaming
if (stream) this[kBOMSeen] = true;

return res;
} catch (e) {
this[kChunk] = null; // Reset unfinished chunk on errors
// The correct way per spec seems to be not destroying the decoder state (aka BOM here) in stream mode
throw e;
}
this[kBOMSeen] = true;
}

if (!stream) this[kBOMSeen] = false;
if (hasIntl) {
const flags = stream ? 0 : CONVERTER_FLAGS_FLUSH;
return icuDecode(this[kHandle], input, flags, this[kEncoding]);
}

return result;
// Unreachable
}
}

Expand Down
87 changes: 87 additions & 0 deletions lib/internal/encoding/util.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
// From https://npmjs.com/package/@exodus/bytes
// Copyright Exodus Movement. Licensed under MIT License.

'use strict';

const {
Uint8Array,
} = primordials;

/**
* Get a number of last bytes in an Uint8Array `data` ending at `len` that don't
* form a codepoint yet, but can be a part of a single codepoint on more data.
* @param {Uint8Array} data Uint8Array of potentially UTF-8/UTF-16 bytes
* @param {number} len Position to look behind from
* @param {string} enc Encoding to use: utf-8, utf-16le, or utf16-be
* @returns {number} Number (0-3) of unfinished potentially valid UTF bytes ending at position `len`
*/
function unfinishedBytes(data, len, enc) {
switch (enc) {
case 'utf-8': {
// 0-3
let pos = 0;
while (pos < 2 && pos < len && (data[len - pos - 1] & 0xc0) === 0x80) pos++; // Go back 0-2 trailing bytes
if (pos === len) return 0; // no space for lead
const lead = data[len - pos - 1];
if (lead < 0xc2 || lead > 0xf4) return 0; // not a lead
if (pos === 0) return 1; // Nothing to recheck, we have only lead, return it. 2-byte must return here
if (lead < 0xe0 || (lead < 0xf0 && pos >= 2)) return 0; // 2-byte, 3-byte or less and we already have 2 trailing
const lower = lead === 0xf0 ? 0x90 : lead === 0xe0 ? 0xa0 : 0x80;
const upper = lead === 0xf4 ? 0x8f : lead === 0xed ? 0x9f : 0xbf;
const next = data[len - pos];
return next >= lower && next <= upper ? pos + 1 : 0;
}

case 'utf-16le':
case 'utf-16be': {
// 0-3
const uneven = len % 2; // Uneven byte length adds 1
if (len < 2) return uneven;
const l = len - uneven - 1;
const last = enc === 'utf-16le' ? (data[l] << 8) ^ data[l - 1] : (data[l - 1] << 8) ^ data[l];
return last >= 0xd8_00 && last < 0xdc_00 ? uneven + 2 : uneven; // lone lead adds 2
}
}
}

/**
* Merge prefix `chunk` with `data` and return new combined prefix.
* For data.length < 3, fully consumes data and can return unfinished data,
* otherwise returns a prefix with no unfinished bytes
* @param {Uint8Array} data Uint8Array of potentially UTF-8/UTF-16 bytes
* @param {Uint8Array} chunk Prefix to prepend before `data`
* @param {string} enc Encoding to use: utf-8, utf-16le, or utf16-be
* @returns {Uint8Array} If data.length >= 3: an Uint8Array containing `chunk` and a slice of `data`
* so that the result has no unfinished codepoints. If data.length < 3: concat(chunk, data).
*/
function mergePrefix(data, chunk, enc) {
if (data.length === 0) return chunk;
if (data.length < 3) {
// No reason to bruteforce offsets, also it's possible this doesn't yet end the sequence
const res = new Uint8Array(data.length + chunk.length);
res.set(chunk);
res.set(data, chunk.length);
return res;
}

// Slice off a small portion of data into prefix chunk so we can decode them separately without extending array size
const temp = new Uint8Array(chunk.length + 3); // We have 1-3 bytes and need 1-3 more bytes
temp.set(chunk);
temp.set(data.subarray(0, 3), chunk.length);

// Stop at the first offset where unfinished bytes reaches 0 or fits into data
// If that doesn't happen (data too short), just concat chunk and data completely (above)
for (let i = 1; i <= 3; i++) {
const unfinished = unfinishedBytes(temp, chunk.length + i, enc); // 0-3
if (unfinished <= i) {
// Always reachable at 3, but we still need 'unfinished' value for it
const add = i - unfinished; // 0-3
return add > 0 ? temp.subarray(0, chunk.length + add) : chunk;
}
}

// Unreachable
return null;
}

module.exports = { unfinishedBytes, mergePrefix };
1 change: 1 addition & 0 deletions test/parallel/test-bootstrap-modules.js
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ expected.beforePreExec = new Set([
'Internal Binding fs',
'NativeModule internal/encoding',
'NativeModule internal/encoding/single-byte',
'NativeModule internal/encoding/util',
'NativeModule internal/blob',
'NativeModule internal/fs/utils',
'NativeModule fs',
Expand Down
Loading