From efa5be8fcb51f3d050df134ef02bae76e383fea0 Mon Sep 17 00:00:00 2001 From: Nikita Skovoroda Date: Wed, 17 Dec 2025 15:25:40 +0400 Subject: [PATCH 1/3] lib: implement all 1-byte encodings in js --- lib/internal/encoding.js | 74 +++++---- lib/internal/encoding/single-byte.js | 155 ++++++++++++++++++ src/encoding_binding.cc | 74 --------- src/encoding_binding.h | 2 - test/parallel/test-bootstrap-modules.js | 1 + .../test-internal-encoding-binding.js | 53 ------ typings/internalBinding/encoding_binding.d.ts | 1 - 7 files changed, 196 insertions(+), 164 deletions(-) create mode 100644 lib/internal/encoding/single-byte.js delete mode 100644 test/parallel/test-internal-encoding-binding.js diff --git a/lib/internal/encoding.js b/lib/internal/encoding.js index 61f48f3395fba7..d30e68df4304a2 100644 --- a/lib/internal/encoding.js +++ b/lib/internal/encoding.js @@ -13,24 +13,27 @@ const { StringPrototypeSlice, Symbol, SymbolToStringTag, - Uint8Array, } = primordials; +const { FastBuffer } = require('internal/buffer'); + const { ERR_ENCODING_NOT_SUPPORTED, ERR_INVALID_ARG_TYPE, ERR_INVALID_THIS, ERR_NO_ICU, } = require('internal/errors').codes; +const kMethod = Symbol('method'); const kHandle = Symbol('handle'); const kFlags = Symbol('flags'); const kEncoding = Symbol('encoding'); const kDecoder = Symbol('decoder'); const kFatal = Symbol('kFatal'); const kUTF8FastPath = Symbol('kUTF8FastPath'); -const kWindows1252FastPath = Symbol('kWindows1252FastPath'); const kIgnoreBOM = Symbol('kIgnoreBOM'); +const { isSinglebyteEncoding, createSinglebyteDecoder } = require('internal/encoding/single-byte'); + const { getConstructorOf, customInspectSymbol: inspect, @@ -55,11 +58,8 @@ const { encodeIntoResults, encodeUtf8String, decodeUTF8, - decodeWindows1252, } = binding; -const { Buffer } = require('buffer'); - function validateDecoder(obj) { if (obj == null || obj[kDecoder] !== true) throw new ERR_INVALID_THIS('TextDecoder'); @@ -69,7 +69,7 @@ const CONVERTER_FLAGS_FLUSH = 0x1; const CONVERTER_FLAGS_FATAL = 0x2; const CONVERTER_FLAGS_IGNORE_BOM = 0x4; -const empty = new Uint8Array(0); +const empty = new FastBuffer(); const encodings = new SafeMap([ ['unicode-1-1-utf-8', 'utf-8'], @@ -387,6 +387,24 @@ ObjectDefineProperties( [SymbolToStringTag]: { __proto__: null, configurable: true, value: 'TextEncoder' }, }); +function parseInput(input) { + if (isAnyArrayBuffer(input)) { + try { + return new FastBuffer(input); + } catch { + return empty; + } + } else if (isArrayBufferView(input)) { + try { + return new FastBuffer(input.buffer, input.byteOffset, input.byteLength); + } catch { + return empty; + } + } else { + throw new ERR_INVALID_ARG_TYPE('input', ['ArrayBuffer', 'ArrayBufferView'], input); + } +} + const TextDecoder = internalBinding('config').hasIntl ? makeTextDecoderICU() : @@ -420,10 +438,12 @@ function makeTextDecoderICU() { this[kFatal] = Boolean(options?.fatal); // Only support fast path for UTF-8. this[kUTF8FastPath] = enc === 'utf-8'; - this[kWindows1252FastPath] = enc === 'windows-1252'; this[kHandle] = undefined; + this[kMethod] = undefined; - if (!this[kUTF8FastPath] && !this[kWindows1252FastPath]) { + if (isSinglebyteEncoding(this.encoding)) { + this[kMethod] = createSinglebyteDecoder(this.encoding, this[kFatal]); + } else if (!this[kUTF8FastPath]) { this.#prepareConverter(); } } @@ -438,22 +458,18 @@ function makeTextDecoderICU() { decode(input = empty, options = kEmptyObject) { validateDecoder(this); + validateObject(options, 'options', kValidateObjectAllowObjectsAndNull); + + if (this[kMethod]) return this[kMethod](parseInput(input)); this[kUTF8FastPath] &&= !(options?.stream); - this[kWindows1252FastPath] &&= !(options?.stream); if (this[kUTF8FastPath]) { return decodeUTF8(input, this[kIgnoreBOM], this[kFatal]); } - if (this[kWindows1252FastPath]) { - return decodeWindows1252(input, this[kIgnoreBOM], this[kFatal]); - } - this.#prepareConverter(); - validateObject(options, 'options', kValidateObjectAllowObjectsAndNull); - let flags = 0; if (options !== null) flags |= options.stream ? 0 : CONVERTER_FLAGS_FLUSH; @@ -476,7 +492,7 @@ function makeTextDecoderJS() { const kBOMSeen = Symbol('BOM seen'); function hasConverter(encoding) { - return encoding === 'utf-8' || encoding === 'utf-16le'; + return encoding === 'utf-8' || encoding === 'utf-16le' || isSinglebyteEncoding(encoding); } class TextDecoder { @@ -502,30 +518,20 @@ function makeTextDecoderJS() { this[kFlags] = flags; this[kEncoding] = enc; this[kBOMSeen] = false; + this[kMethod] = undefined; + + if (isSinglebyteEncoding(this.encoding)) { + this[kMethod] = createSinglebyteDecoder(this.encoding, this[kFatal]); + } } decode(input = empty, options = kEmptyObject) { validateDecoder(this); - if (isAnyArrayBuffer(input)) { - try { - input = Buffer.from(input); - } catch { - input = empty; - } - } else if (isArrayBufferView(input)) { - try { - input = Buffer.from(input.buffer, input.byteOffset, - input.byteLength); - } catch { - input = empty; - } - } else { - throw new ERR_INVALID_ARG_TYPE('input', - ['ArrayBuffer', 'ArrayBufferView'], - input); - } + input = parseInput(input); validateObject(options, 'options', kValidateObjectAllowObjectsAndNull); + if (this[kMethod]) return this[kMethod](input); + if (this[kFlags] & CONVERTER_FLAGS_FLUSH) { this[kBOMSeen] = false; } diff --git a/lib/internal/encoding/single-byte.js b/lib/internal/encoding/single-byte.js new file mode 100644 index 00000000000000..df8042e89ad969 --- /dev/null +++ b/lib/internal/encoding/single-byte.js @@ -0,0 +1,155 @@ +// Simplified version extracted from https://npmjs.com/package/@exodus/bytes codepath for 1-byte encodings +// Copyright Exodus Movement. Licensed under MIT License. + +'use strict'; + +const { + Array, + ArrayPrototypeFill, + ObjectKeys, + ObjectPrototypeHasOwnProperty, + SafeArrayIterator, + SafeMap, + SafeSet, + StringPrototypeIncludes, + TypedArrayFrom, + TypedArrayOf, + TypedArrayPrototypeIncludes, + TypedArrayPrototypeSet, + Uint16Array, +} = primordials; + +const { isAscii } = require('buffer'); + +const { FastBuffer } = require('internal/buffer'); + +const { + ERR_ENCODING_NOT_SUPPORTED, + ERR_ENCODING_INVALID_ENCODED_DATA, +} = require('internal/errors').codes; + +const isBigEndian = new FastBuffer(TypedArrayOf(Uint16Array, 258).buffer)[1] === 2; + +const it = (x) => new SafeArrayIterator(x); + +/* fallback/single-byte.encodings.js */ + +const r = 0xfffd; +const e = (x) => it(ArrayPrototypeFill(new Array(x), 1)); +const h = (x) => it(ArrayPrototypeFill(new Array(x), r)); + +/* eslint-disable @stylistic/js/max-len */ + +// Index tables from https://encoding.spec.whatwg.org/#legacy-single-byte-encodings +// Each table in the spec lists only mapping from byte 0x80 onwards, as below that they are all ASCII and mapped as idenity +// Here, 0xfffd (replacement charcode) designates a hole (unmapped offset), as not all encodings map all offsets +// All other numbers are deltas from the last seen mapped value, starting with 0x7f (127, highest ASCII) +// Thus, [0x80, 0x81, , 0x83] is stored as [1, 1, r, 2] +// Truncation (length < 128) means that all remaining ones are mapped as identity (offset i => codepoint i), not unmapped +const encodings = { + '__proto__': null, + 'ibm866': [913, ...e(47), 8530, 1, 1, -145, 34, 61, 1, -12, -1, 14, -18, 6, 6, -1, -1, -75, 4, 32, -8, -16, -28, 60, 34, 1, -5, -6, 21, -3, -6, -16, 28, -5, 1, -4, 1, -12, -1, -6, 1, 24, -1, -82, -12, 124, -4, 8, 4, -16, -8512, ...e(15), -78, 80, -77, 80, -77, 80, -73, 80, -942, 8553, -8546, 8547, -260, -8306, 9468, -9472], + 'iso-8859-10': [...e(33), 100, 14, 16, 8, -2, 14, -143, 148, -43, 80, 6, 23, -208, 189, -32, -154, 85, 14, 16, 8, -2, 14, -128, 133, -43, 80, 6, 23, 7831, -7850, -32, -75, -63, ...e(5), 104, -34, -67, 79, -77, 75, -73, 1, 1, 1, 117, 7, -121, 1, 1, 1, 146, -144, 154, -152, ...e(5), 34, -32, ...e(5), 73, -34, -36, 48, -46, 44, -42, 1, 1, 1, 86, 7, -90, 1, 1, 1, 115, -113, 123, -121, 1, 1, 1, 1, 58], + 'iso-8859-13': [...e(33), 8061, -8059, 1, 1, 8058, -8056, 1, 49, -47, 173, -171, 1, 1, 1, 24, -22, 1, 1, 1, 8041, -8039, 1, 1, 65, -63, 158, -156, 1, 1, 1, 40, 30, 42, -46, 6, -66, 1, 83, -6, -6, -67, 176, -99, 12, 20, -12, 17, 37, -29, 2, -114, 121, -119, 1, 1, 155, -49, 25, 16, -142, 159, 2, -158, 38, 42, -46, 6, -35, 1, 52, -6, -6, -36, 145, -99, 12, 20, -12, 17, 37, -29, 2, -83, 90, -88, 1, 1, 124, -49, 25, 16, -111, 128, 2, 7835], + 'iso-8859-14': [...e(33), 7522, 1, -7520, 103, 1, 7423, -7523, 7641, -7639, 7641, -119, 231, -7749, 1, 202, 7334, 1, -7423, 1, 7455, 1, -7563, 7584, 43, -42, 44, -35, 147, -111, 1, -36, -7585, ...e(15), 165, -163, ...e(5), 7572, -7570, ...e(5), 153, -151, ...e(16), 134, -132, ...e(5), 7541, -7539, ...e(5), 122], + 'iso-8859-15': [...e(33), 1, 1, 1, 8201, -8199, 187, -185, 186, -184, ...e(10), 202, -200, 1, 1, 199, -197, 1, 1, 151, 1, 37], + 'iso-8859-16': [...e(33), 100, 1, 60, 8043, -142, -7870, -185, 186, -184, 367, -365, 206, -204, 205, 1, -203, 1, 91, 54, 59, 7840, -8039, 1, 199, -113, 268, -350, 151, 1, 37, 4, -188, 1, 1, 64, -62, 66, -64, ...e(9), 65, 51, -113, 1, 1, 124, -122, 132, 22, -151, 1, 1, 1, 60, 258, -315, 1, 1, 1, 33, -31, 35, -33, ...e(9), 34, 51, -82, 1, 1, 93, -91, 101, 22, -120, 1, 1, 1, 29, 258], + 'iso-8859-2': [...e(33), 100, 468, -407, -157, 153, 29, -179, 1, 184, -2, 6, 21, -204, 208, -2, -203, 85, 470, -409, -142, 138, 29, 364, -527, 169, -2, 6, 21, 355, -351, -2, -40, -147, 1, 64, -62, 117, -51, -63, 69, -67, 79, -77, 79, -77, 1, 64, 2, 51, 4, -116, 1, 124, -122, 1, 129, 22, -148, 150, -148, 1, 133, -131, 118, -116, 1, 33, -31, 86, -51, -32, 38, -36, 48, -46, 48, -46, 1, 33, 2, 51, 4, -85, 1, 93, -91, 1, 98, 22, -117, 119, -117, 1, 102, 374], + 'iso-8859-3': [...e(33), 134, 434, -565, 1, r, 128, -125, 1, 136, 46, -64, 22, -135, r, 206, -203, 119, -117, 1, 1, 1, 112, -110, 1, 121, 46, -64, 22, -120, r, 191, -188, 1, 1, r, 2, 70, -2, -65, ...e(8), r, 2, 1, 1, 1, 76, -74, 1, 69, -67, 1, 1, 1, 144, -16, -125, 1, 1, 1, r, 2, 39, -2, -34, ...e(8), r, 2, 1, 1, 1, 45, -43, 1, 38, -36, 1, 1, 1, 113, -16, 380], + 'iso-8859-4': [...e(33), 100, 52, 30, -178, 132, 19, -148, 1, 184, -78, 16, 68, -185, 208, -206, 1, 85, 470, -388, -163, 117, 19, 395, -527, 169, -78, 16, 68, -29, 52, -51, -75, -63, ...e(5), 104, -34, -67, 79, -77, 75, -73, 1, 92, -26, 53, 7, -22, -98, 1, 1, 1, 1, 154, -152, 1, 1, 140, 2, -139, 34, -32, ...e(5), 73, -34, -36, 48, -46, 44, -42, 1, 61, -26, 53, 7, -22, -67, 1, 1, 1, 1, 123, -121, 1, 1, 109, 2, 366], + 'iso-8859-5': [...e(33), 865, ...e(11), -863, 865, ...e(65), 7367, -7365, ...e(11), -949, 951, 1], + 'iso-8859-6': [...e(33), r, r, r, 4, ...h(7), 1384, -1375, ...h(13), 1390, r, r, r, 4, r, 2, ...e(25), r, r, r, r, r, 6, ...e(18), ...h(13)], + 'iso-8859-7': [...e(33), 8056, 1, -8054, 8201, 3, -8201, 1, 1, 1, 721, -719, 1, 1, r, 8040, -8037, 1, 1, 1, 721, 1, 1, -719, 721, 1, 1, -719, 721, -719, 721, ...e(19), r, 2, ...e(43), r], + 'iso-8859-8': [...e(33), r, 2, ...e(7), 46, -44, ...e(14), 62, -60, 1, 1, 1, ...h(32), 8025, -6727, ...e(26), r, r, 6692, 1, r], + 'koi8-r': [9345, 2, 10, 4, 4, 4, 4, 8, 8, 8, 8, 68, 4, 4, 4, 4, 1, 1, 1, -627, 640, -903, 1, 46, 28, 1, -8645, 8833, -8817, 2, 5, 64, 9305, 1, 1, -8449, 8450, ...e(14), -8544, 8545, ...e(10), -9411, 933, -30, 1, 21, -18, 1, 15, -17, 18, -13, ...e(7), 16, -15, 1, 1, 1, -13, -4, 26, -1, -20, 17, 5, -4, -2, 3, -28, -30, 1, 21, -18, 1, 15, -17, 18, -13, ...e(7), 16, -15, 1, 1, 1, -13, -4, 26, -1, -20, 17, 5, -4, -2, 3], + 'koi8-u': [9345, 2, 10, 4, 4, 4, 4, 8, 8, 8, 8, 68, 4, 4, 4, 4, 1, 1, 1, -627, 640, -903, 1, 46, 28, 1, -8645, 8833, -8817, 2, 5, 64, 9305, 1, 1, -8449, 3, 8448, -8446, 1, 8448, 1, 1, 1, 1, -8394, -51, 8448, 1, 1, 1, -8544, 3, 8543, -8541, 1, 8543, 1, 1, 1, 1, -8410, -130, -869, 933, -30, 1, 21, -18, 1, 15, -17, 18, -13, ...e(7), 16, -15, 1, 1, 1, -13, -4, 26, -1, -20, 17, 5, -4, -2, 3, -28, -30, 1, 21, -18, 1, 15, -17, 18, -13, ...e(7), 16, -15, 1, 1, 1, -13, -4, 26, -1, -20, 17, 5, -4, -2, 3], + 'macintosh': [69, 1, 2, 2, 8, 5, 6, 5, -1, 2, 2, -1, 2, 2, 2, -1, 2, 1, 2, -1, 2, 1, 2, 2, -1, 2, 2, -1, 5, -1, 2, 1, 7972, -8048, -14, 1, 4, 8059, -8044, 41, -49, -5, 8313, -8302, -12, 8632, -8602, 18, 8518, -8557, 8627, 1, -8640, 16, 8525, 15, -2, -7759, 7787, -8577, 16, 751, -707, 18, -57, -30, 11, 8558, -8328, 8374, -66, -8539, 16, 8043, -8070, 32, 3, 18, 125, 1, 7872, 1, 8, 1, -5, 1, -7970, 9427, -9419, 121, 7884, 104, -115, 1, 56007, 1, -56033, -8042, 8035, 4, 18, -8046, 8, -9, 10, -3, 5, 1, 1, -3, 7, 1, 63531, -63533, 8, 1, -2, 88, 405, 22, -557, 553, 1, 1, -546, 549, -2, -20], + 'windows-1250': [8237, -8235, 8089, -8087, 8091, 8, -6, 1, -8089, 8104, -7888, 7897, -7903, 10, 25, -4, -233, 8072, 1, 3, 1, 5, -15, 1, -8060, 8330, -8129, 7897, -7903, 10, 25, -4, -218, 551, 17, -407, -157, 96, -94, 1, 1, 1, 181, -179, 1, 1, 1, 205, -203, 1, 554, -409, -142, 1, 1, 1, 1, 77, 90, -164, 130, 416, -415, 62, -40, -147, 1, 64, -62, 117, -51, -63, 69, -67, 79, -77, 79, -77, 1, 64, 2, 51, 4, -116, 1, 124, -122, 1, 129, 22, -148, 150, -148, 1, 133, -131, 118, -116, 1, 33, -31, 86, -51, -32, 38, -36, 48, -46, 48, -46, 1, 33, 2, 51, 4, -85, 1, 93, -91, 1, 98, 22, -117, 119, -117, 1, 102, 374], + 'windows-1251': [899, 1, 7191, -7111, 7115, 8, -6, 1, 139, -124, -7207, 7216, -7215, 2, -1, 4, 67, 7110, 1, 3, 1, 5, -15, 1, -8060, 8330, -7369, 7137, -7136, 2, -1, 4, -959, 878, 80, -86, -868, 1004, -1002, 1, 858, -856, 859, -857, 1, 1, 1, 857, -855, 1, 853, 80, 59, -988, 1, 1, 922, 7365, -7362, -921, 925, -83, 80, 2, -71, ...e(63)], + 'windows-1252': [8237, -8235, 8089, -7816, 7820, 8, -6, 1, -7515, 7530, -7888, 7897, -7911, -197, 240, -238, 1, 8072, 1, 3, 1, 5, -15, 1, -7480, 7750, -8129, 7897, -7911, -182, 225, -6], + 'windows-1253': [8237, -8235, 8089, -7816, 7820, 8, -6, 1, -8089, 8104, -8102, 8111, -8109, 1, 1, 1, 1, 8072, 1, 3, 1, 5, -15, 1, -8060, 8330, -8328, 8096, -8094, 1, 1, 1, 1, 741, 1, -739, 1, 1, 1, 1, 1, 1, r, 2, 1, 1, 1, 8039, -8037, 1, 1, 1, 721, -719, 1, 1, 721, 1, 1, -719, 721, -719, 721, ...e(19), r, 2, ...e(43), r], + 'windows-1254': [8237, -8235, 8089, -7816, 7820, 8, -6, 1, -7515, 7530, -7888, 7897, -7911, -197, 1, 1, 1, 8072, 1, 3, 1, 5, -15, 1, -7480, 7750, -8129, 7897, -7911, -182, 1, 218, -216, ...e(47), 79, -77, ...e(11), 84, 46, -127, ...e(16), 48, -46, ...e(11), 53, 46], + 'windows-1255': [8237, -8235, 8089, -7816, 7820, 8, -6, 1, -7515, 7530, -8102, 8111, -8109, 1, 1, 1, 1, 8072, 1, 3, 1, 5, -15, 1, -7480, 7750, -8328, 8096, -8094, ...e(7), 8199, -8197, 1, 1, 1, 1, 46, -44, ...e(14), 62, -60, 1, 1, 1, 1, 1265, ...e(19), 45, 1, 1, 1, 1, ...h(7), -36, ...e(26), r, r, 6692, 1, r], + 'windows-1256': [8237, -6702, 6556, -7816, 7820, 8, -6, 1, -7515, 7530, -6583, 6592, -7911, 1332, 18, -16, 39, 6505, 1, 3, 1, 5, -15, 1, -6507, 6777, -6801, 6569, -7911, 7865, 1, -6483, -1562, 1388, -1386, ...e(7), 1557, -1555, ...e(14), 1378, -1376, 1, 1, 1, 1377, 162, -160, ...e(21), -1375, 1376, 1, 1, 1, 6, 1, 1, 1, -1379, 1380, -1378, 1379, 1, 1, 1, -1377, 1, 1, 1, 1, 1374, 1, -1372, 1, 1372, 1, 1, 1, -1370, 1371, 1, -1369, 1370, -1368, 1369, -1367, 1, 7954, 1, -6461], + 'windows-1257': [8237, -8235, 8089, -8087, 8091, 8, -6, 1, -8089, 8104, -8102, 8111, -8109, 28, 543, -527, -40, 8072, 1, 3, 1, 5, -15, 1, -8060, 8330, -8328, 8096, -8094, 19, 556, -572, 1, r, 2, 1, 1, r, 2, 1, 49, -47, 173, -171, 1, 1, 1, 24, -22, ...e(5), 1, 1, 65, -63, 158, -156, 1, 1, 1, 40, 30, 42, -46, 6, -66, 1, 83, -6, -6, -67, 176, -99, 12, 20, -12, 17, 37, -29, 2, -114, 121, -119, 1, 1, 155, -49, 25, 16, -142, 159, 2, -158, 38, 42, -46, 6, -35, 1, 52, -6, -6, -36, 145, -99, 12, 20, -12, 17, 37, -29, 2, -83, 90, -88, 1, 1, 124, -49, 25, 16, -111, 128, 2, 347], + 'windows-1258': [8237, -8235, 8089, -7816, 7820, 8, -6, 1, -7515, 7530, -8102, 8111, -7911, -197, 1, 1, 1, 8072, 1, 3, 1, 5, -15, 1, -7480, 7750, -8328, 8096, -7911, -182, 1, 218, -216, ...e(34), 64, -62, ...e(7), 565, -563, 1, 1, 65, -63, 568, -566, 1, 204, -202, 1, 1, 1, 1, 1, 1, 211, 340, -548, 1, 1, 1, 33, -31, ...e(7), 534, -532, 1, 1, 34, -32, 562, -560, 1, 173, -171, 1, 1, 1, 1, 1, 1, 180, 7931], + 'windows-874': [8237, -8235, 1, 1, 1, 8098, -8096, ...e(10), 8072, 1, 3, 1, 5, -15, 1, -8060, ...e(8), 3425, ...e(57), r, r, r, r, 5, ...e(28), r, r, r, r], + 'x-mac-cyrillic': [913, ...e(31), 7153, -8048, 992, -1005, 4, 8059, -8044, 848, -856, -5, 8313, -7456, 80, 7694, -7773, 80, 7627, -8557, 8627, 1, -7695, -929, 988, -137, -4, 80, -77, 80, -78, 80, -79, 80, -2, -83, -857, 8558, -8328, 8374, -66, -8539, 16, 8043, -8070, 875, 80, -79, 80, -7, 7102, 1, 8, 1, -5, 1, -7970, 7975, -7184, 80, -79, 80, 7351, -7445, 80, -2, -31, ...e(30), 7262], +}; + +/* eslint-enable @stylistic/js/max-len */ + +/* fallback/single-byte.js + single-byte.node.js, simplified */ + +const l256 = { __proto__: null, length: 256 }; + +function getEncoding(encoding) { + if (encoding === 'x-user-defined') { + // https://encoding.spec.whatwg.org/#x-user-defined-decoder, 14.5.1. x-user-defined decoder + return TypedArrayFrom(Uint16Array, l256, (_, i) => (i >= 0x80 ? 0xf700 + i : i)); + } + + if (!ObjectPrototypeHasOwnProperty(encodings, encoding)) { + throw new ERR_ENCODING_NOT_SUPPORTED(encoding); + } + + const map = TypedArrayFrom(Uint16Array, l256, (_, i) => i); // Unicode subset + let prev = 127; + map.set(TypedArrayFrom(Uint16Array, it(encodings[encoding]), (x) => (x === r ? x : (prev += x))), 128); + return map; +} + +const supported = new SafeSet(it(ObjectKeys(encodings))).add('iso-8859-8-i').add('x-user-defined'); +const isSinglebyteEncoding = (enc) => supported.has(enc); + +const decodersLoose = new SafeMap(); +const decodersFatal = new SafeMap(); + +function createSinglebyteDecoder(encoding, fatal) { + const id = encoding === 'iso-8859-8-i' ? 'iso-8859-8' : encoding; + const decoders = fatal ? decodersFatal : decodersLoose; + const cached = decoders.get(id); + if (cached) return cached; + + const map = getEncoding(id); + const incomplete = TypedArrayPrototypeIncludes(map, r); + + // Expects type-checked Buffer input + const decoder = (buf) => { + if (buf.byteLength === 0) return ''; + if (isAscii(buf)) return buf.latin1Slice(); // .latin1Slice is faster than .asciiSlice + const o = new Uint16Array(buf.length); + TypedArrayPrototypeSet(o, buf); // Copy to modify in-place, also those are 16-bit now + + let i = 0; + for (const end7 = o.length - 7; i < end7; i += 8) { + o[i] = map[o[i]]; + o[i + 1] = map[o[i + 1]]; + o[i + 2] = map[o[i + 2]]; + o[i + 3] = map[o[i + 3]]; + o[i + 4] = map[o[i + 4]]; + o[i + 5] = map[o[i + 5]]; + o[i + 6] = map[o[i + 6]]; + o[i + 7] = map[o[i + 7]]; + } + + for (const end = o.length; i < end; i++) o[i] = map[o[i]]; + + const b = new FastBuffer(o.buffer, o.byteOffset, o.byteLength); + if (isBigEndian) b.swap16(); + const string = b.ucs2Slice(); + if (fatal && incomplete && StringPrototypeIncludes(string, '\uFFFD')) { + throw new ERR_ENCODING_INVALID_ENCODED_DATA(encoding, undefined); + } + return string; + }; + + decoders.set(id, decoder); + return decoder; +} + +module.exports = { + isSinglebyteEncoding, + createSinglebyteDecoder, + getEncoding, // for tests +}; diff --git a/src/encoding_binding.cc b/src/encoding_binding.cc index f68dd9522a0f69..1bf528de5f029f 100644 --- a/src/encoding_binding.cc +++ b/src/encoding_binding.cc @@ -1,7 +1,6 @@ #include "encoding_binding.h" #include "ada.h" #include "env-inl.h" -#include "node_buffer.h" #include "node_errors.h" #include "node_external_reference.h" #include "simdutf.h" @@ -414,8 +413,6 @@ void BindingData::CreatePerIsolateProperties(IsolateData* isolate_data, SetMethodNoSideEffect(isolate, target, "decodeUTF8", DecodeUTF8); SetMethodNoSideEffect(isolate, target, "toASCII", ToASCII); SetMethodNoSideEffect(isolate, target, "toUnicode", ToUnicode); - SetMethodNoSideEffect( - isolate, target, "decodeWindows1252", DecodeWindows1252); } void BindingData::CreatePerContextProperties(Local target, @@ -433,77 +430,6 @@ void BindingData::RegisterTimerExternalReferences( registry->Register(DecodeUTF8); registry->Register(ToASCII); registry->Register(ToUnicode); - registry->Register(DecodeWindows1252); -} - -void BindingData::DecodeWindows1252(const FunctionCallbackInfo& args) { - Environment* env = Environment::GetCurrent(args); - - CHECK_GE(args.Length(), 1); - if (!(args[0]->IsArrayBuffer() || args[0]->IsSharedArrayBuffer() || - args[0]->IsArrayBufferView())) { - return node::THROW_ERR_INVALID_ARG_TYPE( - env->isolate(), - "The \"input\" argument must be an instance of ArrayBuffer, " - "SharedArrayBuffer, or ArrayBufferView."); - } - - bool ignore_bom = args[1]->IsTrue(); - - ArrayBufferViewContents buffer(args[0]); - const uint8_t* data = buffer.data(); - size_t length = buffer.length(); - - if (ignore_bom && length > 0 && data[0] == 0xFF) { - data++; - length--; - } - - if (length == 0) { - return args.GetReturnValue().SetEmptyString(); - } - - // Windows-1252 specific mapping for bytes 128-159 - // These differ from Latin-1/ISO-8859-1 - static const uint16_t windows1252_mapping[32] = { - 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87 - 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F - 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97 - 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178 // 98-9F - }; - - std::string result; - result.reserve(length * 3); // Reserve space for UTF-8 output - - for (size_t i = 0; i < length; i++) { - uint8_t byte = data[i]; - uint32_t codepoint; - - // Check if byte is in the special Windows-1252 range (128-159) - if (byte >= 0x80 && byte <= 0x9F) { - codepoint = windows1252_mapping[byte - 0x80]; - } else { - // For all other bytes, Windows-1252 is identical to Latin-1 - codepoint = byte; - } - - // Convert codepoint to UTF-8 - if (codepoint < 0x80) { - result.push_back(static_cast(codepoint)); - } else if (codepoint < 0x800) { - result.push_back(static_cast(0xC0 | (codepoint >> 6))); - result.push_back(static_cast(0x80 | (codepoint & 0x3F))); - } else { - result.push_back(static_cast(0xE0 | (codepoint >> 12))); - result.push_back(static_cast(0x80 | ((codepoint >> 6) & 0x3F))); - result.push_back(static_cast(0x80 | (codepoint & 0x3F))); - } - } - - Local ret; - if (ToV8Value(env->context(), result, env->isolate()).ToLocal(&ret)) { - args.GetReturnValue().Set(ret); - } } } // namespace encoding_binding diff --git a/src/encoding_binding.h b/src/encoding_binding.h index 8393702cce855f..2690cb74f8a05b 100644 --- a/src/encoding_binding.h +++ b/src/encoding_binding.h @@ -31,8 +31,6 @@ class BindingData : public SnapshotableObject { static void EncodeInto(const v8::FunctionCallbackInfo& args); static void EncodeUtf8String(const v8::FunctionCallbackInfo& args); static void DecodeUTF8(const v8::FunctionCallbackInfo& args); - static void DecodeWindows1252( - const v8::FunctionCallbackInfo& args); static void ToASCII(const v8::FunctionCallbackInfo& args); static void ToUnicode(const v8::FunctionCallbackInfo& args); diff --git a/test/parallel/test-bootstrap-modules.js b/test/parallel/test-bootstrap-modules.js index b050f5bffde04a..d69a299625d9f2 100644 --- a/test/parallel/test-bootstrap-modules.js +++ b/test/parallel/test-bootstrap-modules.js @@ -88,6 +88,7 @@ expected.beforePreExec = new Set([ 'NativeModule internal/process/signal', 'Internal Binding fs', 'NativeModule internal/encoding', + 'NativeModule internal/encoding/single-byte', 'NativeModule internal/blob', 'NativeModule internal/fs/utils', 'NativeModule fs', diff --git a/test/parallel/test-internal-encoding-binding.js b/test/parallel/test-internal-encoding-binding.js deleted file mode 100644 index 7d5397d213c205..00000000000000 --- a/test/parallel/test-internal-encoding-binding.js +++ /dev/null @@ -1,53 +0,0 @@ -// Flags: --expose-internals - -'use strict'; - -require('../common'); - -const assert = require('node:assert'); -const { internalBinding } = require('internal/test/binding'); -const binding = internalBinding('encoding_binding'); - -// Windows-1252 specific tests -{ - // Test Windows-1252 special characters in 128-159 range - // These differ from Latin-1 - assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x80), false, false), '€'); - assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x82), false, false), '‚'); - assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x83), false, false), 'ƒ'); - assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x9F), false, false), 'Ÿ'); -} - -{ - // Test Windows-1252 characters outside 128-159 range (same as Latin-1) - const buf = Uint8Array.from([0xC1, 0xE9, 0xF3]); - assert.strictEqual(binding.decodeWindows1252(buf, false, false), 'Áéó'); -} - -{ - // Empty input - const buf = Uint8Array.from([]); - assert.strictEqual(binding.decodeWindows1252(buf, false, false), ''); -} - -// Windows-1252 specific tests -{ - // Test Windows-1252 special characters in 128-159 range - // These differ from Latin-1 - assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x80), false, false), '€'); - assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x82), false, false), '‚'); - assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x83), false, false), 'ƒ'); - assert.strictEqual(binding.decodeWindows1252(Uint8Array.of(0x9F), false, false), 'Ÿ'); -} - -{ - // Test Windows-1252 characters outside 128-159 range (same as Latin-1) - const buf = Uint8Array.from([0xC1, 0xE9, 0xF3]); - assert.strictEqual(binding.decodeWindows1252(buf, false, false), 'Áéó'); -} - -{ - // Empty input - const buf = Uint8Array.from([]); - assert.strictEqual(binding.decodeWindows1252(buf, false, false), ''); -} diff --git a/typings/internalBinding/encoding_binding.d.ts b/typings/internalBinding/encoding_binding.d.ts index 6833c9ac0557b1..0774a21f25e21f 100644 --- a/typings/internalBinding/encoding_binding.d.ts +++ b/typings/internalBinding/encoding_binding.d.ts @@ -4,5 +4,4 @@ export interface EncodingBinding { decodeUTF8(buffer: ArrayBufferView | ArrayBuffer | SharedArrayBuffer, ignoreBOM?: boolean, hasFatal?: boolean): string; toASCII(input: string): string; toUnicode(input: string): string; - decodeWindows1252(buffer: ArrayBufferView | ArrayBuffer | SharedArrayBuffer, ignoreBOM?: boolean, hasFatal?: boolean): string; } From 3557542289b1196b504d7f4b62f7da413f9756cd Mon Sep 17 00:00:00 2001 From: Nikita Skovoroda Date: Fri, 19 Dec 2025 03:19:45 +0400 Subject: [PATCH 2/3] test: add single-byte encoding tests --- test/fixtures/encoding/README.md | 4 + test/fixtures/encoding/encodings.json | 465 ++++++++++++++++++ .../encoding/single-byte/index-ibm866.txt | 134 +++++ .../single-byte/index-iso-8859-10.txt | 134 +++++ .../single-byte/index-iso-8859-13.txt | 134 +++++ .../single-byte/index-iso-8859-14.txt | 134 +++++ .../single-byte/index-iso-8859-15.txt | 134 +++++ .../single-byte/index-iso-8859-16.txt | 134 +++++ .../encoding/single-byte/index-iso-8859-2.txt | 134 +++++ .../encoding/single-byte/index-iso-8859-3.txt | 127 +++++ .../encoding/single-byte/index-iso-8859-4.txt | 134 +++++ .../encoding/single-byte/index-iso-8859-5.txt | 134 +++++ .../encoding/single-byte/index-iso-8859-6.txt | 89 ++++ .../encoding/single-byte/index-iso-8859-7.txt | 131 +++++ .../encoding/single-byte/index-iso-8859-8.txt | 98 ++++ .../encoding/single-byte/index-koi8-r.txt | 134 +++++ .../encoding/single-byte/index-koi8-u.txt | 134 +++++ .../encoding/single-byte/index-macintosh.txt | 134 +++++ .../single-byte/index-windows-1250.txt | 134 +++++ .../single-byte/index-windows-1251.txt | 134 +++++ .../single-byte/index-windows-1252.txt | 134 +++++ .../single-byte/index-windows-1253.txt | 131 +++++ .../single-byte/index-windows-1254.txt | 134 +++++ .../single-byte/index-windows-1255.txt | 124 +++++ .../single-byte/index-windows-1256.txt | 134 +++++ .../single-byte/index-windows-1257.txt | 132 +++++ .../single-byte/index-windows-1258.txt | 134 +++++ .../single-byte/index-windows-874.txt | 126 +++++ .../single-byte/index-x-mac-cyrillic.txt | 134 +++++ .../test-whatwg-encoding-singlebyte.mjs | 84 ++++ 30 files changed, 4057 insertions(+) create mode 100644 test/fixtures/encoding/README.md create mode 100644 test/fixtures/encoding/encodings.json create mode 100644 test/fixtures/encoding/single-byte/index-ibm866.txt create mode 100644 test/fixtures/encoding/single-byte/index-iso-8859-10.txt create mode 100644 test/fixtures/encoding/single-byte/index-iso-8859-13.txt create mode 100644 test/fixtures/encoding/single-byte/index-iso-8859-14.txt create mode 100644 test/fixtures/encoding/single-byte/index-iso-8859-15.txt create mode 100644 test/fixtures/encoding/single-byte/index-iso-8859-16.txt create mode 100644 test/fixtures/encoding/single-byte/index-iso-8859-2.txt create mode 100644 test/fixtures/encoding/single-byte/index-iso-8859-3.txt create mode 100644 test/fixtures/encoding/single-byte/index-iso-8859-4.txt create mode 100644 test/fixtures/encoding/single-byte/index-iso-8859-5.txt create mode 100644 test/fixtures/encoding/single-byte/index-iso-8859-6.txt create mode 100644 test/fixtures/encoding/single-byte/index-iso-8859-7.txt create mode 100644 test/fixtures/encoding/single-byte/index-iso-8859-8.txt create mode 100644 test/fixtures/encoding/single-byte/index-koi8-r.txt create mode 100644 test/fixtures/encoding/single-byte/index-koi8-u.txt create mode 100644 test/fixtures/encoding/single-byte/index-macintosh.txt create mode 100644 test/fixtures/encoding/single-byte/index-windows-1250.txt create mode 100644 test/fixtures/encoding/single-byte/index-windows-1251.txt create mode 100644 test/fixtures/encoding/single-byte/index-windows-1252.txt create mode 100644 test/fixtures/encoding/single-byte/index-windows-1253.txt create mode 100644 test/fixtures/encoding/single-byte/index-windows-1254.txt create mode 100644 test/fixtures/encoding/single-byte/index-windows-1255.txt create mode 100644 test/fixtures/encoding/single-byte/index-windows-1256.txt create mode 100644 test/fixtures/encoding/single-byte/index-windows-1257.txt create mode 100644 test/fixtures/encoding/single-byte/index-windows-1258.txt create mode 100644 test/fixtures/encoding/single-byte/index-windows-874.txt create mode 100644 test/fixtures/encoding/single-byte/index-x-mac-cyrillic.txt create mode 100644 test/parallel/test-whatwg-encoding-singlebyte.mjs diff --git a/test/fixtures/encoding/README.md b/test/fixtures/encoding/README.md new file mode 100644 index 00000000000000..be6602bbf6969c --- /dev/null +++ b/test/fixtures/encoding/README.md @@ -0,0 +1,4 @@ +1. +2. +3. +4. diff --git a/test/fixtures/encoding/encodings.json b/test/fixtures/encoding/encodings.json new file mode 100644 index 00000000000000..74019c09603e21 --- /dev/null +++ b/test/fixtures/encoding/encodings.json @@ -0,0 +1,465 @@ +[ + { + "encodings": [ + { + "labels": [ + "unicode-1-1-utf-8", + "unicode11utf8", + "unicode20utf8", + "utf-8", + "utf8", + "x-unicode20utf8" + ], + "name": "UTF-8" + } + ], + "heading": "The Encoding" + }, + { + "encodings": [ + { + "labels": [ + "866", + "cp866", + "csibm866", + "ibm866" + ], + "name": "IBM866" + }, + { + "labels": [ + "csisolatin2", + "iso-8859-2", + "iso-ir-101", + "iso8859-2", + "iso88592", + "iso_8859-2", + "iso_8859-2:1987", + "l2", + "latin2" + ], + "name": "ISO-8859-2" + }, + { + "labels": [ + "csisolatin3", + "iso-8859-3", + "iso-ir-109", + "iso8859-3", + "iso88593", + "iso_8859-3", + "iso_8859-3:1988", + "l3", + "latin3" + ], + "name": "ISO-8859-3" + }, + { + "labels": [ + "csisolatin4", + "iso-8859-4", + "iso-ir-110", + "iso8859-4", + "iso88594", + "iso_8859-4", + "iso_8859-4:1988", + "l4", + "latin4" + ], + "name": "ISO-8859-4" + }, + { + "labels": [ + "csisolatincyrillic", + "cyrillic", + "iso-8859-5", + "iso-ir-144", + "iso8859-5", + "iso88595", + "iso_8859-5", + "iso_8859-5:1988" + ], + "name": "ISO-8859-5" + }, + { + "labels": [ + "arabic", + "asmo-708", + "csiso88596e", + "csiso88596i", + "csisolatinarabic", + "ecma-114", + "iso-8859-6", + "iso-8859-6-e", + "iso-8859-6-i", + "iso-ir-127", + "iso8859-6", + "iso88596", + "iso_8859-6", + "iso_8859-6:1987" + ], + "name": "ISO-8859-6" + }, + { + "labels": [ + "csisolatingreek", + "ecma-118", + "elot_928", + "greek", + "greek8", + "iso-8859-7", + "iso-ir-126", + "iso8859-7", + "iso88597", + "iso_8859-7", + "iso_8859-7:1987", + "sun_eu_greek" + ], + "name": "ISO-8859-7" + }, + { + "labels": [ + "csiso88598e", + "csisolatinhebrew", + "hebrew", + "iso-8859-8", + "iso-8859-8-e", + "iso-ir-138", + "iso8859-8", + "iso88598", + "iso_8859-8", + "iso_8859-8:1988", + "visual" + ], + "name": "ISO-8859-8" + }, + { + "labels": [ + "csiso88598i", + "iso-8859-8-i", + "logical" + ], + "name": "ISO-8859-8-I" + }, + { + "labels": [ + "csisolatin6", + "iso-8859-10", + "iso-ir-157", + "iso8859-10", + "iso885910", + "l6", + "latin6" + ], + "name": "ISO-8859-10" + }, + { + "labels": [ + "iso-8859-13", + "iso8859-13", + "iso885913" + ], + "name": "ISO-8859-13" + }, + { + "labels": [ + "iso-8859-14", + "iso8859-14", + "iso885914" + ], + "name": "ISO-8859-14" + }, + { + "labels": [ + "csisolatin9", + "iso-8859-15", + "iso8859-15", + "iso885915", + "iso_8859-15", + "l9" + ], + "name": "ISO-8859-15" + }, + { + "labels": [ + "iso-8859-16" + ], + "name": "ISO-8859-16" + }, + { + "labels": [ + "cskoi8r", + "koi", + "koi8", + "koi8-r", + "koi8_r" + ], + "name": "KOI8-R" + }, + { + "labels": [ + "koi8-ru", + "koi8-u" + ], + "name": "KOI8-U" + }, + { + "labels": [ + "csmacintosh", + "mac", + "macintosh", + "x-mac-roman" + ], + "name": "macintosh" + }, + { + "labels": [ + "dos-874", + "iso-8859-11", + "iso8859-11", + "iso885911", + "tis-620", + "windows-874" + ], + "name": "windows-874" + }, + { + "labels": [ + "cp1250", + "windows-1250", + "x-cp1250" + ], + "name": "windows-1250" + }, + { + "labels": [ + "cp1251", + "windows-1251", + "x-cp1251" + ], + "name": "windows-1251" + }, + { + "labels": [ + "ansi_x3.4-1968", + "ascii", + "cp1252", + "cp819", + "csisolatin1", + "ibm819", + "iso-8859-1", + "iso-ir-100", + "iso8859-1", + "iso88591", + "iso_8859-1", + "iso_8859-1:1987", + "l1", + "latin1", + "us-ascii", + "windows-1252", + "x-cp1252" + ], + "name": "windows-1252" + }, + { + "labels": [ + "cp1253", + "windows-1253", + "x-cp1253" + ], + "name": "windows-1253" + }, + { + "labels": [ + "cp1254", + "csisolatin5", + "iso-8859-9", + "iso-ir-148", + "iso8859-9", + "iso88599", + "iso_8859-9", + "iso_8859-9:1989", + "l5", + "latin5", + "windows-1254", + "x-cp1254" + ], + "name": "windows-1254" + }, + { + "labels": [ + "cp1255", + "windows-1255", + "x-cp1255" + ], + "name": "windows-1255" + }, + { + "labels": [ + "cp1256", + "windows-1256", + "x-cp1256" + ], + "name": "windows-1256" + }, + { + "labels": [ + "cp1257", + "windows-1257", + "x-cp1257" + ], + "name": "windows-1257" + }, + { + "labels": [ + "cp1258", + "windows-1258", + "x-cp1258" + ], + "name": "windows-1258" + }, + { + "labels": [ + "x-mac-cyrillic", + "x-mac-ukrainian" + ], + "name": "x-mac-cyrillic" + } + ], + "heading": "Legacy single-byte encodings" + }, + { + "encodings": [ + { + "labels": [ + "chinese", + "csgb2312", + "csiso58gb231280", + "gb2312", + "gb_2312", + "gb_2312-80", + "gbk", + "iso-ir-58", + "x-gbk" + ], + "name": "GBK" + }, + { + "labels": [ + "gb18030" + ], + "name": "gb18030" + } + ], + "heading": "Legacy multi-byte Chinese (simplified) encodings" + }, + { + "encodings": [ + { + "labels": [ + "big5", + "big5-hkscs", + "cn-big5", + "csbig5", + "x-x-big5" + ], + "name": "Big5" + } + ], + "heading": "Legacy multi-byte Chinese (traditional) encodings" + }, + { + "encodings": [ + { + "labels": [ + "cseucpkdfmtjapanese", + "euc-jp", + "x-euc-jp" + ], + "name": "EUC-JP" + }, + { + "labels": [ + "csiso2022jp", + "iso-2022-jp" + ], + "name": "ISO-2022-JP" + }, + { + "labels": [ + "csshiftjis", + "ms932", + "ms_kanji", + "shift-jis", + "shift_jis", + "sjis", + "windows-31j", + "x-sjis" + ], + "name": "Shift_JIS" + } + ], + "heading": "Legacy multi-byte Japanese encodings" + }, + { + "encodings": [ + { + "labels": [ + "cseuckr", + "csksc56011987", + "euc-kr", + "iso-ir-149", + "korean", + "ks_c_5601-1987", + "ks_c_5601-1989", + "ksc5601", + "ksc_5601", + "windows-949" + ], + "name": "EUC-KR" + } + ], + "heading": "Legacy multi-byte Korean encodings" + }, + { + "encodings": [ + { + "labels": [ + "csiso2022kr", + "hz-gb-2312", + "iso-2022-cn", + "iso-2022-cn-ext", + "iso-2022-kr", + "replacement" + ], + "name": "replacement" + }, + { + "labels": [ + "unicodefffe", + "utf-16be" + ], + "name": "UTF-16BE" + }, + { + "labels": [ + "csunicode", + "iso-10646-ucs-2", + "ucs-2", + "unicode", + "unicodefeff", + "utf-16", + "utf-16le" + ], + "name": "UTF-16LE" + }, + { + "labels": [ + "x-user-defined" + ], + "name": "x-user-defined" + } + ], + "heading": "Legacy miscellaneous encodings" + } +] diff --git a/test/fixtures/encoding/single-byte/index-ibm866.txt b/test/fixtures/encoding/single-byte/index-ibm866.txt new file mode 100644 index 00000000000000..959f6071844435 --- /dev/null +++ b/test/fixtures/encoding/single-byte/index-ibm866.txt @@ -0,0 +1,134 @@ +# For details on index index-ibm866.txt see the Encoding Standard +# https://encoding.spec.whatwg.org/ +# +# Identifier: db6fe14a559d1601a7667338d83704773d5708dbc641e1ad3c5e21405770f05e +# Date: 2024-09-18 + + 0 0x0410 А (CYRILLIC CAPITAL LETTER A) + 1 0x0411 Б (CYRILLIC CAPITAL LETTER BE) + 2 0x0412 В (CYRILLIC CAPITAL LETTER VE) + 3 0x0413 Г (CYRILLIC CAPITAL LETTER GHE) + 4 0x0414 Д (CYRILLIC CAPITAL LETTER DE) + 5 0x0415 Е (CYRILLIC CAPITAL LETTER IE) + 6 0x0416 Ж (CYRILLIC CAPITAL LETTER ZHE) + 7 0x0417 З (CYRILLIC CAPITAL LETTER ZE) + 8 0x0418 И (CYRILLIC CAPITAL LETTER I) + 9 0x0419 Й (CYRILLIC CAPITAL LETTER SHORT I) + 10 0x041A К (CYRILLIC CAPITAL LETTER KA) + 11 0x041B Л (CYRILLIC CAPITAL LETTER EL) + 12 0x041C М (CYRILLIC CAPITAL LETTER EM) + 13 0x041D Н (CYRILLIC CAPITAL LETTER EN) + 14 0x041E О (CYRILLIC CAPITAL LETTER O) + 15 0x041F П (CYRILLIC CAPITAL LETTER PE) + 16 0x0420 Р (CYRILLIC CAPITAL LETTER ER) + 17 0x0421 С (CYRILLIC CAPITAL LETTER ES) + 18 0x0422 Т (CYRILLIC CAPITAL LETTER TE) + 19 0x0423 У (CYRILLIC CAPITAL LETTER U) + 20 0x0424 Ф (CYRILLIC CAPITAL LETTER EF) + 21 0x0425 Х (CYRILLIC CAPITAL LETTER HA) + 22 0x0426 Ц (CYRILLIC CAPITAL LETTER TSE) + 23 0x0427 Ч (CYRILLIC CAPITAL LETTER CHE) + 24 0x0428 Ш (CYRILLIC CAPITAL LETTER SHA) + 25 0x0429 Щ (CYRILLIC CAPITAL LETTER SHCHA) + 26 0x042A Ъ (CYRILLIC CAPITAL LETTER HARD SIGN) + 27 0x042B Ы (CYRILLIC CAPITAL LETTER YERU) + 28 0x042C Ь (CYRILLIC CAPITAL LETTER SOFT SIGN) + 29 0x042D Э (CYRILLIC CAPITAL LETTER E) + 30 0x042E Ю (CYRILLIC CAPITAL LETTER YU) + 31 0x042F Я (CYRILLIC CAPITAL LETTER YA) + 32 0x0430 а (CYRILLIC SMALL LETTER A) + 33 0x0431 б (CYRILLIC SMALL LETTER BE) + 34 0x0432 в (CYRILLIC SMALL LETTER VE) + 35 0x0433 г (CYRILLIC SMALL LETTER GHE) + 36 0x0434 д (CYRILLIC SMALL LETTER DE) + 37 0x0435 е (CYRILLIC SMALL LETTER IE) + 38 0x0436 ж (CYRILLIC SMALL LETTER ZHE) + 39 0x0437 з (CYRILLIC SMALL LETTER ZE) + 40 0x0438 и (CYRILLIC SMALL LETTER I) + 41 0x0439 й (CYRILLIC SMALL LETTER SHORT I) + 42 0x043A к (CYRILLIC SMALL LETTER KA) + 43 0x043B л (CYRILLIC SMALL LETTER EL) + 44 0x043C м (CYRILLIC SMALL LETTER EM) + 45 0x043D н (CYRILLIC SMALL LETTER EN) + 46 0x043E о (CYRILLIC SMALL LETTER O) + 47 0x043F п (CYRILLIC SMALL LETTER PE) + 48 0x2591 ░ (LIGHT SHADE) + 49 0x2592 ▒ (MEDIUM SHADE) + 50 0x2593 ▓ (DARK SHADE) + 51 0x2502 │ (BOX DRAWINGS LIGHT VERTICAL) + 52 0x2524 ┤ (BOX DRAWINGS LIGHT VERTICAL AND LEFT) + 53 0x2561 ╡ (BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE) + 54 0x2562 ╢ (BOX DRAWINGS VERTICAL DOUBLE AND LEFT SINGLE) + 55 0x2556 ╖ (BOX DRAWINGS DOWN DOUBLE AND LEFT SINGLE) + 56 0x2555 ╕ (BOX DRAWINGS DOWN SINGLE AND LEFT DOUBLE) + 57 0x2563 ╣ (BOX DRAWINGS DOUBLE VERTICAL AND LEFT) + 58 0x2551 ║ (BOX DRAWINGS DOUBLE VERTICAL) + 59 0x2557 ╗ (BOX DRAWINGS DOUBLE DOWN AND LEFT) + 60 0x255D ╝ (BOX DRAWINGS DOUBLE UP AND LEFT) + 61 0x255C ╜ (BOX DRAWINGS UP DOUBLE AND LEFT SINGLE) + 62 0x255B ╛ (BOX DRAWINGS UP SINGLE AND LEFT DOUBLE) + 63 0x2510 ┐ (BOX DRAWINGS LIGHT DOWN AND LEFT) + 64 0x2514 └ (BOX DRAWINGS LIGHT UP AND RIGHT) + 65 0x2534 ┴ (BOX DRAWINGS LIGHT UP AND HORIZONTAL) + 66 0x252C ┬ (BOX DRAWINGS LIGHT DOWN AND HORIZONTAL) + 67 0x251C ├ (BOX DRAWINGS LIGHT VERTICAL AND RIGHT) + 68 0x2500 ─ (BOX DRAWINGS LIGHT HORIZONTAL) + 69 0x253C ┼ (BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL) + 70 0x255E ╞ (BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE) + 71 0x255F ╟ (BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE) + 72 0x255A ╚ (BOX DRAWINGS DOUBLE UP AND RIGHT) + 73 0x2554 ╔ (BOX DRAWINGS DOUBLE DOWN AND RIGHT) + 74 0x2569 ╩ (BOX DRAWINGS DOUBLE UP AND HORIZONTAL) + 75 0x2566 ╦ (BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL) + 76 0x2560 ╠ (BOX DRAWINGS DOUBLE VERTICAL AND RIGHT) + 77 0x2550 ═ (BOX DRAWINGS DOUBLE HORIZONTAL) + 78 0x256C ╬ (BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL) + 79 0x2567 ╧ (BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE) + 80 0x2568 ╨ (BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE) + 81 0x2564 ╤ (BOX DRAWINGS DOWN SINGLE AND HORIZONTAL DOUBLE) + 82 0x2565 ╥ (BOX DRAWINGS DOWN DOUBLE AND HORIZONTAL SINGLE) + 83 0x2559 ╙ (BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE) + 84 0x2558 ╘ (BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE) + 85 0x2552 ╒ (BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE) + 86 0x2553 ╓ (BOX DRAWINGS DOWN DOUBLE AND RIGHT SINGLE) + 87 0x256B ╫ (BOX DRAWINGS VERTICAL DOUBLE AND HORIZONTAL SINGLE) + 88 0x256A ╪ (BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE) + 89 0x2518 ┘ (BOX DRAWINGS LIGHT UP AND LEFT) + 90 0x250C ┌ (BOX DRAWINGS LIGHT DOWN AND RIGHT) + 91 0x2588 █ (FULL BLOCK) + 92 0x2584 ▄ (LOWER HALF BLOCK) + 93 0x258C ▌ (LEFT HALF BLOCK) + 94 0x2590 ▐ (RIGHT HALF BLOCK) + 95 0x2580 ▀ (UPPER HALF BLOCK) + 96 0x0440 р (CYRILLIC SMALL LETTER ER) + 97 0x0441 с (CYRILLIC SMALL LETTER ES) + 98 0x0442 т (CYRILLIC SMALL LETTER TE) + 99 0x0443 у (CYRILLIC SMALL LETTER U) +100 0x0444 ф (CYRILLIC SMALL LETTER EF) +101 0x0445 х (CYRILLIC SMALL LETTER HA) +102 0x0446 ц (CYRILLIC SMALL LETTER TSE) +103 0x0447 ч (CYRILLIC SMALL LETTER CHE) +104 0x0448 ш (CYRILLIC SMALL LETTER SHA) +105 0x0449 щ (CYRILLIC SMALL LETTER SHCHA) +106 0x044A ъ (CYRILLIC SMALL LETTER HARD SIGN) +107 0x044B ы (CYRILLIC SMALL LETTER YERU) +108 0x044C ь (CYRILLIC SMALL LETTER SOFT SIGN) +109 0x044D э (CYRILLIC SMALL LETTER E) +110 0x044E ю (CYRILLIC SMALL LETTER YU) +111 0x044F я (CYRILLIC SMALL LETTER YA) +112 0x0401 Ё (CYRILLIC CAPITAL LETTER IO) +113 0x0451 ё (CYRILLIC SMALL LETTER IO) +114 0x0404 Є (CYRILLIC CAPITAL LETTER UKRAINIAN IE) +115 0x0454 є (CYRILLIC SMALL LETTER UKRAINIAN IE) +116 0x0407 Ї (CYRILLIC CAPITAL LETTER YI) +117 0x0457 ї (CYRILLIC SMALL LETTER YI) +118 0x040E Ў (CYRILLIC CAPITAL LETTER SHORT U) +119 0x045E ў (CYRILLIC SMALL LETTER SHORT U) +120 0x00B0 ° (DEGREE SIGN) +121 0x2219 ∙ (BULLET OPERATOR) +122 0x00B7 · (MIDDLE DOT) +123 0x221A √ (SQUARE ROOT) +124 0x2116 № (NUMERO SIGN) +125 0x00A4 ¤ (CURRENCY SIGN) +126 0x25A0 ■ (BLACK SQUARE) +127 0x00A0   (NO-BREAK SPACE) diff --git a/test/fixtures/encoding/single-byte/index-iso-8859-10.txt b/test/fixtures/encoding/single-byte/index-iso-8859-10.txt new file mode 100644 index 00000000000000..0097b42145eec5 --- /dev/null +++ b/test/fixtures/encoding/single-byte/index-iso-8859-10.txt @@ -0,0 +1,134 @@ +# For details on index index-iso-8859-10.txt see the Encoding Standard +# https://encoding.spec.whatwg.org/ +# +# Identifier: 02c2b5590d8ccda9931008c471f6ee2c590b2c8fe5e6ccb3b08638115d778507 +# Date: 2024-09-18 + + 0 0x0080 € () + 1 0x0081  () + 2 0x0082 ‚ () + 3 0x0083 ƒ () + 4 0x0084 „ () + 5 0x0085 … () + 6 0x0086 † () + 7 0x0087 ‡ () + 8 0x0088 ˆ () + 9 0x0089 ‰ () + 10 0x008A Š () + 11 0x008B ‹ () + 12 0x008C Œ () + 13 0x008D  () + 14 0x008E Ž () + 15 0x008F  () + 16 0x0090  () + 17 0x0091 ‘ () + 18 0x0092 ’ () + 19 0x0093 “ () + 20 0x0094 ” () + 21 0x0095 • () + 22 0x0096 – () + 23 0x0097 — () + 24 0x0098 ˜ () + 25 0x0099 ™ () + 26 0x009A š () + 27 0x009B › () + 28 0x009C œ () + 29 0x009D  () + 30 0x009E ž () + 31 0x009F Ÿ () + 32 0x00A0   (NO-BREAK SPACE) + 33 0x0104 Ą (LATIN CAPITAL LETTER A WITH OGONEK) + 34 0x0112 Ē (LATIN CAPITAL LETTER E WITH MACRON) + 35 0x0122 Ģ (LATIN CAPITAL LETTER G WITH CEDILLA) + 36 0x012A Ī (LATIN CAPITAL LETTER I WITH MACRON) + 37 0x0128 Ĩ (LATIN CAPITAL LETTER I WITH TILDE) + 38 0x0136 Ķ (LATIN CAPITAL LETTER K WITH CEDILLA) + 39 0x00A7 § (SECTION SIGN) + 40 0x013B Ļ (LATIN CAPITAL LETTER L WITH CEDILLA) + 41 0x0110 Đ (LATIN CAPITAL LETTER D WITH STROKE) + 42 0x0160 Š (LATIN CAPITAL LETTER S WITH CARON) + 43 0x0166 Ŧ (LATIN CAPITAL LETTER T WITH STROKE) + 44 0x017D Ž (LATIN CAPITAL LETTER Z WITH CARON) + 45 0x00AD ­ (SOFT HYPHEN) + 46 0x016A Ū (LATIN CAPITAL LETTER U WITH MACRON) + 47 0x014A Ŋ (LATIN CAPITAL LETTER ENG) + 48 0x00B0 ° (DEGREE SIGN) + 49 0x0105 ą (LATIN SMALL LETTER A WITH OGONEK) + 50 0x0113 ē (LATIN SMALL LETTER E WITH MACRON) + 51 0x0123 ģ (LATIN SMALL LETTER G WITH CEDILLA) + 52 0x012B ī (LATIN SMALL LETTER I WITH MACRON) + 53 0x0129 ĩ (LATIN SMALL LETTER I WITH TILDE) + 54 0x0137 ķ (LATIN SMALL LETTER K WITH CEDILLA) + 55 0x00B7 · (MIDDLE DOT) + 56 0x013C ļ (LATIN SMALL LETTER L WITH CEDILLA) + 57 0x0111 đ (LATIN SMALL LETTER D WITH STROKE) + 58 0x0161 š (LATIN SMALL LETTER S WITH CARON) + 59 0x0167 ŧ (LATIN SMALL LETTER T WITH STROKE) + 60 0x017E ž (LATIN SMALL LETTER Z WITH CARON) + 61 0x2015 ― (HORIZONTAL BAR) + 62 0x016B ū (LATIN SMALL LETTER U WITH MACRON) + 63 0x014B ŋ (LATIN SMALL LETTER ENG) + 64 0x0100 Ā (LATIN CAPITAL LETTER A WITH MACRON) + 65 0x00C1 Á (LATIN CAPITAL LETTER A WITH ACUTE) + 66 0x00C2  (LATIN CAPITAL LETTER A WITH CIRCUMFLEX) + 67 0x00C3 à (LATIN CAPITAL LETTER A WITH TILDE) + 68 0x00C4 Ä (LATIN CAPITAL LETTER A WITH DIAERESIS) + 69 0x00C5 Å (LATIN CAPITAL LETTER A WITH RING ABOVE) + 70 0x00C6 Æ (LATIN CAPITAL LETTER AE) + 71 0x012E Į (LATIN CAPITAL LETTER I WITH OGONEK) + 72 0x010C Č (LATIN CAPITAL LETTER C WITH CARON) + 73 0x00C9 É (LATIN CAPITAL LETTER E WITH ACUTE) + 74 0x0118 Ę (LATIN CAPITAL LETTER E WITH OGONEK) + 75 0x00CB Ë (LATIN CAPITAL LETTER E WITH DIAERESIS) + 76 0x0116 Ė (LATIN CAPITAL LETTER E WITH DOT ABOVE) + 77 0x00CD Í (LATIN CAPITAL LETTER I WITH ACUTE) + 78 0x00CE Î (LATIN CAPITAL LETTER I WITH CIRCUMFLEX) + 79 0x00CF Ï (LATIN CAPITAL LETTER I WITH DIAERESIS) + 80 0x00D0 Ð (LATIN CAPITAL LETTER ETH) + 81 0x0145 Ņ (LATIN CAPITAL LETTER N WITH CEDILLA) + 82 0x014C Ō (LATIN CAPITAL LETTER O WITH MACRON) + 83 0x00D3 Ó (LATIN CAPITAL LETTER O WITH ACUTE) + 84 0x00D4 Ô (LATIN CAPITAL LETTER O WITH CIRCUMFLEX) + 85 0x00D5 Õ (LATIN CAPITAL LETTER O WITH TILDE) + 86 0x00D6 Ö (LATIN CAPITAL LETTER O WITH DIAERESIS) + 87 0x0168 Ũ (LATIN CAPITAL LETTER U WITH TILDE) + 88 0x00D8 Ø (LATIN CAPITAL LETTER O WITH STROKE) + 89 0x0172 Ų (LATIN CAPITAL LETTER U WITH OGONEK) + 90 0x00DA Ú (LATIN CAPITAL LETTER U WITH ACUTE) + 91 0x00DB Û (LATIN CAPITAL LETTER U WITH CIRCUMFLEX) + 92 0x00DC Ü (LATIN CAPITAL LETTER U WITH DIAERESIS) + 93 0x00DD Ý (LATIN CAPITAL LETTER Y WITH ACUTE) + 94 0x00DE Þ (LATIN CAPITAL LETTER THORN) + 95 0x00DF ß (LATIN SMALL LETTER SHARP S) + 96 0x0101 ā (LATIN SMALL LETTER A WITH MACRON) + 97 0x00E1 á (LATIN SMALL LETTER A WITH ACUTE) + 98 0x00E2 â (LATIN SMALL LETTER A WITH CIRCUMFLEX) + 99 0x00E3 ã (LATIN SMALL LETTER A WITH TILDE) +100 0x00E4 ä (LATIN SMALL LETTER A WITH DIAERESIS) +101 0x00E5 å (LATIN SMALL LETTER A WITH RING ABOVE) +102 0x00E6 æ (LATIN SMALL LETTER AE) +103 0x012F į (LATIN SMALL LETTER I WITH OGONEK) +104 0x010D č (LATIN SMALL LETTER C WITH CARON) +105 0x00E9 é (LATIN SMALL LETTER E WITH ACUTE) +106 0x0119 ę (LATIN SMALL LETTER E WITH OGONEK) +107 0x00EB ë (LATIN SMALL LETTER E WITH DIAERESIS) +108 0x0117 ė (LATIN SMALL LETTER E WITH DOT ABOVE) +109 0x00ED í (LATIN SMALL LETTER I WITH ACUTE) +110 0x00EE î (LATIN SMALL LETTER I WITH CIRCUMFLEX) +111 0x00EF ï (LATIN SMALL LETTER I WITH DIAERESIS) +112 0x00F0 ð (LATIN SMALL LETTER ETH) +113 0x0146 ņ (LATIN SMALL LETTER N WITH CEDILLA) +114 0x014D ō (LATIN SMALL LETTER O WITH MACRON) +115 0x00F3 ó (LATIN SMALL LETTER O WITH ACUTE) +116 0x00F4 ô (LATIN SMALL LETTER O WITH CIRCUMFLEX) +117 0x00F5 õ (LATIN SMALL LETTER O WITH TILDE) +118 0x00F6 ö (LATIN SMALL LETTER O WITH DIAERESIS) +119 0x0169 ũ (LATIN SMALL LETTER U WITH TILDE) +120 0x00F8 ø (LATIN SMALL LETTER O WITH STROKE) +121 0x0173 ų (LATIN SMALL LETTER U WITH OGONEK) +122 0x00FA ú (LATIN SMALL LETTER U WITH ACUTE) +123 0x00FB û (LATIN SMALL LETTER U WITH CIRCUMFLEX) +124 0x00FC ü (LATIN SMALL LETTER U WITH DIAERESIS) +125 0x00FD ý (LATIN SMALL LETTER Y WITH ACUTE) +126 0x00FE þ (LATIN SMALL LETTER THORN) +127 0x0138 ĸ (LATIN SMALL LETTER KRA) diff --git a/test/fixtures/encoding/single-byte/index-iso-8859-13.txt b/test/fixtures/encoding/single-byte/index-iso-8859-13.txt new file mode 100644 index 00000000000000..0e52de359b8286 --- /dev/null +++ b/test/fixtures/encoding/single-byte/index-iso-8859-13.txt @@ -0,0 +1,134 @@ +# For details on index index-iso-8859-13.txt see the Encoding Standard +# https://encoding.spec.whatwg.org/ +# +# Identifier: 40736338e964ab520407cebcb01329f8d450abf6ce12bf88b74b655b60e43300 +# Date: 2024-09-18 + + 0 0x0080 € () + 1 0x0081  () + 2 0x0082 ‚ () + 3 0x0083 ƒ () + 4 0x0084 „ () + 5 0x0085 … () + 6 0x0086 † () + 7 0x0087 ‡ () + 8 0x0088 ˆ () + 9 0x0089 ‰ () + 10 0x008A Š () + 11 0x008B ‹ () + 12 0x008C Œ () + 13 0x008D  () + 14 0x008E Ž () + 15 0x008F  () + 16 0x0090  () + 17 0x0091 ‘ () + 18 0x0092 ’ () + 19 0x0093 “ () + 20 0x0094 ” () + 21 0x0095 • () + 22 0x0096 – () + 23 0x0097 — () + 24 0x0098 ˜ () + 25 0x0099 ™ () + 26 0x009A š () + 27 0x009B › () + 28 0x009C œ () + 29 0x009D  () + 30 0x009E ž () + 31 0x009F Ÿ () + 32 0x00A0   (NO-BREAK SPACE) + 33 0x201D ” (RIGHT DOUBLE QUOTATION MARK) + 34 0x00A2 ¢ (CENT SIGN) + 35 0x00A3 £ (POUND SIGN) + 36 0x00A4 ¤ (CURRENCY SIGN) + 37 0x201E „ (DOUBLE LOW-9 QUOTATION MARK) + 38 0x00A6 ¦ (BROKEN BAR) + 39 0x00A7 § (SECTION SIGN) + 40 0x00D8 Ø (LATIN CAPITAL LETTER O WITH STROKE) + 41 0x00A9 © (COPYRIGHT SIGN) + 42 0x0156 Ŗ (LATIN CAPITAL LETTER R WITH CEDILLA) + 43 0x00AB « (LEFT-POINTING DOUBLE ANGLE QUOTATION MARK) + 44 0x00AC ¬ (NOT SIGN) + 45 0x00AD ­ (SOFT HYPHEN) + 46 0x00AE ® (REGISTERED SIGN) + 47 0x00C6 Æ (LATIN CAPITAL LETTER AE) + 48 0x00B0 ° (DEGREE SIGN) + 49 0x00B1 ± (PLUS-MINUS SIGN) + 50 0x00B2 ² (SUPERSCRIPT TWO) + 51 0x00B3 ³ (SUPERSCRIPT THREE) + 52 0x201C “ (LEFT DOUBLE QUOTATION MARK) + 53 0x00B5 µ (MICRO SIGN) + 54 0x00B6 ¶ (PILCROW SIGN) + 55 0x00B7 · (MIDDLE DOT) + 56 0x00F8 ø (LATIN SMALL LETTER O WITH STROKE) + 57 0x00B9 ¹ (SUPERSCRIPT ONE) + 58 0x0157 ŗ (LATIN SMALL LETTER R WITH CEDILLA) + 59 0x00BB » (RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK) + 60 0x00BC ¼ (VULGAR FRACTION ONE QUARTER) + 61 0x00BD ½ (VULGAR FRACTION ONE HALF) + 62 0x00BE ¾ (VULGAR FRACTION THREE QUARTERS) + 63 0x00E6 æ (LATIN SMALL LETTER AE) + 64 0x0104 Ą (LATIN CAPITAL LETTER A WITH OGONEK) + 65 0x012E Į (LATIN CAPITAL LETTER I WITH OGONEK) + 66 0x0100 Ā (LATIN CAPITAL LETTER A WITH MACRON) + 67 0x0106 Ć (LATIN CAPITAL LETTER C WITH ACUTE) + 68 0x00C4 Ä (LATIN CAPITAL LETTER A WITH DIAERESIS) + 69 0x00C5 Å (LATIN CAPITAL LETTER A WITH RING ABOVE) + 70 0x0118 Ę (LATIN CAPITAL LETTER E WITH OGONEK) + 71 0x0112 Ē (LATIN CAPITAL LETTER E WITH MACRON) + 72 0x010C Č (LATIN CAPITAL LETTER C WITH CARON) + 73 0x00C9 É (LATIN CAPITAL LETTER E WITH ACUTE) + 74 0x0179 Ź (LATIN CAPITAL LETTER Z WITH ACUTE) + 75 0x0116 Ė (LATIN CAPITAL LETTER E WITH DOT ABOVE) + 76 0x0122 Ģ (LATIN CAPITAL LETTER G WITH CEDILLA) + 77 0x0136 Ķ (LATIN CAPITAL LETTER K WITH CEDILLA) + 78 0x012A Ī (LATIN CAPITAL LETTER I WITH MACRON) + 79 0x013B Ļ (LATIN CAPITAL LETTER L WITH CEDILLA) + 80 0x0160 Š (LATIN CAPITAL LETTER S WITH CARON) + 81 0x0143 Ń (LATIN CAPITAL LETTER N WITH ACUTE) + 82 0x0145 Ņ (LATIN CAPITAL LETTER N WITH CEDILLA) + 83 0x00D3 Ó (LATIN CAPITAL LETTER O WITH ACUTE) + 84 0x014C Ō (LATIN CAPITAL LETTER O WITH MACRON) + 85 0x00D5 Õ (LATIN CAPITAL LETTER O WITH TILDE) + 86 0x00D6 Ö (LATIN CAPITAL LETTER O WITH DIAERESIS) + 87 0x00D7 × (MULTIPLICATION SIGN) + 88 0x0172 Ų (LATIN CAPITAL LETTER U WITH OGONEK) + 89 0x0141 Ł (LATIN CAPITAL LETTER L WITH STROKE) + 90 0x015A Ś (LATIN CAPITAL LETTER S WITH ACUTE) + 91 0x016A Ū (LATIN CAPITAL LETTER U WITH MACRON) + 92 0x00DC Ü (LATIN CAPITAL LETTER U WITH DIAERESIS) + 93 0x017B Ż (LATIN CAPITAL LETTER Z WITH DOT ABOVE) + 94 0x017D Ž (LATIN CAPITAL LETTER Z WITH CARON) + 95 0x00DF ß (LATIN SMALL LETTER SHARP S) + 96 0x0105 ą (LATIN SMALL LETTER A WITH OGONEK) + 97 0x012F į (LATIN SMALL LETTER I WITH OGONEK) + 98 0x0101 ā (LATIN SMALL LETTER A WITH MACRON) + 99 0x0107 ć (LATIN SMALL LETTER C WITH ACUTE) +100 0x00E4 ä (LATIN SMALL LETTER A WITH DIAERESIS) +101 0x00E5 å (LATIN SMALL LETTER A WITH RING ABOVE) +102 0x0119 ę (LATIN SMALL LETTER E WITH OGONEK) +103 0x0113 ē (LATIN SMALL LETTER E WITH MACRON) +104 0x010D č (LATIN SMALL LETTER C WITH CARON) +105 0x00E9 é (LATIN SMALL LETTER E WITH ACUTE) +106 0x017A ź (LATIN SMALL LETTER Z WITH ACUTE) +107 0x0117 ė (LATIN SMALL LETTER E WITH DOT ABOVE) +108 0x0123 ģ (LATIN SMALL LETTER G WITH CEDILLA) +109 0x0137 ķ (LATIN SMALL LETTER K WITH CEDILLA) +110 0x012B ī (LATIN SMALL LETTER I WITH MACRON) +111 0x013C ļ (LATIN SMALL LETTER L WITH CEDILLA) +112 0x0161 š (LATIN SMALL LETTER S WITH CARON) +113 0x0144 ń (LATIN SMALL LETTER N WITH ACUTE) +114 0x0146 ņ (LATIN SMALL LETTER N WITH CEDILLA) +115 0x00F3 ó (LATIN SMALL LETTER O WITH ACUTE) +116 0x014D ō (LATIN SMALL LETTER O WITH MACRON) +117 0x00F5 õ (LATIN SMALL LETTER O WITH TILDE) +118 0x00F6 ö (LATIN SMALL LETTER O WITH DIAERESIS) +119 0x00F7 ÷ (DIVISION SIGN) +120 0x0173 ų (LATIN SMALL LETTER U WITH OGONEK) +121 0x0142 ł (LATIN SMALL LETTER L WITH STROKE) +122 0x015B ś (LATIN SMALL LETTER S WITH ACUTE) +123 0x016B ū (LATIN SMALL LETTER U WITH MACRON) +124 0x00FC ü (LATIN SMALL LETTER U WITH DIAERESIS) +125 0x017C ż (LATIN SMALL LETTER Z WITH DOT ABOVE) +126 0x017E ž (LATIN SMALL LETTER Z WITH CARON) +127 0x2019 ’ (RIGHT SINGLE QUOTATION MARK) diff --git a/test/fixtures/encoding/single-byte/index-iso-8859-14.txt b/test/fixtures/encoding/single-byte/index-iso-8859-14.txt new file mode 100644 index 00000000000000..d020592171b3be --- /dev/null +++ b/test/fixtures/encoding/single-byte/index-iso-8859-14.txt @@ -0,0 +1,134 @@ +# For details on index index-iso-8859-14.txt see the Encoding Standard +# https://encoding.spec.whatwg.org/ +# +# Identifier: 2c8651cfc08b1f35b17919ee5379f2fa006af3ec809f11b3b7f470785580542b +# Date: 2024-09-18 + + 0 0x0080 € () + 1 0x0081  () + 2 0x0082 ‚ () + 3 0x0083 ƒ () + 4 0x0084 „ () + 5 0x0085 … () + 6 0x0086 † () + 7 0x0087 ‡ () + 8 0x0088 ˆ () + 9 0x0089 ‰ () + 10 0x008A Š () + 11 0x008B ‹ () + 12 0x008C Œ () + 13 0x008D  () + 14 0x008E Ž () + 15 0x008F  () + 16 0x0090  () + 17 0x0091 ‘ () + 18 0x0092 ’ () + 19 0x0093 “ () + 20 0x0094 ” () + 21 0x0095 • () + 22 0x0096 – () + 23 0x0097 — () + 24 0x0098 ˜ () + 25 0x0099 ™ () + 26 0x009A š () + 27 0x009B › () + 28 0x009C œ () + 29 0x009D  () + 30 0x009E ž () + 31 0x009F Ÿ () + 32 0x00A0   (NO-BREAK SPACE) + 33 0x1E02 Ḃ (LATIN CAPITAL LETTER B WITH DOT ABOVE) + 34 0x1E03 ḃ (LATIN SMALL LETTER B WITH DOT ABOVE) + 35 0x00A3 £ (POUND SIGN) + 36 0x010A Ċ (LATIN CAPITAL LETTER C WITH DOT ABOVE) + 37 0x010B ċ (LATIN SMALL LETTER C WITH DOT ABOVE) + 38 0x1E0A Ḋ (LATIN CAPITAL LETTER D WITH DOT ABOVE) + 39 0x00A7 § (SECTION SIGN) + 40 0x1E80 Ẁ (LATIN CAPITAL LETTER W WITH GRAVE) + 41 0x00A9 © (COPYRIGHT SIGN) + 42 0x1E82 Ẃ (LATIN CAPITAL LETTER W WITH ACUTE) + 43 0x1E0B ḋ (LATIN SMALL LETTER D WITH DOT ABOVE) + 44 0x1EF2 Ỳ (LATIN CAPITAL LETTER Y WITH GRAVE) + 45 0x00AD ­ (SOFT HYPHEN) + 46 0x00AE ® (REGISTERED SIGN) + 47 0x0178 Ÿ (LATIN CAPITAL LETTER Y WITH DIAERESIS) + 48 0x1E1E Ḟ (LATIN CAPITAL LETTER F WITH DOT ABOVE) + 49 0x1E1F ḟ (LATIN SMALL LETTER F WITH DOT ABOVE) + 50 0x0120 Ġ (LATIN CAPITAL LETTER G WITH DOT ABOVE) + 51 0x0121 ġ (LATIN SMALL LETTER G WITH DOT ABOVE) + 52 0x1E40 Ṁ (LATIN CAPITAL LETTER M WITH DOT ABOVE) + 53 0x1E41 ṁ (LATIN SMALL LETTER M WITH DOT ABOVE) + 54 0x00B6 ¶ (PILCROW SIGN) + 55 0x1E56 Ṗ (LATIN CAPITAL LETTER P WITH DOT ABOVE) + 56 0x1E81 ẁ (LATIN SMALL LETTER W WITH GRAVE) + 57 0x1E57 ṗ (LATIN SMALL LETTER P WITH DOT ABOVE) + 58 0x1E83 ẃ (LATIN SMALL LETTER W WITH ACUTE) + 59 0x1E60 Ṡ (LATIN CAPITAL LETTER S WITH DOT ABOVE) + 60 0x1EF3 ỳ (LATIN SMALL LETTER Y WITH GRAVE) + 61 0x1E84 Ẅ (LATIN CAPITAL LETTER W WITH DIAERESIS) + 62 0x1E85 ẅ (LATIN SMALL LETTER W WITH DIAERESIS) + 63 0x1E61 ṡ (LATIN SMALL LETTER S WITH DOT ABOVE) + 64 0x00C0 À (LATIN CAPITAL LETTER A WITH GRAVE) + 65 0x00C1 Á (LATIN CAPITAL LETTER A WITH ACUTE) + 66 0x00C2  (LATIN CAPITAL LETTER A WITH CIRCUMFLEX) + 67 0x00C3 à (LATIN CAPITAL LETTER A WITH TILDE) + 68 0x00C4 Ä (LATIN CAPITAL LETTER A WITH DIAERESIS) + 69 0x00C5 Å (LATIN CAPITAL LETTER A WITH RING ABOVE) + 70 0x00C6 Æ (LATIN CAPITAL LETTER AE) + 71 0x00C7 Ç (LATIN CAPITAL LETTER C WITH CEDILLA) + 72 0x00C8 È (LATIN CAPITAL LETTER E WITH GRAVE) + 73 0x00C9 É (LATIN CAPITAL LETTER E WITH ACUTE) + 74 0x00CA Ê (LATIN CAPITAL LETTER E WITH CIRCUMFLEX) + 75 0x00CB Ë (LATIN CAPITAL LETTER E WITH DIAERESIS) + 76 0x00CC Ì (LATIN CAPITAL LETTER I WITH GRAVE) + 77 0x00CD Í (LATIN CAPITAL LETTER I WITH ACUTE) + 78 0x00CE Î (LATIN CAPITAL LETTER I WITH CIRCUMFLEX) + 79 0x00CF Ï (LATIN CAPITAL LETTER I WITH DIAERESIS) + 80 0x0174 Ŵ (LATIN CAPITAL LETTER W WITH CIRCUMFLEX) + 81 0x00D1 Ñ (LATIN CAPITAL LETTER N WITH TILDE) + 82 0x00D2 Ò (LATIN CAPITAL LETTER O WITH GRAVE) + 83 0x00D3 Ó (LATIN CAPITAL LETTER O WITH ACUTE) + 84 0x00D4 Ô (LATIN CAPITAL LETTER O WITH CIRCUMFLEX) + 85 0x00D5 Õ (LATIN CAPITAL LETTER O WITH TILDE) + 86 0x00D6 Ö (LATIN CAPITAL LETTER O WITH DIAERESIS) + 87 0x1E6A Ṫ (LATIN CAPITAL LETTER T WITH DOT ABOVE) + 88 0x00D8 Ø (LATIN CAPITAL LETTER O WITH STROKE) + 89 0x00D9 Ù (LATIN CAPITAL LETTER U WITH GRAVE) + 90 0x00DA Ú (LATIN CAPITAL LETTER U WITH ACUTE) + 91 0x00DB Û (LATIN CAPITAL LETTER U WITH CIRCUMFLEX) + 92 0x00DC Ü (LATIN CAPITAL LETTER U WITH DIAERESIS) + 93 0x00DD Ý (LATIN CAPITAL LETTER Y WITH ACUTE) + 94 0x0176 Ŷ (LATIN CAPITAL LETTER Y WITH CIRCUMFLEX) + 95 0x00DF ß (LATIN SMALL LETTER SHARP S) + 96 0x00E0 à (LATIN SMALL LETTER A WITH GRAVE) + 97 0x00E1 á (LATIN SMALL LETTER A WITH ACUTE) + 98 0x00E2 â (LATIN SMALL LETTER A WITH CIRCUMFLEX) + 99 0x00E3 ã (LATIN SMALL LETTER A WITH TILDE) +100 0x00E4 ä (LATIN SMALL LETTER A WITH DIAERESIS) +101 0x00E5 å (LATIN SMALL LETTER A WITH RING ABOVE) +102 0x00E6 æ (LATIN SMALL LETTER AE) +103 0x00E7 ç (LATIN SMALL LETTER C WITH CEDILLA) +104 0x00E8 è (LATIN SMALL LETTER E WITH GRAVE) +105 0x00E9 é (LATIN SMALL LETTER E WITH ACUTE) +106 0x00EA ê (LATIN SMALL LETTER E WITH CIRCUMFLEX) +107 0x00EB ë (LATIN SMALL LETTER E WITH DIAERESIS) +108 0x00EC ì (LATIN SMALL LETTER I WITH GRAVE) +109 0x00ED í (LATIN SMALL LETTER I WITH ACUTE) +110 0x00EE î (LATIN SMALL LETTER I WITH CIRCUMFLEX) +111 0x00EF ï (LATIN SMALL LETTER I WITH DIAERESIS) +112 0x0175 ŵ (LATIN SMALL LETTER W WITH CIRCUMFLEX) +113 0x00F1 ñ (LATIN SMALL LETTER N WITH TILDE) +114 0x00F2 ò (LATIN SMALL LETTER O WITH GRAVE) +115 0x00F3 ó (LATIN SMALL LETTER O WITH ACUTE) +116 0x00F4 ô (LATIN SMALL LETTER O WITH CIRCUMFLEX) +117 0x00F5 õ (LATIN SMALL LETTER O WITH TILDE) +118 0x00F6 ö (LATIN SMALL LETTER O WITH DIAERESIS) +119 0x1E6B ṫ (LATIN SMALL LETTER T WITH DOT ABOVE) +120 0x00F8 ø (LATIN SMALL LETTER O WITH STROKE) +121 0x00F9 ù (LATIN SMALL LETTER U WITH GRAVE) +122 0x00FA ú (LATIN SMALL LETTER U WITH ACUTE) +123 0x00FB û (LATIN SMALL LETTER U WITH CIRCUMFLEX) +124 0x00FC ü (LATIN SMALL LETTER U WITH DIAERESIS) +125 0x00FD ý (LATIN SMALL LETTER Y WITH ACUTE) +126 0x0177 ŷ (LATIN SMALL LETTER Y WITH CIRCUMFLEX) +127 0x00FF ÿ (LATIN SMALL LETTER Y WITH DIAERESIS) diff --git a/test/fixtures/encoding/single-byte/index-iso-8859-15.txt b/test/fixtures/encoding/single-byte/index-iso-8859-15.txt new file mode 100644 index 00000000000000..e673fa816b516d --- /dev/null +++ b/test/fixtures/encoding/single-byte/index-iso-8859-15.txt @@ -0,0 +1,134 @@ +# For details on index index-iso-8859-15.txt see the Encoding Standard +# https://encoding.spec.whatwg.org/ +# +# Identifier: a560aba47bccd7510a6ac77f671fe75dca3800f05cf6d676910c311a8f8ff079 +# Date: 2024-09-18 + + 0 0x0080 € () + 1 0x0081  () + 2 0x0082 ‚ () + 3 0x0083 ƒ () + 4 0x0084 „ () + 5 0x0085 … () + 6 0x0086 † () + 7 0x0087 ‡ () + 8 0x0088 ˆ () + 9 0x0089 ‰ () + 10 0x008A Š () + 11 0x008B ‹ () + 12 0x008C Œ () + 13 0x008D  () + 14 0x008E Ž () + 15 0x008F  () + 16 0x0090  () + 17 0x0091 ‘ () + 18 0x0092 ’ () + 19 0x0093 “ () + 20 0x0094 ” () + 21 0x0095 • () + 22 0x0096 – () + 23 0x0097 — () + 24 0x0098 ˜ () + 25 0x0099 ™ () + 26 0x009A š () + 27 0x009B › () + 28 0x009C œ () + 29 0x009D  () + 30 0x009E ž () + 31 0x009F Ÿ () + 32 0x00A0   (NO-BREAK SPACE) + 33 0x00A1 ¡ (INVERTED EXCLAMATION MARK) + 34 0x00A2 ¢ (CENT SIGN) + 35 0x00A3 £ (POUND SIGN) + 36 0x20AC € (EURO SIGN) + 37 0x00A5 ¥ (YEN SIGN) + 38 0x0160 Š (LATIN CAPITAL LETTER S WITH CARON) + 39 0x00A7 § (SECTION SIGN) + 40 0x0161 š (LATIN SMALL LETTER S WITH CARON) + 41 0x00A9 © (COPYRIGHT SIGN) + 42 0x00AA ª (FEMININE ORDINAL INDICATOR) + 43 0x00AB « (LEFT-POINTING DOUBLE ANGLE QUOTATION MARK) + 44 0x00AC ¬ (NOT SIGN) + 45 0x00AD ­ (SOFT HYPHEN) + 46 0x00AE ® (REGISTERED SIGN) + 47 0x00AF ¯ (MACRON) + 48 0x00B0 ° (DEGREE SIGN) + 49 0x00B1 ± (PLUS-MINUS SIGN) + 50 0x00B2 ² (SUPERSCRIPT TWO) + 51 0x00B3 ³ (SUPERSCRIPT THREE) + 52 0x017D Ž (LATIN CAPITAL LETTER Z WITH CARON) + 53 0x00B5 µ (MICRO SIGN) + 54 0x00B6 ¶ (PILCROW SIGN) + 55 0x00B7 · (MIDDLE DOT) + 56 0x017E ž (LATIN SMALL LETTER Z WITH CARON) + 57 0x00B9 ¹ (SUPERSCRIPT ONE) + 58 0x00BA º (MASCULINE ORDINAL INDICATOR) + 59 0x00BB » (RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK) + 60 0x0152 Œ (LATIN CAPITAL LIGATURE OE) + 61 0x0153 œ (LATIN SMALL LIGATURE OE) + 62 0x0178 Ÿ (LATIN CAPITAL LETTER Y WITH DIAERESIS) + 63 0x00BF ¿ (INVERTED QUESTION MARK) + 64 0x00C0 À (LATIN CAPITAL LETTER A WITH GRAVE) + 65 0x00C1 Á (LATIN CAPITAL LETTER A WITH ACUTE) + 66 0x00C2  (LATIN CAPITAL LETTER A WITH CIRCUMFLEX) + 67 0x00C3 à (LATIN CAPITAL LETTER A WITH TILDE) + 68 0x00C4 Ä (LATIN CAPITAL LETTER A WITH DIAERESIS) + 69 0x00C5 Å (LATIN CAPITAL LETTER A WITH RING ABOVE) + 70 0x00C6 Æ (LATIN CAPITAL LETTER AE) + 71 0x00C7 Ç (LATIN CAPITAL LETTER C WITH CEDILLA) + 72 0x00C8 È (LATIN CAPITAL LETTER E WITH GRAVE) + 73 0x00C9 É (LATIN CAPITAL LETTER E WITH ACUTE) + 74 0x00CA Ê (LATIN CAPITAL LETTER E WITH CIRCUMFLEX) + 75 0x00CB Ë (LATIN CAPITAL LETTER E WITH DIAERESIS) + 76 0x00CC Ì (LATIN CAPITAL LETTER I WITH GRAVE) + 77 0x00CD Í (LATIN CAPITAL LETTER I WITH ACUTE) + 78 0x00CE Î (LATIN CAPITAL LETTER I WITH CIRCUMFLEX) + 79 0x00CF Ï (LATIN CAPITAL LETTER I WITH DIAERESIS) + 80 0x00D0 Ð (LATIN CAPITAL LETTER ETH) + 81 0x00D1 Ñ (LATIN CAPITAL LETTER N WITH TILDE) + 82 0x00D2 Ò (LATIN CAPITAL LETTER O WITH GRAVE) + 83 0x00D3 Ó (LATIN CAPITAL LETTER O WITH ACUTE) + 84 0x00D4 Ô (LATIN CAPITAL LETTER O WITH CIRCUMFLEX) + 85 0x00D5 Õ (LATIN CAPITAL LETTER O WITH TILDE) + 86 0x00D6 Ö (LATIN CAPITAL LETTER O WITH DIAERESIS) + 87 0x00D7 × (MULTIPLICATION SIGN) + 88 0x00D8 Ø (LATIN CAPITAL LETTER O WITH STROKE) + 89 0x00D9 Ù (LATIN CAPITAL LETTER U WITH GRAVE) + 90 0x00DA Ú (LATIN CAPITAL LETTER U WITH ACUTE) + 91 0x00DB Û (LATIN CAPITAL LETTER U WITH CIRCUMFLEX) + 92 0x00DC Ü (LATIN CAPITAL LETTER U WITH DIAERESIS) + 93 0x00DD Ý (LATIN CAPITAL LETTER Y WITH ACUTE) + 94 0x00DE Þ (LATIN CAPITAL LETTER THORN) + 95 0x00DF ß (LATIN SMALL LETTER SHARP S) + 96 0x00E0 à (LATIN SMALL LETTER A WITH GRAVE) + 97 0x00E1 á (LATIN SMALL LETTER A WITH ACUTE) + 98 0x00E2 â (LATIN SMALL LETTER A WITH CIRCUMFLEX) + 99 0x00E3 ã (LATIN SMALL LETTER A WITH TILDE) +100 0x00E4 ä (LATIN SMALL LETTER A WITH DIAERESIS) +101 0x00E5 å (LATIN SMALL LETTER A WITH RING ABOVE) +102 0x00E6 æ (LATIN SMALL LETTER AE) +103 0x00E7 ç (LATIN SMALL LETTER C WITH CEDILLA) +104 0x00E8 è (LATIN SMALL LETTER E WITH GRAVE) +105 0x00E9 é (LATIN SMALL LETTER E WITH ACUTE) +106 0x00EA ê (LATIN SMALL LETTER E WITH CIRCUMFLEX) +107 0x00EB ë (LATIN SMALL LETTER E WITH DIAERESIS) +108 0x00EC ì (LATIN SMALL LETTER I WITH GRAVE) +109 0x00ED í (LATIN SMALL LETTER I WITH ACUTE) +110 0x00EE î (LATIN SMALL LETTER I WITH CIRCUMFLEX) +111 0x00EF ï (LATIN SMALL LETTER I WITH DIAERESIS) +112 0x00F0 ð (LATIN SMALL LETTER ETH) +113 0x00F1 ñ (LATIN SMALL LETTER N WITH TILDE) +114 0x00F2 ò (LATIN SMALL LETTER O WITH GRAVE) +115 0x00F3 ó (LATIN SMALL LETTER O WITH ACUTE) +116 0x00F4 ô (LATIN SMALL LETTER O WITH CIRCUMFLEX) +117 0x00F5 õ (LATIN SMALL LETTER O WITH TILDE) +118 0x00F6 ö (LATIN SMALL LETTER O WITH DIAERESIS) +119 0x00F7 ÷ (DIVISION SIGN) +120 0x00F8 ø (LATIN SMALL LETTER O WITH STROKE) +121 0x00F9 ù (LATIN SMALL LETTER U WITH GRAVE) +122 0x00FA ú (LATIN SMALL LETTER U WITH ACUTE) +123 0x00FB û (LATIN SMALL LETTER U WITH CIRCUMFLEX) +124 0x00FC ü (LATIN SMALL LETTER U WITH DIAERESIS) +125 0x00FD ý (LATIN SMALL LETTER Y WITH ACUTE) +126 0x00FE þ (LATIN SMALL LETTER THORN) +127 0x00FF ÿ (LATIN SMALL LETTER Y WITH DIAERESIS) diff --git a/test/fixtures/encoding/single-byte/index-iso-8859-16.txt b/test/fixtures/encoding/single-byte/index-iso-8859-16.txt new file mode 100644 index 00000000000000..985176ff53676f --- /dev/null +++ b/test/fixtures/encoding/single-byte/index-iso-8859-16.txt @@ -0,0 +1,134 @@ +# For details on index index-iso-8859-16.txt see the Encoding Standard +# https://encoding.spec.whatwg.org/ +# +# Identifier: 55676320d2d1b6e6909f5b3d741a7cf0cefc84e920aa4474afc091459111c2e3 +# Date: 2024-09-18 + + 0 0x0080 € () + 1 0x0081  () + 2 0x0082 ‚ () + 3 0x0083 ƒ () + 4 0x0084 „ () + 5 0x0085 … () + 6 0x0086 † () + 7 0x0087 ‡ () + 8 0x0088 ˆ () + 9 0x0089 ‰ () + 10 0x008A Š () + 11 0x008B ‹ () + 12 0x008C Œ () + 13 0x008D  () + 14 0x008E Ž () + 15 0x008F  () + 16 0x0090  () + 17 0x0091 ‘ () + 18 0x0092 ’ () + 19 0x0093 “ () + 20 0x0094 ” () + 21 0x0095 • () + 22 0x0096 – () + 23 0x0097 — () + 24 0x0098 ˜ () + 25 0x0099 ™ () + 26 0x009A š () + 27 0x009B › () + 28 0x009C œ () + 29 0x009D  () + 30 0x009E ž () + 31 0x009F Ÿ () + 32 0x00A0   (NO-BREAK SPACE) + 33 0x0104 Ą (LATIN CAPITAL LETTER A WITH OGONEK) + 34 0x0105 ą (LATIN SMALL LETTER A WITH OGONEK) + 35 0x0141 Ł (LATIN CAPITAL LETTER L WITH STROKE) + 36 0x20AC € (EURO SIGN) + 37 0x201E „ (DOUBLE LOW-9 QUOTATION MARK) + 38 0x0160 Š (LATIN CAPITAL LETTER S WITH CARON) + 39 0x00A7 § (SECTION SIGN) + 40 0x0161 š (LATIN SMALL LETTER S WITH CARON) + 41 0x00A9 © (COPYRIGHT SIGN) + 42 0x0218 Ș (LATIN CAPITAL LETTER S WITH COMMA BELOW) + 43 0x00AB « (LEFT-POINTING DOUBLE ANGLE QUOTATION MARK) + 44 0x0179 Ź (LATIN CAPITAL LETTER Z WITH ACUTE) + 45 0x00AD ­ (SOFT HYPHEN) + 46 0x017A ź (LATIN SMALL LETTER Z WITH ACUTE) + 47 0x017B Ż (LATIN CAPITAL LETTER Z WITH DOT ABOVE) + 48 0x00B0 ° (DEGREE SIGN) + 49 0x00B1 ± (PLUS-MINUS SIGN) + 50 0x010C Č (LATIN CAPITAL LETTER C WITH CARON) + 51 0x0142 ł (LATIN SMALL LETTER L WITH STROKE) + 52 0x017D Ž (LATIN CAPITAL LETTER Z WITH CARON) + 53 0x201D ” (RIGHT DOUBLE QUOTATION MARK) + 54 0x00B6 ¶ (PILCROW SIGN) + 55 0x00B7 · (MIDDLE DOT) + 56 0x017E ž (LATIN SMALL LETTER Z WITH CARON) + 57 0x010D č (LATIN SMALL LETTER C WITH CARON) + 58 0x0219 ș (LATIN SMALL LETTER S WITH COMMA BELOW) + 59 0x00BB » (RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK) + 60 0x0152 Œ (LATIN CAPITAL LIGATURE OE) + 61 0x0153 œ (LATIN SMALL LIGATURE OE) + 62 0x0178 Ÿ (LATIN CAPITAL LETTER Y WITH DIAERESIS) + 63 0x017C ż (LATIN SMALL LETTER Z WITH DOT ABOVE) + 64 0x00C0 À (LATIN CAPITAL LETTER A WITH GRAVE) + 65 0x00C1 Á (LATIN CAPITAL LETTER A WITH ACUTE) + 66 0x00C2  (LATIN CAPITAL LETTER A WITH CIRCUMFLEX) + 67 0x0102 Ă (LATIN CAPITAL LETTER A WITH BREVE) + 68 0x00C4 Ä (LATIN CAPITAL LETTER A WITH DIAERESIS) + 69 0x0106 Ć (LATIN CAPITAL LETTER C WITH ACUTE) + 70 0x00C6 Æ (LATIN CAPITAL LETTER AE) + 71 0x00C7 Ç (LATIN CAPITAL LETTER C WITH CEDILLA) + 72 0x00C8 È (LATIN CAPITAL LETTER E WITH GRAVE) + 73 0x00C9 É (LATIN CAPITAL LETTER E WITH ACUTE) + 74 0x00CA Ê (LATIN CAPITAL LETTER E WITH CIRCUMFLEX) + 75 0x00CB Ë (LATIN CAPITAL LETTER E WITH DIAERESIS) + 76 0x00CC Ì (LATIN CAPITAL LETTER I WITH GRAVE) + 77 0x00CD Í (LATIN CAPITAL LETTER I WITH ACUTE) + 78 0x00CE Î (LATIN CAPITAL LETTER I WITH CIRCUMFLEX) + 79 0x00CF Ï (LATIN CAPITAL LETTER I WITH DIAERESIS) + 80 0x0110 Đ (LATIN CAPITAL LETTER D WITH STROKE) + 81 0x0143 Ń (LATIN CAPITAL LETTER N WITH ACUTE) + 82 0x00D2 Ò (LATIN CAPITAL LETTER O WITH GRAVE) + 83 0x00D3 Ó (LATIN CAPITAL LETTER O WITH ACUTE) + 84 0x00D4 Ô (LATIN CAPITAL LETTER O WITH CIRCUMFLEX) + 85 0x0150 Ő (LATIN CAPITAL LETTER O WITH DOUBLE ACUTE) + 86 0x00D6 Ö (LATIN CAPITAL LETTER O WITH DIAERESIS) + 87 0x015A Ś (LATIN CAPITAL LETTER S WITH ACUTE) + 88 0x0170 Ű (LATIN CAPITAL LETTER U WITH DOUBLE ACUTE) + 89 0x00D9 Ù (LATIN CAPITAL LETTER U WITH GRAVE) + 90 0x00DA Ú (LATIN CAPITAL LETTER U WITH ACUTE) + 91 0x00DB Û (LATIN CAPITAL LETTER U WITH CIRCUMFLEX) + 92 0x00DC Ü (LATIN CAPITAL LETTER U WITH DIAERESIS) + 93 0x0118 Ę (LATIN CAPITAL LETTER E WITH OGONEK) + 94 0x021A Ț (LATIN CAPITAL LETTER T WITH COMMA BELOW) + 95 0x00DF ß (LATIN SMALL LETTER SHARP S) + 96 0x00E0 à (LATIN SMALL LETTER A WITH GRAVE) + 97 0x00E1 á (LATIN SMALL LETTER A WITH ACUTE) + 98 0x00E2 â (LATIN SMALL LETTER A WITH CIRCUMFLEX) + 99 0x0103 ă (LATIN SMALL LETTER A WITH BREVE) +100 0x00E4 ä (LATIN SMALL LETTER A WITH DIAERESIS) +101 0x0107 ć (LATIN SMALL LETTER C WITH ACUTE) +102 0x00E6 æ (LATIN SMALL LETTER AE) +103 0x00E7 ç (LATIN SMALL LETTER C WITH CEDILLA) +104 0x00E8 è (LATIN SMALL LETTER E WITH GRAVE) +105 0x00E9 é (LATIN SMALL LETTER E WITH ACUTE) +106 0x00EA ê (LATIN SMALL LETTER E WITH CIRCUMFLEX) +107 0x00EB ë (LATIN SMALL LETTER E WITH DIAERESIS) +108 0x00EC ì (LATIN SMALL LETTER I WITH GRAVE) +109 0x00ED í (LATIN SMALL LETTER I WITH ACUTE) +110 0x00EE î (LATIN SMALL LETTER I WITH CIRCUMFLEX) +111 0x00EF ï (LATIN SMALL LETTER I WITH DIAERESIS) +112 0x0111 đ (LATIN SMALL LETTER D WITH STROKE) +113 0x0144 ń (LATIN SMALL LETTER N WITH ACUTE) +114 0x00F2 ò (LATIN SMALL LETTER O WITH GRAVE) +115 0x00F3 ó (LATIN SMALL LETTER O WITH ACUTE) +116 0x00F4 ô (LATIN SMALL LETTER O WITH CIRCUMFLEX) +117 0x0151 ő (LATIN SMALL LETTER O WITH DOUBLE ACUTE) +118 0x00F6 ö (LATIN SMALL LETTER O WITH DIAERESIS) +119 0x015B ś (LATIN SMALL LETTER S WITH ACUTE) +120 0x0171 ű (LATIN SMALL LETTER U WITH DOUBLE ACUTE) +121 0x00F9 ù (LATIN SMALL LETTER U WITH GRAVE) +122 0x00FA ú (LATIN SMALL LETTER U WITH ACUTE) +123 0x00FB û (LATIN SMALL LETTER U WITH CIRCUMFLEX) +124 0x00FC ü (LATIN SMALL LETTER U WITH DIAERESIS) +125 0x0119 ę (LATIN SMALL LETTER E WITH OGONEK) +126 0x021B ț (LATIN SMALL LETTER T WITH COMMA BELOW) +127 0x00FF ÿ (LATIN SMALL LETTER Y WITH DIAERESIS) diff --git a/test/fixtures/encoding/single-byte/index-iso-8859-2.txt b/test/fixtures/encoding/single-byte/index-iso-8859-2.txt new file mode 100644 index 00000000000000..3849517a980d92 --- /dev/null +++ b/test/fixtures/encoding/single-byte/index-iso-8859-2.txt @@ -0,0 +1,134 @@ +# For details on index index-iso-8859-2.txt see the Encoding Standard +# https://encoding.spec.whatwg.org/ +# +# Identifier: 9569c67f22d0b57790e1c407c6eecf227e4562322dc296de43cdab7a0152ec73 +# Date: 2024-09-18 + + 0 0x0080 € () + 1 0x0081  () + 2 0x0082 ‚ () + 3 0x0083 ƒ () + 4 0x0084 „ () + 5 0x0085 … () + 6 0x0086 † () + 7 0x0087 ‡ () + 8 0x0088 ˆ () + 9 0x0089 ‰ () + 10 0x008A Š () + 11 0x008B ‹ () + 12 0x008C Œ () + 13 0x008D  () + 14 0x008E Ž () + 15 0x008F  () + 16 0x0090  () + 17 0x0091 ‘ () + 18 0x0092 ’ () + 19 0x0093 “ () + 20 0x0094 ” () + 21 0x0095 • () + 22 0x0096 – () + 23 0x0097 — () + 24 0x0098 ˜ () + 25 0x0099 ™ () + 26 0x009A š () + 27 0x009B › () + 28 0x009C œ () + 29 0x009D  () + 30 0x009E ž () + 31 0x009F Ÿ () + 32 0x00A0   (NO-BREAK SPACE) + 33 0x0104 Ą (LATIN CAPITAL LETTER A WITH OGONEK) + 34 0x02D8 ˘ (BREVE) + 35 0x0141 Ł (LATIN CAPITAL LETTER L WITH STROKE) + 36 0x00A4 ¤ (CURRENCY SIGN) + 37 0x013D Ľ (LATIN CAPITAL LETTER L WITH CARON) + 38 0x015A Ś (LATIN CAPITAL LETTER S WITH ACUTE) + 39 0x00A7 § (SECTION SIGN) + 40 0x00A8 ¨ (DIAERESIS) + 41 0x0160 Š (LATIN CAPITAL LETTER S WITH CARON) + 42 0x015E Ş (LATIN CAPITAL LETTER S WITH CEDILLA) + 43 0x0164 Ť (LATIN CAPITAL LETTER T WITH CARON) + 44 0x0179 Ź (LATIN CAPITAL LETTER Z WITH ACUTE) + 45 0x00AD ­ (SOFT HYPHEN) + 46 0x017D Ž (LATIN CAPITAL LETTER Z WITH CARON) + 47 0x017B Ż (LATIN CAPITAL LETTER Z WITH DOT ABOVE) + 48 0x00B0 ° (DEGREE SIGN) + 49 0x0105 ą (LATIN SMALL LETTER A WITH OGONEK) + 50 0x02DB ˛ (OGONEK) + 51 0x0142 ł (LATIN SMALL LETTER L WITH STROKE) + 52 0x00B4 ´ (ACUTE ACCENT) + 53 0x013E ľ (LATIN SMALL LETTER L WITH CARON) + 54 0x015B ś (LATIN SMALL LETTER S WITH ACUTE) + 55 0x02C7 ˇ (CARON) + 56 0x00B8 ¸ (CEDILLA) + 57 0x0161 š (LATIN SMALL LETTER S WITH CARON) + 58 0x015F ş (LATIN SMALL LETTER S WITH CEDILLA) + 59 0x0165 ť (LATIN SMALL LETTER T WITH CARON) + 60 0x017A ź (LATIN SMALL LETTER Z WITH ACUTE) + 61 0x02DD ˝ (DOUBLE ACUTE ACCENT) + 62 0x017E ž (LATIN SMALL LETTER Z WITH CARON) + 63 0x017C ż (LATIN SMALL LETTER Z WITH DOT ABOVE) + 64 0x0154 Ŕ (LATIN CAPITAL LETTER R WITH ACUTE) + 65 0x00C1 Á (LATIN CAPITAL LETTER A WITH ACUTE) + 66 0x00C2  (LATIN CAPITAL LETTER A WITH CIRCUMFLEX) + 67 0x0102 Ă (LATIN CAPITAL LETTER A WITH BREVE) + 68 0x00C4 Ä (LATIN CAPITAL LETTER A WITH DIAERESIS) + 69 0x0139 Ĺ (LATIN CAPITAL LETTER L WITH ACUTE) + 70 0x0106 Ć (LATIN CAPITAL LETTER C WITH ACUTE) + 71 0x00C7 Ç (LATIN CAPITAL LETTER C WITH CEDILLA) + 72 0x010C Č (LATIN CAPITAL LETTER C WITH CARON) + 73 0x00C9 É (LATIN CAPITAL LETTER E WITH ACUTE) + 74 0x0118 Ę (LATIN CAPITAL LETTER E WITH OGONEK) + 75 0x00CB Ë (LATIN CAPITAL LETTER E WITH DIAERESIS) + 76 0x011A Ě (LATIN CAPITAL LETTER E WITH CARON) + 77 0x00CD Í (LATIN CAPITAL LETTER I WITH ACUTE) + 78 0x00CE Î (LATIN CAPITAL LETTER I WITH CIRCUMFLEX) + 79 0x010E Ď (LATIN CAPITAL LETTER D WITH CARON) + 80 0x0110 Đ (LATIN CAPITAL LETTER D WITH STROKE) + 81 0x0143 Ń (LATIN CAPITAL LETTER N WITH ACUTE) + 82 0x0147 Ň (LATIN CAPITAL LETTER N WITH CARON) + 83 0x00D3 Ó (LATIN CAPITAL LETTER O WITH ACUTE) + 84 0x00D4 Ô (LATIN CAPITAL LETTER O WITH CIRCUMFLEX) + 85 0x0150 Ő (LATIN CAPITAL LETTER O WITH DOUBLE ACUTE) + 86 0x00D6 Ö (LATIN CAPITAL LETTER O WITH DIAERESIS) + 87 0x00D7 × (MULTIPLICATION SIGN) + 88 0x0158 Ř (LATIN CAPITAL LETTER R WITH CARON) + 89 0x016E Ů (LATIN CAPITAL LETTER U WITH RING ABOVE) + 90 0x00DA Ú (LATIN CAPITAL LETTER U WITH ACUTE) + 91 0x0170 Ű (LATIN CAPITAL LETTER U WITH DOUBLE ACUTE) + 92 0x00DC Ü (LATIN CAPITAL LETTER U WITH DIAERESIS) + 93 0x00DD Ý (LATIN CAPITAL LETTER Y WITH ACUTE) + 94 0x0162 Ţ (LATIN CAPITAL LETTER T WITH CEDILLA) + 95 0x00DF ß (LATIN SMALL LETTER SHARP S) + 96 0x0155 ŕ (LATIN SMALL LETTER R WITH ACUTE) + 97 0x00E1 á (LATIN SMALL LETTER A WITH ACUTE) + 98 0x00E2 â (LATIN SMALL LETTER A WITH CIRCUMFLEX) + 99 0x0103 ă (LATIN SMALL LETTER A WITH BREVE) +100 0x00E4 ä (LATIN SMALL LETTER A WITH DIAERESIS) +101 0x013A ĺ (LATIN SMALL LETTER L WITH ACUTE) +102 0x0107 ć (LATIN SMALL LETTER C WITH ACUTE) +103 0x00E7 ç (LATIN SMALL LETTER C WITH CEDILLA) +104 0x010D č (LATIN SMALL LETTER C WITH CARON) +105 0x00E9 é (LATIN SMALL LETTER E WITH ACUTE) +106 0x0119 ę (LATIN SMALL LETTER E WITH OGONEK) +107 0x00EB ë (LATIN SMALL LETTER E WITH DIAERESIS) +108 0x011B ě (LATIN SMALL LETTER E WITH CARON) +109 0x00ED í (LATIN SMALL LETTER I WITH ACUTE) +110 0x00EE î (LATIN SMALL LETTER I WITH CIRCUMFLEX) +111 0x010F ď (LATIN SMALL LETTER D WITH CARON) +112 0x0111 đ (LATIN SMALL LETTER D WITH STROKE) +113 0x0144 ń (LATIN SMALL LETTER N WITH ACUTE) +114 0x0148 ň (LATIN SMALL LETTER N WITH CARON) +115 0x00F3 ó (LATIN SMALL LETTER O WITH ACUTE) +116 0x00F4 ô (LATIN SMALL LETTER O WITH CIRCUMFLEX) +117 0x0151 ő (LATIN SMALL LETTER O WITH DOUBLE ACUTE) +118 0x00F6 ö (LATIN SMALL LETTER O WITH DIAERESIS) +119 0x00F7 ÷ (DIVISION SIGN) +120 0x0159 ř (LATIN SMALL LETTER R WITH CARON) +121 0x016F ů (LATIN SMALL LETTER U WITH RING ABOVE) +122 0x00FA ú (LATIN SMALL LETTER U WITH ACUTE) +123 0x0171 ű (LATIN SMALL LETTER U WITH DOUBLE ACUTE) +124 0x00FC ü (LATIN SMALL LETTER U WITH DIAERESIS) +125 0x00FD ý (LATIN SMALL LETTER Y WITH ACUTE) +126 0x0163 ţ (LATIN SMALL LETTER T WITH CEDILLA) +127 0x02D9 ˙ (DOT ABOVE) diff --git a/test/fixtures/encoding/single-byte/index-iso-8859-3.txt b/test/fixtures/encoding/single-byte/index-iso-8859-3.txt new file mode 100644 index 00000000000000..e22c3b1099963c --- /dev/null +++ b/test/fixtures/encoding/single-byte/index-iso-8859-3.txt @@ -0,0 +1,127 @@ +# For details on index index-iso-8859-3.txt see the Encoding Standard +# https://encoding.spec.whatwg.org/ +# +# Identifier: af8f1e12df79b768322b5e83613698cdc619438270a2fc359554331c805054a3 +# Date: 2024-09-18 + + 0 0x0080 € () + 1 0x0081  () + 2 0x0082 ‚ () + 3 0x0083 ƒ () + 4 0x0084 „ () + 5 0x0085 … () + 6 0x0086 † () + 7 0x0087 ‡ () + 8 0x0088 ˆ () + 9 0x0089 ‰ () + 10 0x008A Š () + 11 0x008B ‹ () + 12 0x008C Œ () + 13 0x008D  () + 14 0x008E Ž () + 15 0x008F  () + 16 0x0090  () + 17 0x0091 ‘ () + 18 0x0092 ’ () + 19 0x0093 “ () + 20 0x0094 ” () + 21 0x0095 • () + 22 0x0096 – () + 23 0x0097 — () + 24 0x0098 ˜ () + 25 0x0099 ™ () + 26 0x009A š () + 27 0x009B › () + 28 0x009C œ () + 29 0x009D  () + 30 0x009E ž () + 31 0x009F Ÿ () + 32 0x00A0   (NO-BREAK SPACE) + 33 0x0126 Ħ (LATIN CAPITAL LETTER H WITH STROKE) + 34 0x02D8 ˘ (BREVE) + 35 0x00A3 £ (POUND SIGN) + 36 0x00A4 ¤ (CURRENCY SIGN) + 38 0x0124 Ĥ (LATIN CAPITAL LETTER H WITH CIRCUMFLEX) + 39 0x00A7 § (SECTION SIGN) + 40 0x00A8 ¨ (DIAERESIS) + 41 0x0130 İ (LATIN CAPITAL LETTER I WITH DOT ABOVE) + 42 0x015E Ş (LATIN CAPITAL LETTER S WITH CEDILLA) + 43 0x011E Ğ (LATIN CAPITAL LETTER G WITH BREVE) + 44 0x0134 Ĵ (LATIN CAPITAL LETTER J WITH CIRCUMFLEX) + 45 0x00AD ­ (SOFT HYPHEN) + 47 0x017B Ż (LATIN CAPITAL LETTER Z WITH DOT ABOVE) + 48 0x00B0 ° (DEGREE SIGN) + 49 0x0127 ħ (LATIN SMALL LETTER H WITH STROKE) + 50 0x00B2 ² (SUPERSCRIPT TWO) + 51 0x00B3 ³ (SUPERSCRIPT THREE) + 52 0x00B4 ´ (ACUTE ACCENT) + 53 0x00B5 µ (MICRO SIGN) + 54 0x0125 ĥ (LATIN SMALL LETTER H WITH CIRCUMFLEX) + 55 0x00B7 · (MIDDLE DOT) + 56 0x00B8 ¸ (CEDILLA) + 57 0x0131 ı (LATIN SMALL LETTER DOTLESS I) + 58 0x015F ş (LATIN SMALL LETTER S WITH CEDILLA) + 59 0x011F ğ (LATIN SMALL LETTER G WITH BREVE) + 60 0x0135 ĵ (LATIN SMALL LETTER J WITH CIRCUMFLEX) + 61 0x00BD ½ (VULGAR FRACTION ONE HALF) + 63 0x017C ż (LATIN SMALL LETTER Z WITH DOT ABOVE) + 64 0x00C0 À (LATIN CAPITAL LETTER A WITH GRAVE) + 65 0x00C1 Á (LATIN CAPITAL LETTER A WITH ACUTE) + 66 0x00C2  (LATIN CAPITAL LETTER A WITH CIRCUMFLEX) + 68 0x00C4 Ä (LATIN CAPITAL LETTER A WITH DIAERESIS) + 69 0x010A Ċ (LATIN CAPITAL LETTER C WITH DOT ABOVE) + 70 0x0108 Ĉ (LATIN CAPITAL LETTER C WITH CIRCUMFLEX) + 71 0x00C7 Ç (LATIN CAPITAL LETTER C WITH CEDILLA) + 72 0x00C8 È (LATIN CAPITAL LETTER E WITH GRAVE) + 73 0x00C9 É (LATIN CAPITAL LETTER E WITH ACUTE) + 74 0x00CA Ê (LATIN CAPITAL LETTER E WITH CIRCUMFLEX) + 75 0x00CB Ë (LATIN CAPITAL LETTER E WITH DIAERESIS) + 76 0x00CC Ì (LATIN CAPITAL LETTER I WITH GRAVE) + 77 0x00CD Í (LATIN CAPITAL LETTER I WITH ACUTE) + 78 0x00CE Î (LATIN CAPITAL LETTER I WITH CIRCUMFLEX) + 79 0x00CF Ï (LATIN CAPITAL LETTER I WITH DIAERESIS) + 81 0x00D1 Ñ (LATIN CAPITAL LETTER N WITH TILDE) + 82 0x00D2 Ò (LATIN CAPITAL LETTER O WITH GRAVE) + 83 0x00D3 Ó (LATIN CAPITAL LETTER O WITH ACUTE) + 84 0x00D4 Ô (LATIN CAPITAL LETTER O WITH CIRCUMFLEX) + 85 0x0120 Ġ (LATIN CAPITAL LETTER G WITH DOT ABOVE) + 86 0x00D6 Ö (LATIN CAPITAL LETTER O WITH DIAERESIS) + 87 0x00D7 × (MULTIPLICATION SIGN) + 88 0x011C Ĝ (LATIN CAPITAL LETTER G WITH CIRCUMFLEX) + 89 0x00D9 Ù (LATIN CAPITAL LETTER U WITH GRAVE) + 90 0x00DA Ú (LATIN CAPITAL LETTER U WITH ACUTE) + 91 0x00DB Û (LATIN CAPITAL LETTER U WITH CIRCUMFLEX) + 92 0x00DC Ü (LATIN CAPITAL LETTER U WITH DIAERESIS) + 93 0x016C Ŭ (LATIN CAPITAL LETTER U WITH BREVE) + 94 0x015C Ŝ (LATIN CAPITAL LETTER S WITH CIRCUMFLEX) + 95 0x00DF ß (LATIN SMALL LETTER SHARP S) + 96 0x00E0 à (LATIN SMALL LETTER A WITH GRAVE) + 97 0x00E1 á (LATIN SMALL LETTER A WITH ACUTE) + 98 0x00E2 â (LATIN SMALL LETTER A WITH CIRCUMFLEX) +100 0x00E4 ä (LATIN SMALL LETTER A WITH DIAERESIS) +101 0x010B ċ (LATIN SMALL LETTER C WITH DOT ABOVE) +102 0x0109 ĉ (LATIN SMALL LETTER C WITH CIRCUMFLEX) +103 0x00E7 ç (LATIN SMALL LETTER C WITH CEDILLA) +104 0x00E8 è (LATIN SMALL LETTER E WITH GRAVE) +105 0x00E9 é (LATIN SMALL LETTER E WITH ACUTE) +106 0x00EA ê (LATIN SMALL LETTER E WITH CIRCUMFLEX) +107 0x00EB ë (LATIN SMALL LETTER E WITH DIAERESIS) +108 0x00EC ì (LATIN SMALL LETTER I WITH GRAVE) +109 0x00ED í (LATIN SMALL LETTER I WITH ACUTE) +110 0x00EE î (LATIN SMALL LETTER I WITH CIRCUMFLEX) +111 0x00EF ï (LATIN SMALL LETTER I WITH DIAERESIS) +113 0x00F1 ñ (LATIN SMALL LETTER N WITH TILDE) +114 0x00F2 ò (LATIN SMALL LETTER O WITH GRAVE) +115 0x00F3 ó (LATIN SMALL LETTER O WITH ACUTE) +116 0x00F4 ô (LATIN SMALL LETTER O WITH CIRCUMFLEX) +117 0x0121 ġ (LATIN SMALL LETTER G WITH DOT ABOVE) +118 0x00F6 ö (LATIN SMALL LETTER O WITH DIAERESIS) +119 0x00F7 ÷ (DIVISION SIGN) +120 0x011D ĝ (LATIN SMALL LETTER G WITH CIRCUMFLEX) +121 0x00F9 ù (LATIN SMALL LETTER U WITH GRAVE) +122 0x00FA ú (LATIN SMALL LETTER U WITH ACUTE) +123 0x00FB û (LATIN SMALL LETTER U WITH CIRCUMFLEX) +124 0x00FC ü (LATIN SMALL LETTER U WITH DIAERESIS) +125 0x016D ŭ (LATIN SMALL LETTER U WITH BREVE) +126 0x015D ŝ (LATIN SMALL LETTER S WITH CIRCUMFLEX) +127 0x02D9 ˙ (DOT ABOVE) diff --git a/test/fixtures/encoding/single-byte/index-iso-8859-4.txt b/test/fixtures/encoding/single-byte/index-iso-8859-4.txt new file mode 100644 index 00000000000000..bd7f1270117c75 --- /dev/null +++ b/test/fixtures/encoding/single-byte/index-iso-8859-4.txt @@ -0,0 +1,134 @@ +# For details on index index-iso-8859-4.txt see the Encoding Standard +# https://encoding.spec.whatwg.org/ +# +# Identifier: 72f29c92344d351fe9e74a946e7e0468d76d542c6894ff82982cb652ebe0feb7 +# Date: 2024-09-18 + + 0 0x0080 € () + 1 0x0081  () + 2 0x0082 ‚ () + 3 0x0083 ƒ () + 4 0x0084 „ () + 5 0x0085 … () + 6 0x0086 † () + 7 0x0087 ‡ () + 8 0x0088 ˆ () + 9 0x0089 ‰ () + 10 0x008A Š () + 11 0x008B ‹ () + 12 0x008C Œ () + 13 0x008D  () + 14 0x008E Ž () + 15 0x008F  () + 16 0x0090  () + 17 0x0091 ‘ () + 18 0x0092 ’ () + 19 0x0093 “ () + 20 0x0094 ” () + 21 0x0095 • () + 22 0x0096 – () + 23 0x0097 — () + 24 0x0098 ˜ () + 25 0x0099 ™ () + 26 0x009A š () + 27 0x009B › () + 28 0x009C œ () + 29 0x009D  () + 30 0x009E ž () + 31 0x009F Ÿ () + 32 0x00A0   (NO-BREAK SPACE) + 33 0x0104 Ą (LATIN CAPITAL LETTER A WITH OGONEK) + 34 0x0138 ĸ (LATIN SMALL LETTER KRA) + 35 0x0156 Ŗ (LATIN CAPITAL LETTER R WITH CEDILLA) + 36 0x00A4 ¤ (CURRENCY SIGN) + 37 0x0128 Ĩ (LATIN CAPITAL LETTER I WITH TILDE) + 38 0x013B Ļ (LATIN CAPITAL LETTER L WITH CEDILLA) + 39 0x00A7 § (SECTION SIGN) + 40 0x00A8 ¨ (DIAERESIS) + 41 0x0160 Š (LATIN CAPITAL LETTER S WITH CARON) + 42 0x0112 Ē (LATIN CAPITAL LETTER E WITH MACRON) + 43 0x0122 Ģ (LATIN CAPITAL LETTER G WITH CEDILLA) + 44 0x0166 Ŧ (LATIN CAPITAL LETTER T WITH STROKE) + 45 0x00AD ­ (SOFT HYPHEN) + 46 0x017D Ž (LATIN CAPITAL LETTER Z WITH CARON) + 47 0x00AF ¯ (MACRON) + 48 0x00B0 ° (DEGREE SIGN) + 49 0x0105 ą (LATIN SMALL LETTER A WITH OGONEK) + 50 0x02DB ˛ (OGONEK) + 51 0x0157 ŗ (LATIN SMALL LETTER R WITH CEDILLA) + 52 0x00B4 ´ (ACUTE ACCENT) + 53 0x0129 ĩ (LATIN SMALL LETTER I WITH TILDE) + 54 0x013C ļ (LATIN SMALL LETTER L WITH CEDILLA) + 55 0x02C7 ˇ (CARON) + 56 0x00B8 ¸ (CEDILLA) + 57 0x0161 š (LATIN SMALL LETTER S WITH CARON) + 58 0x0113 ē (LATIN SMALL LETTER E WITH MACRON) + 59 0x0123 ģ (LATIN SMALL LETTER G WITH CEDILLA) + 60 0x0167 ŧ (LATIN SMALL LETTER T WITH STROKE) + 61 0x014A Ŋ (LATIN CAPITAL LETTER ENG) + 62 0x017E ž (LATIN SMALL LETTER Z WITH CARON) + 63 0x014B ŋ (LATIN SMALL LETTER ENG) + 64 0x0100 Ā (LATIN CAPITAL LETTER A WITH MACRON) + 65 0x00C1 Á (LATIN CAPITAL LETTER A WITH ACUTE) + 66 0x00C2  (LATIN CAPITAL LETTER A WITH CIRCUMFLEX) + 67 0x00C3 à (LATIN CAPITAL LETTER A WITH TILDE) + 68 0x00C4 Ä (LATIN CAPITAL LETTER A WITH DIAERESIS) + 69 0x00C5 Å (LATIN CAPITAL LETTER A WITH RING ABOVE) + 70 0x00C6 Æ (LATIN CAPITAL LETTER AE) + 71 0x012E Į (LATIN CAPITAL LETTER I WITH OGONEK) + 72 0x010C Č (LATIN CAPITAL LETTER C WITH CARON) + 73 0x00C9 É (LATIN CAPITAL LETTER E WITH ACUTE) + 74 0x0118 Ę (LATIN CAPITAL LETTER E WITH OGONEK) + 75 0x00CB Ë (LATIN CAPITAL LETTER E WITH DIAERESIS) + 76 0x0116 Ė (LATIN CAPITAL LETTER E WITH DOT ABOVE) + 77 0x00CD Í (LATIN CAPITAL LETTER I WITH ACUTE) + 78 0x00CE Î (LATIN CAPITAL LETTER I WITH CIRCUMFLEX) + 79 0x012A Ī (LATIN CAPITAL LETTER I WITH MACRON) + 80 0x0110 Đ (LATIN CAPITAL LETTER D WITH STROKE) + 81 0x0145 Ņ (LATIN CAPITAL LETTER N WITH CEDILLA) + 82 0x014C Ō (LATIN CAPITAL LETTER O WITH MACRON) + 83 0x0136 Ķ (LATIN CAPITAL LETTER K WITH CEDILLA) + 84 0x00D4 Ô (LATIN CAPITAL LETTER O WITH CIRCUMFLEX) + 85 0x00D5 Õ (LATIN CAPITAL LETTER O WITH TILDE) + 86 0x00D6 Ö (LATIN CAPITAL LETTER O WITH DIAERESIS) + 87 0x00D7 × (MULTIPLICATION SIGN) + 88 0x00D8 Ø (LATIN CAPITAL LETTER O WITH STROKE) + 89 0x0172 Ų (LATIN CAPITAL LETTER U WITH OGONEK) + 90 0x00DA Ú (LATIN CAPITAL LETTER U WITH ACUTE) + 91 0x00DB Û (LATIN CAPITAL LETTER U WITH CIRCUMFLEX) + 92 0x00DC Ü (LATIN CAPITAL LETTER U WITH DIAERESIS) + 93 0x0168 Ũ (LATIN CAPITAL LETTER U WITH TILDE) + 94 0x016A Ū (LATIN CAPITAL LETTER U WITH MACRON) + 95 0x00DF ß (LATIN SMALL LETTER SHARP S) + 96 0x0101 ā (LATIN SMALL LETTER A WITH MACRON) + 97 0x00E1 á (LATIN SMALL LETTER A WITH ACUTE) + 98 0x00E2 â (LATIN SMALL LETTER A WITH CIRCUMFLEX) + 99 0x00E3 ã (LATIN SMALL LETTER A WITH TILDE) +100 0x00E4 ä (LATIN SMALL LETTER A WITH DIAERESIS) +101 0x00E5 å (LATIN SMALL LETTER A WITH RING ABOVE) +102 0x00E6 æ (LATIN SMALL LETTER AE) +103 0x012F į (LATIN SMALL LETTER I WITH OGONEK) +104 0x010D č (LATIN SMALL LETTER C WITH CARON) +105 0x00E9 é (LATIN SMALL LETTER E WITH ACUTE) +106 0x0119 ę (LATIN SMALL LETTER E WITH OGONEK) +107 0x00EB ë (LATIN SMALL LETTER E WITH DIAERESIS) +108 0x0117 ė (LATIN SMALL LETTER E WITH DOT ABOVE) +109 0x00ED í (LATIN SMALL LETTER I WITH ACUTE) +110 0x00EE î (LATIN SMALL LETTER I WITH CIRCUMFLEX) +111 0x012B ī (LATIN SMALL LETTER I WITH MACRON) +112 0x0111 đ (LATIN SMALL LETTER D WITH STROKE) +113 0x0146 ņ (LATIN SMALL LETTER N WITH CEDILLA) +114 0x014D ō (LATIN SMALL LETTER O WITH MACRON) +115 0x0137 ķ (LATIN SMALL LETTER K WITH CEDILLA) +116 0x00F4 ô (LATIN SMALL LETTER O WITH CIRCUMFLEX) +117 0x00F5 õ (LATIN SMALL LETTER O WITH TILDE) +118 0x00F6 ö (LATIN SMALL LETTER O WITH DIAERESIS) +119 0x00F7 ÷ (DIVISION SIGN) +120 0x00F8 ø (LATIN SMALL LETTER O WITH STROKE) +121 0x0173 ų (LATIN SMALL LETTER U WITH OGONEK) +122 0x00FA ú (LATIN SMALL LETTER U WITH ACUTE) +123 0x00FB û (LATIN SMALL LETTER U WITH CIRCUMFLEX) +124 0x00FC ü (LATIN SMALL LETTER U WITH DIAERESIS) +125 0x0169 ũ (LATIN SMALL LETTER U WITH TILDE) +126 0x016B ū (LATIN SMALL LETTER U WITH MACRON) +127 0x02D9 ˙ (DOT ABOVE) diff --git a/test/fixtures/encoding/single-byte/index-iso-8859-5.txt b/test/fixtures/encoding/single-byte/index-iso-8859-5.txt new file mode 100644 index 00000000000000..fd096700881120 --- /dev/null +++ b/test/fixtures/encoding/single-byte/index-iso-8859-5.txt @@ -0,0 +1,134 @@ +# For details on index index-iso-8859-5.txt see the Encoding Standard +# https://encoding.spec.whatwg.org/ +# +# Identifier: fa9b1f3f5242df43e2e7bca80e9b6997c67944f20a4af91ee06bacc4e132d9c9 +# Date: 2024-09-18 + + 0 0x0080 € () + 1 0x0081  () + 2 0x0082 ‚ () + 3 0x0083 ƒ () + 4 0x0084 „ () + 5 0x0085 … () + 6 0x0086 † () + 7 0x0087 ‡ () + 8 0x0088 ˆ () + 9 0x0089 ‰ () + 10 0x008A Š () + 11 0x008B ‹ () + 12 0x008C Œ () + 13 0x008D  () + 14 0x008E Ž () + 15 0x008F  () + 16 0x0090  () + 17 0x0091 ‘ () + 18 0x0092 ’ () + 19 0x0093 “ () + 20 0x0094 ” () + 21 0x0095 • () + 22 0x0096 – () + 23 0x0097 — () + 24 0x0098 ˜ () + 25 0x0099 ™ () + 26 0x009A š () + 27 0x009B › () + 28 0x009C œ () + 29 0x009D  () + 30 0x009E ž () + 31 0x009F Ÿ () + 32 0x00A0   (NO-BREAK SPACE) + 33 0x0401 Ё (CYRILLIC CAPITAL LETTER IO) + 34 0x0402 Ђ (CYRILLIC CAPITAL LETTER DJE) + 35 0x0403 Ѓ (CYRILLIC CAPITAL LETTER GJE) + 36 0x0404 Є (CYRILLIC CAPITAL LETTER UKRAINIAN IE) + 37 0x0405 Ѕ (CYRILLIC CAPITAL LETTER DZE) + 38 0x0406 І (CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I) + 39 0x0407 Ї (CYRILLIC CAPITAL LETTER YI) + 40 0x0408 Ј (CYRILLIC CAPITAL LETTER JE) + 41 0x0409 Љ (CYRILLIC CAPITAL LETTER LJE) + 42 0x040A Њ (CYRILLIC CAPITAL LETTER NJE) + 43 0x040B Ћ (CYRILLIC CAPITAL LETTER TSHE) + 44 0x040C Ќ (CYRILLIC CAPITAL LETTER KJE) + 45 0x00AD ­ (SOFT HYPHEN) + 46 0x040E Ў (CYRILLIC CAPITAL LETTER SHORT U) + 47 0x040F Џ (CYRILLIC CAPITAL LETTER DZHE) + 48 0x0410 А (CYRILLIC CAPITAL LETTER A) + 49 0x0411 Б (CYRILLIC CAPITAL LETTER BE) + 50 0x0412 В (CYRILLIC CAPITAL LETTER VE) + 51 0x0413 Г (CYRILLIC CAPITAL LETTER GHE) + 52 0x0414 Д (CYRILLIC CAPITAL LETTER DE) + 53 0x0415 Е (CYRILLIC CAPITAL LETTER IE) + 54 0x0416 Ж (CYRILLIC CAPITAL LETTER ZHE) + 55 0x0417 З (CYRILLIC CAPITAL LETTER ZE) + 56 0x0418 И (CYRILLIC CAPITAL LETTER I) + 57 0x0419 Й (CYRILLIC CAPITAL LETTER SHORT I) + 58 0x041A К (CYRILLIC CAPITAL LETTER KA) + 59 0x041B Л (CYRILLIC CAPITAL LETTER EL) + 60 0x041C М (CYRILLIC CAPITAL LETTER EM) + 61 0x041D Н (CYRILLIC CAPITAL LETTER EN) + 62 0x041E О (CYRILLIC CAPITAL LETTER O) + 63 0x041F П (CYRILLIC CAPITAL LETTER PE) + 64 0x0420 Р (CYRILLIC CAPITAL LETTER ER) + 65 0x0421 С (CYRILLIC CAPITAL LETTER ES) + 66 0x0422 Т (CYRILLIC CAPITAL LETTER TE) + 67 0x0423 У (CYRILLIC CAPITAL LETTER U) + 68 0x0424 Ф (CYRILLIC CAPITAL LETTER EF) + 69 0x0425 Х (CYRILLIC CAPITAL LETTER HA) + 70 0x0426 Ц (CYRILLIC CAPITAL LETTER TSE) + 71 0x0427 Ч (CYRILLIC CAPITAL LETTER CHE) + 72 0x0428 Ш (CYRILLIC CAPITAL LETTER SHA) + 73 0x0429 Щ (CYRILLIC CAPITAL LETTER SHCHA) + 74 0x042A Ъ (CYRILLIC CAPITAL LETTER HARD SIGN) + 75 0x042B Ы (CYRILLIC CAPITAL LETTER YERU) + 76 0x042C Ь (CYRILLIC CAPITAL LETTER SOFT SIGN) + 77 0x042D Э (CYRILLIC CAPITAL LETTER E) + 78 0x042E Ю (CYRILLIC CAPITAL LETTER YU) + 79 0x042F Я (CYRILLIC CAPITAL LETTER YA) + 80 0x0430 а (CYRILLIC SMALL LETTER A) + 81 0x0431 б (CYRILLIC SMALL LETTER BE) + 82 0x0432 в (CYRILLIC SMALL LETTER VE) + 83 0x0433 г (CYRILLIC SMALL LETTER GHE) + 84 0x0434 д (CYRILLIC SMALL LETTER DE) + 85 0x0435 е (CYRILLIC SMALL LETTER IE) + 86 0x0436 ж (CYRILLIC SMALL LETTER ZHE) + 87 0x0437 з (CYRILLIC SMALL LETTER ZE) + 88 0x0438 и (CYRILLIC SMALL LETTER I) + 89 0x0439 й (CYRILLIC SMALL LETTER SHORT I) + 90 0x043A к (CYRILLIC SMALL LETTER KA) + 91 0x043B л (CYRILLIC SMALL LETTER EL) + 92 0x043C м (CYRILLIC SMALL LETTER EM) + 93 0x043D н (CYRILLIC SMALL LETTER EN) + 94 0x043E о (CYRILLIC SMALL LETTER O) + 95 0x043F п (CYRILLIC SMALL LETTER PE) + 96 0x0440 р (CYRILLIC SMALL LETTER ER) + 97 0x0441 с (CYRILLIC SMALL LETTER ES) + 98 0x0442 т (CYRILLIC SMALL LETTER TE) + 99 0x0443 у (CYRILLIC SMALL LETTER U) +100 0x0444 ф (CYRILLIC SMALL LETTER EF) +101 0x0445 х (CYRILLIC SMALL LETTER HA) +102 0x0446 ц (CYRILLIC SMALL LETTER TSE) +103 0x0447 ч (CYRILLIC SMALL LETTER CHE) +104 0x0448 ш (CYRILLIC SMALL LETTER SHA) +105 0x0449 щ (CYRILLIC SMALL LETTER SHCHA) +106 0x044A ъ (CYRILLIC SMALL LETTER HARD SIGN) +107 0x044B ы (CYRILLIC SMALL LETTER YERU) +108 0x044C ь (CYRILLIC SMALL LETTER SOFT SIGN) +109 0x044D э (CYRILLIC SMALL LETTER E) +110 0x044E ю (CYRILLIC SMALL LETTER YU) +111 0x044F я (CYRILLIC SMALL LETTER YA) +112 0x2116 № (NUMERO SIGN) +113 0x0451 ё (CYRILLIC SMALL LETTER IO) +114 0x0452 ђ (CYRILLIC SMALL LETTER DJE) +115 0x0453 ѓ (CYRILLIC SMALL LETTER GJE) +116 0x0454 є (CYRILLIC SMALL LETTER UKRAINIAN IE) +117 0x0455 ѕ (CYRILLIC SMALL LETTER DZE) +118 0x0456 і (CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I) +119 0x0457 ї (CYRILLIC SMALL LETTER YI) +120 0x0458 ј (CYRILLIC SMALL LETTER JE) +121 0x0459 љ (CYRILLIC SMALL LETTER LJE) +122 0x045A њ (CYRILLIC SMALL LETTER NJE) +123 0x045B ћ (CYRILLIC SMALL LETTER TSHE) +124 0x045C ќ (CYRILLIC SMALL LETTER KJE) +125 0x00A7 § (SECTION SIGN) +126 0x045E ў (CYRILLIC SMALL LETTER SHORT U) +127 0x045F џ (CYRILLIC SMALL LETTER DZHE) diff --git a/test/fixtures/encoding/single-byte/index-iso-8859-6.txt b/test/fixtures/encoding/single-byte/index-iso-8859-6.txt new file mode 100644 index 00000000000000..8a6869100f0d12 --- /dev/null +++ b/test/fixtures/encoding/single-byte/index-iso-8859-6.txt @@ -0,0 +1,89 @@ +# For details on index index-iso-8859-6.txt see the Encoding Standard +# https://encoding.spec.whatwg.org/ +# +# Identifier: 85bb7b5c2dc75975afebe5743935ba4ed5a09c1e9e34e9bfb2ff80293f5d8bbc +# Date: 2024-09-18 + + 0 0x0080 € () + 1 0x0081  () + 2 0x0082 ‚ () + 3 0x0083 ƒ () + 4 0x0084 „ () + 5 0x0085 … () + 6 0x0086 † () + 7 0x0087 ‡ () + 8 0x0088 ˆ () + 9 0x0089 ‰ () + 10 0x008A Š () + 11 0x008B ‹ () + 12 0x008C Œ () + 13 0x008D  () + 14 0x008E Ž () + 15 0x008F  () + 16 0x0090  () + 17 0x0091 ‘ () + 18 0x0092 ’ () + 19 0x0093 “ () + 20 0x0094 ” () + 21 0x0095 • () + 22 0x0096 – () + 23 0x0097 — () + 24 0x0098 ˜ () + 25 0x0099 ™ () + 26 0x009A š () + 27 0x009B › () + 28 0x009C œ () + 29 0x009D  () + 30 0x009E ž () + 31 0x009F Ÿ () + 32 0x00A0   (NO-BREAK SPACE) + 36 0x00A4 ¤ (CURRENCY SIGN) + 44 0x060C ، (ARABIC COMMA) + 45 0x00AD ­ (SOFT HYPHEN) + 59 0x061B ؛ (ARABIC SEMICOLON) + 63 0x061F ؟ (ARABIC QUESTION MARK) + 65 0x0621 ء (ARABIC LETTER HAMZA) + 66 0x0622 آ (ARABIC LETTER ALEF WITH MADDA ABOVE) + 67 0x0623 أ (ARABIC LETTER ALEF WITH HAMZA ABOVE) + 68 0x0624 ؤ (ARABIC LETTER WAW WITH HAMZA ABOVE) + 69 0x0625 إ (ARABIC LETTER ALEF WITH HAMZA BELOW) + 70 0x0626 ئ (ARABIC LETTER YEH WITH HAMZA ABOVE) + 71 0x0627 ا (ARABIC LETTER ALEF) + 72 0x0628 ب (ARABIC LETTER BEH) + 73 0x0629 ة (ARABIC LETTER TEH MARBUTA) + 74 0x062A ت (ARABIC LETTER TEH) + 75 0x062B ث (ARABIC LETTER THEH) + 76 0x062C ج (ARABIC LETTER JEEM) + 77 0x062D ح (ARABIC LETTER HAH) + 78 0x062E خ (ARABIC LETTER KHAH) + 79 0x062F د (ARABIC LETTER DAL) + 80 0x0630 ذ (ARABIC LETTER THAL) + 81 0x0631 ر (ARABIC LETTER REH) + 82 0x0632 ز (ARABIC LETTER ZAIN) + 83 0x0633 س (ARABIC LETTER SEEN) + 84 0x0634 ش (ARABIC LETTER SHEEN) + 85 0x0635 ص (ARABIC LETTER SAD) + 86 0x0636 ض (ARABIC LETTER DAD) + 87 0x0637 ط (ARABIC LETTER TAH) + 88 0x0638 ظ (ARABIC LETTER ZAH) + 89 0x0639 ع (ARABIC LETTER AIN) + 90 0x063A غ (ARABIC LETTER GHAIN) + 96 0x0640 ـ (ARABIC TATWEEL) + 97 0x0641 ف (ARABIC LETTER FEH) + 98 0x0642 ق (ARABIC LETTER QAF) + 99 0x0643 ك (ARABIC LETTER KAF) +100 0x0644 ل (ARABIC LETTER LAM) +101 0x0645 م (ARABIC LETTER MEEM) +102 0x0646 ن (ARABIC LETTER NOON) +103 0x0647 ه (ARABIC LETTER HEH) +104 0x0648 و (ARABIC LETTER WAW) +105 0x0649 ى (ARABIC LETTER ALEF MAKSURA) +106 0x064A ي (ARABIC LETTER YEH) +107 0x064B ً (ARABIC FATHATAN) +108 0x064C ٌ (ARABIC DAMMATAN) +109 0x064D ٍ (ARABIC KASRATAN) +110 0x064E َ (ARABIC FATHA) +111 0x064F ُ (ARABIC DAMMA) +112 0x0650 ِ (ARABIC KASRA) +113 0x0651 ّ (ARABIC SHADDA) +114 0x0652 ْ (ARABIC SUKUN) diff --git a/test/fixtures/encoding/single-byte/index-iso-8859-7.txt b/test/fixtures/encoding/single-byte/index-iso-8859-7.txt new file mode 100644 index 00000000000000..484683ddcfd88a --- /dev/null +++ b/test/fixtures/encoding/single-byte/index-iso-8859-7.txt @@ -0,0 +1,131 @@ +# For details on index index-iso-8859-7.txt see the Encoding Standard +# https://encoding.spec.whatwg.org/ +# +# Identifier: f53d8aeba36314ef950eef02ffcf11dff540638ce27dfe7a86b6ccc6875afb24 +# Date: 2024-09-18 + + 0 0x0080 € () + 1 0x0081  () + 2 0x0082 ‚ () + 3 0x0083 ƒ () + 4 0x0084 „ () + 5 0x0085 … () + 6 0x0086 † () + 7 0x0087 ‡ () + 8 0x0088 ˆ () + 9 0x0089 ‰ () + 10 0x008A Š () + 11 0x008B ‹ () + 12 0x008C Œ () + 13 0x008D  () + 14 0x008E Ž () + 15 0x008F  () + 16 0x0090  () + 17 0x0091 ‘ () + 18 0x0092 ’ () + 19 0x0093 “ () + 20 0x0094 ” () + 21 0x0095 • () + 22 0x0096 – () + 23 0x0097 — () + 24 0x0098 ˜ () + 25 0x0099 ™ () + 26 0x009A š () + 27 0x009B › () + 28 0x009C œ () + 29 0x009D  () + 30 0x009E ž () + 31 0x009F Ÿ () + 32 0x00A0   (NO-BREAK SPACE) + 33 0x2018 ‘ (LEFT SINGLE QUOTATION MARK) + 34 0x2019 ’ (RIGHT SINGLE QUOTATION MARK) + 35 0x00A3 £ (POUND SIGN) + 36 0x20AC € (EURO SIGN) + 37 0x20AF ₯ (DRACHMA SIGN) + 38 0x00A6 ¦ (BROKEN BAR) + 39 0x00A7 § (SECTION SIGN) + 40 0x00A8 ¨ (DIAERESIS) + 41 0x00A9 © (COPYRIGHT SIGN) + 42 0x037A ͺ (GREEK YPOGEGRAMMENI) + 43 0x00AB « (LEFT-POINTING DOUBLE ANGLE QUOTATION MARK) + 44 0x00AC ¬ (NOT SIGN) + 45 0x00AD ­ (SOFT HYPHEN) + 47 0x2015 ― (HORIZONTAL BAR) + 48 0x00B0 ° (DEGREE SIGN) + 49 0x00B1 ± (PLUS-MINUS SIGN) + 50 0x00B2 ² (SUPERSCRIPT TWO) + 51 0x00B3 ³ (SUPERSCRIPT THREE) + 52 0x0384 ΄ (GREEK TONOS) + 53 0x0385 ΅ (GREEK DIALYTIKA TONOS) + 54 0x0386 Ά (GREEK CAPITAL LETTER ALPHA WITH TONOS) + 55 0x00B7 · (MIDDLE DOT) + 56 0x0388 Έ (GREEK CAPITAL LETTER EPSILON WITH TONOS) + 57 0x0389 Ή (GREEK CAPITAL LETTER ETA WITH TONOS) + 58 0x038A Ί (GREEK CAPITAL LETTER IOTA WITH TONOS) + 59 0x00BB » (RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK) + 60 0x038C Ό (GREEK CAPITAL LETTER OMICRON WITH TONOS) + 61 0x00BD ½ (VULGAR FRACTION ONE HALF) + 62 0x038E Ύ (GREEK CAPITAL LETTER UPSILON WITH TONOS) + 63 0x038F Ώ (GREEK CAPITAL LETTER OMEGA WITH TONOS) + 64 0x0390 ΐ (GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS) + 65 0x0391 Α (GREEK CAPITAL LETTER ALPHA) + 66 0x0392 Β (GREEK CAPITAL LETTER BETA) + 67 0x0393 Γ (GREEK CAPITAL LETTER GAMMA) + 68 0x0394 Δ (GREEK CAPITAL LETTER DELTA) + 69 0x0395 Ε (GREEK CAPITAL LETTER EPSILON) + 70 0x0396 Ζ (GREEK CAPITAL LETTER ZETA) + 71 0x0397 Η (GREEK CAPITAL LETTER ETA) + 72 0x0398 Θ (GREEK CAPITAL LETTER THETA) + 73 0x0399 Ι (GREEK CAPITAL LETTER IOTA) + 74 0x039A Κ (GREEK CAPITAL LETTER KAPPA) + 75 0x039B Λ (GREEK CAPITAL LETTER LAMDA) + 76 0x039C Μ (GREEK CAPITAL LETTER MU) + 77 0x039D Ν (GREEK CAPITAL LETTER NU) + 78 0x039E Ξ (GREEK CAPITAL LETTER XI) + 79 0x039F Ο (GREEK CAPITAL LETTER OMICRON) + 80 0x03A0 Π (GREEK CAPITAL LETTER PI) + 81 0x03A1 Ρ (GREEK CAPITAL LETTER RHO) + 83 0x03A3 Σ (GREEK CAPITAL LETTER SIGMA) + 84 0x03A4 Τ (GREEK CAPITAL LETTER TAU) + 85 0x03A5 Υ (GREEK CAPITAL LETTER UPSILON) + 86 0x03A6 Φ (GREEK CAPITAL LETTER PHI) + 87 0x03A7 Χ (GREEK CAPITAL LETTER CHI) + 88 0x03A8 Ψ (GREEK CAPITAL LETTER PSI) + 89 0x03A9 Ω (GREEK CAPITAL LETTER OMEGA) + 90 0x03AA Ϊ (GREEK CAPITAL LETTER IOTA WITH DIALYTIKA) + 91 0x03AB Ϋ (GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA) + 92 0x03AC ά (GREEK SMALL LETTER ALPHA WITH TONOS) + 93 0x03AD έ (GREEK SMALL LETTER EPSILON WITH TONOS) + 94 0x03AE ή (GREEK SMALL LETTER ETA WITH TONOS) + 95 0x03AF ί (GREEK SMALL LETTER IOTA WITH TONOS) + 96 0x03B0 ΰ (GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS) + 97 0x03B1 α (GREEK SMALL LETTER ALPHA) + 98 0x03B2 β (GREEK SMALL LETTER BETA) + 99 0x03B3 γ (GREEK SMALL LETTER GAMMA) +100 0x03B4 δ (GREEK SMALL LETTER DELTA) +101 0x03B5 ε (GREEK SMALL LETTER EPSILON) +102 0x03B6 ζ (GREEK SMALL LETTER ZETA) +103 0x03B7 η (GREEK SMALL LETTER ETA) +104 0x03B8 θ (GREEK SMALL LETTER THETA) +105 0x03B9 ι (GREEK SMALL LETTER IOTA) +106 0x03BA κ (GREEK SMALL LETTER KAPPA) +107 0x03BB λ (GREEK SMALL LETTER LAMDA) +108 0x03BC μ (GREEK SMALL LETTER MU) +109 0x03BD ν (GREEK SMALL LETTER NU) +110 0x03BE ξ (GREEK SMALL LETTER XI) +111 0x03BF ο (GREEK SMALL LETTER OMICRON) +112 0x03C0 π (GREEK SMALL LETTER PI) +113 0x03C1 ρ (GREEK SMALL LETTER RHO) +114 0x03C2 ς (GREEK SMALL LETTER FINAL SIGMA) +115 0x03C3 σ (GREEK SMALL LETTER SIGMA) +116 0x03C4 τ (GREEK SMALL LETTER TAU) +117 0x03C5 υ (GREEK SMALL LETTER UPSILON) +118 0x03C6 φ (GREEK SMALL LETTER PHI) +119 0x03C7 χ (GREEK SMALL LETTER CHI) +120 0x03C8 ψ (GREEK SMALL LETTER PSI) +121 0x03C9 ω (GREEK SMALL LETTER OMEGA) +122 0x03CA ϊ (GREEK SMALL LETTER IOTA WITH DIALYTIKA) +123 0x03CB ϋ (GREEK SMALL LETTER UPSILON WITH DIALYTIKA) +124 0x03CC ό (GREEK SMALL LETTER OMICRON WITH TONOS) +125 0x03CD ύ (GREEK SMALL LETTER UPSILON WITH TONOS) +126 0x03CE ώ (GREEK SMALL LETTER OMEGA WITH TONOS) diff --git a/test/fixtures/encoding/single-byte/index-iso-8859-8.txt b/test/fixtures/encoding/single-byte/index-iso-8859-8.txt new file mode 100644 index 00000000000000..b60cf8db324ed1 --- /dev/null +++ b/test/fixtures/encoding/single-byte/index-iso-8859-8.txt @@ -0,0 +1,98 @@ +# For details on index index-iso-8859-8.txt see the Encoding Standard +# https://encoding.spec.whatwg.org/ +# +# Identifier: 7657a9ca3fa875990da960d3f812eea28dcd0ae6ed55a18d5394303c86f5484b +# Date: 2024-09-18 + + 0 0x0080 € () + 1 0x0081  () + 2 0x0082 ‚ () + 3 0x0083 ƒ () + 4 0x0084 „ () + 5 0x0085 … () + 6 0x0086 † () + 7 0x0087 ‡ () + 8 0x0088 ˆ () + 9 0x0089 ‰ () + 10 0x008A Š () + 11 0x008B ‹ () + 12 0x008C Œ () + 13 0x008D  () + 14 0x008E Ž () + 15 0x008F  () + 16 0x0090  () + 17 0x0091 ‘ () + 18 0x0092 ’ () + 19 0x0093 “ () + 20 0x0094 ” () + 21 0x0095 • () + 22 0x0096 – () + 23 0x0097 — () + 24 0x0098 ˜ () + 25 0x0099 ™ () + 26 0x009A š () + 27 0x009B › () + 28 0x009C œ () + 29 0x009D  () + 30 0x009E ž () + 31 0x009F Ÿ () + 32 0x00A0   (NO-BREAK SPACE) + 34 0x00A2 ¢ (CENT SIGN) + 35 0x00A3 £ (POUND SIGN) + 36 0x00A4 ¤ (CURRENCY SIGN) + 37 0x00A5 ¥ (YEN SIGN) + 38 0x00A6 ¦ (BROKEN BAR) + 39 0x00A7 § (SECTION SIGN) + 40 0x00A8 ¨ (DIAERESIS) + 41 0x00A9 © (COPYRIGHT SIGN) + 42 0x00D7 × (MULTIPLICATION SIGN) + 43 0x00AB « (LEFT-POINTING DOUBLE ANGLE QUOTATION MARK) + 44 0x00AC ¬ (NOT SIGN) + 45 0x00AD ­ (SOFT HYPHEN) + 46 0x00AE ® (REGISTERED SIGN) + 47 0x00AF ¯ (MACRON) + 48 0x00B0 ° (DEGREE SIGN) + 49 0x00B1 ± (PLUS-MINUS SIGN) + 50 0x00B2 ² (SUPERSCRIPT TWO) + 51 0x00B3 ³ (SUPERSCRIPT THREE) + 52 0x00B4 ´ (ACUTE ACCENT) + 53 0x00B5 µ (MICRO SIGN) + 54 0x00B6 ¶ (PILCROW SIGN) + 55 0x00B7 · (MIDDLE DOT) + 56 0x00B8 ¸ (CEDILLA) + 57 0x00B9 ¹ (SUPERSCRIPT ONE) + 58 0x00F7 ÷ (DIVISION SIGN) + 59 0x00BB » (RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK) + 60 0x00BC ¼ (VULGAR FRACTION ONE QUARTER) + 61 0x00BD ½ (VULGAR FRACTION ONE HALF) + 62 0x00BE ¾ (VULGAR FRACTION THREE QUARTERS) + 95 0x2017 ‗ (DOUBLE LOW LINE) + 96 0x05D0 א (HEBREW LETTER ALEF) + 97 0x05D1 ב (HEBREW LETTER BET) + 98 0x05D2 ג (HEBREW LETTER GIMEL) + 99 0x05D3 ד (HEBREW LETTER DALET) +100 0x05D4 ה (HEBREW LETTER HE) +101 0x05D5 ו (HEBREW LETTER VAV) +102 0x05D6 ז (HEBREW LETTER ZAYIN) +103 0x05D7 ח (HEBREW LETTER HET) +104 0x05D8 ט (HEBREW LETTER TET) +105 0x05D9 י (HEBREW LETTER YOD) +106 0x05DA ך (HEBREW LETTER FINAL KAF) +107 0x05DB כ (HEBREW LETTER KAF) +108 0x05DC ל (HEBREW LETTER LAMED) +109 0x05DD ם (HEBREW LETTER FINAL MEM) +110 0x05DE מ (HEBREW LETTER MEM) +111 0x05DF ן (HEBREW LETTER FINAL NUN) +112 0x05E0 נ (HEBREW LETTER NUN) +113 0x05E1 ס (HEBREW LETTER SAMEKH) +114 0x05E2 ע (HEBREW LETTER AYIN) +115 0x05E3 ף (HEBREW LETTER FINAL PE) +116 0x05E4 פ (HEBREW LETTER PE) +117 0x05E5 ץ (HEBREW LETTER FINAL TSADI) +118 0x05E6 צ (HEBREW LETTER TSADI) +119 0x05E7 ק (HEBREW LETTER QOF) +120 0x05E8 ר (HEBREW LETTER RESH) +121 0x05E9 ש (HEBREW LETTER SHIN) +122 0x05EA ת (HEBREW LETTER TAV) +125 0x200E ‎ (LEFT-TO-RIGHT MARK) +126 0x200F ‏ (RIGHT-TO-LEFT MARK) diff --git a/test/fixtures/encoding/single-byte/index-koi8-r.txt b/test/fixtures/encoding/single-byte/index-koi8-r.txt new file mode 100644 index 00000000000000..ef0b2d710b6504 --- /dev/null +++ b/test/fixtures/encoding/single-byte/index-koi8-r.txt @@ -0,0 +1,134 @@ +# For details on index index-koi8-r.txt see the Encoding Standard +# https://encoding.spec.whatwg.org/ +# +# Identifier: c5497cd9071cb352c0e56b219154e539badf63de40b71578f09e2e11fe7d50ae +# Date: 2024-09-18 + + 0 0x2500 ─ (BOX DRAWINGS LIGHT HORIZONTAL) + 1 0x2502 │ (BOX DRAWINGS LIGHT VERTICAL) + 2 0x250C ┌ (BOX DRAWINGS LIGHT DOWN AND RIGHT) + 3 0x2510 ┐ (BOX DRAWINGS LIGHT DOWN AND LEFT) + 4 0x2514 └ (BOX DRAWINGS LIGHT UP AND RIGHT) + 5 0x2518 ┘ (BOX DRAWINGS LIGHT UP AND LEFT) + 6 0x251C ├ (BOX DRAWINGS LIGHT VERTICAL AND RIGHT) + 7 0x2524 ┤ (BOX DRAWINGS LIGHT VERTICAL AND LEFT) + 8 0x252C ┬ (BOX DRAWINGS LIGHT DOWN AND HORIZONTAL) + 9 0x2534 ┴ (BOX DRAWINGS LIGHT UP AND HORIZONTAL) + 10 0x253C ┼ (BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL) + 11 0x2580 ▀ (UPPER HALF BLOCK) + 12 0x2584 ▄ (LOWER HALF BLOCK) + 13 0x2588 █ (FULL BLOCK) + 14 0x258C ▌ (LEFT HALF BLOCK) + 15 0x2590 ▐ (RIGHT HALF BLOCK) + 16 0x2591 ░ (LIGHT SHADE) + 17 0x2592 ▒ (MEDIUM SHADE) + 18 0x2593 ▓ (DARK SHADE) + 19 0x2320 ⌠ (TOP HALF INTEGRAL) + 20 0x25A0 ■ (BLACK SQUARE) + 21 0x2219 ∙ (BULLET OPERATOR) + 22 0x221A √ (SQUARE ROOT) + 23 0x2248 ≈ (ALMOST EQUAL TO) + 24 0x2264 ≤ (LESS-THAN OR EQUAL TO) + 25 0x2265 ≥ (GREATER-THAN OR EQUAL TO) + 26 0x00A0   (NO-BREAK SPACE) + 27 0x2321 ⌡ (BOTTOM HALF INTEGRAL) + 28 0x00B0 ° (DEGREE SIGN) + 29 0x00B2 ² (SUPERSCRIPT TWO) + 30 0x00B7 · (MIDDLE DOT) + 31 0x00F7 ÷ (DIVISION SIGN) + 32 0x2550 ═ (BOX DRAWINGS DOUBLE HORIZONTAL) + 33 0x2551 ║ (BOX DRAWINGS DOUBLE VERTICAL) + 34 0x2552 ╒ (BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE) + 35 0x0451 ё (CYRILLIC SMALL LETTER IO) + 36 0x2553 ╓ (BOX DRAWINGS DOWN DOUBLE AND RIGHT SINGLE) + 37 0x2554 ╔ (BOX DRAWINGS DOUBLE DOWN AND RIGHT) + 38 0x2555 ╕ (BOX DRAWINGS DOWN SINGLE AND LEFT DOUBLE) + 39 0x2556 ╖ (BOX DRAWINGS DOWN DOUBLE AND LEFT SINGLE) + 40 0x2557 ╗ (BOX DRAWINGS DOUBLE DOWN AND LEFT) + 41 0x2558 ╘ (BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE) + 42 0x2559 ╙ (BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE) + 43 0x255A ╚ (BOX DRAWINGS DOUBLE UP AND RIGHT) + 44 0x255B ╛ (BOX DRAWINGS UP SINGLE AND LEFT DOUBLE) + 45 0x255C ╜ (BOX DRAWINGS UP DOUBLE AND LEFT SINGLE) + 46 0x255D ╝ (BOX DRAWINGS DOUBLE UP AND LEFT) + 47 0x255E ╞ (BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE) + 48 0x255F ╟ (BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE) + 49 0x2560 ╠ (BOX DRAWINGS DOUBLE VERTICAL AND RIGHT) + 50 0x2561 ╡ (BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE) + 51 0x0401 Ё (CYRILLIC CAPITAL LETTER IO) + 52 0x2562 ╢ (BOX DRAWINGS VERTICAL DOUBLE AND LEFT SINGLE) + 53 0x2563 ╣ (BOX DRAWINGS DOUBLE VERTICAL AND LEFT) + 54 0x2564 ╤ (BOX DRAWINGS DOWN SINGLE AND HORIZONTAL DOUBLE) + 55 0x2565 ╥ (BOX DRAWINGS DOWN DOUBLE AND HORIZONTAL SINGLE) + 56 0x2566 ╦ (BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL) + 57 0x2567 ╧ (BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE) + 58 0x2568 ╨ (BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE) + 59 0x2569 ╩ (BOX DRAWINGS DOUBLE UP AND HORIZONTAL) + 60 0x256A ╪ (BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE) + 61 0x256B ╫ (BOX DRAWINGS VERTICAL DOUBLE AND HORIZONTAL SINGLE) + 62 0x256C ╬ (BOX DRAWINGS DOUBLE VERTICAL AND HORIZONTAL) + 63 0x00A9 © (COPYRIGHT SIGN) + 64 0x044E ю (CYRILLIC SMALL LETTER YU) + 65 0x0430 а (CYRILLIC SMALL LETTER A) + 66 0x0431 б (CYRILLIC SMALL LETTER BE) + 67 0x0446 ц (CYRILLIC SMALL LETTER TSE) + 68 0x0434 д (CYRILLIC SMALL LETTER DE) + 69 0x0435 е (CYRILLIC SMALL LETTER IE) + 70 0x0444 ф (CYRILLIC SMALL LETTER EF) + 71 0x0433 г (CYRILLIC SMALL LETTER GHE) + 72 0x0445 х (CYRILLIC SMALL LETTER HA) + 73 0x0438 и (CYRILLIC SMALL LETTER I) + 74 0x0439 й (CYRILLIC SMALL LETTER SHORT I) + 75 0x043A к (CYRILLIC SMALL LETTER KA) + 76 0x043B л (CYRILLIC SMALL LETTER EL) + 77 0x043C м (CYRILLIC SMALL LETTER EM) + 78 0x043D н (CYRILLIC SMALL LETTER EN) + 79 0x043E о (CYRILLIC SMALL LETTER O) + 80 0x043F п (CYRILLIC SMALL LETTER PE) + 81 0x044F я (CYRILLIC SMALL LETTER YA) + 82 0x0440 р (CYRILLIC SMALL LETTER ER) + 83 0x0441 с (CYRILLIC SMALL LETTER ES) + 84 0x0442 т (CYRILLIC SMALL LETTER TE) + 85 0x0443 у (CYRILLIC SMALL LETTER U) + 86 0x0436 ж (CYRILLIC SMALL LETTER ZHE) + 87 0x0432 в (CYRILLIC SMALL LETTER VE) + 88 0x044C ь (CYRILLIC SMALL LETTER SOFT SIGN) + 89 0x044B ы (CYRILLIC SMALL LETTER YERU) + 90 0x0437 з (CYRILLIC SMALL LETTER ZE) + 91 0x0448 ш (CYRILLIC SMALL LETTER SHA) + 92 0x044D э (CYRILLIC SMALL LETTER E) + 93 0x0449 щ (CYRILLIC SMALL LETTER SHCHA) + 94 0x0447 ч (CYRILLIC SMALL LETTER CHE) + 95 0x044A ъ (CYRILLIC SMALL LETTER HARD SIGN) + 96 0x042E Ю (CYRILLIC CAPITAL LETTER YU) + 97 0x0410 А (CYRILLIC CAPITAL LETTER A) + 98 0x0411 Б (CYRILLIC CAPITAL LETTER BE) + 99 0x0426 Ц (CYRILLIC CAPITAL LETTER TSE) +100 0x0414 Д (CYRILLIC CAPITAL LETTER DE) +101 0x0415 Е (CYRILLIC CAPITAL LETTER IE) +102 0x0424 Ф (CYRILLIC CAPITAL LETTER EF) +103 0x0413 Г (CYRILLIC CAPITAL LETTER GHE) +104 0x0425 Х (CYRILLIC CAPITAL LETTER HA) +105 0x0418 И (CYRILLIC CAPITAL LETTER I) +106 0x0419 Й (CYRILLIC CAPITAL LETTER SHORT I) +107 0x041A К (CYRILLIC CAPITAL LETTER KA) +108 0x041B Л (CYRILLIC CAPITAL LETTER EL) +109 0x041C М (CYRILLIC CAPITAL LETTER EM) +110 0x041D Н (CYRILLIC CAPITAL LETTER EN) +111 0x041E О (CYRILLIC CAPITAL LETTER O) +112 0x041F П (CYRILLIC CAPITAL LETTER PE) +113 0x042F Я (CYRILLIC CAPITAL LETTER YA) +114 0x0420 Р (CYRILLIC CAPITAL LETTER ER) +115 0x0421 С (CYRILLIC CAPITAL LETTER ES) +116 0x0422 Т (CYRILLIC CAPITAL LETTER TE) +117 0x0423 У (CYRILLIC CAPITAL LETTER U) +118 0x0416 Ж (CYRILLIC CAPITAL LETTER ZHE) +119 0x0412 В (CYRILLIC CAPITAL LETTER VE) +120 0x042C Ь (CYRILLIC CAPITAL LETTER SOFT SIGN) +121 0x042B Ы (CYRILLIC CAPITAL LETTER YERU) +122 0x0417 З (CYRILLIC CAPITAL LETTER ZE) +123 0x0428 Ш (CYRILLIC CAPITAL LETTER SHA) +124 0x042D Э (CYRILLIC CAPITAL LETTER E) +125 0x0429 Щ (CYRILLIC CAPITAL LETTER SHCHA) +126 0x0427 Ч (CYRILLIC CAPITAL LETTER CHE) +127 0x042A Ъ (CYRILLIC CAPITAL LETTER HARD SIGN) diff --git a/test/fixtures/encoding/single-byte/index-koi8-u.txt b/test/fixtures/encoding/single-byte/index-koi8-u.txt new file mode 100644 index 00000000000000..bfd717758c2edb --- /dev/null +++ b/test/fixtures/encoding/single-byte/index-koi8-u.txt @@ -0,0 +1,134 @@ +# For details on index index-koi8-u.txt see the Encoding Standard +# https://encoding.spec.whatwg.org/ +# +# Identifier: 19a4da2c3f245118bbc8019326f45a07832949938ff903f03d62ac4da1f61f40 +# Date: 2024-09-18 + + 0 0x2500 ─ (BOX DRAWINGS LIGHT HORIZONTAL) + 1 0x2502 │ (BOX DRAWINGS LIGHT VERTICAL) + 2 0x250C ┌ (BOX DRAWINGS LIGHT DOWN AND RIGHT) + 3 0x2510 ┐ (BOX DRAWINGS LIGHT DOWN AND LEFT) + 4 0x2514 └ (BOX DRAWINGS LIGHT UP AND RIGHT) + 5 0x2518 ┘ (BOX DRAWINGS LIGHT UP AND LEFT) + 6 0x251C ├ (BOX DRAWINGS LIGHT VERTICAL AND RIGHT) + 7 0x2524 ┤ (BOX DRAWINGS LIGHT VERTICAL AND LEFT) + 8 0x252C ┬ (BOX DRAWINGS LIGHT DOWN AND HORIZONTAL) + 9 0x2534 ┴ (BOX DRAWINGS LIGHT UP AND HORIZONTAL) + 10 0x253C ┼ (BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL) + 11 0x2580 ▀ (UPPER HALF BLOCK) + 12 0x2584 ▄ (LOWER HALF BLOCK) + 13 0x2588 █ (FULL BLOCK) + 14 0x258C ▌ (LEFT HALF BLOCK) + 15 0x2590 ▐ (RIGHT HALF BLOCK) + 16 0x2591 ░ (LIGHT SHADE) + 17 0x2592 ▒ (MEDIUM SHADE) + 18 0x2593 ▓ (DARK SHADE) + 19 0x2320 ⌠ (TOP HALF INTEGRAL) + 20 0x25A0 ■ (BLACK SQUARE) + 21 0x2219 ∙ (BULLET OPERATOR) + 22 0x221A √ (SQUARE ROOT) + 23 0x2248 ≈ (ALMOST EQUAL TO) + 24 0x2264 ≤ (LESS-THAN OR EQUAL TO) + 25 0x2265 ≥ (GREATER-THAN OR EQUAL TO) + 26 0x00A0   (NO-BREAK SPACE) + 27 0x2321 ⌡ (BOTTOM HALF INTEGRAL) + 28 0x00B0 ° (DEGREE SIGN) + 29 0x00B2 ² (SUPERSCRIPT TWO) + 30 0x00B7 · (MIDDLE DOT) + 31 0x00F7 ÷ (DIVISION SIGN) + 32 0x2550 ═ (BOX DRAWINGS DOUBLE HORIZONTAL) + 33 0x2551 ║ (BOX DRAWINGS DOUBLE VERTICAL) + 34 0x2552 ╒ (BOX DRAWINGS DOWN SINGLE AND RIGHT DOUBLE) + 35 0x0451 ё (CYRILLIC SMALL LETTER IO) + 36 0x0454 є (CYRILLIC SMALL LETTER UKRAINIAN IE) + 37 0x2554 ╔ (BOX DRAWINGS DOUBLE DOWN AND RIGHT) + 38 0x0456 і (CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I) + 39 0x0457 ї (CYRILLIC SMALL LETTER YI) + 40 0x2557 ╗ (BOX DRAWINGS DOUBLE DOWN AND LEFT) + 41 0x2558 ╘ (BOX DRAWINGS UP SINGLE AND RIGHT DOUBLE) + 42 0x2559 ╙ (BOX DRAWINGS UP DOUBLE AND RIGHT SINGLE) + 43 0x255A ╚ (BOX DRAWINGS DOUBLE UP AND RIGHT) + 44 0x255B ╛ (BOX DRAWINGS UP SINGLE AND LEFT DOUBLE) + 45 0x0491 ґ (CYRILLIC SMALL LETTER GHE WITH UPTURN) + 46 0x045E ў (CYRILLIC SMALL LETTER SHORT U) + 47 0x255E ╞ (BOX DRAWINGS VERTICAL SINGLE AND RIGHT DOUBLE) + 48 0x255F ╟ (BOX DRAWINGS VERTICAL DOUBLE AND RIGHT SINGLE) + 49 0x2560 ╠ (BOX DRAWINGS DOUBLE VERTICAL AND RIGHT) + 50 0x2561 ╡ (BOX DRAWINGS VERTICAL SINGLE AND LEFT DOUBLE) + 51 0x0401 Ё (CYRILLIC CAPITAL LETTER IO) + 52 0x0404 Є (CYRILLIC CAPITAL LETTER UKRAINIAN IE) + 53 0x2563 ╣ (BOX DRAWINGS DOUBLE VERTICAL AND LEFT) + 54 0x0406 І (CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I) + 55 0x0407 Ї (CYRILLIC CAPITAL LETTER YI) + 56 0x2566 ╦ (BOX DRAWINGS DOUBLE DOWN AND HORIZONTAL) + 57 0x2567 ╧ (BOX DRAWINGS UP SINGLE AND HORIZONTAL DOUBLE) + 58 0x2568 ╨ (BOX DRAWINGS UP DOUBLE AND HORIZONTAL SINGLE) + 59 0x2569 ╩ (BOX DRAWINGS DOUBLE UP AND HORIZONTAL) + 60 0x256A ╪ (BOX DRAWINGS VERTICAL SINGLE AND HORIZONTAL DOUBLE) + 61 0x0490 Ґ (CYRILLIC CAPITAL LETTER GHE WITH UPTURN) + 62 0x040E Ў (CYRILLIC CAPITAL LETTER SHORT U) + 63 0x00A9 © (COPYRIGHT SIGN) + 64 0x044E ю (CYRILLIC SMALL LETTER YU) + 65 0x0430 а (CYRILLIC SMALL LETTER A) + 66 0x0431 б (CYRILLIC SMALL LETTER BE) + 67 0x0446 ц (CYRILLIC SMALL LETTER TSE) + 68 0x0434 д (CYRILLIC SMALL LETTER DE) + 69 0x0435 е (CYRILLIC SMALL LETTER IE) + 70 0x0444 ф (CYRILLIC SMALL LETTER EF) + 71 0x0433 г (CYRILLIC SMALL LETTER GHE) + 72 0x0445 х (CYRILLIC SMALL LETTER HA) + 73 0x0438 и (CYRILLIC SMALL LETTER I) + 74 0x0439 й (CYRILLIC SMALL LETTER SHORT I) + 75 0x043A к (CYRILLIC SMALL LETTER KA) + 76 0x043B л (CYRILLIC SMALL LETTER EL) + 77 0x043C м (CYRILLIC SMALL LETTER EM) + 78 0x043D н (CYRILLIC SMALL LETTER EN) + 79 0x043E о (CYRILLIC SMALL LETTER O) + 80 0x043F п (CYRILLIC SMALL LETTER PE) + 81 0x044F я (CYRILLIC SMALL LETTER YA) + 82 0x0440 р (CYRILLIC SMALL LETTER ER) + 83 0x0441 с (CYRILLIC SMALL LETTER ES) + 84 0x0442 т (CYRILLIC SMALL LETTER TE) + 85 0x0443 у (CYRILLIC SMALL LETTER U) + 86 0x0436 ж (CYRILLIC SMALL LETTER ZHE) + 87 0x0432 в (CYRILLIC SMALL LETTER VE) + 88 0x044C ь (CYRILLIC SMALL LETTER SOFT SIGN) + 89 0x044B ы (CYRILLIC SMALL LETTER YERU) + 90 0x0437 з (CYRILLIC SMALL LETTER ZE) + 91 0x0448 ш (CYRILLIC SMALL LETTER SHA) + 92 0x044D э (CYRILLIC SMALL LETTER E) + 93 0x0449 щ (CYRILLIC SMALL LETTER SHCHA) + 94 0x0447 ч (CYRILLIC SMALL LETTER CHE) + 95 0x044A ъ (CYRILLIC SMALL LETTER HARD SIGN) + 96 0x042E Ю (CYRILLIC CAPITAL LETTER YU) + 97 0x0410 А (CYRILLIC CAPITAL LETTER A) + 98 0x0411 Б (CYRILLIC CAPITAL LETTER BE) + 99 0x0426 Ц (CYRILLIC CAPITAL LETTER TSE) +100 0x0414 Д (CYRILLIC CAPITAL LETTER DE) +101 0x0415 Е (CYRILLIC CAPITAL LETTER IE) +102 0x0424 Ф (CYRILLIC CAPITAL LETTER EF) +103 0x0413 Г (CYRILLIC CAPITAL LETTER GHE) +104 0x0425 Х (CYRILLIC CAPITAL LETTER HA) +105 0x0418 И (CYRILLIC CAPITAL LETTER I) +106 0x0419 Й (CYRILLIC CAPITAL LETTER SHORT I) +107 0x041A К (CYRILLIC CAPITAL LETTER KA) +108 0x041B Л (CYRILLIC CAPITAL LETTER EL) +109 0x041C М (CYRILLIC CAPITAL LETTER EM) +110 0x041D Н (CYRILLIC CAPITAL LETTER EN) +111 0x041E О (CYRILLIC CAPITAL LETTER O) +112 0x041F П (CYRILLIC CAPITAL LETTER PE) +113 0x042F Я (CYRILLIC CAPITAL LETTER YA) +114 0x0420 Р (CYRILLIC CAPITAL LETTER ER) +115 0x0421 С (CYRILLIC CAPITAL LETTER ES) +116 0x0422 Т (CYRILLIC CAPITAL LETTER TE) +117 0x0423 У (CYRILLIC CAPITAL LETTER U) +118 0x0416 Ж (CYRILLIC CAPITAL LETTER ZHE) +119 0x0412 В (CYRILLIC CAPITAL LETTER VE) +120 0x042C Ь (CYRILLIC CAPITAL LETTER SOFT SIGN) +121 0x042B Ы (CYRILLIC CAPITAL LETTER YERU) +122 0x0417 З (CYRILLIC CAPITAL LETTER ZE) +123 0x0428 Ш (CYRILLIC CAPITAL LETTER SHA) +124 0x042D Э (CYRILLIC CAPITAL LETTER E) +125 0x0429 Щ (CYRILLIC CAPITAL LETTER SHCHA) +126 0x0427 Ч (CYRILLIC CAPITAL LETTER CHE) +127 0x042A Ъ (CYRILLIC CAPITAL LETTER HARD SIGN) diff --git a/test/fixtures/encoding/single-byte/index-macintosh.txt b/test/fixtures/encoding/single-byte/index-macintosh.txt new file mode 100644 index 00000000000000..fb166d4dc13cb6 --- /dev/null +++ b/test/fixtures/encoding/single-byte/index-macintosh.txt @@ -0,0 +1,134 @@ +# For details on index index-macintosh.txt see the Encoding Standard +# https://encoding.spec.whatwg.org/ +# +# Identifier: f2c6a4f6406b3e86a50a5dba4d2b7dd48e2e33c0d82aefe764535c934ec11764 +# Date: 2024-09-18 + + 0 0x00C4 Ä (LATIN CAPITAL LETTER A WITH DIAERESIS) + 1 0x00C5 Å (LATIN CAPITAL LETTER A WITH RING ABOVE) + 2 0x00C7 Ç (LATIN CAPITAL LETTER C WITH CEDILLA) + 3 0x00C9 É (LATIN CAPITAL LETTER E WITH ACUTE) + 4 0x00D1 Ñ (LATIN CAPITAL LETTER N WITH TILDE) + 5 0x00D6 Ö (LATIN CAPITAL LETTER O WITH DIAERESIS) + 6 0x00DC Ü (LATIN CAPITAL LETTER U WITH DIAERESIS) + 7 0x00E1 á (LATIN SMALL LETTER A WITH ACUTE) + 8 0x00E0 à (LATIN SMALL LETTER A WITH GRAVE) + 9 0x00E2 â (LATIN SMALL LETTER A WITH CIRCUMFLEX) + 10 0x00E4 ä (LATIN SMALL LETTER A WITH DIAERESIS) + 11 0x00E3 ã (LATIN SMALL LETTER A WITH TILDE) + 12 0x00E5 å (LATIN SMALL LETTER A WITH RING ABOVE) + 13 0x00E7 ç (LATIN SMALL LETTER C WITH CEDILLA) + 14 0x00E9 é (LATIN SMALL LETTER E WITH ACUTE) + 15 0x00E8 è (LATIN SMALL LETTER E WITH GRAVE) + 16 0x00EA ê (LATIN SMALL LETTER E WITH CIRCUMFLEX) + 17 0x00EB ë (LATIN SMALL LETTER E WITH DIAERESIS) + 18 0x00ED í (LATIN SMALL LETTER I WITH ACUTE) + 19 0x00EC ì (LATIN SMALL LETTER I WITH GRAVE) + 20 0x00EE î (LATIN SMALL LETTER I WITH CIRCUMFLEX) + 21 0x00EF ï (LATIN SMALL LETTER I WITH DIAERESIS) + 22 0x00F1 ñ (LATIN SMALL LETTER N WITH TILDE) + 23 0x00F3 ó (LATIN SMALL LETTER O WITH ACUTE) + 24 0x00F2 ò (LATIN SMALL LETTER O WITH GRAVE) + 25 0x00F4 ô (LATIN SMALL LETTER O WITH CIRCUMFLEX) + 26 0x00F6 ö (LATIN SMALL LETTER O WITH DIAERESIS) + 27 0x00F5 õ (LATIN SMALL LETTER O WITH TILDE) + 28 0x00FA ú (LATIN SMALL LETTER U WITH ACUTE) + 29 0x00F9 ù (LATIN SMALL LETTER U WITH GRAVE) + 30 0x00FB û (LATIN SMALL LETTER U WITH CIRCUMFLEX) + 31 0x00FC ü (LATIN SMALL LETTER U WITH DIAERESIS) + 32 0x2020 † (DAGGER) + 33 0x00B0 ° (DEGREE SIGN) + 34 0x00A2 ¢ (CENT SIGN) + 35 0x00A3 £ (POUND SIGN) + 36 0x00A7 § (SECTION SIGN) + 37 0x2022 • (BULLET) + 38 0x00B6 ¶ (PILCROW SIGN) + 39 0x00DF ß (LATIN SMALL LETTER SHARP S) + 40 0x00AE ® (REGISTERED SIGN) + 41 0x00A9 © (COPYRIGHT SIGN) + 42 0x2122 ™ (TRADE MARK SIGN) + 43 0x00B4 ´ (ACUTE ACCENT) + 44 0x00A8 ¨ (DIAERESIS) + 45 0x2260 ≠ (NOT EQUAL TO) + 46 0x00C6 Æ (LATIN CAPITAL LETTER AE) + 47 0x00D8 Ø (LATIN CAPITAL LETTER O WITH STROKE) + 48 0x221E ∞ (INFINITY) + 49 0x00B1 ± (PLUS-MINUS SIGN) + 50 0x2264 ≤ (LESS-THAN OR EQUAL TO) + 51 0x2265 ≥ (GREATER-THAN OR EQUAL TO) + 52 0x00A5 ¥ (YEN SIGN) + 53 0x00B5 µ (MICRO SIGN) + 54 0x2202 ∂ (PARTIAL DIFFERENTIAL) + 55 0x2211 ∑ (N-ARY SUMMATION) + 56 0x220F ∏ (N-ARY PRODUCT) + 57 0x03C0 π (GREEK SMALL LETTER PI) + 58 0x222B ∫ (INTEGRAL) + 59 0x00AA ª (FEMININE ORDINAL INDICATOR) + 60 0x00BA º (MASCULINE ORDINAL INDICATOR) + 61 0x03A9 Ω (GREEK CAPITAL LETTER OMEGA) + 62 0x00E6 æ (LATIN SMALL LETTER AE) + 63 0x00F8 ø (LATIN SMALL LETTER O WITH STROKE) + 64 0x00BF ¿ (INVERTED QUESTION MARK) + 65 0x00A1 ¡ (INVERTED EXCLAMATION MARK) + 66 0x00AC ¬ (NOT SIGN) + 67 0x221A √ (SQUARE ROOT) + 68 0x0192 ƒ (LATIN SMALL LETTER F WITH HOOK) + 69 0x2248 ≈ (ALMOST EQUAL TO) + 70 0x2206 ∆ (INCREMENT) + 71 0x00AB « (LEFT-POINTING DOUBLE ANGLE QUOTATION MARK) + 72 0x00BB » (RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK) + 73 0x2026 … (HORIZONTAL ELLIPSIS) + 74 0x00A0   (NO-BREAK SPACE) + 75 0x00C0 À (LATIN CAPITAL LETTER A WITH GRAVE) + 76 0x00C3 à (LATIN CAPITAL LETTER A WITH TILDE) + 77 0x00D5 Õ (LATIN CAPITAL LETTER O WITH TILDE) + 78 0x0152 Œ (LATIN CAPITAL LIGATURE OE) + 79 0x0153 œ (LATIN SMALL LIGATURE OE) + 80 0x2013 – (EN DASH) + 81 0x2014 — (EM DASH) + 82 0x201C “ (LEFT DOUBLE QUOTATION MARK) + 83 0x201D ” (RIGHT DOUBLE QUOTATION MARK) + 84 0x2018 ‘ (LEFT SINGLE QUOTATION MARK) + 85 0x2019 ’ (RIGHT SINGLE QUOTATION MARK) + 86 0x00F7 ÷ (DIVISION SIGN) + 87 0x25CA ◊ (LOZENGE) + 88 0x00FF ÿ (LATIN SMALL LETTER Y WITH DIAERESIS) + 89 0x0178 Ÿ (LATIN CAPITAL LETTER Y WITH DIAERESIS) + 90 0x2044 ⁄ (FRACTION SLASH) + 91 0x20AC € (EURO SIGN) + 92 0x2039 ‹ (SINGLE LEFT-POINTING ANGLE QUOTATION MARK) + 93 0x203A › (SINGLE RIGHT-POINTING ANGLE QUOTATION MARK) + 94 0xFB01 fi (LATIN SMALL LIGATURE FI) + 95 0xFB02 fl (LATIN SMALL LIGATURE FL) + 96 0x2021 ‡ (DOUBLE DAGGER) + 97 0x00B7 · (MIDDLE DOT) + 98 0x201A ‚ (SINGLE LOW-9 QUOTATION MARK) + 99 0x201E „ (DOUBLE LOW-9 QUOTATION MARK) +100 0x2030 ‰ (PER MILLE SIGN) +101 0x00C2  (LATIN CAPITAL LETTER A WITH CIRCUMFLEX) +102 0x00CA Ê (LATIN CAPITAL LETTER E WITH CIRCUMFLEX) +103 0x00C1 Á (LATIN CAPITAL LETTER A WITH ACUTE) +104 0x00CB Ë (LATIN CAPITAL LETTER E WITH DIAERESIS) +105 0x00C8 È (LATIN CAPITAL LETTER E WITH GRAVE) +106 0x00CD Í (LATIN CAPITAL LETTER I WITH ACUTE) +107 0x00CE Î (LATIN CAPITAL LETTER I WITH CIRCUMFLEX) +108 0x00CF Ï (LATIN CAPITAL LETTER I WITH DIAERESIS) +109 0x00CC Ì (LATIN CAPITAL LETTER I WITH GRAVE) +110 0x00D3 Ó (LATIN CAPITAL LETTER O WITH ACUTE) +111 0x00D4 Ô (LATIN CAPITAL LETTER O WITH CIRCUMFLEX) +112 0xF8FF  () +113 0x00D2 Ò (LATIN CAPITAL LETTER O WITH GRAVE) +114 0x00DA Ú (LATIN CAPITAL LETTER U WITH ACUTE) +115 0x00DB Û (LATIN CAPITAL LETTER U WITH CIRCUMFLEX) +116 0x00D9 Ù (LATIN CAPITAL LETTER U WITH GRAVE) +117 0x0131 ı (LATIN SMALL LETTER DOTLESS I) +118 0x02C6 ˆ (MODIFIER LETTER CIRCUMFLEX ACCENT) +119 0x02DC ˜ (SMALL TILDE) +120 0x00AF ¯ (MACRON) +121 0x02D8 ˘ (BREVE) +122 0x02D9 ˙ (DOT ABOVE) +123 0x02DA ˚ (RING ABOVE) +124 0x00B8 ¸ (CEDILLA) +125 0x02DD ˝ (DOUBLE ACUTE ACCENT) +126 0x02DB ˛ (OGONEK) +127 0x02C7 ˇ (CARON) diff --git a/test/fixtures/encoding/single-byte/index-windows-1250.txt b/test/fixtures/encoding/single-byte/index-windows-1250.txt new file mode 100644 index 00000000000000..24fa6c9a89e3cb --- /dev/null +++ b/test/fixtures/encoding/single-byte/index-windows-1250.txt @@ -0,0 +1,134 @@ +# For details on index index-windows-1250.txt see the Encoding Standard +# https://encoding.spec.whatwg.org/ +# +# Identifier: 0669455a7a1c70ba6003ea737991e8ee9adc455125c13cfe6705a361358de5fa +# Date: 2024-09-18 + + 0 0x20AC € (EURO SIGN) + 1 0x0081  () + 2 0x201A ‚ (SINGLE LOW-9 QUOTATION MARK) + 3 0x0083 ƒ () + 4 0x201E „ (DOUBLE LOW-9 QUOTATION MARK) + 5 0x2026 … (HORIZONTAL ELLIPSIS) + 6 0x2020 † (DAGGER) + 7 0x2021 ‡ (DOUBLE DAGGER) + 8 0x0088 ˆ () + 9 0x2030 ‰ (PER MILLE SIGN) + 10 0x0160 Š (LATIN CAPITAL LETTER S WITH CARON) + 11 0x2039 ‹ (SINGLE LEFT-POINTING ANGLE QUOTATION MARK) + 12 0x015A Ś (LATIN CAPITAL LETTER S WITH ACUTE) + 13 0x0164 Ť (LATIN CAPITAL LETTER T WITH CARON) + 14 0x017D Ž (LATIN CAPITAL LETTER Z WITH CARON) + 15 0x0179 Ź (LATIN CAPITAL LETTER Z WITH ACUTE) + 16 0x0090  () + 17 0x2018 ‘ (LEFT SINGLE QUOTATION MARK) + 18 0x2019 ’ (RIGHT SINGLE QUOTATION MARK) + 19 0x201C “ (LEFT DOUBLE QUOTATION MARK) + 20 0x201D ” (RIGHT DOUBLE QUOTATION MARK) + 21 0x2022 • (BULLET) + 22 0x2013 – (EN DASH) + 23 0x2014 — (EM DASH) + 24 0x0098 ˜ () + 25 0x2122 ™ (TRADE MARK SIGN) + 26 0x0161 š (LATIN SMALL LETTER S WITH CARON) + 27 0x203A › (SINGLE RIGHT-POINTING ANGLE QUOTATION MARK) + 28 0x015B ś (LATIN SMALL LETTER S WITH ACUTE) + 29 0x0165 ť (LATIN SMALL LETTER T WITH CARON) + 30 0x017E ž (LATIN SMALL LETTER Z WITH CARON) + 31 0x017A ź (LATIN SMALL LETTER Z WITH ACUTE) + 32 0x00A0   (NO-BREAK SPACE) + 33 0x02C7 ˇ (CARON) + 34 0x02D8 ˘ (BREVE) + 35 0x0141 Ł (LATIN CAPITAL LETTER L WITH STROKE) + 36 0x00A4 ¤ (CURRENCY SIGN) + 37 0x0104 Ą (LATIN CAPITAL LETTER A WITH OGONEK) + 38 0x00A6 ¦ (BROKEN BAR) + 39 0x00A7 § (SECTION SIGN) + 40 0x00A8 ¨ (DIAERESIS) + 41 0x00A9 © (COPYRIGHT SIGN) + 42 0x015E Ş (LATIN CAPITAL LETTER S WITH CEDILLA) + 43 0x00AB « (LEFT-POINTING DOUBLE ANGLE QUOTATION MARK) + 44 0x00AC ¬ (NOT SIGN) + 45 0x00AD ­ (SOFT HYPHEN) + 46 0x00AE ® (REGISTERED SIGN) + 47 0x017B Ż (LATIN CAPITAL LETTER Z WITH DOT ABOVE) + 48 0x00B0 ° (DEGREE SIGN) + 49 0x00B1 ± (PLUS-MINUS SIGN) + 50 0x02DB ˛ (OGONEK) + 51 0x0142 ł (LATIN SMALL LETTER L WITH STROKE) + 52 0x00B4 ´ (ACUTE ACCENT) + 53 0x00B5 µ (MICRO SIGN) + 54 0x00B6 ¶ (PILCROW SIGN) + 55 0x00B7 · (MIDDLE DOT) + 56 0x00B8 ¸ (CEDILLA) + 57 0x0105 ą (LATIN SMALL LETTER A WITH OGONEK) + 58 0x015F ş (LATIN SMALL LETTER S WITH CEDILLA) + 59 0x00BB » (RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK) + 60 0x013D Ľ (LATIN CAPITAL LETTER L WITH CARON) + 61 0x02DD ˝ (DOUBLE ACUTE ACCENT) + 62 0x013E ľ (LATIN SMALL LETTER L WITH CARON) + 63 0x017C ż (LATIN SMALL LETTER Z WITH DOT ABOVE) + 64 0x0154 Ŕ (LATIN CAPITAL LETTER R WITH ACUTE) + 65 0x00C1 Á (LATIN CAPITAL LETTER A WITH ACUTE) + 66 0x00C2  (LATIN CAPITAL LETTER A WITH CIRCUMFLEX) + 67 0x0102 Ă (LATIN CAPITAL LETTER A WITH BREVE) + 68 0x00C4 Ä (LATIN CAPITAL LETTER A WITH DIAERESIS) + 69 0x0139 Ĺ (LATIN CAPITAL LETTER L WITH ACUTE) + 70 0x0106 Ć (LATIN CAPITAL LETTER C WITH ACUTE) + 71 0x00C7 Ç (LATIN CAPITAL LETTER C WITH CEDILLA) + 72 0x010C Č (LATIN CAPITAL LETTER C WITH CARON) + 73 0x00C9 É (LATIN CAPITAL LETTER E WITH ACUTE) + 74 0x0118 Ę (LATIN CAPITAL LETTER E WITH OGONEK) + 75 0x00CB Ë (LATIN CAPITAL LETTER E WITH DIAERESIS) + 76 0x011A Ě (LATIN CAPITAL LETTER E WITH CARON) + 77 0x00CD Í (LATIN CAPITAL LETTER I WITH ACUTE) + 78 0x00CE Î (LATIN CAPITAL LETTER I WITH CIRCUMFLEX) + 79 0x010E Ď (LATIN CAPITAL LETTER D WITH CARON) + 80 0x0110 Đ (LATIN CAPITAL LETTER D WITH STROKE) + 81 0x0143 Ń (LATIN CAPITAL LETTER N WITH ACUTE) + 82 0x0147 Ň (LATIN CAPITAL LETTER N WITH CARON) + 83 0x00D3 Ó (LATIN CAPITAL LETTER O WITH ACUTE) + 84 0x00D4 Ô (LATIN CAPITAL LETTER O WITH CIRCUMFLEX) + 85 0x0150 Ő (LATIN CAPITAL LETTER O WITH DOUBLE ACUTE) + 86 0x00D6 Ö (LATIN CAPITAL LETTER O WITH DIAERESIS) + 87 0x00D7 × (MULTIPLICATION SIGN) + 88 0x0158 Ř (LATIN CAPITAL LETTER R WITH CARON) + 89 0x016E Ů (LATIN CAPITAL LETTER U WITH RING ABOVE) + 90 0x00DA Ú (LATIN CAPITAL LETTER U WITH ACUTE) + 91 0x0170 Ű (LATIN CAPITAL LETTER U WITH DOUBLE ACUTE) + 92 0x00DC Ü (LATIN CAPITAL LETTER U WITH DIAERESIS) + 93 0x00DD Ý (LATIN CAPITAL LETTER Y WITH ACUTE) + 94 0x0162 Ţ (LATIN CAPITAL LETTER T WITH CEDILLA) + 95 0x00DF ß (LATIN SMALL LETTER SHARP S) + 96 0x0155 ŕ (LATIN SMALL LETTER R WITH ACUTE) + 97 0x00E1 á (LATIN SMALL LETTER A WITH ACUTE) + 98 0x00E2 â (LATIN SMALL LETTER A WITH CIRCUMFLEX) + 99 0x0103 ă (LATIN SMALL LETTER A WITH BREVE) +100 0x00E4 ä (LATIN SMALL LETTER A WITH DIAERESIS) +101 0x013A ĺ (LATIN SMALL LETTER L WITH ACUTE) +102 0x0107 ć (LATIN SMALL LETTER C WITH ACUTE) +103 0x00E7 ç (LATIN SMALL LETTER C WITH CEDILLA) +104 0x010D č (LATIN SMALL LETTER C WITH CARON) +105 0x00E9 é (LATIN SMALL LETTER E WITH ACUTE) +106 0x0119 ę (LATIN SMALL LETTER E WITH OGONEK) +107 0x00EB ë (LATIN SMALL LETTER E WITH DIAERESIS) +108 0x011B ě (LATIN SMALL LETTER E WITH CARON) +109 0x00ED í (LATIN SMALL LETTER I WITH ACUTE) +110 0x00EE î (LATIN SMALL LETTER I WITH CIRCUMFLEX) +111 0x010F ď (LATIN SMALL LETTER D WITH CARON) +112 0x0111 đ (LATIN SMALL LETTER D WITH STROKE) +113 0x0144 ń (LATIN SMALL LETTER N WITH ACUTE) +114 0x0148 ň (LATIN SMALL LETTER N WITH CARON) +115 0x00F3 ó (LATIN SMALL LETTER O WITH ACUTE) +116 0x00F4 ô (LATIN SMALL LETTER O WITH CIRCUMFLEX) +117 0x0151 ő (LATIN SMALL LETTER O WITH DOUBLE ACUTE) +118 0x00F6 ö (LATIN SMALL LETTER O WITH DIAERESIS) +119 0x00F7 ÷ (DIVISION SIGN) +120 0x0159 ř (LATIN SMALL LETTER R WITH CARON) +121 0x016F ů (LATIN SMALL LETTER U WITH RING ABOVE) +122 0x00FA ú (LATIN SMALL LETTER U WITH ACUTE) +123 0x0171 ű (LATIN SMALL LETTER U WITH DOUBLE ACUTE) +124 0x00FC ü (LATIN SMALL LETTER U WITH DIAERESIS) +125 0x00FD ý (LATIN SMALL LETTER Y WITH ACUTE) +126 0x0163 ţ (LATIN SMALL LETTER T WITH CEDILLA) +127 0x02D9 ˙ (DOT ABOVE) diff --git a/test/fixtures/encoding/single-byte/index-windows-1251.txt b/test/fixtures/encoding/single-byte/index-windows-1251.txt new file mode 100644 index 00000000000000..ee4533c51ce761 --- /dev/null +++ b/test/fixtures/encoding/single-byte/index-windows-1251.txt @@ -0,0 +1,134 @@ +# For details on index index-windows-1251.txt see the Encoding Standard +# https://encoding.spec.whatwg.org/ +# +# Identifier: 7592ef921679ba168b00a9e9afa3b4eebd67bf13dc7e84c4b6e120de856826e0 +# Date: 2024-09-18 + + 0 0x0402 Ђ (CYRILLIC CAPITAL LETTER DJE) + 1 0x0403 Ѓ (CYRILLIC CAPITAL LETTER GJE) + 2 0x201A ‚ (SINGLE LOW-9 QUOTATION MARK) + 3 0x0453 ѓ (CYRILLIC SMALL LETTER GJE) + 4 0x201E „ (DOUBLE LOW-9 QUOTATION MARK) + 5 0x2026 … (HORIZONTAL ELLIPSIS) + 6 0x2020 † (DAGGER) + 7 0x2021 ‡ (DOUBLE DAGGER) + 8 0x20AC € (EURO SIGN) + 9 0x2030 ‰ (PER MILLE SIGN) + 10 0x0409 Љ (CYRILLIC CAPITAL LETTER LJE) + 11 0x2039 ‹ (SINGLE LEFT-POINTING ANGLE QUOTATION MARK) + 12 0x040A Њ (CYRILLIC CAPITAL LETTER NJE) + 13 0x040C Ќ (CYRILLIC CAPITAL LETTER KJE) + 14 0x040B Ћ (CYRILLIC CAPITAL LETTER TSHE) + 15 0x040F Џ (CYRILLIC CAPITAL LETTER DZHE) + 16 0x0452 ђ (CYRILLIC SMALL LETTER DJE) + 17 0x2018 ‘ (LEFT SINGLE QUOTATION MARK) + 18 0x2019 ’ (RIGHT SINGLE QUOTATION MARK) + 19 0x201C “ (LEFT DOUBLE QUOTATION MARK) + 20 0x201D ” (RIGHT DOUBLE QUOTATION MARK) + 21 0x2022 • (BULLET) + 22 0x2013 – (EN DASH) + 23 0x2014 — (EM DASH) + 24 0x0098 ˜ () + 25 0x2122 ™ (TRADE MARK SIGN) + 26 0x0459 љ (CYRILLIC SMALL LETTER LJE) + 27 0x203A › (SINGLE RIGHT-POINTING ANGLE QUOTATION MARK) + 28 0x045A њ (CYRILLIC SMALL LETTER NJE) + 29 0x045C ќ (CYRILLIC SMALL LETTER KJE) + 30 0x045B ћ (CYRILLIC SMALL LETTER TSHE) + 31 0x045F џ (CYRILLIC SMALL LETTER DZHE) + 32 0x00A0   (NO-BREAK SPACE) + 33 0x040E Ў (CYRILLIC CAPITAL LETTER SHORT U) + 34 0x045E ў (CYRILLIC SMALL LETTER SHORT U) + 35 0x0408 Ј (CYRILLIC CAPITAL LETTER JE) + 36 0x00A4 ¤ (CURRENCY SIGN) + 37 0x0490 Ґ (CYRILLIC CAPITAL LETTER GHE WITH UPTURN) + 38 0x00A6 ¦ (BROKEN BAR) + 39 0x00A7 § (SECTION SIGN) + 40 0x0401 Ё (CYRILLIC CAPITAL LETTER IO) + 41 0x00A9 © (COPYRIGHT SIGN) + 42 0x0404 Є (CYRILLIC CAPITAL LETTER UKRAINIAN IE) + 43 0x00AB « (LEFT-POINTING DOUBLE ANGLE QUOTATION MARK) + 44 0x00AC ¬ (NOT SIGN) + 45 0x00AD ­ (SOFT HYPHEN) + 46 0x00AE ® (REGISTERED SIGN) + 47 0x0407 Ї (CYRILLIC CAPITAL LETTER YI) + 48 0x00B0 ° (DEGREE SIGN) + 49 0x00B1 ± (PLUS-MINUS SIGN) + 50 0x0406 І (CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I) + 51 0x0456 і (CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I) + 52 0x0491 ґ (CYRILLIC SMALL LETTER GHE WITH UPTURN) + 53 0x00B5 µ (MICRO SIGN) + 54 0x00B6 ¶ (PILCROW SIGN) + 55 0x00B7 · (MIDDLE DOT) + 56 0x0451 ё (CYRILLIC SMALL LETTER IO) + 57 0x2116 № (NUMERO SIGN) + 58 0x0454 є (CYRILLIC SMALL LETTER UKRAINIAN IE) + 59 0x00BB » (RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK) + 60 0x0458 ј (CYRILLIC SMALL LETTER JE) + 61 0x0405 Ѕ (CYRILLIC CAPITAL LETTER DZE) + 62 0x0455 ѕ (CYRILLIC SMALL LETTER DZE) + 63 0x0457 ї (CYRILLIC SMALL LETTER YI) + 64 0x0410 А (CYRILLIC CAPITAL LETTER A) + 65 0x0411 Б (CYRILLIC CAPITAL LETTER BE) + 66 0x0412 В (CYRILLIC CAPITAL LETTER VE) + 67 0x0413 Г (CYRILLIC CAPITAL LETTER GHE) + 68 0x0414 Д (CYRILLIC CAPITAL LETTER DE) + 69 0x0415 Е (CYRILLIC CAPITAL LETTER IE) + 70 0x0416 Ж (CYRILLIC CAPITAL LETTER ZHE) + 71 0x0417 З (CYRILLIC CAPITAL LETTER ZE) + 72 0x0418 И (CYRILLIC CAPITAL LETTER I) + 73 0x0419 Й (CYRILLIC CAPITAL LETTER SHORT I) + 74 0x041A К (CYRILLIC CAPITAL LETTER KA) + 75 0x041B Л (CYRILLIC CAPITAL LETTER EL) + 76 0x041C М (CYRILLIC CAPITAL LETTER EM) + 77 0x041D Н (CYRILLIC CAPITAL LETTER EN) + 78 0x041E О (CYRILLIC CAPITAL LETTER O) + 79 0x041F П (CYRILLIC CAPITAL LETTER PE) + 80 0x0420 Р (CYRILLIC CAPITAL LETTER ER) + 81 0x0421 С (CYRILLIC CAPITAL LETTER ES) + 82 0x0422 Т (CYRILLIC CAPITAL LETTER TE) + 83 0x0423 У (CYRILLIC CAPITAL LETTER U) + 84 0x0424 Ф (CYRILLIC CAPITAL LETTER EF) + 85 0x0425 Х (CYRILLIC CAPITAL LETTER HA) + 86 0x0426 Ц (CYRILLIC CAPITAL LETTER TSE) + 87 0x0427 Ч (CYRILLIC CAPITAL LETTER CHE) + 88 0x0428 Ш (CYRILLIC CAPITAL LETTER SHA) + 89 0x0429 Щ (CYRILLIC CAPITAL LETTER SHCHA) + 90 0x042A Ъ (CYRILLIC CAPITAL LETTER HARD SIGN) + 91 0x042B Ы (CYRILLIC CAPITAL LETTER YERU) + 92 0x042C Ь (CYRILLIC CAPITAL LETTER SOFT SIGN) + 93 0x042D Э (CYRILLIC CAPITAL LETTER E) + 94 0x042E Ю (CYRILLIC CAPITAL LETTER YU) + 95 0x042F Я (CYRILLIC CAPITAL LETTER YA) + 96 0x0430 а (CYRILLIC SMALL LETTER A) + 97 0x0431 б (CYRILLIC SMALL LETTER BE) + 98 0x0432 в (CYRILLIC SMALL LETTER VE) + 99 0x0433 г (CYRILLIC SMALL LETTER GHE) +100 0x0434 д (CYRILLIC SMALL LETTER DE) +101 0x0435 е (CYRILLIC SMALL LETTER IE) +102 0x0436 ж (CYRILLIC SMALL LETTER ZHE) +103 0x0437 з (CYRILLIC SMALL LETTER ZE) +104 0x0438 и (CYRILLIC SMALL LETTER I) +105 0x0439 й (CYRILLIC SMALL LETTER SHORT I) +106 0x043A к (CYRILLIC SMALL LETTER KA) +107 0x043B л (CYRILLIC SMALL LETTER EL) +108 0x043C м (CYRILLIC SMALL LETTER EM) +109 0x043D н (CYRILLIC SMALL LETTER EN) +110 0x043E о (CYRILLIC SMALL LETTER O) +111 0x043F п (CYRILLIC SMALL LETTER PE) +112 0x0440 р (CYRILLIC SMALL LETTER ER) +113 0x0441 с (CYRILLIC SMALL LETTER ES) +114 0x0442 т (CYRILLIC SMALL LETTER TE) +115 0x0443 у (CYRILLIC SMALL LETTER U) +116 0x0444 ф (CYRILLIC SMALL LETTER EF) +117 0x0445 х (CYRILLIC SMALL LETTER HA) +118 0x0446 ц (CYRILLIC SMALL LETTER TSE) +119 0x0447 ч (CYRILLIC SMALL LETTER CHE) +120 0x0448 ш (CYRILLIC SMALL LETTER SHA) +121 0x0449 щ (CYRILLIC SMALL LETTER SHCHA) +122 0x044A ъ (CYRILLIC SMALL LETTER HARD SIGN) +123 0x044B ы (CYRILLIC SMALL LETTER YERU) +124 0x044C ь (CYRILLIC SMALL LETTER SOFT SIGN) +125 0x044D э (CYRILLIC SMALL LETTER E) +126 0x044E ю (CYRILLIC SMALL LETTER YU) +127 0x044F я (CYRILLIC SMALL LETTER YA) diff --git a/test/fixtures/encoding/single-byte/index-windows-1252.txt b/test/fixtures/encoding/single-byte/index-windows-1252.txt new file mode 100644 index 00000000000000..af1edbcd19cede --- /dev/null +++ b/test/fixtures/encoding/single-byte/index-windows-1252.txt @@ -0,0 +1,134 @@ +# For details on index index-windows-1252.txt see the Encoding Standard +# https://encoding.spec.whatwg.org/ +# +# Identifier: e56d49d9176e9a412283cf29ac9bd613f5620462f2a080a84eceaf974cfa18b7 +# Date: 2024-09-18 + + 0 0x20AC € (EURO SIGN) + 1 0x0081  () + 2 0x201A ‚ (SINGLE LOW-9 QUOTATION MARK) + 3 0x0192 ƒ (LATIN SMALL LETTER F WITH HOOK) + 4 0x201E „ (DOUBLE LOW-9 QUOTATION MARK) + 5 0x2026 … (HORIZONTAL ELLIPSIS) + 6 0x2020 † (DAGGER) + 7 0x2021 ‡ (DOUBLE DAGGER) + 8 0x02C6 ˆ (MODIFIER LETTER CIRCUMFLEX ACCENT) + 9 0x2030 ‰ (PER MILLE SIGN) + 10 0x0160 Š (LATIN CAPITAL LETTER S WITH CARON) + 11 0x2039 ‹ (SINGLE LEFT-POINTING ANGLE QUOTATION MARK) + 12 0x0152 Œ (LATIN CAPITAL LIGATURE OE) + 13 0x008D  () + 14 0x017D Ž (LATIN CAPITAL LETTER Z WITH CARON) + 15 0x008F  () + 16 0x0090  () + 17 0x2018 ‘ (LEFT SINGLE QUOTATION MARK) + 18 0x2019 ’ (RIGHT SINGLE QUOTATION MARK) + 19 0x201C “ (LEFT DOUBLE QUOTATION MARK) + 20 0x201D ” (RIGHT DOUBLE QUOTATION MARK) + 21 0x2022 • (BULLET) + 22 0x2013 – (EN DASH) + 23 0x2014 — (EM DASH) + 24 0x02DC ˜ (SMALL TILDE) + 25 0x2122 ™ (TRADE MARK SIGN) + 26 0x0161 š (LATIN SMALL LETTER S WITH CARON) + 27 0x203A › (SINGLE RIGHT-POINTING ANGLE QUOTATION MARK) + 28 0x0153 œ (LATIN SMALL LIGATURE OE) + 29 0x009D  () + 30 0x017E ž (LATIN SMALL LETTER Z WITH CARON) + 31 0x0178 Ÿ (LATIN CAPITAL LETTER Y WITH DIAERESIS) + 32 0x00A0   (NO-BREAK SPACE) + 33 0x00A1 ¡ (INVERTED EXCLAMATION MARK) + 34 0x00A2 ¢ (CENT SIGN) + 35 0x00A3 £ (POUND SIGN) + 36 0x00A4 ¤ (CURRENCY SIGN) + 37 0x00A5 ¥ (YEN SIGN) + 38 0x00A6 ¦ (BROKEN BAR) + 39 0x00A7 § (SECTION SIGN) + 40 0x00A8 ¨ (DIAERESIS) + 41 0x00A9 © (COPYRIGHT SIGN) + 42 0x00AA ª (FEMININE ORDINAL INDICATOR) + 43 0x00AB « (LEFT-POINTING DOUBLE ANGLE QUOTATION MARK) + 44 0x00AC ¬ (NOT SIGN) + 45 0x00AD ­ (SOFT HYPHEN) + 46 0x00AE ® (REGISTERED SIGN) + 47 0x00AF ¯ (MACRON) + 48 0x00B0 ° (DEGREE SIGN) + 49 0x00B1 ± (PLUS-MINUS SIGN) + 50 0x00B2 ² (SUPERSCRIPT TWO) + 51 0x00B3 ³ (SUPERSCRIPT THREE) + 52 0x00B4 ´ (ACUTE ACCENT) + 53 0x00B5 µ (MICRO SIGN) + 54 0x00B6 ¶ (PILCROW SIGN) + 55 0x00B7 · (MIDDLE DOT) + 56 0x00B8 ¸ (CEDILLA) + 57 0x00B9 ¹ (SUPERSCRIPT ONE) + 58 0x00BA º (MASCULINE ORDINAL INDICATOR) + 59 0x00BB » (RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK) + 60 0x00BC ¼ (VULGAR FRACTION ONE QUARTER) + 61 0x00BD ½ (VULGAR FRACTION ONE HALF) + 62 0x00BE ¾ (VULGAR FRACTION THREE QUARTERS) + 63 0x00BF ¿ (INVERTED QUESTION MARK) + 64 0x00C0 À (LATIN CAPITAL LETTER A WITH GRAVE) + 65 0x00C1 Á (LATIN CAPITAL LETTER A WITH ACUTE) + 66 0x00C2  (LATIN CAPITAL LETTER A WITH CIRCUMFLEX) + 67 0x00C3 à (LATIN CAPITAL LETTER A WITH TILDE) + 68 0x00C4 Ä (LATIN CAPITAL LETTER A WITH DIAERESIS) + 69 0x00C5 Å (LATIN CAPITAL LETTER A WITH RING ABOVE) + 70 0x00C6 Æ (LATIN CAPITAL LETTER AE) + 71 0x00C7 Ç (LATIN CAPITAL LETTER C WITH CEDILLA) + 72 0x00C8 È (LATIN CAPITAL LETTER E WITH GRAVE) + 73 0x00C9 É (LATIN CAPITAL LETTER E WITH ACUTE) + 74 0x00CA Ê (LATIN CAPITAL LETTER E WITH CIRCUMFLEX) + 75 0x00CB Ë (LATIN CAPITAL LETTER E WITH DIAERESIS) + 76 0x00CC Ì (LATIN CAPITAL LETTER I WITH GRAVE) + 77 0x00CD Í (LATIN CAPITAL LETTER I WITH ACUTE) + 78 0x00CE Î (LATIN CAPITAL LETTER I WITH CIRCUMFLEX) + 79 0x00CF Ï (LATIN CAPITAL LETTER I WITH DIAERESIS) + 80 0x00D0 Ð (LATIN CAPITAL LETTER ETH) + 81 0x00D1 Ñ (LATIN CAPITAL LETTER N WITH TILDE) + 82 0x00D2 Ò (LATIN CAPITAL LETTER O WITH GRAVE) + 83 0x00D3 Ó (LATIN CAPITAL LETTER O WITH ACUTE) + 84 0x00D4 Ô (LATIN CAPITAL LETTER O WITH CIRCUMFLEX) + 85 0x00D5 Õ (LATIN CAPITAL LETTER O WITH TILDE) + 86 0x00D6 Ö (LATIN CAPITAL LETTER O WITH DIAERESIS) + 87 0x00D7 × (MULTIPLICATION SIGN) + 88 0x00D8 Ø (LATIN CAPITAL LETTER O WITH STROKE) + 89 0x00D9 Ù (LATIN CAPITAL LETTER U WITH GRAVE) + 90 0x00DA Ú (LATIN CAPITAL LETTER U WITH ACUTE) + 91 0x00DB Û (LATIN CAPITAL LETTER U WITH CIRCUMFLEX) + 92 0x00DC Ü (LATIN CAPITAL LETTER U WITH DIAERESIS) + 93 0x00DD Ý (LATIN CAPITAL LETTER Y WITH ACUTE) + 94 0x00DE Þ (LATIN CAPITAL LETTER THORN) + 95 0x00DF ß (LATIN SMALL LETTER SHARP S) + 96 0x00E0 à (LATIN SMALL LETTER A WITH GRAVE) + 97 0x00E1 á (LATIN SMALL LETTER A WITH ACUTE) + 98 0x00E2 â (LATIN SMALL LETTER A WITH CIRCUMFLEX) + 99 0x00E3 ã (LATIN SMALL LETTER A WITH TILDE) +100 0x00E4 ä (LATIN SMALL LETTER A WITH DIAERESIS) +101 0x00E5 å (LATIN SMALL LETTER A WITH RING ABOVE) +102 0x00E6 æ (LATIN SMALL LETTER AE) +103 0x00E7 ç (LATIN SMALL LETTER C WITH CEDILLA) +104 0x00E8 è (LATIN SMALL LETTER E WITH GRAVE) +105 0x00E9 é (LATIN SMALL LETTER E WITH ACUTE) +106 0x00EA ê (LATIN SMALL LETTER E WITH CIRCUMFLEX) +107 0x00EB ë (LATIN SMALL LETTER E WITH DIAERESIS) +108 0x00EC ì (LATIN SMALL LETTER I WITH GRAVE) +109 0x00ED í (LATIN SMALL LETTER I WITH ACUTE) +110 0x00EE î (LATIN SMALL LETTER I WITH CIRCUMFLEX) +111 0x00EF ï (LATIN SMALL LETTER I WITH DIAERESIS) +112 0x00F0 ð (LATIN SMALL LETTER ETH) +113 0x00F1 ñ (LATIN SMALL LETTER N WITH TILDE) +114 0x00F2 ò (LATIN SMALL LETTER O WITH GRAVE) +115 0x00F3 ó (LATIN SMALL LETTER O WITH ACUTE) +116 0x00F4 ô (LATIN SMALL LETTER O WITH CIRCUMFLEX) +117 0x00F5 õ (LATIN SMALL LETTER O WITH TILDE) +118 0x00F6 ö (LATIN SMALL LETTER O WITH DIAERESIS) +119 0x00F7 ÷ (DIVISION SIGN) +120 0x00F8 ø (LATIN SMALL LETTER O WITH STROKE) +121 0x00F9 ù (LATIN SMALL LETTER U WITH GRAVE) +122 0x00FA ú (LATIN SMALL LETTER U WITH ACUTE) +123 0x00FB û (LATIN SMALL LETTER U WITH CIRCUMFLEX) +124 0x00FC ü (LATIN SMALL LETTER U WITH DIAERESIS) +125 0x00FD ý (LATIN SMALL LETTER Y WITH ACUTE) +126 0x00FE þ (LATIN SMALL LETTER THORN) +127 0x00FF ÿ (LATIN SMALL LETTER Y WITH DIAERESIS) diff --git a/test/fixtures/encoding/single-byte/index-windows-1253.txt b/test/fixtures/encoding/single-byte/index-windows-1253.txt new file mode 100644 index 00000000000000..1b025ce1588d0e --- /dev/null +++ b/test/fixtures/encoding/single-byte/index-windows-1253.txt @@ -0,0 +1,131 @@ +# For details on index index-windows-1253.txt see the Encoding Standard +# https://encoding.spec.whatwg.org/ +# +# Identifier: 49fdc881a3488904dd1e8dfba9aef3258454249958b611bcded1d4c981ab5561 +# Date: 2024-09-18 + + 0 0x20AC € (EURO SIGN) + 1 0x0081  () + 2 0x201A ‚ (SINGLE LOW-9 QUOTATION MARK) + 3 0x0192 ƒ (LATIN SMALL LETTER F WITH HOOK) + 4 0x201E „ (DOUBLE LOW-9 QUOTATION MARK) + 5 0x2026 … (HORIZONTAL ELLIPSIS) + 6 0x2020 † (DAGGER) + 7 0x2021 ‡ (DOUBLE DAGGER) + 8 0x0088 ˆ () + 9 0x2030 ‰ (PER MILLE SIGN) + 10 0x008A Š () + 11 0x2039 ‹ (SINGLE LEFT-POINTING ANGLE QUOTATION MARK) + 12 0x008C Œ () + 13 0x008D  () + 14 0x008E Ž () + 15 0x008F  () + 16 0x0090  () + 17 0x2018 ‘ (LEFT SINGLE QUOTATION MARK) + 18 0x2019 ’ (RIGHT SINGLE QUOTATION MARK) + 19 0x201C “ (LEFT DOUBLE QUOTATION MARK) + 20 0x201D ” (RIGHT DOUBLE QUOTATION MARK) + 21 0x2022 • (BULLET) + 22 0x2013 – (EN DASH) + 23 0x2014 — (EM DASH) + 24 0x0098 ˜ () + 25 0x2122 ™ (TRADE MARK SIGN) + 26 0x009A š () + 27 0x203A › (SINGLE RIGHT-POINTING ANGLE QUOTATION MARK) + 28 0x009C œ () + 29 0x009D  () + 30 0x009E ž () + 31 0x009F Ÿ () + 32 0x00A0   (NO-BREAK SPACE) + 33 0x0385 ΅ (GREEK DIALYTIKA TONOS) + 34 0x0386 Ά (GREEK CAPITAL LETTER ALPHA WITH TONOS) + 35 0x00A3 £ (POUND SIGN) + 36 0x00A4 ¤ (CURRENCY SIGN) + 37 0x00A5 ¥ (YEN SIGN) + 38 0x00A6 ¦ (BROKEN BAR) + 39 0x00A7 § (SECTION SIGN) + 40 0x00A8 ¨ (DIAERESIS) + 41 0x00A9 © (COPYRIGHT SIGN) + 43 0x00AB « (LEFT-POINTING DOUBLE ANGLE QUOTATION MARK) + 44 0x00AC ¬ (NOT SIGN) + 45 0x00AD ­ (SOFT HYPHEN) + 46 0x00AE ® (REGISTERED SIGN) + 47 0x2015 ― (HORIZONTAL BAR) + 48 0x00B0 ° (DEGREE SIGN) + 49 0x00B1 ± (PLUS-MINUS SIGN) + 50 0x00B2 ² (SUPERSCRIPT TWO) + 51 0x00B3 ³ (SUPERSCRIPT THREE) + 52 0x0384 ΄ (GREEK TONOS) + 53 0x00B5 µ (MICRO SIGN) + 54 0x00B6 ¶ (PILCROW SIGN) + 55 0x00B7 · (MIDDLE DOT) + 56 0x0388 Έ (GREEK CAPITAL LETTER EPSILON WITH TONOS) + 57 0x0389 Ή (GREEK CAPITAL LETTER ETA WITH TONOS) + 58 0x038A Ί (GREEK CAPITAL LETTER IOTA WITH TONOS) + 59 0x00BB » (RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK) + 60 0x038C Ό (GREEK CAPITAL LETTER OMICRON WITH TONOS) + 61 0x00BD ½ (VULGAR FRACTION ONE HALF) + 62 0x038E Ύ (GREEK CAPITAL LETTER UPSILON WITH TONOS) + 63 0x038F Ώ (GREEK CAPITAL LETTER OMEGA WITH TONOS) + 64 0x0390 ΐ (GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS) + 65 0x0391 Α (GREEK CAPITAL LETTER ALPHA) + 66 0x0392 Β (GREEK CAPITAL LETTER BETA) + 67 0x0393 Γ (GREEK CAPITAL LETTER GAMMA) + 68 0x0394 Δ (GREEK CAPITAL LETTER DELTA) + 69 0x0395 Ε (GREEK CAPITAL LETTER EPSILON) + 70 0x0396 Ζ (GREEK CAPITAL LETTER ZETA) + 71 0x0397 Η (GREEK CAPITAL LETTER ETA) + 72 0x0398 Θ (GREEK CAPITAL LETTER THETA) + 73 0x0399 Ι (GREEK CAPITAL LETTER IOTA) + 74 0x039A Κ (GREEK CAPITAL LETTER KAPPA) + 75 0x039B Λ (GREEK CAPITAL LETTER LAMDA) + 76 0x039C Μ (GREEK CAPITAL LETTER MU) + 77 0x039D Ν (GREEK CAPITAL LETTER NU) + 78 0x039E Ξ (GREEK CAPITAL LETTER XI) + 79 0x039F Ο (GREEK CAPITAL LETTER OMICRON) + 80 0x03A0 Π (GREEK CAPITAL LETTER PI) + 81 0x03A1 Ρ (GREEK CAPITAL LETTER RHO) + 83 0x03A3 Σ (GREEK CAPITAL LETTER SIGMA) + 84 0x03A4 Τ (GREEK CAPITAL LETTER TAU) + 85 0x03A5 Υ (GREEK CAPITAL LETTER UPSILON) + 86 0x03A6 Φ (GREEK CAPITAL LETTER PHI) + 87 0x03A7 Χ (GREEK CAPITAL LETTER CHI) + 88 0x03A8 Ψ (GREEK CAPITAL LETTER PSI) + 89 0x03A9 Ω (GREEK CAPITAL LETTER OMEGA) + 90 0x03AA Ϊ (GREEK CAPITAL LETTER IOTA WITH DIALYTIKA) + 91 0x03AB Ϋ (GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA) + 92 0x03AC ά (GREEK SMALL LETTER ALPHA WITH TONOS) + 93 0x03AD έ (GREEK SMALL LETTER EPSILON WITH TONOS) + 94 0x03AE ή (GREEK SMALL LETTER ETA WITH TONOS) + 95 0x03AF ί (GREEK SMALL LETTER IOTA WITH TONOS) + 96 0x03B0 ΰ (GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS) + 97 0x03B1 α (GREEK SMALL LETTER ALPHA) + 98 0x03B2 β (GREEK SMALL LETTER BETA) + 99 0x03B3 γ (GREEK SMALL LETTER GAMMA) +100 0x03B4 δ (GREEK SMALL LETTER DELTA) +101 0x03B5 ε (GREEK SMALL LETTER EPSILON) +102 0x03B6 ζ (GREEK SMALL LETTER ZETA) +103 0x03B7 η (GREEK SMALL LETTER ETA) +104 0x03B8 θ (GREEK SMALL LETTER THETA) +105 0x03B9 ι (GREEK SMALL LETTER IOTA) +106 0x03BA κ (GREEK SMALL LETTER KAPPA) +107 0x03BB λ (GREEK SMALL LETTER LAMDA) +108 0x03BC μ (GREEK SMALL LETTER MU) +109 0x03BD ν (GREEK SMALL LETTER NU) +110 0x03BE ξ (GREEK SMALL LETTER XI) +111 0x03BF ο (GREEK SMALL LETTER OMICRON) +112 0x03C0 π (GREEK SMALL LETTER PI) +113 0x03C1 ρ (GREEK SMALL LETTER RHO) +114 0x03C2 ς (GREEK SMALL LETTER FINAL SIGMA) +115 0x03C3 σ (GREEK SMALL LETTER SIGMA) +116 0x03C4 τ (GREEK SMALL LETTER TAU) +117 0x03C5 υ (GREEK SMALL LETTER UPSILON) +118 0x03C6 φ (GREEK SMALL LETTER PHI) +119 0x03C7 χ (GREEK SMALL LETTER CHI) +120 0x03C8 ψ (GREEK SMALL LETTER PSI) +121 0x03C9 ω (GREEK SMALL LETTER OMEGA) +122 0x03CA ϊ (GREEK SMALL LETTER IOTA WITH DIALYTIKA) +123 0x03CB ϋ (GREEK SMALL LETTER UPSILON WITH DIALYTIKA) +124 0x03CC ό (GREEK SMALL LETTER OMICRON WITH TONOS) +125 0x03CD ύ (GREEK SMALL LETTER UPSILON WITH TONOS) +126 0x03CE ώ (GREEK SMALL LETTER OMEGA WITH TONOS) diff --git a/test/fixtures/encoding/single-byte/index-windows-1254.txt b/test/fixtures/encoding/single-byte/index-windows-1254.txt new file mode 100644 index 00000000000000..4195f3dbc029e2 --- /dev/null +++ b/test/fixtures/encoding/single-byte/index-windows-1254.txt @@ -0,0 +1,134 @@ +# For details on index index-windows-1254.txt see the Encoding Standard +# https://encoding.spec.whatwg.org/ +# +# Identifier: e80a27adf377438be8ba5bd223875ea56d6a4d47f958cce1c957a2c446825caa +# Date: 2024-09-18 + + 0 0x20AC € (EURO SIGN) + 1 0x0081  () + 2 0x201A ‚ (SINGLE LOW-9 QUOTATION MARK) + 3 0x0192 ƒ (LATIN SMALL LETTER F WITH HOOK) + 4 0x201E „ (DOUBLE LOW-9 QUOTATION MARK) + 5 0x2026 … (HORIZONTAL ELLIPSIS) + 6 0x2020 † (DAGGER) + 7 0x2021 ‡ (DOUBLE DAGGER) + 8 0x02C6 ˆ (MODIFIER LETTER CIRCUMFLEX ACCENT) + 9 0x2030 ‰ (PER MILLE SIGN) + 10 0x0160 Š (LATIN CAPITAL LETTER S WITH CARON) + 11 0x2039 ‹ (SINGLE LEFT-POINTING ANGLE QUOTATION MARK) + 12 0x0152 Œ (LATIN CAPITAL LIGATURE OE) + 13 0x008D  () + 14 0x008E Ž () + 15 0x008F  () + 16 0x0090  () + 17 0x2018 ‘ (LEFT SINGLE QUOTATION MARK) + 18 0x2019 ’ (RIGHT SINGLE QUOTATION MARK) + 19 0x201C “ (LEFT DOUBLE QUOTATION MARK) + 20 0x201D ” (RIGHT DOUBLE QUOTATION MARK) + 21 0x2022 • (BULLET) + 22 0x2013 – (EN DASH) + 23 0x2014 — (EM DASH) + 24 0x02DC ˜ (SMALL TILDE) + 25 0x2122 ™ (TRADE MARK SIGN) + 26 0x0161 š (LATIN SMALL LETTER S WITH CARON) + 27 0x203A › (SINGLE RIGHT-POINTING ANGLE QUOTATION MARK) + 28 0x0153 œ (LATIN SMALL LIGATURE OE) + 29 0x009D  () + 30 0x009E ž () + 31 0x0178 Ÿ (LATIN CAPITAL LETTER Y WITH DIAERESIS) + 32 0x00A0   (NO-BREAK SPACE) + 33 0x00A1 ¡ (INVERTED EXCLAMATION MARK) + 34 0x00A2 ¢ (CENT SIGN) + 35 0x00A3 £ (POUND SIGN) + 36 0x00A4 ¤ (CURRENCY SIGN) + 37 0x00A5 ¥ (YEN SIGN) + 38 0x00A6 ¦ (BROKEN BAR) + 39 0x00A7 § (SECTION SIGN) + 40 0x00A8 ¨ (DIAERESIS) + 41 0x00A9 © (COPYRIGHT SIGN) + 42 0x00AA ª (FEMININE ORDINAL INDICATOR) + 43 0x00AB « (LEFT-POINTING DOUBLE ANGLE QUOTATION MARK) + 44 0x00AC ¬ (NOT SIGN) + 45 0x00AD ­ (SOFT HYPHEN) + 46 0x00AE ® (REGISTERED SIGN) + 47 0x00AF ¯ (MACRON) + 48 0x00B0 ° (DEGREE SIGN) + 49 0x00B1 ± (PLUS-MINUS SIGN) + 50 0x00B2 ² (SUPERSCRIPT TWO) + 51 0x00B3 ³ (SUPERSCRIPT THREE) + 52 0x00B4 ´ (ACUTE ACCENT) + 53 0x00B5 µ (MICRO SIGN) + 54 0x00B6 ¶ (PILCROW SIGN) + 55 0x00B7 · (MIDDLE DOT) + 56 0x00B8 ¸ (CEDILLA) + 57 0x00B9 ¹ (SUPERSCRIPT ONE) + 58 0x00BA º (MASCULINE ORDINAL INDICATOR) + 59 0x00BB » (RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK) + 60 0x00BC ¼ (VULGAR FRACTION ONE QUARTER) + 61 0x00BD ½ (VULGAR FRACTION ONE HALF) + 62 0x00BE ¾ (VULGAR FRACTION THREE QUARTERS) + 63 0x00BF ¿ (INVERTED QUESTION MARK) + 64 0x00C0 À (LATIN CAPITAL LETTER A WITH GRAVE) + 65 0x00C1 Á (LATIN CAPITAL LETTER A WITH ACUTE) + 66 0x00C2  (LATIN CAPITAL LETTER A WITH CIRCUMFLEX) + 67 0x00C3 à (LATIN CAPITAL LETTER A WITH TILDE) + 68 0x00C4 Ä (LATIN CAPITAL LETTER A WITH DIAERESIS) + 69 0x00C5 Å (LATIN CAPITAL LETTER A WITH RING ABOVE) + 70 0x00C6 Æ (LATIN CAPITAL LETTER AE) + 71 0x00C7 Ç (LATIN CAPITAL LETTER C WITH CEDILLA) + 72 0x00C8 È (LATIN CAPITAL LETTER E WITH GRAVE) + 73 0x00C9 É (LATIN CAPITAL LETTER E WITH ACUTE) + 74 0x00CA Ê (LATIN CAPITAL LETTER E WITH CIRCUMFLEX) + 75 0x00CB Ë (LATIN CAPITAL LETTER E WITH DIAERESIS) + 76 0x00CC Ì (LATIN CAPITAL LETTER I WITH GRAVE) + 77 0x00CD Í (LATIN CAPITAL LETTER I WITH ACUTE) + 78 0x00CE Î (LATIN CAPITAL LETTER I WITH CIRCUMFLEX) + 79 0x00CF Ï (LATIN CAPITAL LETTER I WITH DIAERESIS) + 80 0x011E Ğ (LATIN CAPITAL LETTER G WITH BREVE) + 81 0x00D1 Ñ (LATIN CAPITAL LETTER N WITH TILDE) + 82 0x00D2 Ò (LATIN CAPITAL LETTER O WITH GRAVE) + 83 0x00D3 Ó (LATIN CAPITAL LETTER O WITH ACUTE) + 84 0x00D4 Ô (LATIN CAPITAL LETTER O WITH CIRCUMFLEX) + 85 0x00D5 Õ (LATIN CAPITAL LETTER O WITH TILDE) + 86 0x00D6 Ö (LATIN CAPITAL LETTER O WITH DIAERESIS) + 87 0x00D7 × (MULTIPLICATION SIGN) + 88 0x00D8 Ø (LATIN CAPITAL LETTER O WITH STROKE) + 89 0x00D9 Ù (LATIN CAPITAL LETTER U WITH GRAVE) + 90 0x00DA Ú (LATIN CAPITAL LETTER U WITH ACUTE) + 91 0x00DB Û (LATIN CAPITAL LETTER U WITH CIRCUMFLEX) + 92 0x00DC Ü (LATIN CAPITAL LETTER U WITH DIAERESIS) + 93 0x0130 İ (LATIN CAPITAL LETTER I WITH DOT ABOVE) + 94 0x015E Ş (LATIN CAPITAL LETTER S WITH CEDILLA) + 95 0x00DF ß (LATIN SMALL LETTER SHARP S) + 96 0x00E0 à (LATIN SMALL LETTER A WITH GRAVE) + 97 0x00E1 á (LATIN SMALL LETTER A WITH ACUTE) + 98 0x00E2 â (LATIN SMALL LETTER A WITH CIRCUMFLEX) + 99 0x00E3 ã (LATIN SMALL LETTER A WITH TILDE) +100 0x00E4 ä (LATIN SMALL LETTER A WITH DIAERESIS) +101 0x00E5 å (LATIN SMALL LETTER A WITH RING ABOVE) +102 0x00E6 æ (LATIN SMALL LETTER AE) +103 0x00E7 ç (LATIN SMALL LETTER C WITH CEDILLA) +104 0x00E8 è (LATIN SMALL LETTER E WITH GRAVE) +105 0x00E9 é (LATIN SMALL LETTER E WITH ACUTE) +106 0x00EA ê (LATIN SMALL LETTER E WITH CIRCUMFLEX) +107 0x00EB ë (LATIN SMALL LETTER E WITH DIAERESIS) +108 0x00EC ì (LATIN SMALL LETTER I WITH GRAVE) +109 0x00ED í (LATIN SMALL LETTER I WITH ACUTE) +110 0x00EE î (LATIN SMALL LETTER I WITH CIRCUMFLEX) +111 0x00EF ï (LATIN SMALL LETTER I WITH DIAERESIS) +112 0x011F ğ (LATIN SMALL LETTER G WITH BREVE) +113 0x00F1 ñ (LATIN SMALL LETTER N WITH TILDE) +114 0x00F2 ò (LATIN SMALL LETTER O WITH GRAVE) +115 0x00F3 ó (LATIN SMALL LETTER O WITH ACUTE) +116 0x00F4 ô (LATIN SMALL LETTER O WITH CIRCUMFLEX) +117 0x00F5 õ (LATIN SMALL LETTER O WITH TILDE) +118 0x00F6 ö (LATIN SMALL LETTER O WITH DIAERESIS) +119 0x00F7 ÷ (DIVISION SIGN) +120 0x00F8 ø (LATIN SMALL LETTER O WITH STROKE) +121 0x00F9 ù (LATIN SMALL LETTER U WITH GRAVE) +122 0x00FA ú (LATIN SMALL LETTER U WITH ACUTE) +123 0x00FB û (LATIN SMALL LETTER U WITH CIRCUMFLEX) +124 0x00FC ü (LATIN SMALL LETTER U WITH DIAERESIS) +125 0x0131 ı (LATIN SMALL LETTER DOTLESS I) +126 0x015F ş (LATIN SMALL LETTER S WITH CEDILLA) +127 0x00FF ÿ (LATIN SMALL LETTER Y WITH DIAERESIS) diff --git a/test/fixtures/encoding/single-byte/index-windows-1255.txt b/test/fixtures/encoding/single-byte/index-windows-1255.txt new file mode 100644 index 00000000000000..3efcbc74e8c89f --- /dev/null +++ b/test/fixtures/encoding/single-byte/index-windows-1255.txt @@ -0,0 +1,124 @@ +# For details on index index-windows-1255.txt see the Encoding Standard +# https://encoding.spec.whatwg.org/ +# +# Identifier: cd7fb43c97eefa1651084d92d02af53ad668bd848528c18c3b1af5c06b499651 +# Date: 2024-09-18 + + 0 0x20AC € (EURO SIGN) + 1 0x0081  () + 2 0x201A ‚ (SINGLE LOW-9 QUOTATION MARK) + 3 0x0192 ƒ (LATIN SMALL LETTER F WITH HOOK) + 4 0x201E „ (DOUBLE LOW-9 QUOTATION MARK) + 5 0x2026 … (HORIZONTAL ELLIPSIS) + 6 0x2020 † (DAGGER) + 7 0x2021 ‡ (DOUBLE DAGGER) + 8 0x02C6 ˆ (MODIFIER LETTER CIRCUMFLEX ACCENT) + 9 0x2030 ‰ (PER MILLE SIGN) + 10 0x008A Š () + 11 0x2039 ‹ (SINGLE LEFT-POINTING ANGLE QUOTATION MARK) + 12 0x008C Œ () + 13 0x008D  () + 14 0x008E Ž () + 15 0x008F  () + 16 0x0090  () + 17 0x2018 ‘ (LEFT SINGLE QUOTATION MARK) + 18 0x2019 ’ (RIGHT SINGLE QUOTATION MARK) + 19 0x201C “ (LEFT DOUBLE QUOTATION MARK) + 20 0x201D ” (RIGHT DOUBLE QUOTATION MARK) + 21 0x2022 • (BULLET) + 22 0x2013 – (EN DASH) + 23 0x2014 — (EM DASH) + 24 0x02DC ˜ (SMALL TILDE) + 25 0x2122 ™ (TRADE MARK SIGN) + 26 0x009A š () + 27 0x203A › (SINGLE RIGHT-POINTING ANGLE QUOTATION MARK) + 28 0x009C œ () + 29 0x009D  () + 30 0x009E ž () + 31 0x009F Ÿ () + 32 0x00A0   (NO-BREAK SPACE) + 33 0x00A1 ¡ (INVERTED EXCLAMATION MARK) + 34 0x00A2 ¢ (CENT SIGN) + 35 0x00A3 £ (POUND SIGN) + 36 0x20AA ₪ (NEW SHEQEL SIGN) + 37 0x00A5 ¥ (YEN SIGN) + 38 0x00A6 ¦ (BROKEN BAR) + 39 0x00A7 § (SECTION SIGN) + 40 0x00A8 ¨ (DIAERESIS) + 41 0x00A9 © (COPYRIGHT SIGN) + 42 0x00D7 × (MULTIPLICATION SIGN) + 43 0x00AB « (LEFT-POINTING DOUBLE ANGLE QUOTATION MARK) + 44 0x00AC ¬ (NOT SIGN) + 45 0x00AD ­ (SOFT HYPHEN) + 46 0x00AE ® (REGISTERED SIGN) + 47 0x00AF ¯ (MACRON) + 48 0x00B0 ° (DEGREE SIGN) + 49 0x00B1 ± (PLUS-MINUS SIGN) + 50 0x00B2 ² (SUPERSCRIPT TWO) + 51 0x00B3 ³ (SUPERSCRIPT THREE) + 52 0x00B4 ´ (ACUTE ACCENT) + 53 0x00B5 µ (MICRO SIGN) + 54 0x00B6 ¶ (PILCROW SIGN) + 55 0x00B7 · (MIDDLE DOT) + 56 0x00B8 ¸ (CEDILLA) + 57 0x00B9 ¹ (SUPERSCRIPT ONE) + 58 0x00F7 ÷ (DIVISION SIGN) + 59 0x00BB » (RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK) + 60 0x00BC ¼ (VULGAR FRACTION ONE QUARTER) + 61 0x00BD ½ (VULGAR FRACTION ONE HALF) + 62 0x00BE ¾ (VULGAR FRACTION THREE QUARTERS) + 63 0x00BF ¿ (INVERTED QUESTION MARK) + 64 0x05B0 ְ (HEBREW POINT SHEVA) + 65 0x05B1 ֱ (HEBREW POINT HATAF SEGOL) + 66 0x05B2 ֲ (HEBREW POINT HATAF PATAH) + 67 0x05B3 ֳ (HEBREW POINT HATAF QAMATS) + 68 0x05B4 ִ (HEBREW POINT HIRIQ) + 69 0x05B5 ֵ (HEBREW POINT TSERE) + 70 0x05B6 ֶ (HEBREW POINT SEGOL) + 71 0x05B7 ַ (HEBREW POINT PATAH) + 72 0x05B8 ָ (HEBREW POINT QAMATS) + 73 0x05B9 ֹ (HEBREW POINT HOLAM) + 74 0x05BA ֺ (HEBREW POINT HOLAM HASER FOR VAV) + 75 0x05BB ֻ (HEBREW POINT QUBUTS) + 76 0x05BC ּ (HEBREW POINT DAGESH OR MAPIQ) + 77 0x05BD ֽ (HEBREW POINT METEG) + 78 0x05BE ־ (HEBREW PUNCTUATION MAQAF) + 79 0x05BF ֿ (HEBREW POINT RAFE) + 80 0x05C0 ׀ (HEBREW PUNCTUATION PASEQ) + 81 0x05C1 ׁ (HEBREW POINT SHIN DOT) + 82 0x05C2 ׂ (HEBREW POINT SIN DOT) + 83 0x05C3 ׃ (HEBREW PUNCTUATION SOF PASUQ) + 84 0x05F0 װ (HEBREW LIGATURE YIDDISH DOUBLE VAV) + 85 0x05F1 ױ (HEBREW LIGATURE YIDDISH VAV YOD) + 86 0x05F2 ײ (HEBREW LIGATURE YIDDISH DOUBLE YOD) + 87 0x05F3 ׳ (HEBREW PUNCTUATION GERESH) + 88 0x05F4 ״ (HEBREW PUNCTUATION GERSHAYIM) + 96 0x05D0 א (HEBREW LETTER ALEF) + 97 0x05D1 ב (HEBREW LETTER BET) + 98 0x05D2 ג (HEBREW LETTER GIMEL) + 99 0x05D3 ד (HEBREW LETTER DALET) +100 0x05D4 ה (HEBREW LETTER HE) +101 0x05D5 ו (HEBREW LETTER VAV) +102 0x05D6 ז (HEBREW LETTER ZAYIN) +103 0x05D7 ח (HEBREW LETTER HET) +104 0x05D8 ט (HEBREW LETTER TET) +105 0x05D9 י (HEBREW LETTER YOD) +106 0x05DA ך (HEBREW LETTER FINAL KAF) +107 0x05DB כ (HEBREW LETTER KAF) +108 0x05DC ל (HEBREW LETTER LAMED) +109 0x05DD ם (HEBREW LETTER FINAL MEM) +110 0x05DE מ (HEBREW LETTER MEM) +111 0x05DF ן (HEBREW LETTER FINAL NUN) +112 0x05E0 נ (HEBREW LETTER NUN) +113 0x05E1 ס (HEBREW LETTER SAMEKH) +114 0x05E2 ע (HEBREW LETTER AYIN) +115 0x05E3 ף (HEBREW LETTER FINAL PE) +116 0x05E4 פ (HEBREW LETTER PE) +117 0x05E5 ץ (HEBREW LETTER FINAL TSADI) +118 0x05E6 צ (HEBREW LETTER TSADI) +119 0x05E7 ק (HEBREW LETTER QOF) +120 0x05E8 ר (HEBREW LETTER RESH) +121 0x05E9 ש (HEBREW LETTER SHIN) +122 0x05EA ת (HEBREW LETTER TAV) +125 0x200E ‎ (LEFT-TO-RIGHT MARK) +126 0x200F ‏ (RIGHT-TO-LEFT MARK) diff --git a/test/fixtures/encoding/single-byte/index-windows-1256.txt b/test/fixtures/encoding/single-byte/index-windows-1256.txt new file mode 100644 index 00000000000000..8baa3fa8f5a318 --- /dev/null +++ b/test/fixtures/encoding/single-byte/index-windows-1256.txt @@ -0,0 +1,134 @@ +# For details on index index-windows-1256.txt see the Encoding Standard +# https://encoding.spec.whatwg.org/ +# +# Identifier: 161bdb381f16408e8bebcc8f5310c4190af0e359de8d9bbaa3628ce2f0875509 +# Date: 2024-09-18 + + 0 0x20AC € (EURO SIGN) + 1 0x067E پ (ARABIC LETTER PEH) + 2 0x201A ‚ (SINGLE LOW-9 QUOTATION MARK) + 3 0x0192 ƒ (LATIN SMALL LETTER F WITH HOOK) + 4 0x201E „ (DOUBLE LOW-9 QUOTATION MARK) + 5 0x2026 … (HORIZONTAL ELLIPSIS) + 6 0x2020 † (DAGGER) + 7 0x2021 ‡ (DOUBLE DAGGER) + 8 0x02C6 ˆ (MODIFIER LETTER CIRCUMFLEX ACCENT) + 9 0x2030 ‰ (PER MILLE SIGN) + 10 0x0679 ٹ (ARABIC LETTER TTEH) + 11 0x2039 ‹ (SINGLE LEFT-POINTING ANGLE QUOTATION MARK) + 12 0x0152 Œ (LATIN CAPITAL LIGATURE OE) + 13 0x0686 چ (ARABIC LETTER TCHEH) + 14 0x0698 ژ (ARABIC LETTER JEH) + 15 0x0688 ڈ (ARABIC LETTER DDAL) + 16 0x06AF گ (ARABIC LETTER GAF) + 17 0x2018 ‘ (LEFT SINGLE QUOTATION MARK) + 18 0x2019 ’ (RIGHT SINGLE QUOTATION MARK) + 19 0x201C “ (LEFT DOUBLE QUOTATION MARK) + 20 0x201D ” (RIGHT DOUBLE QUOTATION MARK) + 21 0x2022 • (BULLET) + 22 0x2013 – (EN DASH) + 23 0x2014 — (EM DASH) + 24 0x06A9 ک (ARABIC LETTER KEHEH) + 25 0x2122 ™ (TRADE MARK SIGN) + 26 0x0691 ڑ (ARABIC LETTER RREH) + 27 0x203A › (SINGLE RIGHT-POINTING ANGLE QUOTATION MARK) + 28 0x0153 œ (LATIN SMALL LIGATURE OE) + 29 0x200C ‌ (ZERO WIDTH NON-JOINER) + 30 0x200D ‍ (ZERO WIDTH JOINER) + 31 0x06BA ں (ARABIC LETTER NOON GHUNNA) + 32 0x00A0   (NO-BREAK SPACE) + 33 0x060C ، (ARABIC COMMA) + 34 0x00A2 ¢ (CENT SIGN) + 35 0x00A3 £ (POUND SIGN) + 36 0x00A4 ¤ (CURRENCY SIGN) + 37 0x00A5 ¥ (YEN SIGN) + 38 0x00A6 ¦ (BROKEN BAR) + 39 0x00A7 § (SECTION SIGN) + 40 0x00A8 ¨ (DIAERESIS) + 41 0x00A9 © (COPYRIGHT SIGN) + 42 0x06BE ھ (ARABIC LETTER HEH DOACHASHMEE) + 43 0x00AB « (LEFT-POINTING DOUBLE ANGLE QUOTATION MARK) + 44 0x00AC ¬ (NOT SIGN) + 45 0x00AD ­ (SOFT HYPHEN) + 46 0x00AE ® (REGISTERED SIGN) + 47 0x00AF ¯ (MACRON) + 48 0x00B0 ° (DEGREE SIGN) + 49 0x00B1 ± (PLUS-MINUS SIGN) + 50 0x00B2 ² (SUPERSCRIPT TWO) + 51 0x00B3 ³ (SUPERSCRIPT THREE) + 52 0x00B4 ´ (ACUTE ACCENT) + 53 0x00B5 µ (MICRO SIGN) + 54 0x00B6 ¶ (PILCROW SIGN) + 55 0x00B7 · (MIDDLE DOT) + 56 0x00B8 ¸ (CEDILLA) + 57 0x00B9 ¹ (SUPERSCRIPT ONE) + 58 0x061B ؛ (ARABIC SEMICOLON) + 59 0x00BB » (RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK) + 60 0x00BC ¼ (VULGAR FRACTION ONE QUARTER) + 61 0x00BD ½ (VULGAR FRACTION ONE HALF) + 62 0x00BE ¾ (VULGAR FRACTION THREE QUARTERS) + 63 0x061F ؟ (ARABIC QUESTION MARK) + 64 0x06C1 ہ (ARABIC LETTER HEH GOAL) + 65 0x0621 ء (ARABIC LETTER HAMZA) + 66 0x0622 آ (ARABIC LETTER ALEF WITH MADDA ABOVE) + 67 0x0623 أ (ARABIC LETTER ALEF WITH HAMZA ABOVE) + 68 0x0624 ؤ (ARABIC LETTER WAW WITH HAMZA ABOVE) + 69 0x0625 إ (ARABIC LETTER ALEF WITH HAMZA BELOW) + 70 0x0626 ئ (ARABIC LETTER YEH WITH HAMZA ABOVE) + 71 0x0627 ا (ARABIC LETTER ALEF) + 72 0x0628 ب (ARABIC LETTER BEH) + 73 0x0629 ة (ARABIC LETTER TEH MARBUTA) + 74 0x062A ت (ARABIC LETTER TEH) + 75 0x062B ث (ARABIC LETTER THEH) + 76 0x062C ج (ARABIC LETTER JEEM) + 77 0x062D ح (ARABIC LETTER HAH) + 78 0x062E خ (ARABIC LETTER KHAH) + 79 0x062F د (ARABIC LETTER DAL) + 80 0x0630 ذ (ARABIC LETTER THAL) + 81 0x0631 ر (ARABIC LETTER REH) + 82 0x0632 ز (ARABIC LETTER ZAIN) + 83 0x0633 س (ARABIC LETTER SEEN) + 84 0x0634 ش (ARABIC LETTER SHEEN) + 85 0x0635 ص (ARABIC LETTER SAD) + 86 0x0636 ض (ARABIC LETTER DAD) + 87 0x00D7 × (MULTIPLICATION SIGN) + 88 0x0637 ط (ARABIC LETTER TAH) + 89 0x0638 ظ (ARABIC LETTER ZAH) + 90 0x0639 ع (ARABIC LETTER AIN) + 91 0x063A غ (ARABIC LETTER GHAIN) + 92 0x0640 ـ (ARABIC TATWEEL) + 93 0x0641 ف (ARABIC LETTER FEH) + 94 0x0642 ق (ARABIC LETTER QAF) + 95 0x0643 ك (ARABIC LETTER KAF) + 96 0x00E0 à (LATIN SMALL LETTER A WITH GRAVE) + 97 0x0644 ل (ARABIC LETTER LAM) + 98 0x00E2 â (LATIN SMALL LETTER A WITH CIRCUMFLEX) + 99 0x0645 م (ARABIC LETTER MEEM) +100 0x0646 ن (ARABIC LETTER NOON) +101 0x0647 ه (ARABIC LETTER HEH) +102 0x0648 و (ARABIC LETTER WAW) +103 0x00E7 ç (LATIN SMALL LETTER C WITH CEDILLA) +104 0x00E8 è (LATIN SMALL LETTER E WITH GRAVE) +105 0x00E9 é (LATIN SMALL LETTER E WITH ACUTE) +106 0x00EA ê (LATIN SMALL LETTER E WITH CIRCUMFLEX) +107 0x00EB ë (LATIN SMALL LETTER E WITH DIAERESIS) +108 0x0649 ى (ARABIC LETTER ALEF MAKSURA) +109 0x064A ي (ARABIC LETTER YEH) +110 0x00EE î (LATIN SMALL LETTER I WITH CIRCUMFLEX) +111 0x00EF ï (LATIN SMALL LETTER I WITH DIAERESIS) +112 0x064B ً (ARABIC FATHATAN) +113 0x064C ٌ (ARABIC DAMMATAN) +114 0x064D ٍ (ARABIC KASRATAN) +115 0x064E َ (ARABIC FATHA) +116 0x00F4 ô (LATIN SMALL LETTER O WITH CIRCUMFLEX) +117 0x064F ُ (ARABIC DAMMA) +118 0x0650 ِ (ARABIC KASRA) +119 0x00F7 ÷ (DIVISION SIGN) +120 0x0651 ّ (ARABIC SHADDA) +121 0x00F9 ù (LATIN SMALL LETTER U WITH GRAVE) +122 0x0652 ْ (ARABIC SUKUN) +123 0x00FB û (LATIN SMALL LETTER U WITH CIRCUMFLEX) +124 0x00FC ü (LATIN SMALL LETTER U WITH DIAERESIS) +125 0x200E ‎ (LEFT-TO-RIGHT MARK) +126 0x200F ‏ (RIGHT-TO-LEFT MARK) +127 0x06D2 ے (ARABIC LETTER YEH BARREE) diff --git a/test/fixtures/encoding/single-byte/index-windows-1257.txt b/test/fixtures/encoding/single-byte/index-windows-1257.txt new file mode 100644 index 00000000000000..ffbf66120894cb --- /dev/null +++ b/test/fixtures/encoding/single-byte/index-windows-1257.txt @@ -0,0 +1,132 @@ +# For details on index index-windows-1257.txt see the Encoding Standard +# https://encoding.spec.whatwg.org/ +# +# Identifier: cc7256bdd10a5b8dc7fb6f994659f307dfcae60def9aa6c29d811f85e2842c47 +# Date: 2024-09-18 + + 0 0x20AC € (EURO SIGN) + 1 0x0081  () + 2 0x201A ‚ (SINGLE LOW-9 QUOTATION MARK) + 3 0x0083 ƒ () + 4 0x201E „ (DOUBLE LOW-9 QUOTATION MARK) + 5 0x2026 … (HORIZONTAL ELLIPSIS) + 6 0x2020 † (DAGGER) + 7 0x2021 ‡ (DOUBLE DAGGER) + 8 0x0088 ˆ () + 9 0x2030 ‰ (PER MILLE SIGN) + 10 0x008A Š () + 11 0x2039 ‹ (SINGLE LEFT-POINTING ANGLE QUOTATION MARK) + 12 0x008C Œ () + 13 0x00A8 ¨ (DIAERESIS) + 14 0x02C7 ˇ (CARON) + 15 0x00B8 ¸ (CEDILLA) + 16 0x0090  () + 17 0x2018 ‘ (LEFT SINGLE QUOTATION MARK) + 18 0x2019 ’ (RIGHT SINGLE QUOTATION MARK) + 19 0x201C “ (LEFT DOUBLE QUOTATION MARK) + 20 0x201D ” (RIGHT DOUBLE QUOTATION MARK) + 21 0x2022 • (BULLET) + 22 0x2013 – (EN DASH) + 23 0x2014 — (EM DASH) + 24 0x0098 ˜ () + 25 0x2122 ™ (TRADE MARK SIGN) + 26 0x009A š () + 27 0x203A › (SINGLE RIGHT-POINTING ANGLE QUOTATION MARK) + 28 0x009C œ () + 29 0x00AF ¯ (MACRON) + 30 0x02DB ˛ (OGONEK) + 31 0x009F Ÿ () + 32 0x00A0   (NO-BREAK SPACE) + 34 0x00A2 ¢ (CENT SIGN) + 35 0x00A3 £ (POUND SIGN) + 36 0x00A4 ¤ (CURRENCY SIGN) + 38 0x00A6 ¦ (BROKEN BAR) + 39 0x00A7 § (SECTION SIGN) + 40 0x00D8 Ø (LATIN CAPITAL LETTER O WITH STROKE) + 41 0x00A9 © (COPYRIGHT SIGN) + 42 0x0156 Ŗ (LATIN CAPITAL LETTER R WITH CEDILLA) + 43 0x00AB « (LEFT-POINTING DOUBLE ANGLE QUOTATION MARK) + 44 0x00AC ¬ (NOT SIGN) + 45 0x00AD ­ (SOFT HYPHEN) + 46 0x00AE ® (REGISTERED SIGN) + 47 0x00C6 Æ (LATIN CAPITAL LETTER AE) + 48 0x00B0 ° (DEGREE SIGN) + 49 0x00B1 ± (PLUS-MINUS SIGN) + 50 0x00B2 ² (SUPERSCRIPT TWO) + 51 0x00B3 ³ (SUPERSCRIPT THREE) + 52 0x00B4 ´ (ACUTE ACCENT) + 53 0x00B5 µ (MICRO SIGN) + 54 0x00B6 ¶ (PILCROW SIGN) + 55 0x00B7 · (MIDDLE DOT) + 56 0x00F8 ø (LATIN SMALL LETTER O WITH STROKE) + 57 0x00B9 ¹ (SUPERSCRIPT ONE) + 58 0x0157 ŗ (LATIN SMALL LETTER R WITH CEDILLA) + 59 0x00BB » (RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK) + 60 0x00BC ¼ (VULGAR FRACTION ONE QUARTER) + 61 0x00BD ½ (VULGAR FRACTION ONE HALF) + 62 0x00BE ¾ (VULGAR FRACTION THREE QUARTERS) + 63 0x00E6 æ (LATIN SMALL LETTER AE) + 64 0x0104 Ą (LATIN CAPITAL LETTER A WITH OGONEK) + 65 0x012E Į (LATIN CAPITAL LETTER I WITH OGONEK) + 66 0x0100 Ā (LATIN CAPITAL LETTER A WITH MACRON) + 67 0x0106 Ć (LATIN CAPITAL LETTER C WITH ACUTE) + 68 0x00C4 Ä (LATIN CAPITAL LETTER A WITH DIAERESIS) + 69 0x00C5 Å (LATIN CAPITAL LETTER A WITH RING ABOVE) + 70 0x0118 Ę (LATIN CAPITAL LETTER E WITH OGONEK) + 71 0x0112 Ē (LATIN CAPITAL LETTER E WITH MACRON) + 72 0x010C Č (LATIN CAPITAL LETTER C WITH CARON) + 73 0x00C9 É (LATIN CAPITAL LETTER E WITH ACUTE) + 74 0x0179 Ź (LATIN CAPITAL LETTER Z WITH ACUTE) + 75 0x0116 Ė (LATIN CAPITAL LETTER E WITH DOT ABOVE) + 76 0x0122 Ģ (LATIN CAPITAL LETTER G WITH CEDILLA) + 77 0x0136 Ķ (LATIN CAPITAL LETTER K WITH CEDILLA) + 78 0x012A Ī (LATIN CAPITAL LETTER I WITH MACRON) + 79 0x013B Ļ (LATIN CAPITAL LETTER L WITH CEDILLA) + 80 0x0160 Š (LATIN CAPITAL LETTER S WITH CARON) + 81 0x0143 Ń (LATIN CAPITAL LETTER N WITH ACUTE) + 82 0x0145 Ņ (LATIN CAPITAL LETTER N WITH CEDILLA) + 83 0x00D3 Ó (LATIN CAPITAL LETTER O WITH ACUTE) + 84 0x014C Ō (LATIN CAPITAL LETTER O WITH MACRON) + 85 0x00D5 Õ (LATIN CAPITAL LETTER O WITH TILDE) + 86 0x00D6 Ö (LATIN CAPITAL LETTER O WITH DIAERESIS) + 87 0x00D7 × (MULTIPLICATION SIGN) + 88 0x0172 Ų (LATIN CAPITAL LETTER U WITH OGONEK) + 89 0x0141 Ł (LATIN CAPITAL LETTER L WITH STROKE) + 90 0x015A Ś (LATIN CAPITAL LETTER S WITH ACUTE) + 91 0x016A Ū (LATIN CAPITAL LETTER U WITH MACRON) + 92 0x00DC Ü (LATIN CAPITAL LETTER U WITH DIAERESIS) + 93 0x017B Ż (LATIN CAPITAL LETTER Z WITH DOT ABOVE) + 94 0x017D Ž (LATIN CAPITAL LETTER Z WITH CARON) + 95 0x00DF ß (LATIN SMALL LETTER SHARP S) + 96 0x0105 ą (LATIN SMALL LETTER A WITH OGONEK) + 97 0x012F į (LATIN SMALL LETTER I WITH OGONEK) + 98 0x0101 ā (LATIN SMALL LETTER A WITH MACRON) + 99 0x0107 ć (LATIN SMALL LETTER C WITH ACUTE) +100 0x00E4 ä (LATIN SMALL LETTER A WITH DIAERESIS) +101 0x00E5 å (LATIN SMALL LETTER A WITH RING ABOVE) +102 0x0119 ę (LATIN SMALL LETTER E WITH OGONEK) +103 0x0113 ē (LATIN SMALL LETTER E WITH MACRON) +104 0x010D č (LATIN SMALL LETTER C WITH CARON) +105 0x00E9 é (LATIN SMALL LETTER E WITH ACUTE) +106 0x017A ź (LATIN SMALL LETTER Z WITH ACUTE) +107 0x0117 ė (LATIN SMALL LETTER E WITH DOT ABOVE) +108 0x0123 ģ (LATIN SMALL LETTER G WITH CEDILLA) +109 0x0137 ķ (LATIN SMALL LETTER K WITH CEDILLA) +110 0x012B ī (LATIN SMALL LETTER I WITH MACRON) +111 0x013C ļ (LATIN SMALL LETTER L WITH CEDILLA) +112 0x0161 š (LATIN SMALL LETTER S WITH CARON) +113 0x0144 ń (LATIN SMALL LETTER N WITH ACUTE) +114 0x0146 ņ (LATIN SMALL LETTER N WITH CEDILLA) +115 0x00F3 ó (LATIN SMALL LETTER O WITH ACUTE) +116 0x014D ō (LATIN SMALL LETTER O WITH MACRON) +117 0x00F5 õ (LATIN SMALL LETTER O WITH TILDE) +118 0x00F6 ö (LATIN SMALL LETTER O WITH DIAERESIS) +119 0x00F7 ÷ (DIVISION SIGN) +120 0x0173 ų (LATIN SMALL LETTER U WITH OGONEK) +121 0x0142 ł (LATIN SMALL LETTER L WITH STROKE) +122 0x015B ś (LATIN SMALL LETTER S WITH ACUTE) +123 0x016B ū (LATIN SMALL LETTER U WITH MACRON) +124 0x00FC ü (LATIN SMALL LETTER U WITH DIAERESIS) +125 0x017C ż (LATIN SMALL LETTER Z WITH DOT ABOVE) +126 0x017E ž (LATIN SMALL LETTER Z WITH CARON) +127 0x02D9 ˙ (DOT ABOVE) diff --git a/test/fixtures/encoding/single-byte/index-windows-1258.txt b/test/fixtures/encoding/single-byte/index-windows-1258.txt new file mode 100644 index 00000000000000..568141350ab0fb --- /dev/null +++ b/test/fixtures/encoding/single-byte/index-windows-1258.txt @@ -0,0 +1,134 @@ +# For details on index index-windows-1258.txt see the Encoding Standard +# https://encoding.spec.whatwg.org/ +# +# Identifier: 198bacedfcf24390e219240a7b776b6cec34cff070330b08a601a69c67f7eb24 +# Date: 2024-09-18 + + 0 0x20AC € (EURO SIGN) + 1 0x0081  () + 2 0x201A ‚ (SINGLE LOW-9 QUOTATION MARK) + 3 0x0192 ƒ (LATIN SMALL LETTER F WITH HOOK) + 4 0x201E „ (DOUBLE LOW-9 QUOTATION MARK) + 5 0x2026 … (HORIZONTAL ELLIPSIS) + 6 0x2020 † (DAGGER) + 7 0x2021 ‡ (DOUBLE DAGGER) + 8 0x02C6 ˆ (MODIFIER LETTER CIRCUMFLEX ACCENT) + 9 0x2030 ‰ (PER MILLE SIGN) + 10 0x008A Š () + 11 0x2039 ‹ (SINGLE LEFT-POINTING ANGLE QUOTATION MARK) + 12 0x0152 Œ (LATIN CAPITAL LIGATURE OE) + 13 0x008D  () + 14 0x008E Ž () + 15 0x008F  () + 16 0x0090  () + 17 0x2018 ‘ (LEFT SINGLE QUOTATION MARK) + 18 0x2019 ’ (RIGHT SINGLE QUOTATION MARK) + 19 0x201C “ (LEFT DOUBLE QUOTATION MARK) + 20 0x201D ” (RIGHT DOUBLE QUOTATION MARK) + 21 0x2022 • (BULLET) + 22 0x2013 – (EN DASH) + 23 0x2014 — (EM DASH) + 24 0x02DC ˜ (SMALL TILDE) + 25 0x2122 ™ (TRADE MARK SIGN) + 26 0x009A š () + 27 0x203A › (SINGLE RIGHT-POINTING ANGLE QUOTATION MARK) + 28 0x0153 œ (LATIN SMALL LIGATURE OE) + 29 0x009D  () + 30 0x009E ž () + 31 0x0178 Ÿ (LATIN CAPITAL LETTER Y WITH DIAERESIS) + 32 0x00A0   (NO-BREAK SPACE) + 33 0x00A1 ¡ (INVERTED EXCLAMATION MARK) + 34 0x00A2 ¢ (CENT SIGN) + 35 0x00A3 £ (POUND SIGN) + 36 0x00A4 ¤ (CURRENCY SIGN) + 37 0x00A5 ¥ (YEN SIGN) + 38 0x00A6 ¦ (BROKEN BAR) + 39 0x00A7 § (SECTION SIGN) + 40 0x00A8 ¨ (DIAERESIS) + 41 0x00A9 © (COPYRIGHT SIGN) + 42 0x00AA ª (FEMININE ORDINAL INDICATOR) + 43 0x00AB « (LEFT-POINTING DOUBLE ANGLE QUOTATION MARK) + 44 0x00AC ¬ (NOT SIGN) + 45 0x00AD ­ (SOFT HYPHEN) + 46 0x00AE ® (REGISTERED SIGN) + 47 0x00AF ¯ (MACRON) + 48 0x00B0 ° (DEGREE SIGN) + 49 0x00B1 ± (PLUS-MINUS SIGN) + 50 0x00B2 ² (SUPERSCRIPT TWO) + 51 0x00B3 ³ (SUPERSCRIPT THREE) + 52 0x00B4 ´ (ACUTE ACCENT) + 53 0x00B5 µ (MICRO SIGN) + 54 0x00B6 ¶ (PILCROW SIGN) + 55 0x00B7 · (MIDDLE DOT) + 56 0x00B8 ¸ (CEDILLA) + 57 0x00B9 ¹ (SUPERSCRIPT ONE) + 58 0x00BA º (MASCULINE ORDINAL INDICATOR) + 59 0x00BB » (RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK) + 60 0x00BC ¼ (VULGAR FRACTION ONE QUARTER) + 61 0x00BD ½ (VULGAR FRACTION ONE HALF) + 62 0x00BE ¾ (VULGAR FRACTION THREE QUARTERS) + 63 0x00BF ¿ (INVERTED QUESTION MARK) + 64 0x00C0 À (LATIN CAPITAL LETTER A WITH GRAVE) + 65 0x00C1 Á (LATIN CAPITAL LETTER A WITH ACUTE) + 66 0x00C2  (LATIN CAPITAL LETTER A WITH CIRCUMFLEX) + 67 0x0102 Ă (LATIN CAPITAL LETTER A WITH BREVE) + 68 0x00C4 Ä (LATIN CAPITAL LETTER A WITH DIAERESIS) + 69 0x00C5 Å (LATIN CAPITAL LETTER A WITH RING ABOVE) + 70 0x00C6 Æ (LATIN CAPITAL LETTER AE) + 71 0x00C7 Ç (LATIN CAPITAL LETTER C WITH CEDILLA) + 72 0x00C8 È (LATIN CAPITAL LETTER E WITH GRAVE) + 73 0x00C9 É (LATIN CAPITAL LETTER E WITH ACUTE) + 74 0x00CA Ê (LATIN CAPITAL LETTER E WITH CIRCUMFLEX) + 75 0x00CB Ë (LATIN CAPITAL LETTER E WITH DIAERESIS) + 76 0x0300 ̀ (COMBINING GRAVE ACCENT) + 77 0x00CD Í (LATIN CAPITAL LETTER I WITH ACUTE) + 78 0x00CE Î (LATIN CAPITAL LETTER I WITH CIRCUMFLEX) + 79 0x00CF Ï (LATIN CAPITAL LETTER I WITH DIAERESIS) + 80 0x0110 Đ (LATIN CAPITAL LETTER D WITH STROKE) + 81 0x00D1 Ñ (LATIN CAPITAL LETTER N WITH TILDE) + 82 0x0309 ̉ (COMBINING HOOK ABOVE) + 83 0x00D3 Ó (LATIN CAPITAL LETTER O WITH ACUTE) + 84 0x00D4 Ô (LATIN CAPITAL LETTER O WITH CIRCUMFLEX) + 85 0x01A0 Ơ (LATIN CAPITAL LETTER O WITH HORN) + 86 0x00D6 Ö (LATIN CAPITAL LETTER O WITH DIAERESIS) + 87 0x00D7 × (MULTIPLICATION SIGN) + 88 0x00D8 Ø (LATIN CAPITAL LETTER O WITH STROKE) + 89 0x00D9 Ù (LATIN CAPITAL LETTER U WITH GRAVE) + 90 0x00DA Ú (LATIN CAPITAL LETTER U WITH ACUTE) + 91 0x00DB Û (LATIN CAPITAL LETTER U WITH CIRCUMFLEX) + 92 0x00DC Ü (LATIN CAPITAL LETTER U WITH DIAERESIS) + 93 0x01AF Ư (LATIN CAPITAL LETTER U WITH HORN) + 94 0x0303 ̃ (COMBINING TILDE) + 95 0x00DF ß (LATIN SMALL LETTER SHARP S) + 96 0x00E0 à (LATIN SMALL LETTER A WITH GRAVE) + 97 0x00E1 á (LATIN SMALL LETTER A WITH ACUTE) + 98 0x00E2 â (LATIN SMALL LETTER A WITH CIRCUMFLEX) + 99 0x0103 ă (LATIN SMALL LETTER A WITH BREVE) +100 0x00E4 ä (LATIN SMALL LETTER A WITH DIAERESIS) +101 0x00E5 å (LATIN SMALL LETTER A WITH RING ABOVE) +102 0x00E6 æ (LATIN SMALL LETTER AE) +103 0x00E7 ç (LATIN SMALL LETTER C WITH CEDILLA) +104 0x00E8 è (LATIN SMALL LETTER E WITH GRAVE) +105 0x00E9 é (LATIN SMALL LETTER E WITH ACUTE) +106 0x00EA ê (LATIN SMALL LETTER E WITH CIRCUMFLEX) +107 0x00EB ë (LATIN SMALL LETTER E WITH DIAERESIS) +108 0x0301 ́ (COMBINING ACUTE ACCENT) +109 0x00ED í (LATIN SMALL LETTER I WITH ACUTE) +110 0x00EE î (LATIN SMALL LETTER I WITH CIRCUMFLEX) +111 0x00EF ï (LATIN SMALL LETTER I WITH DIAERESIS) +112 0x0111 đ (LATIN SMALL LETTER D WITH STROKE) +113 0x00F1 ñ (LATIN SMALL LETTER N WITH TILDE) +114 0x0323 ̣ (COMBINING DOT BELOW) +115 0x00F3 ó (LATIN SMALL LETTER O WITH ACUTE) +116 0x00F4 ô (LATIN SMALL LETTER O WITH CIRCUMFLEX) +117 0x01A1 ơ (LATIN SMALL LETTER O WITH HORN) +118 0x00F6 ö (LATIN SMALL LETTER O WITH DIAERESIS) +119 0x00F7 ÷ (DIVISION SIGN) +120 0x00F8 ø (LATIN SMALL LETTER O WITH STROKE) +121 0x00F9 ù (LATIN SMALL LETTER U WITH GRAVE) +122 0x00FA ú (LATIN SMALL LETTER U WITH ACUTE) +123 0x00FB û (LATIN SMALL LETTER U WITH CIRCUMFLEX) +124 0x00FC ü (LATIN SMALL LETTER U WITH DIAERESIS) +125 0x01B0 ư (LATIN SMALL LETTER U WITH HORN) +126 0x20AB ₫ (DONG SIGN) +127 0x00FF ÿ (LATIN SMALL LETTER Y WITH DIAERESIS) diff --git a/test/fixtures/encoding/single-byte/index-windows-874.txt b/test/fixtures/encoding/single-byte/index-windows-874.txt new file mode 100644 index 00000000000000..57315c20ff16c9 --- /dev/null +++ b/test/fixtures/encoding/single-byte/index-windows-874.txt @@ -0,0 +1,126 @@ +# For details on index index-windows-874.txt see the Encoding Standard +# https://encoding.spec.whatwg.org/ +# +# Identifier: b416583ce125e38474381b31b401a98b19ecf2e57e0998e78a1e18b14894905d +# Date: 2024-09-18 + + 0 0x20AC € (EURO SIGN) + 1 0x0081  () + 2 0x0082 ‚ () + 3 0x0083 ƒ () + 4 0x0084 „ () + 5 0x2026 … (HORIZONTAL ELLIPSIS) + 6 0x0086 † () + 7 0x0087 ‡ () + 8 0x0088 ˆ () + 9 0x0089 ‰ () + 10 0x008A Š () + 11 0x008B ‹ () + 12 0x008C Œ () + 13 0x008D  () + 14 0x008E Ž () + 15 0x008F  () + 16 0x0090  () + 17 0x2018 ‘ (LEFT SINGLE QUOTATION MARK) + 18 0x2019 ’ (RIGHT SINGLE QUOTATION MARK) + 19 0x201C “ (LEFT DOUBLE QUOTATION MARK) + 20 0x201D ” (RIGHT DOUBLE QUOTATION MARK) + 21 0x2022 • (BULLET) + 22 0x2013 – (EN DASH) + 23 0x2014 — (EM DASH) + 24 0x0098 ˜ () + 25 0x0099 ™ () + 26 0x009A š () + 27 0x009B › () + 28 0x009C œ () + 29 0x009D  () + 30 0x009E ž () + 31 0x009F Ÿ () + 32 0x00A0   (NO-BREAK SPACE) + 33 0x0E01 ก (THAI CHARACTER KO KAI) + 34 0x0E02 ข (THAI CHARACTER KHO KHAI) + 35 0x0E03 ฃ (THAI CHARACTER KHO KHUAT) + 36 0x0E04 ค (THAI CHARACTER KHO KHWAI) + 37 0x0E05 ฅ (THAI CHARACTER KHO KHON) + 38 0x0E06 ฆ (THAI CHARACTER KHO RAKHANG) + 39 0x0E07 ง (THAI CHARACTER NGO NGU) + 40 0x0E08 จ (THAI CHARACTER CHO CHAN) + 41 0x0E09 ฉ (THAI CHARACTER CHO CHING) + 42 0x0E0A ช (THAI CHARACTER CHO CHANG) + 43 0x0E0B ซ (THAI CHARACTER SO SO) + 44 0x0E0C ฌ (THAI CHARACTER CHO CHOE) + 45 0x0E0D ญ (THAI CHARACTER YO YING) + 46 0x0E0E ฎ (THAI CHARACTER DO CHADA) + 47 0x0E0F ฏ (THAI CHARACTER TO PATAK) + 48 0x0E10 ฐ (THAI CHARACTER THO THAN) + 49 0x0E11 ฑ (THAI CHARACTER THO NANGMONTHO) + 50 0x0E12 ฒ (THAI CHARACTER THO PHUTHAO) + 51 0x0E13 ณ (THAI CHARACTER NO NEN) + 52 0x0E14 ด (THAI CHARACTER DO DEK) + 53 0x0E15 ต (THAI CHARACTER TO TAO) + 54 0x0E16 ถ (THAI CHARACTER THO THUNG) + 55 0x0E17 ท (THAI CHARACTER THO THAHAN) + 56 0x0E18 ธ (THAI CHARACTER THO THONG) + 57 0x0E19 น (THAI CHARACTER NO NU) + 58 0x0E1A บ (THAI CHARACTER BO BAIMAI) + 59 0x0E1B ป (THAI CHARACTER PO PLA) + 60 0x0E1C ผ (THAI CHARACTER PHO PHUNG) + 61 0x0E1D ฝ (THAI CHARACTER FO FA) + 62 0x0E1E พ (THAI CHARACTER PHO PHAN) + 63 0x0E1F ฟ (THAI CHARACTER FO FAN) + 64 0x0E20 ภ (THAI CHARACTER PHO SAMPHAO) + 65 0x0E21 ม (THAI CHARACTER MO MA) + 66 0x0E22 ย (THAI CHARACTER YO YAK) + 67 0x0E23 ร (THAI CHARACTER RO RUA) + 68 0x0E24 ฤ (THAI CHARACTER RU) + 69 0x0E25 ล (THAI CHARACTER LO LING) + 70 0x0E26 ฦ (THAI CHARACTER LU) + 71 0x0E27 ว (THAI CHARACTER WO WAEN) + 72 0x0E28 ศ (THAI CHARACTER SO SALA) + 73 0x0E29 ษ (THAI CHARACTER SO RUSI) + 74 0x0E2A ส (THAI CHARACTER SO SUA) + 75 0x0E2B ห (THAI CHARACTER HO HIP) + 76 0x0E2C ฬ (THAI CHARACTER LO CHULA) + 77 0x0E2D อ (THAI CHARACTER O ANG) + 78 0x0E2E ฮ (THAI CHARACTER HO NOKHUK) + 79 0x0E2F ฯ (THAI CHARACTER PAIYANNOI) + 80 0x0E30 ะ (THAI CHARACTER SARA A) + 81 0x0E31 ั (THAI CHARACTER MAI HAN-AKAT) + 82 0x0E32 า (THAI CHARACTER SARA AA) + 83 0x0E33 ำ (THAI CHARACTER SARA AM) + 84 0x0E34 ิ (THAI CHARACTER SARA I) + 85 0x0E35 ี (THAI CHARACTER SARA II) + 86 0x0E36 ึ (THAI CHARACTER SARA UE) + 87 0x0E37 ื (THAI CHARACTER SARA UEE) + 88 0x0E38 ุ (THAI CHARACTER SARA U) + 89 0x0E39 ู (THAI CHARACTER SARA UU) + 90 0x0E3A ฺ (THAI CHARACTER PHINTHU) + 95 0x0E3F ฿ (THAI CURRENCY SYMBOL BAHT) + 96 0x0E40 เ (THAI CHARACTER SARA E) + 97 0x0E41 แ (THAI CHARACTER SARA AE) + 98 0x0E42 โ (THAI CHARACTER SARA O) + 99 0x0E43 ใ (THAI CHARACTER SARA AI MAIMUAN) +100 0x0E44 ไ (THAI CHARACTER SARA AI MAIMALAI) +101 0x0E45 ๅ (THAI CHARACTER LAKKHANGYAO) +102 0x0E46 ๆ (THAI CHARACTER MAIYAMOK) +103 0x0E47 ็ (THAI CHARACTER MAITAIKHU) +104 0x0E48 ่ (THAI CHARACTER MAI EK) +105 0x0E49 ้ (THAI CHARACTER MAI THO) +106 0x0E4A ๊ (THAI CHARACTER MAI TRI) +107 0x0E4B ๋ (THAI CHARACTER MAI CHATTAWA) +108 0x0E4C ์ (THAI CHARACTER THANTHAKHAT) +109 0x0E4D ํ (THAI CHARACTER NIKHAHIT) +110 0x0E4E ๎ (THAI CHARACTER YAMAKKAN) +111 0x0E4F ๏ (THAI CHARACTER FONGMAN) +112 0x0E50 ๐ (THAI DIGIT ZERO) +113 0x0E51 ๑ (THAI DIGIT ONE) +114 0x0E52 ๒ (THAI DIGIT TWO) +115 0x0E53 ๓ (THAI DIGIT THREE) +116 0x0E54 ๔ (THAI DIGIT FOUR) +117 0x0E55 ๕ (THAI DIGIT FIVE) +118 0x0E56 ๖ (THAI DIGIT SIX) +119 0x0E57 ๗ (THAI DIGIT SEVEN) +120 0x0E58 ๘ (THAI DIGIT EIGHT) +121 0x0E59 ๙ (THAI DIGIT NINE) +122 0x0E5A ๚ (THAI CHARACTER ANGKHANKHU) +123 0x0E5B ๛ (THAI CHARACTER KHOMUT) diff --git a/test/fixtures/encoding/single-byte/index-x-mac-cyrillic.txt b/test/fixtures/encoding/single-byte/index-x-mac-cyrillic.txt new file mode 100644 index 00000000000000..333b361dfce689 --- /dev/null +++ b/test/fixtures/encoding/single-byte/index-x-mac-cyrillic.txt @@ -0,0 +1,134 @@ +# For details on index index-x-mac-cyrillic.txt see the Encoding Standard +# https://encoding.spec.whatwg.org/ +# +# Identifier: 73e8e7642c6fa9de29d42819b47fba55b58666fb1e339faeb4a89a0bd7c24d43 +# Date: 2024-09-18 + + 0 0x0410 А (CYRILLIC CAPITAL LETTER A) + 1 0x0411 Б (CYRILLIC CAPITAL LETTER BE) + 2 0x0412 В (CYRILLIC CAPITAL LETTER VE) + 3 0x0413 Г (CYRILLIC CAPITAL LETTER GHE) + 4 0x0414 Д (CYRILLIC CAPITAL LETTER DE) + 5 0x0415 Е (CYRILLIC CAPITAL LETTER IE) + 6 0x0416 Ж (CYRILLIC CAPITAL LETTER ZHE) + 7 0x0417 З (CYRILLIC CAPITAL LETTER ZE) + 8 0x0418 И (CYRILLIC CAPITAL LETTER I) + 9 0x0419 Й (CYRILLIC CAPITAL LETTER SHORT I) + 10 0x041A К (CYRILLIC CAPITAL LETTER KA) + 11 0x041B Л (CYRILLIC CAPITAL LETTER EL) + 12 0x041C М (CYRILLIC CAPITAL LETTER EM) + 13 0x041D Н (CYRILLIC CAPITAL LETTER EN) + 14 0x041E О (CYRILLIC CAPITAL LETTER O) + 15 0x041F П (CYRILLIC CAPITAL LETTER PE) + 16 0x0420 Р (CYRILLIC CAPITAL LETTER ER) + 17 0x0421 С (CYRILLIC CAPITAL LETTER ES) + 18 0x0422 Т (CYRILLIC CAPITAL LETTER TE) + 19 0x0423 У (CYRILLIC CAPITAL LETTER U) + 20 0x0424 Ф (CYRILLIC CAPITAL LETTER EF) + 21 0x0425 Х (CYRILLIC CAPITAL LETTER HA) + 22 0x0426 Ц (CYRILLIC CAPITAL LETTER TSE) + 23 0x0427 Ч (CYRILLIC CAPITAL LETTER CHE) + 24 0x0428 Ш (CYRILLIC CAPITAL LETTER SHA) + 25 0x0429 Щ (CYRILLIC CAPITAL LETTER SHCHA) + 26 0x042A Ъ (CYRILLIC CAPITAL LETTER HARD SIGN) + 27 0x042B Ы (CYRILLIC CAPITAL LETTER YERU) + 28 0x042C Ь (CYRILLIC CAPITAL LETTER SOFT SIGN) + 29 0x042D Э (CYRILLIC CAPITAL LETTER E) + 30 0x042E Ю (CYRILLIC CAPITAL LETTER YU) + 31 0x042F Я (CYRILLIC CAPITAL LETTER YA) + 32 0x2020 † (DAGGER) + 33 0x00B0 ° (DEGREE SIGN) + 34 0x0490 Ґ (CYRILLIC CAPITAL LETTER GHE WITH UPTURN) + 35 0x00A3 £ (POUND SIGN) + 36 0x00A7 § (SECTION SIGN) + 37 0x2022 • (BULLET) + 38 0x00B6 ¶ (PILCROW SIGN) + 39 0x0406 І (CYRILLIC CAPITAL LETTER BYELORUSSIAN-UKRAINIAN I) + 40 0x00AE ® (REGISTERED SIGN) + 41 0x00A9 © (COPYRIGHT SIGN) + 42 0x2122 ™ (TRADE MARK SIGN) + 43 0x0402 Ђ (CYRILLIC CAPITAL LETTER DJE) + 44 0x0452 ђ (CYRILLIC SMALL LETTER DJE) + 45 0x2260 ≠ (NOT EQUAL TO) + 46 0x0403 Ѓ (CYRILLIC CAPITAL LETTER GJE) + 47 0x0453 ѓ (CYRILLIC SMALL LETTER GJE) + 48 0x221E ∞ (INFINITY) + 49 0x00B1 ± (PLUS-MINUS SIGN) + 50 0x2264 ≤ (LESS-THAN OR EQUAL TO) + 51 0x2265 ≥ (GREATER-THAN OR EQUAL TO) + 52 0x0456 і (CYRILLIC SMALL LETTER BYELORUSSIAN-UKRAINIAN I) + 53 0x00B5 µ (MICRO SIGN) + 54 0x0491 ґ (CYRILLIC SMALL LETTER GHE WITH UPTURN) + 55 0x0408 Ј (CYRILLIC CAPITAL LETTER JE) + 56 0x0404 Є (CYRILLIC CAPITAL LETTER UKRAINIAN IE) + 57 0x0454 є (CYRILLIC SMALL LETTER UKRAINIAN IE) + 58 0x0407 Ї (CYRILLIC CAPITAL LETTER YI) + 59 0x0457 ї (CYRILLIC SMALL LETTER YI) + 60 0x0409 Љ (CYRILLIC CAPITAL LETTER LJE) + 61 0x0459 љ (CYRILLIC SMALL LETTER LJE) + 62 0x040A Њ (CYRILLIC CAPITAL LETTER NJE) + 63 0x045A њ (CYRILLIC SMALL LETTER NJE) + 64 0x0458 ј (CYRILLIC SMALL LETTER JE) + 65 0x0405 Ѕ (CYRILLIC CAPITAL LETTER DZE) + 66 0x00AC ¬ (NOT SIGN) + 67 0x221A √ (SQUARE ROOT) + 68 0x0192 ƒ (LATIN SMALL LETTER F WITH HOOK) + 69 0x2248 ≈ (ALMOST EQUAL TO) + 70 0x2206 ∆ (INCREMENT) + 71 0x00AB « (LEFT-POINTING DOUBLE ANGLE QUOTATION MARK) + 72 0x00BB » (RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK) + 73 0x2026 … (HORIZONTAL ELLIPSIS) + 74 0x00A0   (NO-BREAK SPACE) + 75 0x040B Ћ (CYRILLIC CAPITAL LETTER TSHE) + 76 0x045B ћ (CYRILLIC SMALL LETTER TSHE) + 77 0x040C Ќ (CYRILLIC CAPITAL LETTER KJE) + 78 0x045C ќ (CYRILLIC SMALL LETTER KJE) + 79 0x0455 ѕ (CYRILLIC SMALL LETTER DZE) + 80 0x2013 – (EN DASH) + 81 0x2014 — (EM DASH) + 82 0x201C “ (LEFT DOUBLE QUOTATION MARK) + 83 0x201D ” (RIGHT DOUBLE QUOTATION MARK) + 84 0x2018 ‘ (LEFT SINGLE QUOTATION MARK) + 85 0x2019 ’ (RIGHT SINGLE QUOTATION MARK) + 86 0x00F7 ÷ (DIVISION SIGN) + 87 0x201E „ (DOUBLE LOW-9 QUOTATION MARK) + 88 0x040E Ў (CYRILLIC CAPITAL LETTER SHORT U) + 89 0x045E ў (CYRILLIC SMALL LETTER SHORT U) + 90 0x040F Џ (CYRILLIC CAPITAL LETTER DZHE) + 91 0x045F џ (CYRILLIC SMALL LETTER DZHE) + 92 0x2116 № (NUMERO SIGN) + 93 0x0401 Ё (CYRILLIC CAPITAL LETTER IO) + 94 0x0451 ё (CYRILLIC SMALL LETTER IO) + 95 0x044F я (CYRILLIC SMALL LETTER YA) + 96 0x0430 а (CYRILLIC SMALL LETTER A) + 97 0x0431 б (CYRILLIC SMALL LETTER BE) + 98 0x0432 в (CYRILLIC SMALL LETTER VE) + 99 0x0433 г (CYRILLIC SMALL LETTER GHE) +100 0x0434 д (CYRILLIC SMALL LETTER DE) +101 0x0435 е (CYRILLIC SMALL LETTER IE) +102 0x0436 ж (CYRILLIC SMALL LETTER ZHE) +103 0x0437 з (CYRILLIC SMALL LETTER ZE) +104 0x0438 и (CYRILLIC SMALL LETTER I) +105 0x0439 й (CYRILLIC SMALL LETTER SHORT I) +106 0x043A к (CYRILLIC SMALL LETTER KA) +107 0x043B л (CYRILLIC SMALL LETTER EL) +108 0x043C м (CYRILLIC SMALL LETTER EM) +109 0x043D н (CYRILLIC SMALL LETTER EN) +110 0x043E о (CYRILLIC SMALL LETTER O) +111 0x043F п (CYRILLIC SMALL LETTER PE) +112 0x0440 р (CYRILLIC SMALL LETTER ER) +113 0x0441 с (CYRILLIC SMALL LETTER ES) +114 0x0442 т (CYRILLIC SMALL LETTER TE) +115 0x0443 у (CYRILLIC SMALL LETTER U) +116 0x0444 ф (CYRILLIC SMALL LETTER EF) +117 0x0445 х (CYRILLIC SMALL LETTER HA) +118 0x0446 ц (CYRILLIC SMALL LETTER TSE) +119 0x0447 ч (CYRILLIC SMALL LETTER CHE) +120 0x0448 ш (CYRILLIC SMALL LETTER SHA) +121 0x0449 щ (CYRILLIC SMALL LETTER SHCHA) +122 0x044A ъ (CYRILLIC SMALL LETTER HARD SIGN) +123 0x044B ы (CYRILLIC SMALL LETTER YERU) +124 0x044C ь (CYRILLIC SMALL LETTER SOFT SIGN) +125 0x044D э (CYRILLIC SMALL LETTER E) +126 0x044E ю (CYRILLIC SMALL LETTER YU) +127 0x20AC € (EURO SIGN) diff --git a/test/parallel/test-whatwg-encoding-singlebyte.mjs b/test/parallel/test-whatwg-encoding-singlebyte.mjs new file mode 100644 index 00000000000000..89aa7c62964c54 --- /dev/null +++ b/test/parallel/test-whatwg-encoding-singlebyte.mjs @@ -0,0 +1,84 @@ +// Test that single-byte encodings mappings are correct and match spec +// Works without Intl + +// From: https://github.com/ExodusOSS/bytes/blob/master/tests/single-byte.test.js +// Copyright Exodus Movement. Licensed under MIT License. + +import '../common/index.mjs'; +import { readFileSync } from 'node:fs'; +import { join } from 'node:path'; +import { test, describe } from 'node:test'; +import raw from '../fixtures/encoding/encodings.json' with { type: 'json' }; + +const groups = new Map(raw.map((x) => [x.heading, x.encodings.map((x) => x.name.toLowerCase())])); +const encodings = groups.get('Legacy single-byte encodings'); + +describe('single-byte encodings are supersets of ascii', () => { + for (const encoding of encodings) { + test(encoding, (t) => { + const loose = new TextDecoder(encoding); + const fatal = new TextDecoder(encoding, { fatal: true }); + for (let i = 0; i < 128; i++) { + const str = String.fromCodePoint(i); + t.assert.strictEqual(loose.decode(Uint8Array.of(i)), str, i); + t.assert.strictEqual(fatal.decode(Uint8Array.of(i)), str, i); + } + }); + } +}); + +describe('single-byte encodings index', () => { + for (const encoding of encodings) { + test(encoding, (t) => { + const loose = new TextDecoder(encoding); + const fatal = new TextDecoder(encoding, { fatal: true }); + const file = encoding === 'iso-8859-8-i' ? `index-iso-8859-8.txt` : `index-${encoding}.txt`; + const text = readFileSync(join(import.meta.dirname, '../fixtures/encoding/single-byte', file), 'utf8'); + const rows = text + .split('\n') + .map((x) => x.trim()) + .filter((x) => x && x[0] !== '#') + .map((x) => x.split('\t')) + .map(([istr, codeHex, description]) => { + const i = Number(istr); + t.assert.ok(i < 128); + const code = parseInt(codeHex.slice(2), 16); + t.assert.strictEqual(`${i}`, istr); + t.assert.strictEqual('0x' + code.toString(16).padStart(4, '0').toUpperCase(), codeHex); + t.assert.ok(code && code !== 0xff_fd && code <= 0xff_ff); // Can't be a replacement char, has to be <= 16-bit + t.assert.ok(code < 0xd8_00 || code >= 0xe0_00); // not a surrogate + return [i, { i, code, description }]; + }); + + t.assert.ok(rows.length <= 128); + const known = new Map(rows); + t.assert.strictEqual(rows.length, known.size); // all unique + + for (let i = 0; i < 128; i++) { + const row = known.get(i); + const byte = i + 128; + if (row) { + t.assert.strictEqual(i, row.i); + const str = String.fromCodePoint(row.code); + t.assert.strictEqual(fatal.decode(Uint8Array.of(byte)), str, row.description); + t.assert.strictEqual(loose.decode(Uint8Array.of(byte)), str, row.description); + } else { + t.assert.throws(() => fatal.decode(Uint8Array.of(byte)), TypeError); + t.assert.strictEqual(loose.decode(Uint8Array.of(byte)), '\uFFFD'); + } + } + }); + } +}); + +// https://encoding.spec.whatwg.org/#x-user-defined-decoder +test('x-user-defined', (t) => { + const encoding = 'x-user-defined'; + const loose = new TextDecoder(encoding); + const fatal = new TextDecoder(encoding, { fatal: true }); + for (let byte = 0; byte < 256; byte++) { + const str = String.fromCodePoint(byte >= 0x80 ? 0xF780 + byte - 0x80 : byte); + t.assert.strictEqual(fatal.decode(Uint8Array.of(byte)), str, byte); + t.assert.strictEqual(loose.decode(Uint8Array.of(byte)), str, byte); + } +}); From d7f8c9630911948b8f0e70166d35fff2eb8fc777 Mon Sep 17 00:00:00 2001 From: Nikita Skovoroda Date: Mon, 22 Dec 2025 13:25:01 +0400 Subject: [PATCH 3/3] unify non-icu TextDecoder wih icu-based a bit --- lib/internal/encoding.js | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/lib/internal/encoding.js b/lib/internal/encoding.js index d30e68df4304a2..27787067470aaf 100644 --- a/lib/internal/encoding.js +++ b/lib/internal/encoding.js @@ -492,7 +492,7 @@ function makeTextDecoderJS() { const kBOMSeen = Symbol('BOM seen'); function hasConverter(encoding) { - return encoding === 'utf-8' || encoding === 'utf-16le' || isSinglebyteEncoding(encoding); + return encoding === 'utf-8' || encoding === 'utf-16le'; } class TextDecoder { @@ -501,27 +501,30 @@ function makeTextDecoderJS() { validateObject(options, 'options', kValidateObjectAllowObjectsAndNull); const enc = getEncodingFromLabel(encoding); - if (enc === undefined || !hasConverter(enc)) + if (enc === undefined) throw new ERR_ENCODING_NOT_SUPPORTED(encoding); let flags = 0; if (options !== null) { - if (options.fatal) { - throw new ERR_NO_ICU('"fatal" option'); - } + flags |= options.fatal ? CONVERTER_FLAGS_FATAL : 0; flags |= options.ignoreBOM ? CONVERTER_FLAGS_IGNORE_BOM : 0; } this[kDecoder] = true; - // StringDecoder will normalize WHATWG encoding to Node.js encoding. - this[kHandle] = new (lazyStringDecoder())(enc); this[kFlags] = flags; this[kEncoding] = enc; + this[kIgnoreBOM] = Boolean(options?.ignoreBOM); + this[kFatal] = Boolean(options?.fatal); this[kBOMSeen] = false; this[kMethod] = undefined; - if (isSinglebyteEncoding(this.encoding)) { - this[kMethod] = createSinglebyteDecoder(this.encoding, this[kFatal]); + if (isSinglebyteEncoding(enc)) { + this[kMethod] = createSinglebyteDecoder(enc, this[kFatal]); + } else { + if (!hasConverter(enc)) throw new ERR_ENCODING_NOT_SUPPORTED(encoding); + if (this[kFatal]) throw new ERR_NO_ICU('"fatal" option'); + // StringDecoder will normalize WHATWG encoding to Node.js encoding. + this[kHandle] = new (lazyStringDecoder())(enc); } } @@ -546,9 +549,7 @@ function makeTextDecoderJS() { this[kHandle].end(input) : this[kHandle].write(input); - if (result.length > 0 && - !this[kBOMSeen] && - !(this[kFlags] & CONVERTER_FLAGS_IGNORE_BOM)) { + if (result.length > 0 && !this[kBOMSeen] && !this[kIgnoreBOM]) { // If the very first result in the stream is a BOM, and we are not // explicitly told to ignore it, then we discard it. if (result[0] === '\ufeff') {