Skip to content

Commit ba71243

Browse files
lib: correct windows-1252 decoding in TextDecoder
The TextDecoder was incorrectly using the Latin-1 fast path for windows-1252 encoding, which caused incorrect decoding of bytes in the 0x80-0x9F range. The issue occurs because windows-1252 differs from ISO-8859-1 (Latin-1) in this byte range. The simdutf library's convert_latin1_to_utf8 function directly maps bytes to Unicode codepoints (e.g., 0x92 → U+0092), which is correct for ISO-8859-1 but incorrect for windows-1252, where 0x92 should map to U+2019 (RIGHT SINGLE QUOTATION MARK '). This fix disables the Latin-1 fast path for windows-1252, forcing the decoder to use the ICU converter which correctly handles the windows-1252 specific character mappings according to the WHATWG Encoding Standard. The fix includes comprehensive tests for all 32 affected characters (bytes 0x80-0x9F) to prevent regression. Fixes: #56542 Refs: https://encoding.spec.whatwg.org/#windows-1252
1 parent 478a5e6 commit ba71243

File tree

2 files changed

+138
-1
lines changed

2 files changed

+138
-1
lines changed

lib/internal/encoding.js

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -420,7 +420,12 @@ function makeTextDecoderICU() {
420420
this[kFatal] = Boolean(options?.fatal);
421421
// Only support fast path for UTF-8.
422422
this[kUTF8FastPath] = enc === 'utf-8';
423-
this[kLatin1FastPath] = enc === 'windows-1252';
423+
// Disable Latin-1 fast path for windows-1252 as it differs from ISO-8859-1
424+
// in the 0x80-0x9F range. The fast path uses simdutf which directly maps
425+
// bytes to Unicode codepoints (e.g., 0x92 → U+0092), but windows-1252
426+
// requires different mappings (e.g., 0x92 → U+2019 '). The ICU decoder
427+
// handles these mappings correctly.
428+
this[kLatin1FastPath] = false;
424429
this[kHandle] = undefined;
425430

426431
if (!this[kUTF8FastPath] && !this[kLatin1FastPath]) {
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
'use strict';
2+
3+
// Tests for Windows-1252 encoding, specifically the 0x80-0x9F range
4+
// where it differs from ISO-8859-1 (Latin-1).
5+
// Refs: https://github.com/nodejs/node/issues/56542
6+
// Refs: https://encoding.spec.whatwg.org/#windows-1252
7+
8+
require('../common');
9+
10+
const assert = require('assert');
11+
12+
// Test specific case from issue #56542
13+
{
14+
const decoder = new TextDecoder('windows-1252');
15+
const decoded = decoder.decode(new Uint8Array([0x92]));
16+
assert.strictEqual(
17+
decoded.charCodeAt(0),
18+
0x2019,
19+
'Byte 0x92 should decode to U+2019 (') not U+0092'
20+
);
21+
assert.strictEqual(decoded, '\u2019', 'Expected right single quotation mark');
22+
}
23+
24+
// Test all 32 characters in the 0x80-0x9F range where Windows-1252
25+
// differs from ISO-8859-1. These mappings are defined by the WHATWG
26+
// Encoding Standard.
27+
// Source: https://encoding.spec.whatwg.org/#index-windows-1252
28+
{
29+
const testCases = [
30+
[0x80, 0x20AC, '€'], // EURO SIGN
31+
[0x81, 0x0081, '\u0081'], // Undefined (maps to itself)
32+
[0x82, 0x201A, '‚'], // SINGLE LOW-9 QUOTATION MARK
33+
[0x83, 0x0192, 'ƒ'], // LATIN SMALL LETTER F WITH HOOK
34+
[0x84, 0x201E, '„'], // DOUBLE LOW-9 QUOTATION MARK
35+
[0x85, 0x2026, '…'], // HORIZONTAL ELLIPSIS
36+
[0x86, 0x2020, '†'], // DAGGER
37+
[0x87, 0x2021, '‡'], // DOUBLE DAGGER
38+
[0x88, 0x02C6, 'ˆ'], // MODIFIER LETTER CIRCUMFLEX ACCENT
39+
[0x89, 0x2030, '‰'], // PER MILLE SIGN
40+
[0x8A, 0x0160, 'Š'], // LATIN CAPITAL LETTER S WITH CARON
41+
[0x8B, 0x2039, '‹'], // SINGLE LEFT-POINTING ANGLE QUOTATION MARK
42+
[0x8C, 0x0152, 'Œ'], // LATIN CAPITAL LIGATURE OE
43+
[0x8D, 0x008D, '\u008D'], // Undefined (maps to itself)
44+
[0x8E, 0x017D, 'Ž'], // LATIN CAPITAL LETTER Z WITH CARON
45+
[0x8F, 0x008F, '\u008F'], // Undefined (maps to itself)
46+
[0x90, 0x0090, '\u0090'], // Undefined (maps to itself)
47+
[0x91, 0x2018, '''], // LEFT SINGLE QUOTATION MARK
48+
[0x92, 0x2019, '''], // RIGHT SINGLE QUOTATION MARK
49+
[0x93, 0x201C, '"'], // LEFT DOUBLE QUOTATION MARK
50+
[0x94, 0x201D, '"'], // RIGHT DOUBLE QUOTATION MARK
51+
[0x95, 0x2022, '•'], // BULLET
52+
[0x96, 0x2013, '–'], // EN DASH
53+
[0x97, 0x2014, '—'], // EM DASH
54+
[0x98, 0x02DC, '˜'], // SMALL TILDE
55+
[0x99, 0x2122, '™'], // TRADE MARK SIGN
56+
[0x9A, 0x0161, 'š'], // LATIN SMALL LETTER S WITH CARON
57+
[0x9B, 0x203A, '›'], // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
58+
[0x9C, 0x0153, 'œ'], // LATIN SMALL LIGATURE OE
59+
[0x9D, 0x009D, '\u009D'], // Undefined (maps to itself)
60+
[0x9E, 0x017E, 'ž'], // LATIN SMALL LETTER Z WITH CARON
61+
[0x9F, 0x0178, 'Ÿ'], // LATIN CAPITAL LETTER Y WITH DIAERESIS
62+
];
63+
64+
const decoder = new TextDecoder('windows-1252');
65+
66+
for (const [byte, expectedCodePoint, expectedChar] of testCases) {
67+
const decoded = decoder.decode(new Uint8Array([byte]));
68+
const actualCodePoint = decoded.charCodeAt(0);
69+
70+
assert.strictEqual(
71+
actualCodePoint,
72+
expectedCodePoint,
73+
`Byte 0x${byte.toString(16).toUpperCase()} should decode to ` +
74+
`U+${expectedCodePoint.toString(16).toUpperCase().padStart(4, '0')} ` +
75+
`but got U+${actualCodePoint.toString(16).toUpperCase().padStart(4, '0')}`
76+
);
77+
78+
assert.strictEqual(
79+
decoded,
80+
expectedChar,
81+
`Byte 0x${byte.toString(16).toUpperCase()} should decode to ` +
82+
`${expectedChar} but got ${decoded}`
83+
);
84+
}
85+
}
86+
87+
// Test that common Windows-1252 encoding aliases work correctly
88+
// Per WHATWG Encoding Standard, many encodings map to windows-1252
89+
{
90+
const aliases = [
91+
'windows-1252',
92+
'cp1252',
93+
'x-cp1252',
94+
'iso-8859-1', // Per WHATWG spec, iso-8859-1 maps to windows-1252
95+
'latin1', // Per WHATWG spec, latin1 maps to windows-1252
96+
'ascii', // Per WHATWG spec, ascii maps to windows-1252
97+
];
98+
const testByte = 0x92; // Right single quotation mark
99+
const expected = '\u2019';
100+
101+
for (const alias of aliases) {
102+
const decoder = new TextDecoder(alias);
103+
const decoded = decoder.decode(new Uint8Array([testByte]));
104+
assert.strictEqual(
105+
decoded,
106+
expected,
107+
`Encoding alias '${alias}' should decode 0x92 to U+2019 (per WHATWG spec)`
108+
);
109+
}
110+
}
111+
112+
// Test a realistic Windows-1252 text sample
113+
{
114+
const decoder = new TextDecoder('windows-1252');
115+
116+
// "It's a "quote" — with €100"
117+
const bytes = [
118+
0x49, 0x74, 0x92, 0x73, 0x20, 0x61, 0x20, // It's a
119+
0x93, 0x71, 0x75, 0x6F, 0x74, 0x65, 0x94, 0x20, // "quote"
120+
0x97, 0x20, 0x77, 0x69, 0x74, 0x68, 0x20, // — with
121+
0x80, 0x31, 0x30, 0x30, // €100
122+
];
123+
124+
const expected = 'It\u2019s a \u201Cquote\u201D \u2014 with \u20AC100';
125+
const decoded = decoder.decode(new Uint8Array(bytes));
126+
127+
assert.strictEqual(
128+
decoded,
129+
expected,
130+
'Realistic Windows-1252 text should decode correctly'
131+
);
132+
}

0 commit comments

Comments
 (0)