From 591c43fb2937d73864aa5d75792fe2cc963e562d Mon Sep 17 00:00:00 2001 From: mayrang Date: Fri, 17 Apr 2026 18:27:47 +0900 Subject: [PATCH 1/3] =?UTF-8?q?fix:=20classify=20Hangul=20Compatibility=20?= =?UTF-8?q?Jamo=20(U+3130=E2=80=93U+318F)=20as=20CJK?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit isCJKCodePoint() included Hangul syllables (U+AC00–U+D7AF) but not Hangul Compatibility Jamo (U+3130–U+318F), the standalone consonants and vowels (ㄱ ㄴ ㄷ ㅋ ㅠ ...) used constantly in Korean digital text. Without CJK classification these characters were treated as atomic segments the line-breaker could not split at grapheme boundaries, producing line counts 1 too high for common expressions like ㅋㅋ, ㄹㅇ. Verified against Chrome and Safari. --- src/analysis.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/src/analysis.ts b/src/analysis.ts index 4abdac4..15c2ca8 100644 --- a/src/analysis.ts +++ b/src/analysis.ts @@ -121,6 +121,7 @@ function isCJKCodePoint(codePoint: number): boolean { (codePoint >= 0x3000 && codePoint <= 0x303F) || (codePoint >= 0x3040 && codePoint <= 0x309F) || (codePoint >= 0x30A0 && codePoint <= 0x30FF) || + (codePoint >= 0x3130 && codePoint <= 0x318F) || // Hangul Compatibility Jamo (ㄱ-ㅣ) (codePoint >= 0xAC00 && codePoint <= 0xD7AF) || (codePoint >= 0xFF00 && codePoint <= 0xFFEF) ) From 08b5ffa2fea12a5449510f4d1e3ff451e349f6fe Mon Sep 17 00:00:00 2001 From: mayrang Date: Fri, 17 Apr 2026 18:27:47 +0900 Subject: [PATCH 2/3] fix: add korean-check oracle for Hangul edge cases and layout modes --- package.json | 1 + scripts/korean-check.ts | 344 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 345 insertions(+) create mode 100644 scripts/korean-check.ts diff --git a/package.json b/package.json index 4e0c932..422bbc4 100644 --- a/package.json +++ b/package.json @@ -68,6 +68,7 @@ "corpus-taxonomy": "bun run scripts/corpus-taxonomy.ts", "generate:bidi-data": "bun run scripts/generate-bidi-data.ts", "keep-all-check": "bun run scripts/keep-all-check.ts", + "korean-check": "bun run scripts/korean-check.ts", "package-smoke-test": "bun run scripts/package-smoke-test.ts", "prepack": "rm -rf dist && tsc -p tsconfig.build.json", "pre-wrap-check": "bun run scripts/pre-wrap-check.ts", diff --git a/scripts/korean-check.ts b/scripts/korean-check.ts new file mode 100644 index 0000000..3701131 --- /dev/null +++ b/scripts/korean-check.ts @@ -0,0 +1,344 @@ +import { type ChildProcess } from 'node:child_process' +import { + acquireBrowserAutomationLock, + createBrowserSession, + ensurePageServer, + getAvailablePort, + loadHashReport, + type AutomationBrowserKind, + type BrowserKind, +} from './browser-automation.ts' + +type ProbeReport = { + status: 'ready' | 'error' + requestId?: string + browserLineMethod?: 'range' | 'span' + width?: number + predictedHeight?: number + actualHeight?: number + diffPx?: number + predictedLineCount?: number + browserLineCount?: number + firstBreakMismatch?: { + line: number + deltaText: string + reasonGuess: string + oursText: string + browserText: string + } | null + extractorSensitivity?: string | null + message?: string +} + +type OracleCase = { + label: string + text: string + width: number + font: string + lineHeight: number + lang: string + dir?: 'ltr' | 'rtl' + whiteSpace?: 'normal' | 'pre-wrap' + wordBreak?: 'normal' | 'keep-all' +} + +const ORACLE_CASES: OracleCase[] = [ + // B: Edge cases — normal mode + { + label: 'B1: 한글 자모 단독 (U+1100)', + text: 'ᄀᄂᄃ 테스트 ᄀᄂᄃ 테스트 ᄀᄂᄃ', + width: 200, + font: '20px serif', + lineHeight: 34, + lang: 'ko', + }, + { + label: 'B2: 한글 호환 자모 (U+3130)', + text: 'ㄱㄴㄷ 호환 자모 ㄱㄴㄷ 호환 자모 ㄱㄴㄷ', + width: 200, + font: '20px serif', + lineHeight: 34, + lang: 'ko', + }, + { + label: 'B3: 한글+영어 혼합', + text: '안녕 Hello 세계 안녕 Hello 세계', + width: 200, + font: '20px "Apple SD Gothic Neo"', + lineHeight: 34, + lang: 'ko', + }, + { + label: 'B4: 한글+숫자', + text: '가격은 10,000원 입니다 배송은 3,500원 입니다', + width: 200, + font: '20px "Apple SD Gothic Neo"', + lineHeight: 34, + lang: 'ko', + }, + { + label: 'B5: 한글+CJK 구두점', + text: '안녕하세요。잘 부탁합니다。감사합니다。', + width: 200, + font: '20px serif', + lineHeight: 34, + lang: 'ko', + }, + { + label: 'B6: NBSP + 한글', + text: '서울\u00A0시청역 부산\u00A0역', + width: 150, + font: '20px serif', + lineHeight: 34, + lang: 'ko', + }, + // B2-width-variants: same texts at different widths to confirm bug is width-sensitive + { + label: 'B2c-w160: ㅠㅠ 감정 (160px)', + text: 'ㅠㅠ 너무 슬퍼요 ㅠㅠ 정말로', + width: 160, + font: '20px serif', + lineHeight: 34, + lang: 'ko', + }, + { + label: 'B2c-w140: ㅠㅠ 감정 (140px)', + text: 'ㅠㅠ 너무 슬퍼요 ㅠㅠ 정말로', + width: 140, + font: '20px serif', + lineHeight: 34, + lang: 'ko', + }, + { + label: 'B2d-w150: ㄹㅇ 슬랭 (150px)', + text: '이거 ㄹㅇ임 ㄹㅇ 아니면 뭐야', + width: 150, + font: '20px serif', + lineHeight: 34, + lang: 'ko', + }, + { + label: 'B2f-w150: ㅇㅋ ㄴㄴ (150px)', + text: 'ㅇㅋ 알겠어요 ㄴㄴ 그건 아니고', + width: 150, + font: '20px serif', + lineHeight: 34, + lang: 'ko', + }, + // B2-variants: Hangul Compatibility Jamo edge cases (the confirmed bug) + { + label: 'B2b: ㅋㅋ 슬랭 혼합', + text: 'ㅋㅋㅋ 진짜 웃기다 ㅋㅋㅋ 진짜로', + width: 200, + font: '20px serif', + lineHeight: 34, + lang: 'ko', + }, + { + label: 'B2c: ㅠㅠ 감정 표현', + text: 'ㅠㅠ 너무 슬퍼요 ㅠㅠ 정말로', + width: 200, + font: '20px serif', + lineHeight: 34, + lang: 'ko', + }, + { + label: 'B2d: 자음 단독 문장 중간', + text: '이거 ㄹㅇ임 ㄹㅇ 아니면 뭐야', + width: 180, + font: '20px serif', + lineHeight: 34, + lang: 'ko', + }, + { + label: 'B2e: 자음만 연속', + text: 'ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎ', + width: 150, + font: '20px serif', + lineHeight: 34, + lang: 'ko', + }, + { + label: 'B2f: ㅇㅋ ㄴㄴ 인터넷 슬랭', + text: 'ㅇㅋ 알겠어요 ㄴㄴ 그건 아니고', + width: 180, + font: '20px serif', + lineHeight: 34, + lang: 'ko', + }, + // C: Layout modes + { + label: 'C1: keep-all + 좁은 너비', + text: '한국어 테스트 입니다', + width: 80, + font: '20px serif', + lineHeight: 34, + lang: 'ko', + wordBreak: 'keep-all', + }, + { + label: 'C2: keep-all + 한글+영어 혼합', + text: '한국어 Korean 혼합 테스트', + width: 150, + font: '20px serif', + lineHeight: 34, + lang: 'ko', + wordBreak: 'keep-all', + }, + { + label: 'C3: pre-wrap + 한글 하드 브레이크', + text: '가나다\n라마바', + width: 300, + font: '20px serif', + lineHeight: 34, + lang: 'ko', + whiteSpace: 'pre-wrap', + }, + { + label: 'C4: pre-wrap + 탭 + 한글', + text: '가나\t다라', + width: 300, + font: '20px serif', + lineHeight: 34, + lang: 'ko', + whiteSpace: 'pre-wrap', + }, +] + +function parseStringFlag(name: string): string | null { + const prefix = `--${name}=` + const arg = process.argv.find(value => value.startsWith(prefix)) + return arg === undefined ? null : arg.slice(prefix.length) +} + +function parseNumberFlag(name: string, fallback: number): number { + const raw = parseStringFlag(name) + if (raw === null) return fallback + const parsed = Number.parseInt(raw, 10) + if (!Number.isFinite(parsed)) throw new Error(`Invalid value for --${name}: ${raw}`) + return parsed +} + +function parseBrowsers(value: string | null): AutomationBrowserKind[] { + const raw = (value ?? 'chrome,safari').trim() + if (raw.length === 0) return ['chrome', 'safari'] + + const browsers = raw + .split(',') + .map(part => part.trim().toLowerCase()) + .filter(Boolean) + + for (const browser of browsers) { + if (browser !== 'chrome' && browser !== 'safari' && browser !== 'firefox') { + throw new Error(`Unsupported browser ${browser}`) + } + } + + return browsers as AutomationBrowserKind[] +} + +function buildProbeUrl(baseUrl: string, requestId: string, testCase: OracleCase): string { + const dir = testCase.dir ?? 'ltr' + const whiteSpace = testCase.whiteSpace ?? 'normal' + const wordBreak = testCase.wordBreak ?? 'normal' + return ( + `${baseUrl}/probe?text=${encodeURIComponent(testCase.text)}` + + `&width=${testCase.width}` + + `&font=${encodeURIComponent(testCase.font)}` + + `&lineHeight=${testCase.lineHeight}` + + `&dir=${encodeURIComponent(dir)}` + + `&lang=${encodeURIComponent(testCase.lang)}` + + `&whiteSpace=${encodeURIComponent(whiteSpace)}` + + `&wordBreak=${encodeURIComponent(wordBreak)}` + + `&method=span` + + `&requestId=${encodeURIComponent(requestId)}` + ) +} + +function reportIsExact(report: ProbeReport): boolean { + return ( + report.status === 'ready' && + report.diffPx === 0 && + report.predictedLineCount === report.browserLineCount && + report.predictedHeight === report.actualHeight && + report.firstBreakMismatch === null + ) +} + +function printCaseResult(browser: AutomationBrowserKind, testCase: OracleCase, report: ProbeReport): void { + if (report.status === 'error') { + console.log(` FAIL ${testCase.label}: error: ${report.message ?? 'unknown error'}`) + return + } + + const pass = reportIsExact(report) + const icon = pass ? '✓ PASS' : '✗ FAIL' + const lines = `[${report.predictedLineCount} lines]` + const detail = pass + ? lines + : `expected=${report.browserLineCount} got=${report.predictedLineCount} width=${testCase.width}px font=${testCase.font}` + + console.log(` ${icon} ${testCase.label.padEnd(40)} ${detail}`) + + if (!pass && report.firstBreakMismatch != null) { + console.log( + ` break L${report.firstBreakMismatch.line}: ${report.firstBreakMismatch.reasonGuess} | ` + + `ours ${JSON.stringify(report.firstBreakMismatch.oursText)} | ` + + `browser ${JSON.stringify(report.firstBreakMismatch.browserText)}`, + ) + } +} + +async function runBrowser(browser: AutomationBrowserKind, port: number): Promise { + const lock = await acquireBrowserAutomationLock(browser) + const reportBrowser: BrowserKind | null = browser === 'firefox' ? null : browser + const session = reportBrowser === null ? null : createBrowserSession(reportBrowser) + let serverProcess: ChildProcess | null = null + let ok = true + let pass = 0 + + try { + if (session === null || reportBrowser === null) { + throw new Error('Firefox is not supported for korean oracle checks') + } + + const pageServer = await ensurePageServer(port, '/probe', process.cwd()) + serverProcess = pageServer.process + + console.log(`\nKorean Layout Check — ${browser.charAt(0).toUpperCase() + browser.slice(1)}`) + console.log('─'.repeat(60)) + + for (const testCase of ORACLE_CASES) { + const requestId = `${browser}-${Date.now()}-${Math.random().toString(36).slice(2)}` + const url = buildProbeUrl(pageServer.baseUrl, requestId, testCase) + const report = await loadHashReport(session, url, requestId, reportBrowser, timeoutMs) + printCaseResult(browser, testCase, report) + if (reportIsExact(report)) { + pass++ + } else { + ok = false + } + } + + console.log(`\nSummary: ${browser} ${pass}/${ORACLE_CASES.length} pass`) + } finally { + session?.close() + serverProcess?.kill() + lock.release() + } + + return ok +} + +const requestedPort = parseNumberFlag('port', 0) +const browsers = parseBrowsers(parseStringFlag('browser')) +const timeoutMs = parseNumberFlag('timeout', 60_000) + +const port = await getAvailablePort(requestedPort === 0 ? null : requestedPort) +let overallOk = true +for (const browser of browsers) { + const browserOk = await runBrowser(browser, port) + if (!browserOk) overallOk = false +} + +if (!overallOk) process.exitCode = 1 From 00025d7e81148fe11869e9e86ad84834e7d328dc Mon Sep 17 00:00:00 2001 From: mayrang Date: Fri, 17 Apr 2026 22:53:07 +0900 Subject: [PATCH 3/3] fix: translate korean-check labels to English --- scripts/korean-check.ts | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/scripts/korean-check.ts b/scripts/korean-check.ts index 3701131..c9e056a 100644 --- a/scripts/korean-check.ts +++ b/scripts/korean-check.ts @@ -45,7 +45,7 @@ type OracleCase = { const ORACLE_CASES: OracleCase[] = [ // B: Edge cases — normal mode { - label: 'B1: 한글 자모 단독 (U+1100)', + label: 'B1: Hangul Jamo standalone (U+1100)', text: 'ᄀᄂᄃ 테스트 ᄀᄂᄃ 테스트 ᄀᄂᄃ', width: 200, font: '20px serif', @@ -53,7 +53,7 @@ const ORACLE_CASES: OracleCase[] = [ lang: 'ko', }, { - label: 'B2: 한글 호환 자모 (U+3130)', + label: 'B2: Hangul Compatibility Jamo (U+3130)', text: 'ㄱㄴㄷ 호환 자모 ㄱㄴㄷ 호환 자모 ㄱㄴㄷ', width: 200, font: '20px serif', @@ -61,7 +61,7 @@ const ORACLE_CASES: OracleCase[] = [ lang: 'ko', }, { - label: 'B3: 한글+영어 혼합', + label: 'B3: Korean+English mixed', text: '안녕 Hello 세계 안녕 Hello 세계', width: 200, font: '20px "Apple SD Gothic Neo"', @@ -69,7 +69,7 @@ const ORACLE_CASES: OracleCase[] = [ lang: 'ko', }, { - label: 'B4: 한글+숫자', + label: 'B4: Korean+numbers mixed', text: '가격은 10,000원 입니다 배송은 3,500원 입니다', width: 200, font: '20px "Apple SD Gothic Neo"', @@ -77,7 +77,7 @@ const ORACLE_CASES: OracleCase[] = [ lang: 'ko', }, { - label: 'B5: 한글+CJK 구두점', + label: 'B5: Korean+CJK punctuation', text: '안녕하세요。잘 부탁합니다。감사합니다。', width: 200, font: '20px serif', @@ -85,7 +85,7 @@ const ORACLE_CASES: OracleCase[] = [ lang: 'ko', }, { - label: 'B6: NBSP + 한글', + label: 'B6: NBSP + Korean', text: '서울\u00A0시청역 부산\u00A0역', width: 150, font: '20px serif', @@ -94,7 +94,7 @@ const ORACLE_CASES: OracleCase[] = [ }, // B2-width-variants: same texts at different widths to confirm bug is width-sensitive { - label: 'B2c-w160: ㅠㅠ 감정 (160px)', + label: 'B2c-w160: ㅠㅠ crying expression (160px)', text: 'ㅠㅠ 너무 슬퍼요 ㅠㅠ 정말로', width: 160, font: '20px serif', @@ -102,7 +102,7 @@ const ORACLE_CASES: OracleCase[] = [ lang: 'ko', }, { - label: 'B2c-w140: ㅠㅠ 감정 (140px)', + label: 'B2c-w140: ㅠㅠ crying expression (140px)', text: 'ㅠㅠ 너무 슬퍼요 ㅠㅠ 정말로', width: 140, font: '20px serif', @@ -110,7 +110,7 @@ const ORACLE_CASES: OracleCase[] = [ lang: 'ko', }, { - label: 'B2d-w150: ㄹㅇ 슬랭 (150px)', + label: 'B2d-w150: ㄹㅇ literally slang (150px)', text: '이거 ㄹㅇ임 ㄹㅇ 아니면 뭐야', width: 150, font: '20px serif', @@ -118,7 +118,7 @@ const ORACLE_CASES: OracleCase[] = [ lang: 'ko', }, { - label: 'B2f-w150: ㅇㅋ ㄴㄴ (150px)', + label: 'B2f-w150: ㅇㅋ/ㄴㄴ okay/nope slang (150px)', text: 'ㅇㅋ 알겠어요 ㄴㄴ 그건 아니고', width: 150, font: '20px serif', @@ -127,7 +127,7 @@ const ORACLE_CASES: OracleCase[] = [ }, // B2-variants: Hangul Compatibility Jamo edge cases (the confirmed bug) { - label: 'B2b: ㅋㅋ 슬랭 혼합', + label: 'B2b: ㅋㅋ laughter slang mixed', text: 'ㅋㅋㅋ 진짜 웃기다 ㅋㅋㅋ 진짜로', width: 200, font: '20px serif', @@ -135,7 +135,7 @@ const ORACLE_CASES: OracleCase[] = [ lang: 'ko', }, { - label: 'B2c: ㅠㅠ 감정 표현', + label: 'B2c: ㅠㅠ crying expression', text: 'ㅠㅠ 너무 슬퍼요 ㅠㅠ 정말로', width: 200, font: '20px serif', @@ -143,7 +143,7 @@ const ORACLE_CASES: OracleCase[] = [ lang: 'ko', }, { - label: 'B2d: 자음 단독 문장 중간', + label: 'B2d: ㄹㅇ literally slang mid-sentence', text: '이거 ㄹㅇ임 ㄹㅇ 아니면 뭐야', width: 180, font: '20px serif', @@ -151,7 +151,7 @@ const ORACLE_CASES: OracleCase[] = [ lang: 'ko', }, { - label: 'B2e: 자음만 연속', + label: 'B2e: consonants-only run', text: 'ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎ', width: 150, font: '20px serif', @@ -159,7 +159,7 @@ const ORACLE_CASES: OracleCase[] = [ lang: 'ko', }, { - label: 'B2f: ㅇㅋ ㄴㄴ 인터넷 슬랭', + label: 'B2f: ㅇㅋ/ㄴㄴ okay/nope internet slang', text: 'ㅇㅋ 알겠어요 ㄴㄴ 그건 아니고', width: 180, font: '20px serif', @@ -168,7 +168,7 @@ const ORACLE_CASES: OracleCase[] = [ }, // C: Layout modes { - label: 'C1: keep-all + 좁은 너비', + label: 'C1: keep-all + narrow width', text: '한국어 테스트 입니다', width: 80, font: '20px serif', @@ -177,7 +177,7 @@ const ORACLE_CASES: OracleCase[] = [ wordBreak: 'keep-all', }, { - label: 'C2: keep-all + 한글+영어 혼합', + label: 'C2: keep-all + Korean+English mixed', text: '한국어 Korean 혼합 테스트', width: 150, font: '20px serif', @@ -186,7 +186,7 @@ const ORACLE_CASES: OracleCase[] = [ wordBreak: 'keep-all', }, { - label: 'C3: pre-wrap + 한글 하드 브레이크', + label: 'C3: pre-wrap + Korean hard break', text: '가나다\n라마바', width: 300, font: '20px serif', @@ -195,7 +195,7 @@ const ORACLE_CASES: OracleCase[] = [ whiteSpace: 'pre-wrap', }, { - label: 'C4: pre-wrap + 탭 + 한글', + label: 'C4: pre-wrap + tab + Korean', text: '가나\t다라', width: 300, font: '20px serif',