From c74447eb5dde34183d71607a0c745cf94d3fdccb Mon Sep 17 00:00:00 2001 From: doyoon530 <150874253+doyoon530@users.noreply.github.com> Date: Sat, 11 Apr 2026 20:34:46 +0900 Subject: [PATCH] Improve Korean keep-all mixed-script breaks --- src/analysis.ts | 114 ++++++++++++++++++++++++++++++++++++++++++--- src/layout.test.ts | 68 +++++++++++++++++++++++++++ src/layout.ts | 24 ++++++++-- 3 files changed, 196 insertions(+), 10 deletions(-) diff --git a/src/analysis.ts b/src/analysis.ts index 4abdac4..a4494fe 100644 --- a/src/analysis.ts +++ b/src/analysis.ts @@ -98,11 +98,34 @@ export function setAnalysisLocale(locale?: string): void { const arabicScriptRe = /\p{Script=Arabic}/u const combiningMarkRe = /\p{M}/u const decimalDigitRe = /\p{Nd}/u +// Korean app/product labels often mix Hangul with ASCII letters/digits and +// lightweight token punctuation, but URL/query/key-value separators should +// remain structural boundaries instead of being folded into one token. +const keepAllTextRunSeparators = new Set(['/', '?', '&', '=', ':']) +const koreanKeepAllInnerPunctuation = new Set(['.', '-', '_', '(', ')']) function containsArabicScript(text: string): boolean { return arabicScriptRe.test(text) } +function isHangulCodePoint(codePoint: number): boolean { + return ( + (codePoint >= 0xAC00 && codePoint <= 0xD7AF) || + (codePoint >= 0x1100 && codePoint <= 0x11FF) || + (codePoint >= 0x3130 && codePoint <= 0x318F) || + (codePoint >= 0xA960 && codePoint <= 0xA97F) || + (codePoint >= 0xD7B0 && codePoint <= 0xD7FF) + ) +} + +function isAsciiAlphaNumericCodePoint(codePoint: number): boolean { + return ( + (codePoint >= 0x30 && codePoint <= 0x39) || + (codePoint >= 0x41 && codePoint <= 0x5A) || + (codePoint >= 0x61 && codePoint <= 0x7A) + ) +} + function isCJKCodePoint(codePoint: number): boolean { return ( (codePoint >= 0x4E00 && codePoint <= 0x9FFF) || @@ -146,6 +169,47 @@ export function isCJK(s: string): boolean { return false } +function containsHangulText(text: string): boolean { + for (const ch of text) { + if (isHangulCodePoint(ch.codePointAt(0)!)) return true + } + return false +} + +export function containsKeepAllTextRunSeparator(text: string): boolean { + for (const ch of text) { + if (keepAllTextRunSeparators.has(ch)) return true + } + return false +} + +function containsBlockingKeepAllTextRunEntrySeparator(text: string): boolean { + let offset = 0 + for (const ch of text) { + offset += ch.length + if (!keepAllTextRunSeparators.has(ch)) continue + if ((ch === '?' || ch === ':') && offset === text.length) continue + return true + } + return false +} + +function isKoreanKeepAllCompactText(text: string): boolean { + if (text.length === 0) return false + for (const ch of text) { + const codePoint = ch.codePointAt(0)! + if ( + isHangulCodePoint(codePoint) || + isAsciiAlphaNumericCodePoint(codePoint) || + koreanKeepAllInnerPunctuation.has(ch) + ) { + continue + } + return false + } + return true +} + function endsWithLineStartProhibitedText(text: string): boolean { const last = getLastCodePoint(text) return last !== null && (kinsokuStart.has(last) || leftStickyPunctuation.has(last)) @@ -174,6 +238,23 @@ export function canContinueKeepAllTextRun(previousText: string): boolean { ) } +export function canContinueKeepAllTextRunAcrossBoundary(previousText: string, nextText: string): boolean { + const hasHangulBoundary = containsHangulText(previousText) || containsHangulText(nextText) + if (!hasHangulBoundary) return canContinueKeepAllTextRun(previousText) + + return ( + canContinueKeepAllTextRun(previousText) && + !containsKeepAllTextRunSeparator(previousText) && + !containsBlockingKeepAllTextRunEntrySeparator(nextText) + ) +} + +export function canContinueKeepAllTextRunForKorean(previousText: string, nextText: string): boolean { + if (!canContinueKeepAllTextRunAcrossBoundary(previousText, nextText)) return false + if (!containsHangulText(previousText) && !containsHangulText(nextText)) return false + return isKoreanKeepAllCompactText(previousText) && isKoreanKeepAllCompactText(nextText) +} + export const kinsokuStart = new Set([ '\uFF0C', '\uFF0E', @@ -1194,14 +1275,18 @@ function mergeKeepAllTextSegments(segmentation: MergedSegmentation): MergedSegme let pendingStart = 0 let pendingContainsCJK = false let pendingCanContinue = false + let pendingStartedAfterKeepAllSeparator = false + let previousTextHadKeepAllSeparator = false function flushPendingText(): void { if (pendingTextParts === null) return - texts.push(joinTextParts(pendingTextParts)) + const text = joinTextParts(pendingTextParts) + texts.push(text) isWordLike.push(pendingWordLike) kinds.push('text') starts.push(pendingStart) pendingTextParts = null + previousTextHadKeepAllSeparator = containsKeepAllTextRunSeparator(text) } for (let i = 0; i < segmentation.len; i++) { @@ -1214,12 +1299,25 @@ function mergeKeepAllTextSegments(segmentation: MergedSegmentation): MergedSegme const textContainsCJK = containsCJKText(text) const textCanContinue = canContinueKeepAllTextRun(text) - if (pendingTextParts !== null && pendingContainsCJK && pendingCanContinue) { - pendingTextParts.push(text) - pendingWordLike = pendingWordLike || wordLike - pendingContainsCJK = pendingContainsCJK || textContainsCJK - pendingCanContinue = textCanContinue - continue + if (pendingTextParts !== null) { + const previousText = pendingTextParts[pendingTextParts.length - 1]! + const canContinueAcrossBoundary = canContinueKeepAllTextRunAcrossBoundary(previousText, text) + const canUseDefaultCJKKeepAll = + pendingContainsCJK && + pendingCanContinue && + canContinueAcrossBoundary && + (!pendingStartedAfterKeepAllSeparator || textContainsCJK) + const canUseKoreanKeepAll = + !pendingStartedAfterKeepAllSeparator && + canContinueKeepAllTextRunForKorean(previousText, text) + + if (canUseDefaultCJKKeepAll || canUseKoreanKeepAll) { + pendingTextParts.push(text) + pendingWordLike = pendingWordLike || wordLike + pendingContainsCJK = pendingContainsCJK || textContainsCJK + pendingCanContinue = textCanContinue + continue + } } flushPendingText() @@ -1228,6 +1326,7 @@ function mergeKeepAllTextSegments(segmentation: MergedSegmentation): MergedSegme pendingStart = start pendingContainsCJK = textContainsCJK pendingCanContinue = textCanContinue + pendingStartedAfterKeepAllSeparator = previousTextHadKeepAllSeparator continue } @@ -1236,6 +1335,7 @@ function mergeKeepAllTextSegments(segmentation: MergedSegmentation): MergedSegme isWordLike.push(wordLike) kinds.push(kind) starts.push(start) + previousTextHadKeepAllSeparator = false } flushPendingText() diff --git a/src/layout.test.ts b/src/layout.test.ts index a37ebd1..8025bdd 100644 --- a/src/layout.test.ts +++ b/src/layout.test.ts @@ -570,6 +570,74 @@ describe('prepare invariants', () => { expect(prepareWithSegments('foo\u00A0世界', FONT, { wordBreak: 'keep-all' }).segments).toEqual(['foo\u00A0', '世界']) }) + test('keep-all keeps compact Korean mixed-script tokens together', () => { + for (const text of [ + 'AI정보공학과', + 'README카드생성기', + 'api문서v2가이드', + '2026학년도공지', + '한글ABC123혼합문장', + 'GitHubREADME한글가이드', + '공지사항v2업데이트', + ]) { + expect(prepareWithSegments(text, FONT, { wordBreak: 'keep-all' }).segments).toEqual([text]) + } + }) + + test('keep-all does not merge Korean text across path and query separators', () => { + expect(prepareWithSegments('검색어?정렬=최신', FONT, { wordBreak: 'keep-all' }).segments).toEqual([ + '검색어?', + '정렬', + '=', + '최신', + ]) + expect(prepareWithSegments('docs/README한글가이드', FONT, { wordBreak: 'keep-all' }).segments).toEqual([ + 'docs', + '/', + 'README', + '한글가이드', + ]) + expect(prepareWithSegments('hello:한글테스트', FONT, { wordBreak: 'keep-all' }).segments).toEqual([ + 'hello:', + '한글테스트', + ]) + expect(prepareWithSegments('path/to/한글문서', FONT, { wordBreak: 'keep-all' }).segments).toEqual([ + 'path', + '/', + 'to', + '/', + '한글문서', + ]) + expect(prepareWithSegments('key=value한글', FONT, { wordBreak: 'keep-all' }).segments).toEqual([ + 'key', + '=', + 'value', + '한글', + ]) + expect(prepareWithSegments('한글&영문조합', FONT, { wordBreak: 'keep-all' }).segments).toEqual([ + '한글', + '&', + '영문조합', + ]) + }) + + test('keep-all preserves Korean mixed-punctuation token behavior', () => { + for (const text of [ + '공지사항(수정본)', + 'v2.1한글업데이트', + '한글-영문-혼합', + '한글_영문_조합', + ]) { + expect(prepareWithSegments(text, FONT, { wordBreak: 'keep-all' }).segments).toEqual([text]) + } + + expect(prepareWithSegments('AI\u200B정보공학과', FONT, { wordBreak: 'keep-all' }).segments).toEqual([ + 'AI', + '\u200B', + '정보공학과', + ]) + }) + test('adjacent CJK text units stay breakable after visible text, not only after spaces', () => { const prepared = prepareWithSegments('foo 世界 bar', FONT) expect(prepared.segments).toEqual(['foo', ' ', '世', '界', ' ', 'bar']) diff --git a/src/layout.ts b/src/layout.ts index ec8e0e8..633738d 100644 --- a/src/layout.ts +++ b/src/layout.ts @@ -36,7 +36,10 @@ import { computeSegmentLevels } from './bidi.js' import { analyzeText, canContinueKeepAllTextRun, + canContinueKeepAllTextRunAcrossBoundary, + canContinueKeepAllTextRunForKorean, clearAnalysisCaches, + containsKeepAllTextRunSeparator, endsWithClosingQuote, isCJK, isNumericRunSegment, @@ -282,20 +285,34 @@ function mergeKeepAllTextUnits(units: MeasuredTextUnit[]): MeasuredTextUnit[] { let currentStart = units[0]!.start let currentContainsCJK = isCJK(units[0]!.text) let currentCanContinue = canContinueKeepAllTextRun(units[0]!.text) + let currentStartedAfterKeepAllSeparator = false + let previousTextHadKeepAllSeparator = false function flushCurrent(): void { + const text = currentTextParts.length === 1 ? currentTextParts[0]! : currentTextParts.join('') merged.push({ - text: currentTextParts.length === 1 ? currentTextParts[0]! : currentTextParts.join(''), + text, start: currentStart, }) + previousTextHadKeepAllSeparator = containsKeepAllTextRunSeparator(text) } for (let i = 1; i < units.length; i++) { const next = units[i]! const nextContainsCJK = isCJK(next.text) const nextCanContinue = canContinueKeepAllTextRun(next.text) - - if (currentContainsCJK && currentCanContinue) { + const previousText = currentTextParts[currentTextParts.length - 1]! + const canContinueAcrossBoundary = canContinueKeepAllTextRunAcrossBoundary(previousText, next.text) + const canUseDefaultCJKKeepAll = + currentContainsCJK && + currentCanContinue && + canContinueAcrossBoundary && + (!currentStartedAfterKeepAllSeparator || nextContainsCJK) + const canUseKoreanKeepAll = + !currentStartedAfterKeepAllSeparator && + canContinueKeepAllTextRunForKorean(previousText, next.text) + + if (canUseDefaultCJKKeepAll || canUseKoreanKeepAll) { currentTextParts.push(next.text) currentContainsCJK = currentContainsCJK || nextContainsCJK currentCanContinue = nextCanContinue @@ -307,6 +324,7 @@ function mergeKeepAllTextUnits(units: MeasuredTextUnit[]): MeasuredTextUnit[] { currentStart = next.start currentContainsCJK = nextContainsCJK currentCanContinue = nextCanContinue + currentStartedAfterKeepAllSeparator = previousTextHadKeepAllSeparator } flushCurrent()