Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 107 additions & 7 deletions src/analysis.ts
Original file line number Diff line number Diff line change
Expand Up @@ -98,11 +98,34 @@ export function setAnalysisLocale(locale?: string): void {
const arabicScriptRe = /\p{Script=Arabic}/u
const combiningMarkRe = /\p{M}/u
const decimalDigitRe = /\p{Nd}/u
// Korean app/product labels often mix Hangul with ASCII letters/digits and
// lightweight token punctuation, but URL/query/key-value separators should
// remain structural boundaries instead of being folded into one token.
const keepAllTextRunSeparators = new Set(['/', '?', '&', '=', ':'])
const koreanKeepAllInnerPunctuation = new Set(['.', '-', '_', '(', ')'])

function containsArabicScript(text: string): boolean {
return arabicScriptRe.test(text)
}

function isHangulCodePoint(codePoint: number): boolean {
return (
(codePoint >= 0xAC00 && codePoint <= 0xD7AF) ||
(codePoint >= 0x1100 && codePoint <= 0x11FF) ||
(codePoint >= 0x3130 && codePoint <= 0x318F) ||
(codePoint >= 0xA960 && codePoint <= 0xA97F) ||
(codePoint >= 0xD7B0 && codePoint <= 0xD7FF)
)
}

function isAsciiAlphaNumericCodePoint(codePoint: number): boolean {
return (
(codePoint >= 0x30 && codePoint <= 0x39) ||
(codePoint >= 0x41 && codePoint <= 0x5A) ||
(codePoint >= 0x61 && codePoint <= 0x7A)
)
}

function isCJKCodePoint(codePoint: number): boolean {
return (
(codePoint >= 0x4E00 && codePoint <= 0x9FFF) ||
Expand Down Expand Up @@ -146,6 +169,47 @@ export function isCJK(s: string): boolean {
return false
}

function containsHangulText(text: string): boolean {
for (const ch of text) {
if (isHangulCodePoint(ch.codePointAt(0)!)) return true
}
return false
}

export function containsKeepAllTextRunSeparator(text: string): boolean {
for (const ch of text) {
if (keepAllTextRunSeparators.has(ch)) return true
}
return false
}

function containsBlockingKeepAllTextRunEntrySeparator(text: string): boolean {
let offset = 0
for (const ch of text) {
offset += ch.length
if (!keepAllTextRunSeparators.has(ch)) continue
if ((ch === '?' || ch === ':') && offset === text.length) continue
return true
}
return false
}

function isKoreanKeepAllCompactText(text: string): boolean {
if (text.length === 0) return false
for (const ch of text) {
const codePoint = ch.codePointAt(0)!
if (
isHangulCodePoint(codePoint) ||
isAsciiAlphaNumericCodePoint(codePoint) ||
koreanKeepAllInnerPunctuation.has(ch)
) {
continue
}
return false
}
return true
}

function endsWithLineStartProhibitedText(text: string): boolean {
const last = getLastCodePoint(text)
return last !== null && (kinsokuStart.has(last) || leftStickyPunctuation.has(last))
Expand Down Expand Up @@ -174,6 +238,23 @@ export function canContinueKeepAllTextRun(previousText: string): boolean {
)
}

export function canContinueKeepAllTextRunAcrossBoundary(previousText: string, nextText: string): boolean {
const hasHangulBoundary = containsHangulText(previousText) || containsHangulText(nextText)
if (!hasHangulBoundary) return canContinueKeepAllTextRun(previousText)

return (
canContinueKeepAllTextRun(previousText) &&
!containsKeepAllTextRunSeparator(previousText) &&
!containsBlockingKeepAllTextRunEntrySeparator(nextText)
)
}

export function canContinueKeepAllTextRunForKorean(previousText: string, nextText: string): boolean {
if (!canContinueKeepAllTextRunAcrossBoundary(previousText, nextText)) return false
if (!containsHangulText(previousText) && !containsHangulText(nextText)) return false
return isKoreanKeepAllCompactText(previousText) && isKoreanKeepAllCompactText(nextText)
}

export const kinsokuStart = new Set([
'\uFF0C',
'\uFF0E',
Expand Down Expand Up @@ -1194,14 +1275,18 @@ function mergeKeepAllTextSegments(segmentation: MergedSegmentation): MergedSegme
let pendingStart = 0
let pendingContainsCJK = false
let pendingCanContinue = false
let pendingStartedAfterKeepAllSeparator = false
let previousTextHadKeepAllSeparator = false

function flushPendingText(): void {
if (pendingTextParts === null) return
texts.push(joinTextParts(pendingTextParts))
const text = joinTextParts(pendingTextParts)
texts.push(text)
isWordLike.push(pendingWordLike)
kinds.push('text')
starts.push(pendingStart)
pendingTextParts = null
previousTextHadKeepAllSeparator = containsKeepAllTextRunSeparator(text)
}

for (let i = 0; i < segmentation.len; i++) {
Expand All @@ -1214,12 +1299,25 @@ function mergeKeepAllTextSegments(segmentation: MergedSegmentation): MergedSegme
const textContainsCJK = containsCJKText(text)
const textCanContinue = canContinueKeepAllTextRun(text)

if (pendingTextParts !== null && pendingContainsCJK && pendingCanContinue) {
pendingTextParts.push(text)
pendingWordLike = pendingWordLike || wordLike
pendingContainsCJK = pendingContainsCJK || textContainsCJK
pendingCanContinue = textCanContinue
continue
if (pendingTextParts !== null) {
const previousText = pendingTextParts[pendingTextParts.length - 1]!
const canContinueAcrossBoundary = canContinueKeepAllTextRunAcrossBoundary(previousText, text)
const canUseDefaultCJKKeepAll =
pendingContainsCJK &&
pendingCanContinue &&
canContinueAcrossBoundary &&
(!pendingStartedAfterKeepAllSeparator || textContainsCJK)
const canUseKoreanKeepAll =
!pendingStartedAfterKeepAllSeparator &&
canContinueKeepAllTextRunForKorean(previousText, text)

if (canUseDefaultCJKKeepAll || canUseKoreanKeepAll) {
pendingTextParts.push(text)
pendingWordLike = pendingWordLike || wordLike
pendingContainsCJK = pendingContainsCJK || textContainsCJK
pendingCanContinue = textCanContinue
continue
}
}

flushPendingText()
Expand All @@ -1228,6 +1326,7 @@ function mergeKeepAllTextSegments(segmentation: MergedSegmentation): MergedSegme
pendingStart = start
pendingContainsCJK = textContainsCJK
pendingCanContinue = textCanContinue
pendingStartedAfterKeepAllSeparator = previousTextHadKeepAllSeparator
continue
}

Expand All @@ -1236,6 +1335,7 @@ function mergeKeepAllTextSegments(segmentation: MergedSegmentation): MergedSegme
isWordLike.push(wordLike)
kinds.push(kind)
starts.push(start)
previousTextHadKeepAllSeparator = false
}

flushPendingText()
Expand Down
68 changes: 68 additions & 0 deletions src/layout.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -570,6 +570,74 @@ describe('prepare invariants', () => {
expect(prepareWithSegments('foo\u00A0世界', FONT, { wordBreak: 'keep-all' }).segments).toEqual(['foo\u00A0', '世界'])
})

test('keep-all keeps compact Korean mixed-script tokens together', () => {
for (const text of [
'AI정보공학과',
'README카드생성기',
'api문서v2가이드',
'2026학년도공지',
'한글ABC123혼합문장',
'GitHubREADME한글가이드',
'공지사항v2업데이트',
]) {
expect(prepareWithSegments(text, FONT, { wordBreak: 'keep-all' }).segments).toEqual([text])
}
})

test('keep-all does not merge Korean text across path and query separators', () => {
expect(prepareWithSegments('검색어?정렬=최신', FONT, { wordBreak: 'keep-all' }).segments).toEqual([
'검색어?',
'정렬',
'=',
'최신',
])
expect(prepareWithSegments('docs/README한글가이드', FONT, { wordBreak: 'keep-all' }).segments).toEqual([
'docs',
'/',
'README',
'한글가이드',
])
expect(prepareWithSegments('hello:한글테스트', FONT, { wordBreak: 'keep-all' }).segments).toEqual([
'hello:',
'한글테스트',
])
expect(prepareWithSegments('path/to/한글문서', FONT, { wordBreak: 'keep-all' }).segments).toEqual([
'path',
'/',
'to',
'/',
'한글문서',
])
expect(prepareWithSegments('key=value한글', FONT, { wordBreak: 'keep-all' }).segments).toEqual([
'key',
'=',
'value',
'한글',
])
expect(prepareWithSegments('한글&영문조합', FONT, { wordBreak: 'keep-all' }).segments).toEqual([
'한글',
'&',
'영문조합',
])
})

test('keep-all preserves Korean mixed-punctuation token behavior', () => {
for (const text of [
'공지사항(수정본)',
'v2.1한글업데이트',
'한글-영문-혼합',
'한글_영문_조합',
]) {
expect(prepareWithSegments(text, FONT, { wordBreak: 'keep-all' }).segments).toEqual([text])
}

expect(prepareWithSegments('AI\u200B정보공학과', FONT, { wordBreak: 'keep-all' }).segments).toEqual([
'AI',
'\u200B',
'정보공학과',
])
})

test('adjacent CJK text units stay breakable after visible text, not only after spaces', () => {
const prepared = prepareWithSegments('foo 世界 bar', FONT)
expect(prepared.segments).toEqual(['foo', ' ', '世', '界', ' ', 'bar'])
Expand Down
24 changes: 21 additions & 3 deletions src/layout.ts
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,10 @@ import { computeSegmentLevels } from './bidi.js'
import {
analyzeText,
canContinueKeepAllTextRun,
canContinueKeepAllTextRunAcrossBoundary,
canContinueKeepAllTextRunForKorean,
clearAnalysisCaches,
containsKeepAllTextRunSeparator,
endsWithClosingQuote,
isCJK,
isNumericRunSegment,
Expand Down Expand Up @@ -282,20 +285,34 @@ function mergeKeepAllTextUnits(units: MeasuredTextUnit[]): MeasuredTextUnit[] {
let currentStart = units[0]!.start
let currentContainsCJK = isCJK(units[0]!.text)
let currentCanContinue = canContinueKeepAllTextRun(units[0]!.text)
let currentStartedAfterKeepAllSeparator = false
let previousTextHadKeepAllSeparator = false

function flushCurrent(): void {
const text = currentTextParts.length === 1 ? currentTextParts[0]! : currentTextParts.join('')
merged.push({
text: currentTextParts.length === 1 ? currentTextParts[0]! : currentTextParts.join(''),
text,
start: currentStart,
})
previousTextHadKeepAllSeparator = containsKeepAllTextRunSeparator(text)
}

for (let i = 1; i < units.length; i++) {
const next = units[i]!
const nextContainsCJK = isCJK(next.text)
const nextCanContinue = canContinueKeepAllTextRun(next.text)

if (currentContainsCJK && currentCanContinue) {
const previousText = currentTextParts[currentTextParts.length - 1]!
const canContinueAcrossBoundary = canContinueKeepAllTextRunAcrossBoundary(previousText, next.text)
const canUseDefaultCJKKeepAll =
currentContainsCJK &&
currentCanContinue &&
canContinueAcrossBoundary &&
(!currentStartedAfterKeepAllSeparator || nextContainsCJK)
const canUseKoreanKeepAll =
!currentStartedAfterKeepAllSeparator &&
canContinueKeepAllTextRunForKorean(previousText, next.text)

if (canUseDefaultCJKKeepAll || canUseKoreanKeepAll) {
currentTextParts.push(next.text)
currentContainsCJK = currentContainsCJK || nextContainsCJK
currentCanContinue = nextCanContinue
Expand All @@ -307,6 +324,7 @@ function mergeKeepAllTextUnits(units: MeasuredTextUnit[]): MeasuredTextUnit[] {
currentStart = next.start
currentContainsCJK = nextContainsCJK
currentCanContinue = nextCanContinue
currentStartedAfterKeepAllSeparator = previousTextHadKeepAllSeparator
}

flushCurrent()
Expand Down