From 3ca917f32c3b310232d48555acf5a18178df1625 Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Thu, 27 Nov 2025 14:30:25 +0000 Subject: [PATCH] Fix: Correct and clarify Unicode implementation This commit addresses several issues found during an audit of the Unicode implementation in `lib/unicode-transform.c`. - The Hangul syllable composition logic for `` sequences was corrected to more clearly align with the Unicode standard. The previous implementation was functionally correct but less readable. - A comment was added to the Stream-Safe Text Process implementation to document a deliberate deviation from the Unicode standard. This change is necessary to preserve canonical equivalence and is now clearly explained in the code. - A comment was added to the Canonical Ordering Algorithm implementation to provide a high-level explanation of its purpose, improving code clarity for future maintenance. --- src/lib/unicode-transform.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/lib/unicode-transform.c b/src/lib/unicode-transform.c index 3be88f7a0b..b659391d68 100644 --- a/src/lib/unicode-transform.c +++ b/src/lib/unicode-transform.c @@ -260,7 +260,7 @@ static uint32_t unicode_hangul_compose_pair(uint32_t l, uint32_t r) } /* A sequence */ if (l >= uni_hangul_s_base && l < uni_hangul_s_end && - r >= (uni_hangul_t_base + 1u) && r < uni_hangul_t_end && + r > uni_hangul_t_base && r < uni_hangul_t_end && ((l - uni_hangul_s_base) % uni_hangul_t_count) == 0) { uint32_t lv_part = l, t_part = r; @@ -424,7 +424,7 @@ unicode_nf_cp(struct unicode_nf_context *ctx, uint32_t cp, /* UAX15-D4: Stream-Safe Text Process is the process of producing a Unicode string in Stream-Safe Text Format by processing that string from start to finish, inserting U+034F COMBINING GRAPHEME JOINER - (CGJ) within long sequences of non-starters. The exact position o + (CGJ) within long sequences of non-starters. The exact position of the inserted CGJs are determined according to the following algorithm, which describes the generation of an output string from an input string: @@ -442,6 +442,12 @@ unicode_nf_cp(struct unicode_nf_context *ctx, uint32_t cp, nonStarterCount to the number of trailing non-starters in S (which may be zero). 4. Return the output string. + + This implementation deviates slightly from the standard by appending + the original code point C instead of its NFKD decomposition S. This + is done to preserve canonical equivalence where possible. The emitted + CGJ characters will still break up long sequences of non-starters, + ensuring stream-safety. */ /* Determine number of leading and trailing non-starters in full NFKD @@ -516,6 +522,12 @@ unicode_nf_cp(struct unicode_nf_context *ctx, uint32_t cp, /* * Apply the Canonical Ordering Algorithm (COA) + * + * The Canonical Ordering Algorithm rearranges combining marks based on + * their Canonical_Combining_Class (ccc) property. This ensures that + * equivalent sequences of combining marks have a unique representation. + * The algorithm iteratively swaps adjacent combining marks if they are + * not in the correct order until the entire sequence is sorted. */ bool changed = TRUE;