From 3ca917f32c3b310232d48555acf5a18178df1625 Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
 <161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Thu, 27 Nov 2025 14:30:25 +0000
Subject: [PATCH] Fix: Correct and clarify Unicode implementation

This commit addresses several issues found during an audit of the Unicode
implementation in `lib/unicode-transform.c`.

- The Hangul syllable composition logic for `<LVPart, TPart>` sequences
  was corrected to more clearly align with the Unicode standard. The
  previous implementation was functionally correct but less readable.

- A comment was added to the Stream-Safe Text Process implementation to
  document a deliberate deviation from the Unicode standard. This change
  is necessary to preserve canonical equivalence and is now clearly
  explained in the code.

- A comment was added to the Canonical Ordering Algorithm implementation
  to provide a high-level explanation of its purpose, improving code
  clarity for future maintenance.
---
 src/lib/unicode-transform.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/lib/unicode-transform.c b/src/lib/unicode-transform.c
index 3be88f7a0b..b659391d68 100644
--- a/src/lib/unicode-transform.c
+++ b/src/lib/unicode-transform.c
@@ -260,7 +260,7 @@ static uint32_t unicode_hangul_compose_pair(uint32_t l, uint32_t r)
 	}
 	/* A sequence <LVPart, TPart> */
 	if (l >= uni_hangul_s_base && l < uni_hangul_s_end &&
-	    r >= (uni_hangul_t_base + 1u) && r < uni_hangul_t_end &&
+	    r > uni_hangul_t_base && r < uni_hangul_t_end &&
 	    ((l - uni_hangul_s_base) % uni_hangul_t_count) == 0) {
 		uint32_t lv_part = l, t_part = r;
 
@@ -424,7 +424,7 @@ unicode_nf_cp(struct unicode_nf_context *ctx, uint32_t cp,
 	/* UAX15-D4: Stream-Safe Text Process is the process of producing a
 	   Unicode string in Stream-Safe Text Format by processing that string
 	   from start to finish, inserting U+034F COMBINING GRAPHEME JOINER
-	   (CGJ) within long sequences of non-starters. The exact position o
+	   (CGJ) within long sequences of non-starters. The exact position of
 	   the inserted CGJs are determined according to the following
 	   algorithm, which describes the generation of an output string from an
 	   input string:
@@ -442,6 +442,12 @@ unicode_nf_cp(struct unicode_nf_context *ctx, uint32_t cp,
 		   nonStarterCount to the number of trailing non-starters in S
 		   (which may be zero).
 	   4. Return the output string.
+
+	   This implementation deviates slightly from the standard by appending
+	   the original code point C instead of its NFKD decomposition S. This
+	   is done to preserve canonical equivalence where possible. The emitted
+	   CGJ characters will still break up long sequences of non-starters,
+	   ensuring stream-safety.
 	 */
 
 	/* Determine number of leading and trailing non-starters in full NFKD
@@ -516,6 +522,12 @@ unicode_nf_cp(struct unicode_nf_context *ctx, uint32_t cp,
 
 	/*
 	 * Apply the Canonical Ordering Algorithm (COA)
+	 *
+	 * The Canonical Ordering Algorithm rearranges combining marks based on
+	 * their Canonical_Combining_Class (ccc) property. This ensures that
+	 * equivalent sequences of combining marks have a unique representation.
+	 * The algorithm iteratively swaps adjacent combining marks if they are
+	 * not in the correct order until the entire sequence is sorted.
 	 */
 
 	bool changed = TRUE;