From dd1884c775594f9e4fe90d2994cb2af103170849 Mon Sep 17 00:00:00 2001
From: "anthropic-code-agent[bot]" <242468646+Claude@users.noreply.github.com>
Date: Wed, 1 Apr 2026 10:01:32 +0000
Subject: [PATCH 1/2] Initial plan


From d193ab9f8e826b00b13e3acfcd79aa08e4d57ee9 Mon Sep 17 00:00:00 2001
From: "anthropic-code-agent[bot]" <242468646+Claude@users.noreply.github.com>
Date: Wed, 1 Apr 2026 10:07:59 +0000
Subject: [PATCH 2/2] Fix byte offset issue: convert character offsets to byte
 offsets

- Modified Find(string, int) to convert character offset to byte offset using Encoding.UTF8.GetByteCount
- Modified Captures(string, int) to convert character offset to byte offset
- Updated XML documentation to clarify that string methods use character offsets, while byte methods use byte offsets
- Added comprehensive tests for multi-byte UTF-8 characters (emojis and Asian characters)
- All 71 tests passing (6 new tests added)

Agent-Logs-Url: https://github.com/crispthinking/IronRe2/sessions/2d26941d-972d-4c97-a3ec-ad34e99c1ff3

Co-authored-by: corpo-iwillspeak <265613520+corpo-iwillspeak@users.noreply.github.com>
---
 src/IronRe2/Regex.cs             |  18 ++---
 test/IronRe2.Tests/RegexTests.cs | 112 +++++++++++++++++++++++++++++++
 2 files changed, 122 insertions(+), 8 deletions(-)
diff --git a/src/IronRe2/Regex.cs b/src/IronRe2/Regex.cs
index 651d217..1733ddc 100644
--- a/src/IronRe2/Regex.cs
+++ b/src/IronRe2/Regex.cs
@@ -183,12 +183,13 @@ public Match Find(string haystack)
     ///     of the match
     /// </summary>
     /// <param name="haystack">The string to search for the pattern</param>
-    /// <param name="offset">The offset to start the search at</param>
+    /// <param name="offset">The character offset to start the search at (not byte offset)</param>
     /// <returns>The match data for the match</returns>
     public Match Find(string haystack, int offset)
     {
         var hayBytes = Encoding.UTF8.GetBytes(haystack);
-        return Find(hayBytes, offset);
+        var byteOffset = Encoding.UTF8.GetByteCount(haystack.AsSpan(0, offset));
+        return Find(hayBytes, byteOffset);
     }
 
     /// <summary>
@@ -205,8 +206,8 @@ public Match Find(ReadOnlyMemory<byte> hayBytes)
     ///     Find the pattern starting at the given offset and return the extent
     ///     of the match
     /// </summary>
-    /// <param name="hayBytes">The string to search for the pattern</param>
-    /// <param name="offset">The offset to start the search at</param>
+    /// <param name="hayBytes">The bytes to search for the pattern</param>
+    /// <param name="offset">The byte offset to start the search at</param>
     /// <returns>The match data for the match</returns>
     public Match Find(ReadOnlyMemory<byte> hayBytes, int offset)
     {
@@ -283,12 +284,13 @@ public Captures Captures(string haystack)
     ///     </para>
     /// </summary>
     /// <param name="haystack">The string to search for the pattern</param>
-    /// <param name="offset">The offest to start searching from</param>
+    /// <param name="offset">The character offset to start searching from (not byte offset)</param>
     /// <returns>The captures data</returns>
     public Captures Captures(string haystack, int offset)
     {
         var hayBytes = Encoding.UTF8.GetBytes(haystack);
-        return Captures(hayBytes, offset);
+        var byteOffset = Encoding.UTF8.GetByteCount(haystack.AsSpan(0, offset));
+        return Captures(hayBytes, byteOffset);
     }
 
     /// <summary>
@@ -316,8 +318,8 @@ public Captures Captures(ReadOnlyMemory<byte> haystack)
     ///         of each of the regex's capturing groups.
     ///     </para>
     /// </summary>
-    /// <param name="haystack">The string to search for the pattern</param>
-    /// <param name="offset">The offest to start searching from</param>
+    /// <param name="haystack">The bytes to search for the pattern</param>
+    /// <param name="offset">The byte offset to start searching from</param>
     /// <returns>The captures data</returns>
     public Captures Captures(ReadOnlyMemory<byte> haystack, int offset)
     {
diff --git a/test/IronRe2.Tests/RegexTests.cs b/test/IronRe2.Tests/RegexTests.cs
index a34bd47..e7769f9 100644
--- a/test/IronRe2.Tests/RegexTests.cs
+++ b/test/IronRe2.Tests/RegexTests.cs
@@ -389,6 +389,118 @@ public void CapturesOutOfBounds()
         Assert.Throws<IndexOutOfRangeException>(() => match[-2]);
     }
 
+    [Fact]
+    public void FindWithCharacterOffsetInMultiByteString()
+    {
+        // This test verifies that the offset parameter in Find(string, int) is interpreted as
+        // a character offset, not a byte offset, when dealing with multi-byte UTF-8 characters
+        Regex re = new("world");
+
+        // String with multi-byte UTF-8 characters (emoji)
+        // "Hello 🌍 world" where 🌍 is a 4-byte UTF-8 character
+        const string haystack = "Hello 🌍 world";
+
+        // Character positions: H(0) e(1) l(2) l(3) o(4) (5) 🌍(6) (7) w(8) o(9) r(10) l(11) d(12)
+        // Byte positions: H(0) e(1) l(2) l(3) o(4) (5) 🌍(6-9) (10) w(11) o(12) r(13) l(14) d(15)
+
+        // Find starting at character offset 8 (character 'w')
+        var match = re.Find(haystack, 8);
+
+        Assert.True(match.Matched);
+        // The match should be at byte positions 11-16
+        Assert.Equal(11, match.Start);
+        Assert.Equal(16, match.End);
+        Assert.Equal("world", match.ExtractedText);
+    }
+
+    [Fact]
+    public void FindWithCharacterOffsetBeforeMultiByteCharacter()
+    {
+        Regex re = new("🌍");
+        const string haystack = "Hello 🌍 world";
+
+        // Find starting at character offset 6 (the emoji itself)
+        var match = re.Find(haystack, 6);
+
+        Assert.True(match.Matched);
+        Assert.Equal("🌍", match.ExtractedText);
+    }
+
+    [Fact]
+    public void CapturesWithCharacterOffsetInMultiByteString()
+    {
+        // Test that Captures(string, int) also respects character offsets
+        Regex re = new(@"(\w+)");
+
+        // String with multi-byte character
+        const string haystack = "Hello 🌍 world";
+
+        // Start searching at character offset 8 (character 'w')
+        var captures = re.Captures(haystack, 8);
+
+        Assert.True(captures.Matched);
+        Assert.Equal("world", captures[0].ExtractedText);
+        Assert.Equal("world", captures[1].ExtractedText);
+    }
+
+    [Fact]
+    public void FindWithZeroOffsetInMultiByteString()
+    {
+        // Verify that offset 0 still works correctly
+        Regex re = new("Hello");
+        const string haystack = "Hello 🌍 world";
+
+        var match = re.Find(haystack, 0);
+
+        Assert.True(match.Matched);
+        Assert.Equal("Hello", match.ExtractedText);
+        Assert.Equal(0, match.Start);
+        Assert.Equal(5, match.End);
+    }
+
+    [Fact]
+    public void FindAllWithMultiByteCharacters()
+    {
+        // Verify that FindAll still works correctly with multi-byte characters
+        Regex re = new(@"\w+");
+        const string haystack = "Hello 🌍 world";
+
+        List<Match> matches = [.. re.FindAll(haystack)];
+
+        Assert.Collection(matches,
+            m =>
+            {
+                Assert.Equal(0, m.Start);
+                Assert.Equal(5, m.End);
+                Assert.Equal("Hello", m.ExtractedText);
+            },
+            m =>
+            {
+                Assert.Equal(11, m.Start);
+                Assert.Equal(16, m.End);
+                Assert.Equal("world", m.ExtractedText);
+            });
+    }
+
+    [Fact]
+    public void FindWithCharacterOffsetInAsianCharacters()
+    {
+        // Test with Asian multi-byte characters (3-byte UTF-8)
+        Regex re = new("世界");
+        const string haystack = "你好世界"; // "Hello world" in Chinese
+
+        // Character positions: 你(0) 好(1) 世(2) 界(3)
+        // Byte positions: 你(0-2) 好(3-5) 世(6-8) 界(9-11)
+
+        // Find starting at character offset 2
+        var match = re.Find(haystack, 2);
+
+        Assert.True(match.Matched);
+        Assert.Equal(6, match.Start);  // Byte offset
+        Assert.Equal(12, match.End);   // Byte offset
+        Assert.Equal("世界", match.ExtractedText);
+    }
+
     public static IEnumerable<object[]> IsMatchData()
     {
         yield return [".+", "hello world", true];