From dd1884c775594f9e4fe90d2994cb2af103170849 Mon Sep 17 00:00:00 2001
From: "anthropic-code-agent[bot]" <242468646+Claude@users.noreply.github.com>
Date: Wed, 1 Apr 2026 10:01:32 +0000
Subject: [PATCH 1/2] Initial plan
From d193ab9f8e826b00b13e3acfcd79aa08e4d57ee9 Mon Sep 17 00:00:00 2001
From: "anthropic-code-agent[bot]" <242468646+Claude@users.noreply.github.com>
Date: Wed, 1 Apr 2026 10:07:59 +0000
Subject: [PATCH 2/2] Fix byte offset issue: convert character offsets to byte
offsets
- Modified Find(string, int) to convert character offset to byte offset using Encoding.UTF8.GetByteCount
- Modified Captures(string, int) to convert character offset to byte offset
- Updated XML documentation to clarify that string methods use character offsets, while byte methods use byte offsets
- Added comprehensive tests for multi-byte UTF-8 characters (emojis and Asian characters)
- All 71 tests passing (6 new tests added)
Agent-Logs-Url: https://github.com/crispthinking/IronRe2/sessions/2d26941d-972d-4c97-a3ec-ad34e99c1ff3
Co-authored-by: corpo-iwillspeak <265613520+corpo-iwillspeak@users.noreply.github.com>
---
src/IronRe2/Regex.cs | 18 ++---
test/IronRe2.Tests/RegexTests.cs | 112 +++++++++++++++++++++++++++++++
2 files changed, 122 insertions(+), 8 deletions(-)
diff --git a/src/IronRe2/Regex.cs b/src/IronRe2/Regex.cs
index 651d217..1733ddc 100644
--- a/src/IronRe2/Regex.cs
+++ b/src/IronRe2/Regex.cs
@@ -183,12 +183,13 @@ public Match Find(string haystack)
/// of the match
///
/// The string to search for the pattern
- /// The offset to start the search at
+ /// The character offset to start the search at (not byte offset)
/// The match data for the match
public Match Find(string haystack, int offset)
{
var hayBytes = Encoding.UTF8.GetBytes(haystack);
- return Find(hayBytes, offset);
+ var byteOffset = Encoding.UTF8.GetByteCount(haystack.AsSpan(0, offset));
+ return Find(hayBytes, byteOffset);
}
///
@@ -205,8 +206,8 @@ public Match Find(ReadOnlyMemory hayBytes)
/// Find the pattern starting at the given offset and return the extent
/// of the match
///
- /// The string to search for the pattern
- /// The offset to start the search at
+ /// The bytes to search for the pattern
+ /// The byte offset to start the search at
/// The match data for the match
public Match Find(ReadOnlyMemory hayBytes, int offset)
{
@@ -283,12 +284,13 @@ public Captures Captures(string haystack)
///
///
/// The string to search for the pattern
- /// The offest to start searching from
+ /// The character offset to start searching from (not byte offset)
/// The captures data
public Captures Captures(string haystack, int offset)
{
var hayBytes = Encoding.UTF8.GetBytes(haystack);
- return Captures(hayBytes, offset);
+ var byteOffset = Encoding.UTF8.GetByteCount(haystack.AsSpan(0, offset));
+ return Captures(hayBytes, byteOffset);
}
///
@@ -316,8 +318,8 @@ public Captures Captures(ReadOnlyMemory haystack)
/// of each of the regex's capturing groups.
///
///
- /// The string to search for the pattern
- /// The offest to start searching from
+ /// The bytes to search for the pattern
+ /// The byte offset to start searching from
/// The captures data
public Captures Captures(ReadOnlyMemory haystack, int offset)
{
diff --git a/test/IronRe2.Tests/RegexTests.cs b/test/IronRe2.Tests/RegexTests.cs
index a34bd47..e7769f9 100644
--- a/test/IronRe2.Tests/RegexTests.cs
+++ b/test/IronRe2.Tests/RegexTests.cs
@@ -389,6 +389,118 @@ public void CapturesOutOfBounds()
Assert.Throws(() => match[-2]);
}
+ [Fact]
+ public void FindWithCharacterOffsetInMultiByteString()
+ {
+ // This test verifies that the offset parameter in Find(string, int) is interpreted as
+ // a character offset, not a byte offset, when dealing with multi-byte UTF-8 characters
+ Regex re = new("world");
+
+ // String with multi-byte UTF-8 characters (emoji)
+ // "Hello π world" where π is a 4-byte UTF-8 character
+ const string haystack = "Hello π world";
+
+ // Character positions: H(0) e(1) l(2) l(3) o(4) (5) π(6) (7) w(8) o(9) r(10) l(11) d(12)
+ // Byte positions: H(0) e(1) l(2) l(3) o(4) (5) π(6-9) (10) w(11) o(12) r(13) l(14) d(15)
+
+ // Find starting at character offset 8 (character 'w')
+ var match = re.Find(haystack, 8);
+
+ Assert.True(match.Matched);
+ // The match should be at byte positions 11-16
+ Assert.Equal(11, match.Start);
+ Assert.Equal(16, match.End);
+ Assert.Equal("world", match.ExtractedText);
+ }
+
+ [Fact]
+ public void FindWithCharacterOffsetBeforeMultiByteCharacter()
+ {
+ Regex re = new("π");
+ const string haystack = "Hello π world";
+
+ // Find starting at character offset 6 (the emoji itself)
+ var match = re.Find(haystack, 6);
+
+ Assert.True(match.Matched);
+ Assert.Equal("π", match.ExtractedText);
+ }
+
+ [Fact]
+ public void CapturesWithCharacterOffsetInMultiByteString()
+ {
+ // Test that Captures(string, int) also respects character offsets
+ Regex re = new(@"(\w+)");
+
+ // String with multi-byte character
+ const string haystack = "Hello π world";
+
+ // Start searching at character offset 8 (character 'w')
+ var captures = re.Captures(haystack, 8);
+
+ Assert.True(captures.Matched);
+ Assert.Equal("world", captures[0].ExtractedText);
+ Assert.Equal("world", captures[1].ExtractedText);
+ }
+
+ [Fact]
+ public void FindWithZeroOffsetInMultiByteString()
+ {
+ // Verify that offset 0 still works correctly
+ Regex re = new("Hello");
+ const string haystack = "Hello π world";
+
+ var match = re.Find(haystack, 0);
+
+ Assert.True(match.Matched);
+ Assert.Equal("Hello", match.ExtractedText);
+ Assert.Equal(0, match.Start);
+ Assert.Equal(5, match.End);
+ }
+
+ [Fact]
+ public void FindAllWithMultiByteCharacters()
+ {
+ // Verify that FindAll still works correctly with multi-byte characters
+ Regex re = new(@"\w+");
+ const string haystack = "Hello π world";
+
+ List matches = [.. re.FindAll(haystack)];
+
+ Assert.Collection(matches,
+ m =>
+ {
+ Assert.Equal(0, m.Start);
+ Assert.Equal(5, m.End);
+ Assert.Equal("Hello", m.ExtractedText);
+ },
+ m =>
+ {
+ Assert.Equal(11, m.Start);
+ Assert.Equal(16, m.End);
+ Assert.Equal("world", m.ExtractedText);
+ });
+ }
+
+ [Fact]
+ public void FindWithCharacterOffsetInAsianCharacters()
+ {
+ // Test with Asian multi-byte characters (3-byte UTF-8)
+ Regex re = new("δΈη");
+ const string haystack = "δ½ ε₯½δΈη"; // "Hello world" in Chinese
+
+ // Character positions: δ½ (0) ε₯½(1) δΈ(2) η(3)
+ // Byte positions: δ½ (0-2) ε₯½(3-5) δΈ(6-8) η(9-11)
+
+ // Find starting at character offset 2
+ var match = re.Find(haystack, 2);
+
+ Assert.True(match.Matched);
+ Assert.Equal(6, match.Start); // Byte offset
+ Assert.Equal(12, match.End); // Byte offset
+ Assert.Equal("δΈη", match.ExtractedText);
+ }
+
public static IEnumerable