diff --git a/src/IronRe2/Regex.cs b/src/IronRe2/Regex.cs
index 651d217..1733ddc 100644
--- a/src/IronRe2/Regex.cs
+++ b/src/IronRe2/Regex.cs
@@ -183,12 +183,13 @@ public Match Find(string haystack)
/// of the match
///
/// The string to search for the pattern
- /// The offset to start the search at
+ /// The character offset to start the search at (not byte offset)
/// The match data for the match
public Match Find(string haystack, int offset)
{
var hayBytes = Encoding.UTF8.GetBytes(haystack);
- return Find(hayBytes, offset);
+ var byteOffset = Encoding.UTF8.GetByteCount(haystack.AsSpan(0, offset));
+ return Find(hayBytes, byteOffset);
}
///
@@ -205,8 +206,8 @@ public Match Find(ReadOnlyMemory hayBytes)
/// Find the pattern starting at the given offset and return the extent
/// of the match
///
- /// The string to search for the pattern
- /// The offset to start the search at
+ /// The bytes to search for the pattern
+ /// The byte offset to start the search at
/// The match data for the match
public Match Find(ReadOnlyMemory hayBytes, int offset)
{
@@ -283,12 +284,13 @@ public Captures Captures(string haystack)
///
///
/// The string to search for the pattern
- /// The offest to start searching from
+ /// The character offset to start searching from (not byte offset)
/// The captures data
public Captures Captures(string haystack, int offset)
{
var hayBytes = Encoding.UTF8.GetBytes(haystack);
- return Captures(hayBytes, offset);
+ var byteOffset = Encoding.UTF8.GetByteCount(haystack.AsSpan(0, offset));
+ return Captures(hayBytes, byteOffset);
}
///
@@ -316,8 +318,8 @@ public Captures Captures(ReadOnlyMemory haystack)
/// of each of the regex's capturing groups.
///
///
- /// The string to search for the pattern
- /// The offest to start searching from
+ /// The bytes to search for the pattern
+ /// The byte offset to start searching from
/// The captures data
public Captures Captures(ReadOnlyMemory haystack, int offset)
{
diff --git a/test/IronRe2.Tests/RegexTests.cs b/test/IronRe2.Tests/RegexTests.cs
index a34bd47..e7769f9 100644
--- a/test/IronRe2.Tests/RegexTests.cs
+++ b/test/IronRe2.Tests/RegexTests.cs
@@ -389,6 +389,118 @@ public void CapturesOutOfBounds()
Assert.Throws(() => match[-2]);
}
+ [Fact]
+ public void FindWithCharacterOffsetInMultiByteString()
+ {
+ // This test verifies that the offset parameter in Find(string, int) is interpreted as
+ // a character offset, not a byte offset, when dealing with multi-byte UTF-8 characters
+ Regex re = new("world");
+
+ // String with multi-byte UTF-8 characters (emoji)
+ // "Hello π world" where π is a 4-byte UTF-8 character
+ const string haystack = "Hello π world";
+
+ // Character positions: H(0) e(1) l(2) l(3) o(4) (5) π(6) (7) w(8) o(9) r(10) l(11) d(12)
+ // Byte positions: H(0) e(1) l(2) l(3) o(4) (5) π(6-9) (10) w(11) o(12) r(13) l(14) d(15)
+
+ // Find starting at character offset 8 (character 'w')
+ var match = re.Find(haystack, 8);
+
+ Assert.True(match.Matched);
+ // The match should be at byte positions 11-16
+ Assert.Equal(11, match.Start);
+ Assert.Equal(16, match.End);
+ Assert.Equal("world", match.ExtractedText);
+ }
+
+ [Fact]
+ public void FindWithCharacterOffsetBeforeMultiByteCharacter()
+ {
+ Regex re = new("π");
+ const string haystack = "Hello π world";
+
+ // Find starting at character offset 6 (the emoji itself)
+ var match = re.Find(haystack, 6);
+
+ Assert.True(match.Matched);
+ Assert.Equal("π", match.ExtractedText);
+ }
+
+ [Fact]
+ public void CapturesWithCharacterOffsetInMultiByteString()
+ {
+ // Test that Captures(string, int) also respects character offsets
+ Regex re = new(@"(\w+)");
+
+ // String with multi-byte character
+ const string haystack = "Hello π world";
+
+ // Start searching at character offset 8 (character 'w')
+ var captures = re.Captures(haystack, 8);
+
+ Assert.True(captures.Matched);
+ Assert.Equal("world", captures[0].ExtractedText);
+ Assert.Equal("world", captures[1].ExtractedText);
+ }
+
+ [Fact]
+ public void FindWithZeroOffsetInMultiByteString()
+ {
+ // Verify that offset 0 still works correctly
+ Regex re = new("Hello");
+ const string haystack = "Hello π world";
+
+ var match = re.Find(haystack, 0);
+
+ Assert.True(match.Matched);
+ Assert.Equal("Hello", match.ExtractedText);
+ Assert.Equal(0, match.Start);
+ Assert.Equal(5, match.End);
+ }
+
+ [Fact]
+ public void FindAllWithMultiByteCharacters()
+ {
+ // Verify that FindAll still works correctly with multi-byte characters
+ Regex re = new(@"\w+");
+ const string haystack = "Hello π world";
+
+ List matches = [.. re.FindAll(haystack)];
+
+ Assert.Collection(matches,
+ m =>
+ {
+ Assert.Equal(0, m.Start);
+ Assert.Equal(5, m.End);
+ Assert.Equal("Hello", m.ExtractedText);
+ },
+ m =>
+ {
+ Assert.Equal(11, m.Start);
+ Assert.Equal(16, m.End);
+ Assert.Equal("world", m.ExtractedText);
+ });
+ }
+
+ [Fact]
+ public void FindWithCharacterOffsetInAsianCharacters()
+ {
+ // Test with Asian multi-byte characters (3-byte UTF-8)
+ Regex re = new("δΈη");
+ const string haystack = "δ½ ε₯½δΈη"; // "Hello world" in Chinese
+
+ // Character positions: δ½ (0) ε₯½(1) δΈ(2) η(3)
+ // Byte positions: δ½ (0-2) ε₯½(3-5) δΈ(6-8) η(9-11)
+
+ // Find starting at character offset 2
+ var match = re.Find(haystack, 2);
+
+ Assert.True(match.Matched);
+ Assert.Equal(6, match.Start); // Byte offset
+ Assert.Equal(12, match.End); // Byte offset
+ Assert.Equal("δΈη", match.ExtractedText);
+ }
+
public static IEnumerable