diff --git a/src/IronRe2/Regex.cs b/src/IronRe2/Regex.cs index 651d217..1733ddc 100644 --- a/src/IronRe2/Regex.cs +++ b/src/IronRe2/Regex.cs @@ -183,12 +183,13 @@ public Match Find(string haystack) /// of the match /// /// The string to search for the pattern - /// The offset to start the search at + /// The character offset to start the search at (not byte offset) /// The match data for the match public Match Find(string haystack, int offset) { var hayBytes = Encoding.UTF8.GetBytes(haystack); - return Find(hayBytes, offset); + var byteOffset = Encoding.UTF8.GetByteCount(haystack.AsSpan(0, offset)); + return Find(hayBytes, byteOffset); } /// @@ -205,8 +206,8 @@ public Match Find(ReadOnlyMemory hayBytes) /// Find the pattern starting at the given offset and return the extent /// of the match /// - /// The string to search for the pattern - /// The offset to start the search at + /// The bytes to search for the pattern + /// The byte offset to start the search at /// The match data for the match public Match Find(ReadOnlyMemory hayBytes, int offset) { @@ -283,12 +284,13 @@ public Captures Captures(string haystack) /// /// /// The string to search for the pattern - /// The offest to start searching from + /// The character offset to start searching from (not byte offset) /// The captures data public Captures Captures(string haystack, int offset) { var hayBytes = Encoding.UTF8.GetBytes(haystack); - return Captures(hayBytes, offset); + var byteOffset = Encoding.UTF8.GetByteCount(haystack.AsSpan(0, offset)); + return Captures(hayBytes, byteOffset); } /// @@ -316,8 +318,8 @@ public Captures Captures(ReadOnlyMemory haystack) /// of each of the regex's capturing groups. /// /// - /// The string to search for the pattern - /// The offest to start searching from + /// The bytes to search for the pattern + /// The byte offset to start searching from /// The captures data public Captures Captures(ReadOnlyMemory haystack, int offset) { diff --git a/test/IronRe2.Tests/RegexTests.cs b/test/IronRe2.Tests/RegexTests.cs index a34bd47..e7769f9 100644 --- a/test/IronRe2.Tests/RegexTests.cs +++ b/test/IronRe2.Tests/RegexTests.cs @@ -389,6 +389,118 @@ public void CapturesOutOfBounds() Assert.Throws(() => match[-2]); } + [Fact] + public void FindWithCharacterOffsetInMultiByteString() + { + // This test verifies that the offset parameter in Find(string, int) is interpreted as + // a character offset, not a byte offset, when dealing with multi-byte UTF-8 characters + Regex re = new("world"); + + // String with multi-byte UTF-8 characters (emoji) + // "Hello 🌍 world" where 🌍 is a 4-byte UTF-8 character + const string haystack = "Hello 🌍 world"; + + // Character positions: H(0) e(1) l(2) l(3) o(4) (5) 🌍(6) (7) w(8) o(9) r(10) l(11) d(12) + // Byte positions: H(0) e(1) l(2) l(3) o(4) (5) 🌍(6-9) (10) w(11) o(12) r(13) l(14) d(15) + + // Find starting at character offset 8 (character 'w') + var match = re.Find(haystack, 8); + + Assert.True(match.Matched); + // The match should be at byte positions 11-16 + Assert.Equal(11, match.Start); + Assert.Equal(16, match.End); + Assert.Equal("world", match.ExtractedText); + } + + [Fact] + public void FindWithCharacterOffsetBeforeMultiByteCharacter() + { + Regex re = new("🌍"); + const string haystack = "Hello 🌍 world"; + + // Find starting at character offset 6 (the emoji itself) + var match = re.Find(haystack, 6); + + Assert.True(match.Matched); + Assert.Equal("🌍", match.ExtractedText); + } + + [Fact] + public void CapturesWithCharacterOffsetInMultiByteString() + { + // Test that Captures(string, int) also respects character offsets + Regex re = new(@"(\w+)"); + + // String with multi-byte character + const string haystack = "Hello 🌍 world"; + + // Start searching at character offset 8 (character 'w') + var captures = re.Captures(haystack, 8); + + Assert.True(captures.Matched); + Assert.Equal("world", captures[0].ExtractedText); + Assert.Equal("world", captures[1].ExtractedText); + } + + [Fact] + public void FindWithZeroOffsetInMultiByteString() + { + // Verify that offset 0 still works correctly + Regex re = new("Hello"); + const string haystack = "Hello 🌍 world"; + + var match = re.Find(haystack, 0); + + Assert.True(match.Matched); + Assert.Equal("Hello", match.ExtractedText); + Assert.Equal(0, match.Start); + Assert.Equal(5, match.End); + } + + [Fact] + public void FindAllWithMultiByteCharacters() + { + // Verify that FindAll still works correctly with multi-byte characters + Regex re = new(@"\w+"); + const string haystack = "Hello 🌍 world"; + + List matches = [.. re.FindAll(haystack)]; + + Assert.Collection(matches, + m => + { + Assert.Equal(0, m.Start); + Assert.Equal(5, m.End); + Assert.Equal("Hello", m.ExtractedText); + }, + m => + { + Assert.Equal(11, m.Start); + Assert.Equal(16, m.End); + Assert.Equal("world", m.ExtractedText); + }); + } + + [Fact] + public void FindWithCharacterOffsetInAsianCharacters() + { + // Test with Asian multi-byte characters (3-byte UTF-8) + Regex re = new("δΈ–η•Œ"); + const string haystack = "δ½ ε₯½δΈ–η•Œ"; // "Hello world" in Chinese + + // Character positions: δ½ (0) ε₯½(1) δΈ–(2) η•Œ(3) + // Byte positions: δ½ (0-2) ε₯½(3-5) δΈ–(6-8) η•Œ(9-11) + + // Find starting at character offset 2 + var match = re.Find(haystack, 2); + + Assert.True(match.Matched); + Assert.Equal(6, match.Start); // Byte offset + Assert.Equal(12, match.End); // Byte offset + Assert.Equal("δΈ–η•Œ", match.ExtractedText); + } + public static IEnumerable IsMatchData() { yield return [".+", "hello world", true];