Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 10 additions & 8 deletions src/IronRe2/Regex.cs
Original file line number Diff line number Diff line change
Expand Up @@ -183,12 +183,13 @@ public Match Find(string haystack)
/// of the match
/// </summary>
/// <param name="haystack">The string to search for the pattern</param>
/// <param name="offset">The offset to start the search at</param>
/// <param name="offset">The character offset to start the search at (not byte offset)</param>
/// <returns>The match data for the match</returns>
public Match Find(string haystack, int offset)
{
var hayBytes = Encoding.UTF8.GetBytes(haystack);
return Find(hayBytes, offset);
var byteOffset = Encoding.UTF8.GetByteCount(haystack.AsSpan(0, offset));
return Find(hayBytes, byteOffset);
}

/// <summary>
Expand All @@ -205,8 +206,8 @@ public Match Find(ReadOnlyMemory<byte> hayBytes)
/// Find the pattern starting at the given offset and return the extent
/// of the match
/// </summary>
/// <param name="hayBytes">The string to search for the pattern</param>
/// <param name="offset">The offset to start the search at</param>
/// <param name="hayBytes">The bytes to search for the pattern</param>
/// <param name="offset">The byte offset to start the search at</param>
/// <returns>The match data for the match</returns>
public Match Find(ReadOnlyMemory<byte> hayBytes, int offset)
{
Expand Down Expand Up @@ -283,12 +284,13 @@ public Captures Captures(string haystack)
/// </para>
/// </summary>
/// <param name="haystack">The string to search for the pattern</param>
/// <param name="offset">The offest to start searching from</param>
/// <param name="offset">The character offset to start searching from (not byte offset)</param>
/// <returns>The captures data</returns>
public Captures Captures(string haystack, int offset)
{
var hayBytes = Encoding.UTF8.GetBytes(haystack);
return Captures(hayBytes, offset);
var byteOffset = Encoding.UTF8.GetByteCount(haystack.AsSpan(0, offset));
return Captures(hayBytes, byteOffset);
}

/// <summary>
Expand Down Expand Up @@ -316,8 +318,8 @@ public Captures Captures(ReadOnlyMemory<byte> haystack)
/// of each of the regex's capturing groups.
/// </para>
/// </summary>
/// <param name="haystack">The string to search for the pattern</param>
/// <param name="offset">The offest to start searching from</param>
/// <param name="haystack">The bytes to search for the pattern</param>
/// <param name="offset">The byte offset to start searching from</param>
/// <returns>The captures data</returns>
public Captures Captures(ReadOnlyMemory<byte> haystack, int offset)
{
Expand Down
112 changes: 112 additions & 0 deletions test/IronRe2.Tests/RegexTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,118 @@ public void CapturesOutOfBounds()
Assert.Throws<IndexOutOfRangeException>(() => match[-2]);
}

[Fact]
public void FindWithCharacterOffsetInMultiByteString()
{
// This test verifies that the offset parameter in Find(string, int) is interpreted as
// a character offset, not a byte offset, when dealing with multi-byte UTF-8 characters
Regex re = new("world");

// String with multi-byte UTF-8 characters (emoji)
// "Hello 🌍 world" where 🌍 is a 4-byte UTF-8 character
const string haystack = "Hello 🌍 world";

// Character positions: H(0) e(1) l(2) l(3) o(4) (5) 🌍(6) (7) w(8) o(9) r(10) l(11) d(12)
// Byte positions: H(0) e(1) l(2) l(3) o(4) (5) 🌍(6-9) (10) w(11) o(12) r(13) l(14) d(15)

// Find starting at character offset 8 (character 'w')
var match = re.Find(haystack, 8);

Assert.True(match.Matched);
// The match should be at byte positions 11-16
Assert.Equal(11, match.Start);
Assert.Equal(16, match.End);
Assert.Equal("world", match.ExtractedText);
}

[Fact]
public void FindWithCharacterOffsetBeforeMultiByteCharacter()
{
Regex re = new("🌍");
const string haystack = "Hello 🌍 world";

// Find starting at character offset 6 (the emoji itself)
var match = re.Find(haystack, 6);

Assert.True(match.Matched);
Assert.Equal("🌍", match.ExtractedText);
}

[Fact]
public void CapturesWithCharacterOffsetInMultiByteString()
{
// Test that Captures(string, int) also respects character offsets
Regex re = new(@"(\w+)");

// String with multi-byte character
const string haystack = "Hello 🌍 world";

// Start searching at character offset 8 (character 'w')
var captures = re.Captures(haystack, 8);

Assert.True(captures.Matched);
Assert.Equal("world", captures[0].ExtractedText);
Assert.Equal("world", captures[1].ExtractedText);
}

[Fact]
public void FindWithZeroOffsetInMultiByteString()
{
// Verify that offset 0 still works correctly
Regex re = new("Hello");
const string haystack = "Hello 🌍 world";

var match = re.Find(haystack, 0);

Assert.True(match.Matched);
Assert.Equal("Hello", match.ExtractedText);
Assert.Equal(0, match.Start);
Assert.Equal(5, match.End);
}

[Fact]
public void FindAllWithMultiByteCharacters()
{
// Verify that FindAll still works correctly with multi-byte characters
Regex re = new(@"\w+");
const string haystack = "Hello 🌍 world";

List<Match> matches = [.. re.FindAll(haystack)];

Assert.Collection(matches,
m =>
{
Assert.Equal(0, m.Start);
Assert.Equal(5, m.End);
Assert.Equal("Hello", m.ExtractedText);
},
m =>
{
Assert.Equal(11, m.Start);
Assert.Equal(16, m.End);
Assert.Equal("world", m.ExtractedText);
});
}

[Fact]
public void FindWithCharacterOffsetInAsianCharacters()
{
// Test with Asian multi-byte characters (3-byte UTF-8)
Regex re = new("世界");
const string haystack = "你好世界"; // "Hello world" in Chinese

// Character positions: 你(0) 好(1) 世(2) 界(3)
// Byte positions: 你(0-2) 好(3-5) 世(6-8) 界(9-11)

// Find starting at character offset 2
var match = re.Find(haystack, 2);

Assert.True(match.Matched);
Assert.Equal(6, match.Start); // Byte offset
Assert.Equal(12, match.End); // Byte offset
Assert.Equal("世界", match.ExtractedText);
}

public static IEnumerable<object[]> IsMatchData()
{
yield return [".+", "hello world", true];
Expand Down