diff --git a/Script/test_utf8_bug.sh b/Script/test_utf8_bug.sh new file mode 100644 index 0000000000..f2d4064b05 --- /dev/null +++ b/Script/test_utf8_bug.sh @@ -0,0 +1,210 @@ +#!/bin/bash + +# UTF-8 Bug Verification Script +# Tests Claude Code with Korean and other multi-byte UTF-8 text + +set -e + +echo "=== Claude Code UTF-8 Bug Test Script ===" +echo "" + +# Create test directory +TEST_DIR="./utf8_test_$(date +%s)" +mkdir -p "$TEST_DIR" +cd "$TEST_DIR" + +echo "Created test directory: $TEST_DIR" +echo "" + +# Test Case 1: Create file with Korean text +echo "Test 1: Creating file with Korean text (original bug scenario)" +cat > korean_test.txt << 'EOF' +뉴스레터를 공유해주세요
+안녕하세요 Claude Code +한글 테스트입니다 +EOF + +echo "✓ Created korean_test.txt" +cat korean_test.txt +echo "" + +# Test Case 2: Create HTML file with Korean content +echo "Test 2: Creating HTML file with Korean content" +cat > korean_newsletter.html << 'EOF' + + + + +안녕하세요! Claude Code 사용자 여러분
+이 페이지는 UTF-8 인코딩 테스트용입니다.
+ + +EOF + +echo "✓ Created korean_newsletter.html" +echo "" + +# Test Case 3: Create mixed language file +echo "Test 3: Creating mixed language file" +cat > mixed_languages.txt << 'EOF' +English: Hello World +Korean: 안녕하세요 +Japanese: こんにちは世界 +Chinese: 你好世界 +Emoji: Hello 👋 World 🌍 +EOF + +echo "✓ Created mixed_languages.txt" +cat mixed_languages.txt +echo "" + +# Test Case 4: Create file with various CJK characters +echo "Test 4: Creating file with various CJK characters" +cat > cjk_test.txt << 'EOF' +Korean Characters: +가나다라마바사아자차카타파하 +ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎ + +Japanese Characters: +あいうえお +かきくけこ +アイウエオ +カキクケコ + +Chinese Characters: +一二三四五六七八九十 +甲乙丙丁戊己庚辛壬癸 +EOF + +echo "✓ Created cjk_test.txt" +echo "" + +# Test Case 5: Create long Korean text +echo "Test 5: Creating file with long Korean text" +cat > long_korean.txt << 'EOF' +대한민국의 수도는 서울특별시입니다. 서울은 한강을 중심으로 발달한 도시로, 약 천만 명의 인구가 거주하고 있습니다. +서울의 역사는 매우 깊어 조선시대부터 수도의 역할을 해왔으며, 현재는 정치, 경제, 문화의 중심지입니다. +대표적인 관광지로는 경복궁, 남산타워, 명동, 강남 등이 있습니다. +한국의 전통 음식으로는 김치, 불고기, 비빔밥, 삼겹살 등이 유명합니다. +K-POP과 한국 드라마는 전 세계적으로 큰 인기를 얻고 있으며, 한류 문화의 중심지 역할을 하고 있습니다. +EOF + +echo "✓ Created long_korean.txt" +echo "" + +# Test Case 6: Create file with exact bug scenario +echo "Test 6: Creating file with exact bug scenario (33 byte boundary)" +# The string "뉴스레터를 공유해주세요" has byte 33 inside '요' +cat > exact_bug.txt << 'EOF' +뉴스레터를 공유해주세요 +EOF + +echo "✓ Created exact_bug.txt" +echo "Byte length check:" +wc -c exact_bug.txt +echo "" + +# Verification function +verify_file() { + local file=$1 + echo "Verifying: $file" + + # Check if file exists and is valid UTF-8 + if [ -f "$file" ]; then + echo " ✓ File exists" + + # Verify UTF-8 encoding + if file "$file" | grep -q "UTF-8"; then + echo " ✓ UTF-8 encoding verified" + else + echo " ⚠ Warning: File may not be UTF-8" + fi + + # Check byte and character counts + byte_count=$(wc -c < "$file") + line_count=$(wc -l < "$file") + echo " - Bytes: $byte_count" + echo " - Lines: $line_count" + else + echo " ✗ File not found" + fi + echo "" +} + +# Verify all test files +echo "=== Verifying Test Files ===" +verify_file "korean_test.txt" +verify_file "korean_newsletter.html" +verify_file "mixed_languages.txt" +verify_file "cjk_test.txt" +verify_file "long_korean.txt" +verify_file "exact_bug.txt" + +# Test with Claude Code (if available) +echo "=== Testing with Claude Code ===" +if command -v claude &> /dev/null; then + echo "Claude Code is installed. Testing search functionality..." + + # This would trigger the bug if not fixed + echo "" + echo "Test: Searching for Korean text in files" + echo "Command: grep -r '뉴스레터' ." + echo "" + + if grep -r '뉴스레터' . 2>&1; then + echo "✓ grep command succeeded" + else + echo "✗ grep command failed (may indicate UTF-8 handling issue)" + fi + + echo "" + echo "To test with Claude Code, run:" + echo " cd $TEST_DIR" + echo " claude" + echo " Then try searching for: 뉴스레터를 공유해주세요" +else + echo "⚠ Claude Code not found in PATH" + echo "Install Claude Code to run full tests" +fi + +echo "" +echo "=== Manual Test Instructions ===" +cat << 'INSTRUCTIONS' + +To manually verify the bug fix: + +1. Navigate to the test directory: + cd [test directory path shown above] + +2. Start Claude Code: + claude + +3. Try these commands in Claude Code: + - "Search for 뉴스레터 in this directory" + - "Show me the contents of korean_test.txt" + - "Find all Korean text in these files" + - "Read exact_bug.txt" + +4. Expected behavior: + ✓ Should NOT crash with "byte index is not a char boundary" error + ✓ Should correctly display Korean text + ✓ Should handle search results with multi-byte characters + +5. If you encounter the panic: + - Note the exact error message + - Check which file/operation triggered it + - Report using: /bug command in Claude Code + +INSTRUCTIONS + +echo "" +echo "=== Cleanup ===" +echo "To remove test files, run:" +echo " cd .. && rm -rf $TEST_DIR" +echo "" + +echo "Test setup complete!" diff --git a/examples/utf8_safe_slicing.rs b/examples/utf8_safe_slicing.rs new file mode 100644 index 0000000000..250e89f68e --- /dev/null +++ b/examples/utf8_safe_slicing.rs @@ -0,0 +1,255 @@ +// Safe UTF-8 String Slicing Examples for Claude Code +// This file demonstrates proper UTF-8 string handling in Rust + +/// Safely slice a string at a byte boundary, adjusting to the nearest valid char boundary +/// +/// # Arguments +/// * `s` - The string to slice +/// * `max_bytes` - Maximum byte length (will adjust down to valid char boundary) +/// +/// # Returns +/// A string slice that doesn't exceed max_bytes and respects char boundaries +pub fn safe_slice(s: &str, max_bytes: usize) -> &str { + if max_bytes >= s.len() { + return s; + } + + // Find valid character boundary at or before max_bytes + let mut boundary = max_bytes; + while boundary > 0 && !s.is_char_boundary(boundary) { + boundary -= 1; + } + + &s[0..boundary] +} + +/// Safely slice a string at a byte boundary (stable API version) +/// +/// This manually finds the valid char boundary +pub fn safe_slice_modern(s: &str, max_bytes: usize) -> &str { + if max_bytes >= s.len() { + return s; + } + + // Manual floor_char_boundary for stable Rust + let mut boundary = max_bytes; + while boundary > 0 && !s.is_char_boundary(boundary) { + boundary -= 1; + } + &s[0..boundary] +} + +/// Truncate a string to a maximum number of characters (not bytes) +/// +/// # Arguments +/// * `s` - The string to truncate +/// * `max_chars` - Maximum number of Unicode characters +/// +/// # Returns +/// A new String containing at most max_chars characters +pub fn truncate_chars(s: &str, max_chars: usize) -> String { + s.chars().take(max_chars).collect() +} + +/// Safely get a substring by character indices +/// +/// # Arguments +/// * `s` - The source string +/// * `start_char` - Starting character index (inclusive) +/// * `end_char` - Ending character index (exclusive) +/// +/// # Returns +/// A new String containing the specified character range +pub fn substring_by_chars(s: &str, start_char: usize, end_char: usize) -> String { + s.chars() + .skip(start_char) + .take(end_char.saturating_sub(start_char)) + .collect() +} + +/// Check if a string can be safely sliced at a given byte index +pub fn can_slice_at(s: &str, index: usize) -> bool { + index <= s.len() && s.is_char_boundary(index) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_korean_text_original_bug() { + // This is the exact string from the bug report + let text = "뉴스레터를 공유해주세요"; + + // This would panic: &text[0..33] + // Instead, use safe slicing: + let result = safe_slice(text, 33); + + // Should not panic and should return valid UTF-8 + assert!(!result.is_empty()); + println!("Safe slice result: {}", result); + } + + #[test] + fn test_safe_slice_korean() { + let text = "안녕하세요"; + + // Each Korean char will occupy 3 bytes + assert_eq!(safe_slice(text, 3), "안"); + assert_eq!(safe_slice(text, 4), "안"); // Adjusts down to byte 3 + assert_eq!(safe_slice(text, 6), "안녕"); + assert_eq!(safe_slice(text, 100), "안녕하세요"); + } + + #[test] + fn test_truncate_chars_korean() { + let text = "뉴스레터를 공유해주세요"; + + assert_eq!(truncate_chars(text, 5), "뉴스레터를"); + assert_eq!(truncate_chars(text, 7), "뉴스레터를 공"); // Space counts as char + assert_eq!(truncate_chars(text, 8), "뉴스레터를 공유"); + assert_eq!(truncate_chars(text, 12), "뉴스레터를 공유해주세요"); + assert_eq!(truncate_chars(text, 100), "뉴스레터를 공유해주세요"); + } + + #[test] + fn test_mixed_ascii_korean() { + let text = "Hello 안녕하세요 World"; + + // Character-based truncation + assert_eq!(truncate_chars(text, 6), "Hello "); + assert_eq!(truncate_chars(text, 10), "Hello 안녕하세"); + assert_eq!(truncate_chars(text, 16), "Hello 안녕하세요 Worl"); + + // Byte-based safe slicing + assert_eq!(safe_slice(text, 6), "Hello "); + // "Hello " is 6 bytes, "안" is 3 more = 9 bytes total + assert_eq!(safe_slice(text, 9), "Hello 안"); + } + + #[test] + fn test_japanese_text() { + let text = "こんにちは世界"; + + assert_eq!(truncate_chars(text, 5), "こんにちは"); + assert_eq!(truncate_chars(text, 7), "こんにちは世界"); + + // Each Japanese char is 3 bytes + assert_eq!(safe_slice(text, 15), "こんにちは"); + } + + #[test] + fn test_chinese_text() { + let text = "你好世界"; + + assert_eq!(truncate_chars(text, 2), "你好"); + assert_eq!(truncate_chars(text, 4), "你好世界"); + + // Each Chinese char is 3 bytes + assert_eq!(safe_slice(text, 6), "你好"); + } + + #[test] + fn test_emoji() { + let text = "Hello 👋 World 🌍"; + + // Emoji can be 4 bytes + let result = safe_slice(text, 9); + println!("Emoji slice: {}", result); + + let char_result = truncate_chars(text, 7); + assert_eq!(char_result, "Hello 👋"); + } + + #[test] + fn test_substring_by_chars() { + let text = "뉴스레터를 공유해주세요"; + + assert_eq!(substring_by_chars(text, 0, 5), "뉴스레터를"); + assert_eq!(substring_by_chars(text, 5, 7), " 공"); + assert_eq!(substring_by_chars(text, 7, 11), "유해주세"); + } + + #[test] + fn test_can_slice_at() { + let text = "안녕"; + + assert!(can_slice_at(text, 0)); + assert!(can_slice_at(text, 3)); // After first char + assert!(!can_slice_at(text, 1)); // Middle of first char + assert!(!can_slice_at(text, 2)); // Middle of first char + assert!(can_slice_at(text, 6)); // End of string + } + + #[test] + fn test_edge_cases() { + // Empty string + assert_eq!(safe_slice("", 10), ""); + assert_eq!(truncate_chars("", 10), ""); + + // Single char + let text = "안"; + assert_eq!(safe_slice(text, 1), ""); + assert_eq!(safe_slice(text, 3), "안"); + assert_eq!(truncate_chars(text, 0), ""); + assert_eq!(truncate_chars(text, 1), "안"); + + // ASCII + let ascii = "Hello"; + assert_eq!(safe_slice(ascii, 3), "Hel"); + assert_eq!(truncate_chars(ascii, 3), "Hel"); + } + + #[test] + fn test_html_with_korean() { + // Similar to the bug report + let html = "뉴스레터를 공유해주세요
"; + + // Should not panic + let result = safe_slice(html, 33); + assert!(!result.is_empty()); + + let char_result = truncate_chars(html, 15); + println!("HTML truncated: {}", char_result); + } + + #[test] + fn test_performance_large_string() { + let korean_text = "안녕하세요".repeat(1000); + + // Should handle large strings efficiently + let result = truncate_chars(&korean_text, 100); + assert_eq!(result.chars().count(), 100); + + let byte_result = safe_slice(&korean_text, 300); + assert!(byte_result.len() <= 300); + } +} + +fn main() { + // Demonstration of the bug and fix + println!("=== UTF-8 Safe Slicing Demo ===\n"); + + let text = "뉴스레터를 공유해주세요"; + println!("Original text: {}", text); + println!("Byte length: {}", text.len()); + println!("Char count: {}", text.chars().count()); + + println!("\n--- Attempting to slice at byte 33 (would panic) ---"); + println!("Safe slice at byte 33: {}", safe_slice(text, 33)); + + println!("\n--- Character-based truncation ---"); + println!("First 11 chars: {}", truncate_chars(text, 11)); + println!("First 5 chars: {}", truncate_chars(text, 5)); + + println!("\n--- Mixed content example ---"); + let mixed = "Hello 안녕하세요 World"; + println!("Mixed text: {}", mixed); + println!("First 10 chars: {}", truncate_chars(mixed, 10)); + println!("Safe slice at 20 bytes: {}", safe_slice(mixed, 20)); + + println!("\n--- Multiple languages ---"); + let multilang = "English 한국어 日本語 中文 🌍"; + println!("Multilang: {}", multilang); + println!("First 15 chars: {}", truncate_chars(multilang, 15)); +}