diff --git a/src/string_utils.py b/src/string_utils.py new file mode 100644 index 0000000..efcfb09 --- /dev/null +++ b/src/string_utils.py @@ -0,0 +1,298 @@ +"""String manipulation utilities for common text transformations. + +This module provides functions for converting between different text cases +commonly used in programming and documentation. +""" + +import re +from typing import List + + +# Precompile regex patterns for better performance +_CONSECUTIVE_CAPS = re.compile(r'([A-Z]+)([A-Z][a-z])') +_CAMEL_BOUNDARY = re.compile(r'([a-z0-9])([A-Z])') +_NON_ALNUM = re.compile(r'[^a-zA-Z0-9]+') +_WHITESPACE = re.compile(r'\s+') + + +def _tokenize_string(text: str) -> List[str]: + """ + Tokenize a string into words based on case transitions, delimiters, and numbers. + + This shared tokenizer ensures consistent behavior across all case conversion functions. + + Args: + text: Input string to tokenize + + Returns: + List of word tokens + + Examples: + >>> _tokenize_string("HelloWorld") + ['Hello', 'World'] + >>> _tokenize_string("XMLHttpRequest") + ['XML', 'Http', 'Request'] + >>> _tokenize_string("snake_case_example") + ['snake', 'case', 'example'] + """ + if not text: + return [] + + # First, replace non-alphanumeric with spaces + text = _NON_ALNUM.sub(' ', text) + + # Handle consecutive capitals (e.g., XMLHttp -> XML Http) + text = _CONSECUTIVE_CAPS.sub(r'\1 \2', text) + + # Insert space before capitals preceded by lowercase/digit + text = _CAMEL_BOUNDARY.sub(r'\1 \2', text) + + # Split on whitespace and filter empty strings + tokens = text.split() + + return [token for token in tokens if token] + + +def to_snake_case(text: str) -> str: + """Convert a string to snake_case. + + Args: + text: Input string to convert + + Returns: + String converted to snake_case + + Examples: + >>> to_snake_case("HelloWorld") + 'hello_world' + >>> to_snake_case("XMLHttpRequest") + 'xml_http_request' + >>> to_snake_case("IOError") + 'io_error' + >>> to_snake_case("convert-to-snake") + 'convert_to_snake' + >>> to_snake_case("multiple spaces") + 'multiple_spaces' + >>> to_snake_case("__already__snake__") + 'already_snake' + """ + tokens = _tokenize_string(text) + if not tokens: + return "" + + return '_'.join(token.lower() for token in tokens) + + +def to_camel_case(text: str) -> str: + """Convert a string to camelCase. + + Args: + text: Input string to convert + + Returns: + String converted to camelCase + + Examples: + >>> to_camel_case("hello_world") + 'helloWorld' + >>> to_camel_case("some-variable-name") + 'someVariableName' + >>> to_camel_case("Convert to camel") + 'convertToCamel' + >>> to_camel_case("HelloWorld") + 'helloWorld' + >>> to_camel_case("XMLHttpRequest") + 'xmlHttpRequest' + >>> to_camel_case("__multiple__delimiters__") + 'multipleDelimiters' + """ + tokens = _tokenize_string(text) + if not tokens: + return "" + + # First token lowercase, rest capitalized + return tokens[0].lower() + ''.join(token.capitalize() for token in tokens[1:]) + + +def to_pascal_case(text: str) -> str: + """Convert a string to PascalCase. + + Args: + text: Input string to convert + + Returns: + String converted to PascalCase + + Examples: + >>> to_pascal_case("hello_world") + 'HelloWorld' + >>> to_pascal_case("some-variable-name") + 'SomeVariableName' + >>> to_pascal_case("convert to pascal") + 'ConvertToPascal' + >>> to_pascal_case("helloWorld") + 'HelloWorld' + >>> to_pascal_case("XMLHttpRequest") + 'XmlHttpRequest' + >>> to_pascal_case("__multiple__delimiters__") + 'MultipleDelimiters' + """ + tokens = _tokenize_string(text) + if not tokens: + return "" + + return ''.join(token.capitalize() for token in tokens) + + +def to_kebab_case(text: str) -> str: + """Convert a string to kebab-case. + + Args: + text: Input string to convert + + Returns: + String converted to kebab-case + + Examples: + >>> to_kebab_case("HelloWorld") + 'hello-world' + >>> to_kebab_case("XMLHttpRequest") + 'xml-http-request' + >>> to_kebab_case("some_variable_name") + 'some-variable-name' + >>> to_kebab_case("multiple spaces") + 'multiple-spaces' + >>> to_kebab_case("--already--kebab--") + 'already-kebab' + """ + tokens = _tokenize_string(text) + if not tokens: + return "" + + return '-'.join(token.lower() for token in tokens) + + +def truncate_text(text: str, max_length: int, suffix: str = "...") -> str: + """Truncate text to a maximum length with optional suffix. + + If the text needs truncation and max_length is too small to accommodate + the suffix meaningfully, returns an empty string. + + Args: + text: Text to truncate + max_length: Maximum length including suffix (must be non-negative) + suffix: String to append when truncating (default: "...") + + Returns: + Truncated text with suffix if needed, empty string if max_length + is too small for meaningful truncation + + Raises: + ValueError: If max_length is negative + + Examples: + >>> truncate_text("This is a long text", 10) + 'This is...' + >>> truncate_text("Short", 10) + 'Short' + >>> truncate_text("Exactly ten", 11) + 'Exactly ten' + >>> truncate_text("Too long", -1) + Traceback (most recent call last): + ... + ValueError: max_length must be non-negative, got -1 + """ + if max_length < 0: + raise ValueError(f"max_length must be non-negative, got {max_length}") + + if len(text) <= max_length: + return text + + # If max_length is too small for meaningful truncation, return empty + if max_length < len(suffix) + 1: # Need at least 1 char + suffix + return "" + + return text[:max_length - len(suffix)] + suffix + + +def word_count(text: str) -> int: + """Count the number of words in a text. + + Words are defined as sequences of characters separated by whitespace. + + Args: + text: Text to count words in + + Returns: + Number of words + + Examples: + >>> word_count("Hello world") + 2 + >>> word_count(" Multiple spaces ") + 2 + >>> word_count("") + 0 + >>> word_count("One-word") # Hyphenated counts as one + 1 + >>> word_count("Line\\nbreak\\tand\\ttabs") + 3 + """ + if not text: + return 0 + + words = text.split() + return len(words) + + +def is_mixed_case(text: str) -> bool: + """Check if a string contains both uppercase and lowercase letters. + + Args: + text: String to check + + Returns: + True if string has both cases, False otherwise + + Examples: + >>> is_mixed_case("HelloWorld") + True + >>> is_mixed_case("ALLCAPS") + False + >>> is_mixed_case("lowercase") + False + >>> is_mixed_case("noLetters123") + False + """ + has_upper = any(c.isupper() for c in text) + has_lower = any(c.islower() for c in text) + return has_upper and has_lower + + +def remove_extra_whitespace(text: str) -> str: + """Remove extra whitespace from a string. + + Collapses multiple spaces into single spaces and trims + leading/trailing whitespace. + + Args: + text: Text to clean + + Returns: + Text with normalized whitespace + + Examples: + >>> remove_extra_whitespace(" Hello world ") + 'Hello world' + >>> remove_extra_whitespace("Multiple\\n\\nlines") + 'Multiple lines' + >>> remove_extra_whitespace("\\t\\tTabs\\t\\t") + 'Tabs' + """ + if not text: + return "" + + # Replace all whitespace sequences with single space + text = _WHITESPACE.sub(' ', text) + # Strip leading/trailing whitespace + return text.strip() \ No newline at end of file diff --git a/tests/test_string_utils.py b/tests/test_string_utils.py new file mode 100644 index 0000000..3f555f7 --- /dev/null +++ b/tests/test_string_utils.py @@ -0,0 +1,340 @@ +"""Unit tests for string utilities module.""" + +import pytest +import timeit +from src.string_utils import ( + to_snake_case, + to_camel_case, + to_pascal_case, + to_kebab_case, + truncate_text, + word_count, + is_mixed_case, + remove_extra_whitespace, + _tokenize_string, # Import for testing +) + + +class TestCaseConversions: + """Test suite for case conversion functions.""" + + @pytest.mark.parametrize( + "input_text,expected", + [ + ("HelloWorld", "hello_world"), + ("someVariableName", "some_variable_name"), + ("convert-to-snake", "convert_to_snake"), + ("already_snake_case", "already_snake_case"), + ("Mixed-Style_Example", "mixed_style_example"), + ("XMLHttpRequest", "xml_http_request"), + ("IOError", "io_error"), + ("", ""), + ("a", "a"), + ("ABC", "abc"), + ("123Numbers", "123_numbers"), + ("with spaces here", "with_spaces_here"), + ("__multiple__underscores__", "multiple_underscores"), + ("HelloWorld", "hello_world"), # Now handles camelCase input + ("CamelCASEMixed", "camel_case_mixed"), + ], + ) + def test_to_snake_case(self, input_text, expected): + """Test conversion to snake_case.""" + assert to_snake_case(input_text) == expected + + @pytest.mark.parametrize( + "input_text,expected", + [ + ("hello_world", "helloWorld"), + ("some-variable-name", "someVariableName"), + ("Convert to camel", "convertToCamel"), + ("already_camelCase", "alreadyCamelCase"), # Preserves existing case transitions + ("mixed-Style_Example", "mixedStyleExample"), + ("", ""), + ("a", "a"), + ("first", "first"), + ("UPPERCASE", "uppercase"), + ("123_numbers", "123Numbers"), + ("__multiple__delimiters__", "multipleDelimiters"), + ("!!!only!!!special!!!", "onlySpecial"), + ("HelloWorld", "helloWorld"), # Now properly handles CamelCase + ("XMLHttpRequest", "xmlHttpRequest"), # Consistent tokenization + ], + ) + def test_to_camel_case(self, input_text, expected): + """Test conversion to camelCase.""" + assert to_camel_case(input_text) == expected + + @pytest.mark.parametrize( + "input_text,expected", + [ + ("hello_world", "HelloWorld"), + ("some-variable-name", "SomeVariableName"), + ("convert to pascal", "ConvertToPascal"), + ("already_PascalCase", "AlreadyPascalCase"), # Preserves case transitions + ("mixed-Style_Example", "MixedStyleExample"), + ("", ""), + ("a", "A"), + ("first", "First"), + ("UPPERCASE", "Uppercase"), + ("123_numbers", "123Numbers"), + ("__multiple__delimiters__", "MultipleDelimiters"), + ("helloWorld", "HelloWorld"), # Handles camelCase input + ("XMLHttpRequest", "XmlHttpRequest"), # Consistent tokenization + ], + ) + def test_to_pascal_case(self, input_text, expected): + """Test conversion to PascalCase.""" + assert to_pascal_case(input_text) == expected + + @pytest.mark.parametrize( + "input_text,expected", + [ + ("HelloWorld", "hello-world"), + ("some_variable_name", "some-variable-name"), + ("Convert To Kebab", "convert-to-kebab"), + ("already-kebab-case", "already-kebab-case"), + ("Mixed_Style Example", "mixed-style-example"), + ("XMLHttpRequest", "xml-http-request"), + ("", ""), + ("a", "a"), + ("ABC", "abc"), + ("123Numbers", "123-numbers"), + ("--multiple--hyphens--", "multiple-hyphens"), + ("HTTPSConnection", "https-connection"), + ], + ) + def test_to_kebab_case(self, input_text, expected): + """Test conversion to kebab-case.""" + assert to_kebab_case(input_text) == expected + + +class TestTextUtilities: + """Test suite for text utility functions.""" + + @pytest.mark.parametrize( + "text,max_length,suffix,expected", + [ + ("This is a long text", 10, "...", "This is..."), + ("Short", 10, "...", "Short"), + ("Exactly ten", 11, "...", "Exactly ten"), + ("Truncate this text", 15, "...", "Truncate thi..."), + ("Custom suffix", 10, "→", "Custom su→"), + ("No suffix needed", 20, "...", "No suffix needed"), + ("", 5, "...", ""), + ("Very long text that needs truncation", 20, "...", "Very long text th..."), + ("Edge", 4, "...", "Edge"), # Text same as max_length + ("Tiny", 3, "...", ""), # Too small for meaningful truncation + ("Test", 2, "...", ""), # Too small + ("Test", 4, "!!!", "Test"), # Exact length, no truncation + ], + ) + def test_truncate_text(self, text, max_length, suffix, expected): + """Test text truncation with various parameters.""" + assert truncate_text(text, max_length, suffix) == expected + + def test_truncate_text_default_suffix(self): + """Test truncate_text with default suffix.""" + assert truncate_text("This is a long text", 10) == "This is..." + assert truncate_text("Short", 10) == "Short" + + @pytest.mark.parametrize( + "text,expected", + [ + ("Hello world", 2), + (" Multiple spaces ", 2), + ("", 0), + ("One", 1), + ("This is a test sentence.", 5), + (" ", 0), + ("Word", 1), + ("Multiple\nlines\nwith\nwords", 4), + ("\tTabs\tand\tspaces\t", 3), + ("123 456 789", 3), + ("one-word", 1), # Hyphenated as single word + ("email@example.com", 1), # Email as single word + ], + ) + def test_word_count(self, text, expected): + """Test word counting in various texts.""" + assert word_count(text) == expected + + +class TestAdditionalUtilities: + """Test suite for additional utility functions.""" + + @pytest.mark.parametrize( + "text,expected", + [ + ("HelloWorld", True), + ("ALLCAPS", False), + ("lowercase", False), + ("MixedCase", True), + ("", False), + ("123", False), + ("123ABC", False), # No lowercase + ("123abc", False), # No uppercase + ("aB", True), + ("!@#$%", False), # No letters + ], + ) + def test_is_mixed_case(self, text, expected): + """Test mixed case detection.""" + assert is_mixed_case(text) == expected + + @pytest.mark.parametrize( + "text,expected", + [ + (" Hello world ", "Hello world"), + ("Multiple\n\nlines", "Multiple lines"), + ("\t\tTabs\t\t", "Tabs"), + ("Normal text", "Normal text"), + ("", ""), + (" ", ""), + ("One Two Three", "One Two Three"), + ("Line\nbreak\tand\ttab", "Line break and tab"), + (" \n\t Mixed \n whitespace \t ", "Mixed whitespace"), + ], + ) + def test_remove_extra_whitespace(self, text, expected): + """Test whitespace normalization.""" + assert remove_extra_whitespace(text) == expected + + +class TestTokenizer: + """Test suite for the shared tokenizer.""" + + def test_tokenizer_basic(self): + """Test basic tokenization.""" + assert _tokenize_string("HelloWorld") == ["Hello", "World"] + assert _tokenize_string("snake_case") == ["snake", "case"] + assert _tokenize_string("kebab-case") == ["kebab", "case"] + assert _tokenize_string("XMLHttpRequest") == ["XML", "Http", "Request"] + assert _tokenize_string("IOError") == ["IO", "Error"] + assert _tokenize_string("") == [] + + def test_tokenizer_consistency(self): + """Test that all conversion functions use consistent tokenization.""" + test_string = "XMLHttpRequest" + + # All should tokenize the same way + assert to_snake_case(test_string) == "xml_http_request" + assert to_camel_case(test_string) == "xmlHttpRequest" + assert to_pascal_case(test_string) == "XmlHttpRequest" + assert to_kebab_case(test_string) == "xml-http-request" + + +class TestEdgeCases: + """Test suite for edge cases and special scenarios.""" + + def test_empty_string_conversions(self): + """Test all conversions with empty string.""" + assert to_snake_case("") == "" + assert to_camel_case("") == "" + assert to_pascal_case("") == "" + assert to_kebab_case("") == "" + + def test_single_character_conversions(self): + """Test all conversions with single character.""" + assert to_snake_case("A") == "a" + assert to_camel_case("A") == "a" + assert to_pascal_case("A") == "A" + assert to_kebab_case("A") == "a" + + def test_numbers_in_conversions(self): + """Test conversions with numbers.""" + assert to_snake_case("test123Case") == "test123_case" + assert to_camel_case("test_123_case") == "test123Case" + assert to_pascal_case("test_123_case") == "Test123Case" + assert to_kebab_case("test123Case") == "test123-case" + + def test_special_characters_only(self): + """Test conversions with only special characters.""" + assert to_snake_case("!!!@@@###") == "" + assert to_camel_case("!!!@@@###") == "" + assert to_pascal_case("!!!@@@###") == "" + assert to_kebab_case("!!!@@@###") == "" + assert to_snake_case("___") == "" + assert to_kebab_case("---") == "" + + def test_consecutive_delimiters(self): + """Test conversions with consecutive delimiters.""" + assert to_snake_case("hello___world") == "hello_world" + assert to_camel_case("hello___world") == "helloWorld" + assert to_pascal_case("hello---world") == "HelloWorld" + assert to_kebab_case("hello world") == "hello-world" + assert to_snake_case("test____case____example") == "test_case_example" + assert to_kebab_case("test----case----example") == "test-case-example" + + def test_truncate_edge_cases(self): + """Test truncation edge cases.""" + # Suffix longer than max_length + assert truncate_text("Test", 2, "...") == "" + assert truncate_text("Test", 0, "...") == "" + + # Text exactly at max_length + assert truncate_text("12345", 5, "...") == "12345" + + # Empty text + assert truncate_text("", 10, "...") == "" + + # Very long suffix + assert truncate_text("Short", 10, "VERYLONGSUFFIX") == "Short" + assert truncate_text("This needs truncation", 10, "LONG") == "This nLONG" + + # Negative max_length should raise ValueError + with pytest.raises(ValueError, match="max_length must be non-negative"): + truncate_text("Test", -1) + + def test_unicode_handling(self): + """Test handling of Unicode characters.""" + # Note: Current implementation treats accented chars as non-alphanumeric + assert "caf" in to_snake_case("Café").lower() + assert "m_nch" in to_snake_case("München").lower() + assert word_count("Café au lait") == 3 + assert is_mixed_case("Café") == True + + +class TestPerformance: + """Test suite for performance characteristics.""" + + def test_performance_with_long_strings(self): + """Test that functions handle long strings efficiently.""" + long_text = "CamelCase" * 100 # 900 characters + + # Should complete quickly (under 10ms) + start = timeit.default_timer() + result = to_snake_case(long_text) + elapsed = timeit.default_timer() - start + assert elapsed < 0.01 + assert "camel_case" in result + + # Test other conversions + start = timeit.default_timer() + to_kebab_case(long_text) + elapsed = timeit.default_timer() - start + assert elapsed < 0.01 + + def test_performance_with_many_delimiters(self): + """Test performance with many consecutive delimiters.""" + text_with_delimiters = "_" * 100 + "test" + "_" * 100 + "case" + + start = timeit.default_timer() + result = to_snake_case(text_with_delimiters) + elapsed = timeit.default_timer() - start + assert elapsed < 0.01 + assert result == "test_case" + + def test_regex_compilation_benefit(self): + """Test that precompiled regex improves performance.""" + # Run multiple conversions to test regex caching + test_cases = ["CamelCase", "snake_case", "kebab-case"] * 100 + + start = timeit.default_timer() + for text in test_cases: + to_snake_case(text) + to_kebab_case(text) + elapsed = timeit.default_timer() - start + + # Should complete 600 conversions in under 100ms + assert elapsed < 0.1 \ No newline at end of file