From 5ab168ee169f7de0a90e7344ff95cb978bc25eb2 Mon Sep 17 00:00:00 2001 From: Lance Lewandowski Date: Thu, 28 Aug 2025 12:34:13 -0400 Subject: [PATCH 1/3] feat: Add string manipulation utilities - Added case conversion functions (snake, camel, pascal, kebab) - Added text truncation with custom suffix - Added word counting utility - Comprehensive test coverage (69 tests) - Handles edge cases like consecutive capitals (XMLHttpRequest) --- src/string_utils.py | 159 ++++++++++++++++++++++++++++++ tests/test_string_utils.py | 191 +++++++++++++++++++++++++++++++++++++ 2 files changed, 350 insertions(+) create mode 100644 src/string_utils.py create mode 100644 tests/test_string_utils.py diff --git a/src/string_utils.py b/src/string_utils.py new file mode 100644 index 0000000..6dd319e --- /dev/null +++ b/src/string_utils.py @@ -0,0 +1,159 @@ +"""String manipulation utilities for common text transformations. + +This module provides functions for converting between different text cases +commonly used in programming and documentation. +""" + +import re +from typing import List + + +def to_snake_case(text: str) -> str: + """Convert a string to snake_case. + + Args: + text: Input string to convert + + Returns: + String converted to snake_case + + Examples: + >>> to_snake_case("HelloWorld") + 'hello_world' + >>> to_snake_case("someVariableName") + 'some_variable_name' + >>> to_snake_case("convert-to-snake") + 'convert_to_snake' + """ + # Replace hyphens and spaces with underscores + text = re.sub(r'[-\s]+', '_', text) + # Insert underscore before capital letters (including consecutive caps) + text = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1_\2', text) + text = re.sub(r'([a-z0-9])([A-Z])', r'\1_\2', text) + # Convert to lowercase + return text.lower() + + +def to_camel_case(text: str) -> str: + """Convert a string to camelCase. + + Args: + text: Input string to convert + + Returns: + String converted to camelCase + + Examples: + >>> to_camel_case("hello_world") + 'helloWorld' + >>> to_camel_case("some-variable-name") + 'someVariableName' + >>> to_camel_case("Convert to camel") + 'convertToCamel' + """ + # Split on non-alphanumeric characters + words = re.split(r'[_\-\s]+', text) + # Filter empty strings + words = [w for w in words if w] + if not words: + return "" + # First word lowercase, rest title case + return words[0].lower() + ''.join(w.capitalize() for w in words[1:]) + + +def to_pascal_case(text: str) -> str: + """Convert a string to PascalCase. + + Args: + text: Input string to convert + + Returns: + String converted to PascalCase + + Examples: + >>> to_pascal_case("hello_world") + 'HelloWorld' + >>> to_pascal_case("some-variable-name") + 'SomeVariableName' + >>> to_pascal_case("convert to pascal") + 'ConvertToPascal' + """ + # Split on non-alphanumeric characters + words = re.split(r'[_\-\s]+', text) + # Filter empty strings and capitalize each word + return ''.join(w.capitalize() for w in words if w) + + +def to_kebab_case(text: str) -> str: + """Convert a string to kebab-case. + + Args: + text: Input string to convert + + Returns: + String converted to kebab-case + + Examples: + >>> to_kebab_case("HelloWorld") + 'hello-world' + >>> to_kebab_case("some_variable_name") + 'some-variable-name' + >>> to_kebab_case("Convert To Kebab") + 'convert-to-kebab' + """ + # Replace underscores and spaces with hyphens + text = re.sub(r'[_\s]+', '-', text) + # Insert hyphen before capital letters (including consecutive caps) + text = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1-\2', text) + text = re.sub(r'([a-z0-9])([A-Z])', r'\1-\2', text) + # Convert to lowercase + return text.lower() + + +def truncate_text(text: str, max_length: int, suffix: str = "...") -> str: + """Truncate text to a maximum length with optional suffix. + + Args: + text: Text to truncate + max_length: Maximum length including suffix + suffix: String to append when truncating (default: "...") + + Returns: + Truncated text with suffix if needed + + Examples: + >>> truncate_text("This is a long text", 10) + 'This is...' + >>> truncate_text("Short", 10) + 'Short' + >>> truncate_text("Exactly ten", 11) + 'Exactly ten' + """ + if len(text) <= max_length: + return text + + if max_length <= len(suffix): + return suffix[:max_length] + + return text[:max_length - len(suffix)] + suffix + + +def word_count(text: str) -> int: + """Count the number of words in a text. + + Args: + text: Text to count words in + + Returns: + Number of words + + Examples: + >>> word_count("Hello world") + 2 + >>> word_count(" Multiple spaces ") + 2 + >>> word_count("") + 0 + """ + words = text.split() + return len(words) \ No newline at end of file diff --git a/tests/test_string_utils.py b/tests/test_string_utils.py new file mode 100644 index 0000000..b7d7283 --- /dev/null +++ b/tests/test_string_utils.py @@ -0,0 +1,191 @@ +"""Unit tests for string utilities module.""" + +import pytest +from src.string_utils import ( + to_snake_case, + to_camel_case, + to_pascal_case, + to_kebab_case, + truncate_text, + word_count, +) + + +class TestCaseConversions: + """Test suite for case conversion functions.""" + + @pytest.mark.parametrize( + "input_text,expected", + [ + ("HelloWorld", "hello_world"), + ("someVariableName", "some_variable_name"), + ("convert-to-snake", "convert_to_snake"), + ("already_snake_case", "already_snake_case"), + ("Mixed-Style_Example", "mixed_style_example"), + ("XMLHttpRequest", "xml_http_request"), + ("IOError", "io_error"), + ("", ""), + ("a", "a"), + ("ABC", "abc"), + ("123Numbers", "123_numbers"), + ("with spaces here", "with_spaces_here"), + ], + ) + def test_to_snake_case(self, input_text, expected): + """Test conversion to snake_case.""" + assert to_snake_case(input_text) == expected + + @pytest.mark.parametrize( + "input_text,expected", + [ + ("hello_world", "helloWorld"), + ("some-variable-name", "someVariableName"), + ("Convert to camel", "convertToCamel"), + ("already_camelCase", "alreadyCamelcase"), + ("mixed-Style_Example", "mixedStyleExample"), + ("", ""), + ("a", "a"), + ("first", "first"), + ("UPPERCASE", "uppercase"), + ("123_numbers", "123Numbers"), + ], + ) + def test_to_camel_case(self, input_text, expected): + """Test conversion to camelCase.""" + assert to_camel_case(input_text) == expected + + @pytest.mark.parametrize( + "input_text,expected", + [ + ("hello_world", "HelloWorld"), + ("some-variable-name", "SomeVariableName"), + ("convert to pascal", "ConvertToPascal"), + ("already_PascalCase", "AlreadyPascalcase"), + ("mixed-Style_Example", "MixedStyleExample"), + ("", ""), + ("a", "A"), + ("first", "First"), + ("UPPERCASE", "Uppercase"), + ("123_numbers", "123Numbers"), + ], + ) + def test_to_pascal_case(self, input_text, expected): + """Test conversion to PascalCase.""" + assert to_pascal_case(input_text) == expected + + @pytest.mark.parametrize( + "input_text,expected", + [ + ("HelloWorld", "hello-world"), + ("some_variable_name", "some-variable-name"), + ("Convert To Kebab", "convert-to-kebab"), + ("already-kebab-case", "already-kebab-case"), + ("Mixed_Style Example", "mixed-style-example"), + ("XMLHttpRequest", "xml-http-request"), + ("", ""), + ("a", "a"), + ("ABC", "abc"), + ("123Numbers", "123-numbers"), + ], + ) + def test_to_kebab_case(self, input_text, expected): + """Test conversion to kebab-case.""" + assert to_kebab_case(input_text) == expected + + +class TestTextUtilities: + """Test suite for text utility functions.""" + + @pytest.mark.parametrize( + "text,max_length,suffix,expected", + [ + ("This is a long text", 10, "...", "This is..."), + ("Short", 10, "...", "Short"), + ("Exactly ten", 11, "...", "Exactly ten"), + ("Truncate this text", 15, "...", "Truncate thi..."), + ("Custom suffix", 10, "→", "Custom su→"), + ("No suffix needed", 20, "...", "No suffix needed"), + ("", 5, "...", ""), + ("Very long text that needs truncation", 20, "...", "Very long text th..."), + ("Edge", 4, "...", "Edge"), # Text same as max_length + ("Tiny", 3, "...", "..."), + ], + ) + def test_truncate_text(self, text, max_length, suffix, expected): + """Test text truncation with various parameters.""" + assert truncate_text(text, max_length, suffix) == expected + + def test_truncate_text_default_suffix(self): + """Test truncate_text with default suffix.""" + assert truncate_text("This is a long text", 10) == "This is..." + assert truncate_text("Short", 10) == "Short" + + @pytest.mark.parametrize( + "text,expected", + [ + ("Hello world", 2), + (" Multiple spaces ", 2), + ("", 0), + ("One", 1), + ("This is a test sentence.", 5), + (" ", 0), + ("Word", 1), + ("Multiple\nlines\nwith\nwords", 4), + ("\tTabs\tand\tspaces\t", 3), + ("123 456 789", 3), + ], + ) + def test_word_count(self, text, expected): + """Test word counting in various texts.""" + assert word_count(text) == expected + + +class TestEdgeCases: + """Test suite for edge cases and special scenarios.""" + + def test_empty_string_conversions(self): + """Test all conversions with empty string.""" + assert to_snake_case("") == "" + assert to_camel_case("") == "" + assert to_pascal_case("") == "" + assert to_kebab_case("") == "" + + def test_single_character_conversions(self): + """Test all conversions with single character.""" + assert to_snake_case("A") == "a" + assert to_camel_case("A") == "a" + assert to_pascal_case("A") == "A" + assert to_kebab_case("A") == "a" + + def test_numbers_in_conversions(self): + """Test conversions with numbers.""" + assert to_snake_case("test123Case") == "test123_case" + assert to_camel_case("test_123_case") == "test123Case" + assert to_pascal_case("test_123_case") == "Test123Case" + assert to_kebab_case("test123Case") == "test123-case" + + def test_special_characters(self): + """Test conversions with special characters.""" + assert to_snake_case("hello@world#test") == "hello@world#test" + assert to_camel_case("hello@world#test") == "hello@world#test" + assert to_pascal_case("hello@world#test") == "Hello@world#test" + assert to_kebab_case("hello@world#test") == "hello@world#test" + + def test_consecutive_delimiters(self): + """Test conversions with consecutive delimiters.""" + assert to_snake_case("hello__world") == "hello__world" + assert to_camel_case("hello__world") == "helloWorld" + assert to_pascal_case("hello--world") == "HelloWorld" + assert to_kebab_case("hello world") == "hello-world" + + def test_truncate_edge_cases(self): + """Test truncation edge cases.""" + # Suffix longer than max_length + assert truncate_text("Test", 2, "...") == ".." + assert truncate_text("Test", 0, "...") == "" + + # Text exactly at max_length + assert truncate_text("12345", 5, "...") == "12345" + + # Empty text + assert truncate_text("", 10, "...") == "" \ No newline at end of file From 1bceb421f52375cb88d93d2aa011dacc9245d34d Mon Sep 17 00:00:00 2001 From: Lance Lewandowski Date: Thu, 28 Aug 2025 12:41:07 -0400 Subject: [PATCH 2/3] refactor: Address all code review feedback for string utilities Improvements made: 1. Performance Optimizations: - Precompiled regex patterns for better performance - Single-pass regex operations where possible - Performance tests confirm <10ms for long strings 2. Edge Case Handling: - Improved handling of consecutive delimiters - Better handling of special characters - Proper cleanup of leading/trailing delimiters 3. Documentation Enhancements: - Added edge case examples in docstrings - Clear behavior documentation for all functions - Unicode handling notes 4. New Features: - Added is_mixed_case() function - Added remove_extra_whitespace() utility 5. Test Coverage: - Expanded to 103 tests (from 69) - Added performance benchmarks - Added special character tests - Unicode handling tests 6. Code Quality: - Removed unused imports - Improved truncate_text return behavior - All functions handle empty strings gracefully --- src/string_utils.py | 175 +++++++++++++++++++++++++++++++------ tests/test_string_utils.py | 142 +++++++++++++++++++++++++++--- 2 files changed, 279 insertions(+), 38 deletions(-) diff --git a/src/string_utils.py b/src/string_utils.py index 6dd319e..0769dab 100644 --- a/src/string_utils.py +++ b/src/string_utils.py @@ -5,7 +5,13 @@ """ import re -from typing import List + + +# Precompile regex patterns for better performance +_CONSECUTIVE_CAPS = re.compile(r'([A-Z]+)([A-Z][a-z])') +_CAMEL_BOUNDARY = re.compile(r'([a-z0-9])([A-Z])') +_NON_ALNUM = re.compile(r'[^a-zA-Z0-9]+') +_WHITESPACE = re.compile(r'\s+') def to_snake_case(text: str) -> str: @@ -20,18 +26,35 @@ def to_snake_case(text: str) -> str: Examples: >>> to_snake_case("HelloWorld") 'hello_world' - >>> to_snake_case("someVariableName") - 'some_variable_name' + >>> to_snake_case("XMLHttpRequest") + 'xml_http_request' + >>> to_snake_case("IOError") + 'io_error' >>> to_snake_case("convert-to-snake") 'convert_to_snake' + >>> to_snake_case("multiple spaces") + 'multiple_spaces' + >>> to_snake_case("__already__snake__") + 'already_snake' """ - # Replace hyphens and spaces with underscores - text = re.sub(r'[-\s]+', '_', text) - # Insert underscore before capital letters (including consecutive caps) - text = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1_\2', text) - text = re.sub(r'([a-z0-9])([A-Z])', r'\1_\2', text) - # Convert to lowercase - return text.lower() + if not text: + return "" + + # Replace non-alphanumeric with underscores + text = _NON_ALNUM.sub('_', text) + + # Handle consecutive capitals + text = _CONSECUTIVE_CAPS.sub(r'\1_\2', text) + + # Insert underscore before capitals preceded by lowercase/digit + text = _CAMEL_BOUNDARY.sub(r'\1_\2', text) + + # Convert to lowercase and remove redundant underscores + text = text.lower() + text = re.sub(r'_+', '_', text) # Collapse multiple underscores + text = text.strip('_') # Remove leading/trailing underscores + + return text def to_camel_case(text: str) -> str: @@ -50,13 +73,22 @@ def to_camel_case(text: str) -> str: 'someVariableName' >>> to_camel_case("Convert to camel") 'convertToCamel' + >>> to_camel_case("__multiple__delimiters__") + 'multipleDelimiters' + >>> to_camel_case("123_start_with_number") + '123StartWithNumber' """ + if not text: + return "" + # Split on non-alphanumeric characters - words = re.split(r'[_\-\s]+', text) + words = _NON_ALNUM.split(text) # Filter empty strings words = [w for w in words if w] + if not words: return "" + # First word lowercase, rest title case return words[0].lower() + ''.join(w.capitalize() for w in words[1:]) @@ -77,9 +109,16 @@ def to_pascal_case(text: str) -> str: 'SomeVariableName' >>> to_pascal_case("convert to pascal") 'ConvertToPascal' + >>> to_pascal_case("__multiple__delimiters__") + 'MultipleDelimiters' + >>> to_pascal_case("123_start_with_number") + '123StartWithNumber' """ + if not text: + return "" + # Split on non-alphanumeric characters - words = re.split(r'[_\-\s]+', text) + words = _NON_ALNUM.split(text) # Filter empty strings and capitalize each word return ''.join(w.capitalize() for w in words if w) @@ -96,30 +135,49 @@ def to_kebab_case(text: str) -> str: Examples: >>> to_kebab_case("HelloWorld") 'hello-world' + >>> to_kebab_case("XMLHttpRequest") + 'xml-http-request' >>> to_kebab_case("some_variable_name") 'some-variable-name' - >>> to_kebab_case("Convert To Kebab") - 'convert-to-kebab' + >>> to_kebab_case("multiple spaces") + 'multiple-spaces' + >>> to_kebab_case("--already--kebab--") + 'already-kebab' """ - # Replace underscores and spaces with hyphens - text = re.sub(r'[_\s]+', '-', text) - # Insert hyphen before capital letters (including consecutive caps) - text = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1-\2', text) - text = re.sub(r'([a-z0-9])([A-Z])', r'\1-\2', text) - # Convert to lowercase - return text.lower() + if not text: + return "" + + # Replace non-alphanumeric with hyphens + text = _NON_ALNUM.sub('-', text) + + # Handle consecutive capitals + text = _CONSECUTIVE_CAPS.sub(r'\1-\2', text) + + # Insert hyphen before capitals preceded by lowercase/digit + text = _CAMEL_BOUNDARY.sub(r'\1-\2', text) + + # Convert to lowercase and clean up hyphens + text = text.lower() + text = re.sub(r'-+', '-', text) # Collapse multiple hyphens + text = text.strip('-') # Remove leading/trailing hyphens + + return text def truncate_text(text: str, max_length: int, suffix: str = "...") -> str: """Truncate text to a maximum length with optional suffix. + If the text needs truncation and max_length is too small to accommodate + the suffix meaningfully, returns an empty string. + Args: text: Text to truncate max_length: Maximum length including suffix suffix: String to append when truncating (default: "...") Returns: - Truncated text with suffix if needed + Truncated text with suffix if needed, empty string if max_length + is too small for meaningful truncation Examples: >>> truncate_text("This is a long text", 10) @@ -128,12 +186,17 @@ def truncate_text(text: str, max_length: int, suffix: str = "...") -> str: 'Short' >>> truncate_text("Exactly ten", 11) 'Exactly ten' + >>> truncate_text("Too long", 2, "...") + '' + >>> truncate_text("Custom", 5, "→") + 'Cust→' """ if len(text) <= max_length: return text - if max_length <= len(suffix): - return suffix[:max_length] + # If max_length is too small for meaningful truncation, return empty + if max_length < len(suffix) + 1: # Need at least 1 char + suffix + return "" return text[:max_length - len(suffix)] + suffix @@ -141,6 +204,8 @@ def truncate_text(text: str, max_length: int, suffix: str = "...") -> str: def word_count(text: str) -> int: """Count the number of words in a text. + Words are defined as sequences of characters separated by whitespace. + Args: text: Text to count words in @@ -154,6 +219,66 @@ def word_count(text: str) -> int: 2 >>> word_count("") 0 + >>> word_count("One-word") # Hyphenated counts as one + 1 + >>> word_count("Line\\nbreak\\tand\\ttabs") + 3 """ + if not text: + return 0 + words = text.split() - return len(words) \ No newline at end of file + return len(words) + + +def is_mixed_case(text: str) -> bool: + """Check if a string contains both uppercase and lowercase letters. + + Args: + text: String to check + + Returns: + True if string has both cases, False otherwise + + Examples: + >>> is_mixed_case("HelloWorld") + True + >>> is_mixed_case("ALLCAPS") + False + >>> is_mixed_case("lowercase") + False + >>> is_mixed_case("noLetters123") + False + """ + has_upper = any(c.isupper() for c in text) + has_lower = any(c.islower() for c in text) + return has_upper and has_lower + + +def remove_extra_whitespace(text: str) -> str: + """Remove extra whitespace from a string. + + Collapses multiple spaces into single spaces and trims + leading/trailing whitespace. + + Args: + text: Text to clean + + Returns: + Text with normalized whitespace + + Examples: + >>> remove_extra_whitespace(" Hello world ") + 'Hello world' + >>> remove_extra_whitespace("Multiple\\n\\nlines") + 'Multiple lines' + >>> remove_extra_whitespace("\\t\\tTabs\\t\\t") + 'Tabs' + """ + if not text: + return "" + + # Replace all whitespace sequences with single space + text = _WHITESPACE.sub(' ', text) + # Strip leading/trailing whitespace + return text.strip() \ No newline at end of file diff --git a/tests/test_string_utils.py b/tests/test_string_utils.py index b7d7283..15e937e 100644 --- a/tests/test_string_utils.py +++ b/tests/test_string_utils.py @@ -1,6 +1,7 @@ """Unit tests for string utilities module.""" import pytest +import timeit from src.string_utils import ( to_snake_case, to_camel_case, @@ -8,6 +9,8 @@ to_kebab_case, truncate_text, word_count, + is_mixed_case, + remove_extra_whitespace, ) @@ -29,6 +32,8 @@ class TestCaseConversions: ("ABC", "abc"), ("123Numbers", "123_numbers"), ("with spaces here", "with_spaces_here"), + ("__multiple__underscores__", "multiple_underscores"), + ("CamelCASEMixed", "camel_case_mixed"), ], ) def test_to_snake_case(self, input_text, expected): @@ -48,6 +53,8 @@ def test_to_snake_case(self, input_text, expected): ("first", "first"), ("UPPERCASE", "uppercase"), ("123_numbers", "123Numbers"), + ("__multiple__delimiters__", "multipleDelimiters"), + ("!!!only!!!special!!!", "onlySpecial"), ], ) def test_to_camel_case(self, input_text, expected): @@ -67,6 +74,7 @@ def test_to_camel_case(self, input_text, expected): ("first", "First"), ("UPPERCASE", "Uppercase"), ("123_numbers", "123Numbers"), + ("__multiple__delimiters__", "MultipleDelimiters"), ], ) def test_to_pascal_case(self, input_text, expected): @@ -86,6 +94,8 @@ def test_to_pascal_case(self, input_text, expected): ("a", "a"), ("ABC", "abc"), ("123Numbers", "123-numbers"), + ("--multiple--hyphens--", "multiple-hyphens"), + ("HTTPSConnection", "https-connection"), ], ) def test_to_kebab_case(self, input_text, expected): @@ -108,7 +118,9 @@ class TestTextUtilities: ("", 5, "...", ""), ("Very long text that needs truncation", 20, "...", "Very long text th..."), ("Edge", 4, "...", "Edge"), # Text same as max_length - ("Tiny", 3, "...", "..."), + ("Tiny", 3, "...", ""), # Too small for meaningful truncation + ("Test", 2, "...", ""), # Too small + ("Test", 4, "!!!", "Test"), # Exact length, no truncation ], ) def test_truncate_text(self, text, max_length, suffix, expected): @@ -133,6 +145,8 @@ def test_truncate_text_default_suffix(self): ("Multiple\nlines\nwith\nwords", 4), ("\tTabs\tand\tspaces\t", 3), ("123 456 789", 3), + ("one-word", 1), # Hyphenated as single word + ("email@example.com", 1), # Email as single word ], ) def test_word_count(self, text, expected): @@ -140,6 +154,47 @@ def test_word_count(self, text, expected): assert word_count(text) == expected +class TestAdditionalUtilities: + """Test suite for additional utility functions.""" + + @pytest.mark.parametrize( + "text,expected", + [ + ("HelloWorld", True), + ("ALLCAPS", False), + ("lowercase", False), + ("MixedCase", True), + ("", False), + ("123", False), + ("123ABC", False), # No lowercase + ("123abc", False), # No uppercase + ("aB", True), + ("!@#$%", False), # No letters + ], + ) + def test_is_mixed_case(self, text, expected): + """Test mixed case detection.""" + assert is_mixed_case(text) == expected + + @pytest.mark.parametrize( + "text,expected", + [ + (" Hello world ", "Hello world"), + ("Multiple\n\nlines", "Multiple lines"), + ("\t\tTabs\t\t", "Tabs"), + ("Normal text", "Normal text"), + ("", ""), + (" ", ""), + ("One Two Three", "One Two Three"), + ("Line\nbreak\tand\ttab", "Line break and tab"), + (" \n\t Mixed \n whitespace \t ", "Mixed whitespace"), + ], + ) + def test_remove_extra_whitespace(self, text, expected): + """Test whitespace normalization.""" + assert remove_extra_whitespace(text) == expected + + class TestEdgeCases: """Test suite for edge cases and special scenarios.""" @@ -164,28 +219,89 @@ def test_numbers_in_conversions(self): assert to_pascal_case("test_123_case") == "Test123Case" assert to_kebab_case("test123Case") == "test123-case" - def test_special_characters(self): - """Test conversions with special characters.""" - assert to_snake_case("hello@world#test") == "hello@world#test" - assert to_camel_case("hello@world#test") == "hello@world#test" - assert to_pascal_case("hello@world#test") == "Hello@world#test" - assert to_kebab_case("hello@world#test") == "hello@world#test" + def test_special_characters_only(self): + """Test conversions with only special characters.""" + assert to_snake_case("!!!@@@###") == "" + assert to_camel_case("!!!@@@###") == "" + assert to_pascal_case("!!!@@@###") == "" + assert to_kebab_case("!!!@@@###") == "" + assert to_snake_case("___") == "" + assert to_kebab_case("---") == "" def test_consecutive_delimiters(self): """Test conversions with consecutive delimiters.""" - assert to_snake_case("hello__world") == "hello__world" - assert to_camel_case("hello__world") == "helloWorld" - assert to_pascal_case("hello--world") == "HelloWorld" - assert to_kebab_case("hello world") == "hello-world" + assert to_snake_case("hello___world") == "hello_world" + assert to_camel_case("hello___world") == "helloWorld" + assert to_pascal_case("hello---world") == "HelloWorld" + assert to_kebab_case("hello world") == "hello-world" + assert to_snake_case("test____case____example") == "test_case_example" + assert to_kebab_case("test----case----example") == "test-case-example" def test_truncate_edge_cases(self): """Test truncation edge cases.""" # Suffix longer than max_length - assert truncate_text("Test", 2, "...") == ".." + assert truncate_text("Test", 2, "...") == "" assert truncate_text("Test", 0, "...") == "" # Text exactly at max_length assert truncate_text("12345", 5, "...") == "12345" # Empty text - assert truncate_text("", 10, "...") == "" \ No newline at end of file + assert truncate_text("", 10, "...") == "" + + # Very long suffix + assert truncate_text("Short", 10, "VERYLONGSUFFIX") == "Short" + assert truncate_text("This needs truncation", 10, "LONG") == "This nLONG" + + def test_unicode_handling(self): + """Test handling of Unicode characters.""" + # Note: Current implementation treats accented chars as non-alphanumeric + assert "caf" in to_snake_case("Café").lower() + assert "m_nch" in to_snake_case("München").lower() + assert word_count("Café au lait") == 3 + assert is_mixed_case("Café") == True + + +class TestPerformance: + """Test suite for performance characteristics.""" + + def test_performance_with_long_strings(self): + """Test that functions handle long strings efficiently.""" + long_text = "CamelCase" * 100 # 900 characters + + # Should complete quickly (under 10ms) + start = timeit.default_timer() + result = to_snake_case(long_text) + elapsed = timeit.default_timer() - start + assert elapsed < 0.01 + assert "camel_case" in result + + # Test other conversions + start = timeit.default_timer() + to_kebab_case(long_text) + elapsed = timeit.default_timer() - start + assert elapsed < 0.01 + + def test_performance_with_many_delimiters(self): + """Test performance with many consecutive delimiters.""" + text_with_delimiters = "_" * 100 + "test" + "_" * 100 + "case" + + start = timeit.default_timer() + result = to_snake_case(text_with_delimiters) + elapsed = timeit.default_timer() - start + assert elapsed < 0.01 + assert result == "test_case" + + def test_regex_compilation_benefit(self): + """Test that precompiled regex improves performance.""" + # Run multiple conversions to test regex caching + test_cases = ["CamelCase", "snake_case", "kebab-case"] * 100 + + start = timeit.default_timer() + for text in test_cases: + to_snake_case(text) + to_kebab_case(text) + elapsed = timeit.default_timer() - start + + # Should complete 600 conversions in under 100ms + assert elapsed < 0.1 \ No newline at end of file From 38e304a7e0034353c500c200e21ac5f66d70fd7b Mon Sep 17 00:00:00 2001 From: Lance Lewandowski Date: Thu, 28 Aug 2025 12:47:42 -0400 Subject: [PATCH 3/3] refactor: Implement shared tokenizer and add validation per review feedback Addressed final review feedback from PR #14: 1. Shared Tokenizer Implementation: - Created _tokenize_string() for consistent word boundary detection - All case conversion functions now use the same tokenizer - Properly handles case transitions (HelloWorld -> Hello, World) - Correctly tokenizes consecutive capitals (XMLHttpRequest -> XML, Http, Request) - Ensures consistent behavior across all conversion functions 2. Non-Negative Validation: - Added validation in truncate_text() for max_length parameter - Raises ValueError for negative values - Makes API safer and more predictable 3. Test Improvements: - Added dedicated test suite for tokenizer - Added test for ValueError on negative max_length - Updated expectations for consistent tokenization - All 110 tests passing The implementation now provides consistent behavior across all case conversion functions and safer parameter validation. --- src/string_utils.py | 128 ++++++++++++++++++++----------------- tests/test_string_utils.py | 37 ++++++++++- 2 files changed, 106 insertions(+), 59 deletions(-) diff --git a/src/string_utils.py b/src/string_utils.py index 0769dab..efcfb09 100644 --- a/src/string_utils.py +++ b/src/string_utils.py @@ -5,6 +5,7 @@ """ import re +from typing import List # Precompile regex patterns for better performance @@ -14,6 +15,44 @@ _WHITESPACE = re.compile(r'\s+') +def _tokenize_string(text: str) -> List[str]: + """ + Tokenize a string into words based on case transitions, delimiters, and numbers. + + This shared tokenizer ensures consistent behavior across all case conversion functions. + + Args: + text: Input string to tokenize + + Returns: + List of word tokens + + Examples: + >>> _tokenize_string("HelloWorld") + ['Hello', 'World'] + >>> _tokenize_string("XMLHttpRequest") + ['XML', 'Http', 'Request'] + >>> _tokenize_string("snake_case_example") + ['snake', 'case', 'example'] + """ + if not text: + return [] + + # First, replace non-alphanumeric with spaces + text = _NON_ALNUM.sub(' ', text) + + # Handle consecutive capitals (e.g., XMLHttp -> XML Http) + text = _CONSECUTIVE_CAPS.sub(r'\1 \2', text) + + # Insert space before capitals preceded by lowercase/digit + text = _CAMEL_BOUNDARY.sub(r'\1 \2', text) + + # Split on whitespace and filter empty strings + tokens = text.split() + + return [token for token in tokens if token] + + def to_snake_case(text: str) -> str: """Convert a string to snake_case. @@ -37,24 +76,11 @@ def to_snake_case(text: str) -> str: >>> to_snake_case("__already__snake__") 'already_snake' """ - if not text: + tokens = _tokenize_string(text) + if not tokens: return "" - # Replace non-alphanumeric with underscores - text = _NON_ALNUM.sub('_', text) - - # Handle consecutive capitals - text = _CONSECUTIVE_CAPS.sub(r'\1_\2', text) - - # Insert underscore before capitals preceded by lowercase/digit - text = _CAMEL_BOUNDARY.sub(r'\1_\2', text) - - # Convert to lowercase and remove redundant underscores - text = text.lower() - text = re.sub(r'_+', '_', text) # Collapse multiple underscores - text = text.strip('_') # Remove leading/trailing underscores - - return text + return '_'.join(token.lower() for token in tokens) def to_camel_case(text: str) -> str: @@ -73,24 +99,19 @@ def to_camel_case(text: str) -> str: 'someVariableName' >>> to_camel_case("Convert to camel") 'convertToCamel' + >>> to_camel_case("HelloWorld") + 'helloWorld' + >>> to_camel_case("XMLHttpRequest") + 'xmlHttpRequest' >>> to_camel_case("__multiple__delimiters__") 'multipleDelimiters' - >>> to_camel_case("123_start_with_number") - '123StartWithNumber' """ - if not text: - return "" - - # Split on non-alphanumeric characters - words = _NON_ALNUM.split(text) - # Filter empty strings - words = [w for w in words if w] - - if not words: + tokens = _tokenize_string(text) + if not tokens: return "" - # First word lowercase, rest title case - return words[0].lower() + ''.join(w.capitalize() for w in words[1:]) + # First token lowercase, rest capitalized + return tokens[0].lower() + ''.join(token.capitalize() for token in tokens[1:]) def to_pascal_case(text: str) -> str: @@ -109,18 +130,18 @@ def to_pascal_case(text: str) -> str: 'SomeVariableName' >>> to_pascal_case("convert to pascal") 'ConvertToPascal' + >>> to_pascal_case("helloWorld") + 'HelloWorld' + >>> to_pascal_case("XMLHttpRequest") + 'XmlHttpRequest' >>> to_pascal_case("__multiple__delimiters__") 'MultipleDelimiters' - >>> to_pascal_case("123_start_with_number") - '123StartWithNumber' """ - if not text: + tokens = _tokenize_string(text) + if not tokens: return "" - # Split on non-alphanumeric characters - words = _NON_ALNUM.split(text) - # Filter empty strings and capitalize each word - return ''.join(w.capitalize() for w in words if w) + return ''.join(token.capitalize() for token in tokens) def to_kebab_case(text: str) -> str: @@ -144,24 +165,11 @@ def to_kebab_case(text: str) -> str: >>> to_kebab_case("--already--kebab--") 'already-kebab' """ - if not text: + tokens = _tokenize_string(text) + if not tokens: return "" - # Replace non-alphanumeric with hyphens - text = _NON_ALNUM.sub('-', text) - - # Handle consecutive capitals - text = _CONSECUTIVE_CAPS.sub(r'\1-\2', text) - - # Insert hyphen before capitals preceded by lowercase/digit - text = _CAMEL_BOUNDARY.sub(r'\1-\2', text) - - # Convert to lowercase and clean up hyphens - text = text.lower() - text = re.sub(r'-+', '-', text) # Collapse multiple hyphens - text = text.strip('-') # Remove leading/trailing hyphens - - return text + return '-'.join(token.lower() for token in tokens) def truncate_text(text: str, max_length: int, suffix: str = "...") -> str: @@ -172,13 +180,16 @@ def truncate_text(text: str, max_length: int, suffix: str = "...") -> str: Args: text: Text to truncate - max_length: Maximum length including suffix + max_length: Maximum length including suffix (must be non-negative) suffix: String to append when truncating (default: "...") Returns: Truncated text with suffix if needed, empty string if max_length is too small for meaningful truncation + Raises: + ValueError: If max_length is negative + Examples: >>> truncate_text("This is a long text", 10) 'This is...' @@ -186,11 +197,14 @@ def truncate_text(text: str, max_length: int, suffix: str = "...") -> str: 'Short' >>> truncate_text("Exactly ten", 11) 'Exactly ten' - >>> truncate_text("Too long", 2, "...") - '' - >>> truncate_text("Custom", 5, "→") - 'Cust→' + >>> truncate_text("Too long", -1) + Traceback (most recent call last): + ... + ValueError: max_length must be non-negative, got -1 """ + if max_length < 0: + raise ValueError(f"max_length must be non-negative, got {max_length}") + if len(text) <= max_length: return text diff --git a/tests/test_string_utils.py b/tests/test_string_utils.py index 15e937e..3f555f7 100644 --- a/tests/test_string_utils.py +++ b/tests/test_string_utils.py @@ -11,6 +11,7 @@ word_count, is_mixed_case, remove_extra_whitespace, + _tokenize_string, # Import for testing ) @@ -33,6 +34,7 @@ class TestCaseConversions: ("123Numbers", "123_numbers"), ("with spaces here", "with_spaces_here"), ("__multiple__underscores__", "multiple_underscores"), + ("HelloWorld", "hello_world"), # Now handles camelCase input ("CamelCASEMixed", "camel_case_mixed"), ], ) @@ -46,7 +48,7 @@ def test_to_snake_case(self, input_text, expected): ("hello_world", "helloWorld"), ("some-variable-name", "someVariableName"), ("Convert to camel", "convertToCamel"), - ("already_camelCase", "alreadyCamelcase"), + ("already_camelCase", "alreadyCamelCase"), # Preserves existing case transitions ("mixed-Style_Example", "mixedStyleExample"), ("", ""), ("a", "a"), @@ -55,6 +57,8 @@ def test_to_snake_case(self, input_text, expected): ("123_numbers", "123Numbers"), ("__multiple__delimiters__", "multipleDelimiters"), ("!!!only!!!special!!!", "onlySpecial"), + ("HelloWorld", "helloWorld"), # Now properly handles CamelCase + ("XMLHttpRequest", "xmlHttpRequest"), # Consistent tokenization ], ) def test_to_camel_case(self, input_text, expected): @@ -67,7 +71,7 @@ def test_to_camel_case(self, input_text, expected): ("hello_world", "HelloWorld"), ("some-variable-name", "SomeVariableName"), ("convert to pascal", "ConvertToPascal"), - ("already_PascalCase", "AlreadyPascalcase"), + ("already_PascalCase", "AlreadyPascalCase"), # Preserves case transitions ("mixed-Style_Example", "MixedStyleExample"), ("", ""), ("a", "A"), @@ -75,6 +79,8 @@ def test_to_camel_case(self, input_text, expected): ("UPPERCASE", "Uppercase"), ("123_numbers", "123Numbers"), ("__multiple__delimiters__", "MultipleDelimiters"), + ("helloWorld", "HelloWorld"), # Handles camelCase input + ("XMLHttpRequest", "XmlHttpRequest"), # Consistent tokenization ], ) def test_to_pascal_case(self, input_text, expected): @@ -195,6 +201,29 @@ def test_remove_extra_whitespace(self, text, expected): assert remove_extra_whitespace(text) == expected +class TestTokenizer: + """Test suite for the shared tokenizer.""" + + def test_tokenizer_basic(self): + """Test basic tokenization.""" + assert _tokenize_string("HelloWorld") == ["Hello", "World"] + assert _tokenize_string("snake_case") == ["snake", "case"] + assert _tokenize_string("kebab-case") == ["kebab", "case"] + assert _tokenize_string("XMLHttpRequest") == ["XML", "Http", "Request"] + assert _tokenize_string("IOError") == ["IO", "Error"] + assert _tokenize_string("") == [] + + def test_tokenizer_consistency(self): + """Test that all conversion functions use consistent tokenization.""" + test_string = "XMLHttpRequest" + + # All should tokenize the same way + assert to_snake_case(test_string) == "xml_http_request" + assert to_camel_case(test_string) == "xmlHttpRequest" + assert to_pascal_case(test_string) == "XmlHttpRequest" + assert to_kebab_case(test_string) == "xml-http-request" + + class TestEdgeCases: """Test suite for edge cases and special scenarios.""" @@ -252,6 +281,10 @@ def test_truncate_edge_cases(self): # Very long suffix assert truncate_text("Short", 10, "VERYLONGSUFFIX") == "Short" assert truncate_text("This needs truncation", 10, "LONG") == "This nLONG" + + # Negative max_length should raise ValueError + with pytest.raises(ValueError, match="max_length must be non-negative"): + truncate_text("Test", -1) def test_unicode_handling(self): """Test handling of Unicode characters."""