diff --git a/README.md b/README.md index 0528d3cc..57a01552 100644 --- a/README.md +++ b/README.md @@ -44,10 +44,14 @@ Or using the run script: python trailpack/ui/run_streamlit.py ``` -The UI provides a 3-step workflow with smooth transitions: +The UI provides a step-by-step workflow with smooth transitions: 1. **Upload File & Select Language**: Upload an Excel file and select language for PyST mapping 2. **Select Sheet**: Choose which sheet to process with data preview 3. **Map Columns**: Map each column to PyST concepts with automatic suggestions and dataframe preview +4. **General Details**: Provide package metadata including: + - Basic information (name, title, description, version) + - **Resource name** with automatic sanitization and validation + - Licenses, contributors, and data sources The view object is stored internally for further processing. diff --git a/tests/test_schema_validation.py b/tests/test_schema_validation.py index bbc41ef3..201bb99f 100644 --- a/tests/test_schema_validation.py +++ b/tests/test_schema_validation.py @@ -407,3 +407,63 @@ def test_inconsistencies_export_to_csv(validator, sample_schema, tmp_path): assert any(row['value'] == '456' for row in rows) assert all(row['actual_type'] == 'int' for row in rows) assert all(row['expected_type'] == 'str' for row in rows) + + +def test_sanitize_resource_name(validator): + """Test resource name sanitization.""" + # Test with invalid characters + assert validator.sanitize_resource_name("My Resource!") == "my_resource" + assert validator.sanitize_resource_name("Test@123#ABC") == "test123abc" + assert validator.sanitize_resource_name("DATA FILE") == "data_file" + + # Test with special characters + assert validator.sanitize_resource_name("20_mw+") == "20_mw" + assert validator.sanitize_resource_name("test@#$%") == "test" + + # Test with dots + assert validator.sanitize_resource_name(".test.name.") == "test.name" + + # Test with valid name + assert validator.sanitize_resource_name("valid-name_123") == "valid-name_123" + + # Test empty string + assert validator.sanitize_resource_name("") == "resource" + assert validator.sanitize_resource_name("@#$%") == "resource" + + +def test_validate_and_sanitize_resource_name(validator): + """Test resource name validation with sanitization.""" + # Valid name + is_valid, name, suggestion = validator.validate_and_sanitize_resource_name("valid-name") + assert is_valid is True + assert name == "valid-name" + assert suggestion is None + + # Invalid name - get suggestion + is_valid, name, suggestion = validator.validate_and_sanitize_resource_name("Invalid Name!") + assert is_valid is False + assert name == "Invalid Name!" # Original preserved when not auto_fix + assert suggestion == "invalid_name" + + # Invalid name - auto fix + is_valid, name, suggestion = validator.validate_and_sanitize_resource_name("Invalid Name!", auto_fix=True) + assert is_valid is False + assert name == "invalid_name" # Sanitized when auto_fix + assert suggestion is None + + +def test_validate_resource_suggests_sanitized_name(validator): + """Test that resource validation suggests sanitized names.""" + resource = { + "name": "My Resource!", + "path": "data.csv", + "format": "csv" + } + + result = validator.validate_resource(resource) + + # Should have a warning with suggested name + warnings_str = " ".join([str(w) for w in result.warnings]) + assert "My Resource!" in warnings_str + assert "my_resource" in warnings_str + assert "Suggested name" in warnings_str or "suggested" in warnings_str.lower() diff --git a/trailpack/ui/streamlit_app.py b/trailpack/ui/streamlit_app.py index 40ba4af9..248a32e4 100644 --- a/trailpack/ui/streamlit_app.py +++ b/trailpack/ui/streamlit_app.py @@ -38,6 +38,7 @@ from trailpack.pyst.api.requests.suggest import SUPPORTED_LANGUAGES from trailpack.pyst.api.client import get_suggest_client from trailpack.packing.datapackage_schema import DataPackageSchema, COMMON_LICENSES +from trailpack.validation import StandardValidator from trailpack.config import ( build_mapping_config, build_metadata_config, @@ -157,6 +158,14 @@ def iri_to_web_url(iri: str, language: str = "en") -> str: st.session_state.view_object = {} if "general_details" not in st.session_state: st.session_state.general_details = {} +if "resource_name" not in st.session_state: + st.session_state.resource_name = None +if "resource_name_confirmed" not in st.session_state: + st.session_state.resource_name_confirmed = False +if "resource_name_accepted" not in st.session_state: + st.session_state.resource_name_accepted = False +if "resource_name_editing" not in st.session_state: + st.session_state.resource_name_editing = False def render_sidebar_header(): @@ -450,7 +459,7 @@ def generate_view_object() -> Dict[str, Any]: col1, col2, col3 = st.columns([1, 1, 1]) with col3: - # Enable Next button if file exists (either uploaded or in session state) + # Enable Next button if file exists has_file = uploaded_file is not None or st.session_state.file_name is not None if has_file: @@ -478,7 +487,7 @@ def generate_view_object() -> Dict[str, Any]: navigate_to(2) else: - st.button("Next âžĄī¸", type="primary", disabled=True, use_container_width=True) + st.button("Next âžĄī¸", type="primary", disabled=True, use_container_width=True, help="Please upload a file first") # Page 2: Sheet Selection @@ -892,6 +901,180 @@ def generate_view_object() -> Dict[str, Any]: elif version == "": st.session_state.general_details.pop("version", None) + st.markdown("---") + st.markdown("### 📝 Resource Name Configuration") + st.markdown(""" +The resource name identifies your data file in the package. It must follow specific naming rules: +- Only **lowercase letters** (a-z) +- **Numbers** (0-9) +- **Hyphens** (-), **underscores** (_), and **dots** (.) +- No spaces or special characters + +**Example:** `solar-panel-data`, `emissions_2024`, `my.dataset.v1` + """) + + # Initialize validator + validator = StandardValidator() + + # Get original filename + sheet name and check validity + if st.session_state.file_name and st.session_state.selected_sheet: + # Combine filename and sheet name + file_stem = Path(st.session_state.file_name).stem + sheet_name = st.session_state.selected_sheet.replace(' ', '_') + original_name = f"{file_stem}_{sheet_name}" + is_valid_original, _, suggested_name = validator.validate_and_sanitize_resource_name(original_name) + + # Only show the error/suggestion if not yet accepted + if not st.session_state.resource_name_accepted: + # Show source file and sheet info + st.info(f"📄 **Source:** `{file_stem}` (file) + `{st.session_state.selected_sheet}` (sheet)") + + # Show original combined name with validation status + if is_valid_original: + st.success(f"✅ **Combined name is valid:** `{original_name}`") + # If valid and not set, use it + if not st.session_state.resource_name: + st.session_state.resource_name = original_name + st.session_state.resource_name_accepted = True + else: + # Show the problem prominently + st.error(f"❌ **Combined name has issues:** `{original_name}`") + + st.warning(""" +**Issues found:** +- Uppercase letters → converted to lowercase +- Spaces → replaced with underscores +- Special characters → removed + """) + + # Check if we're in edit mode or display mode + if not st.session_state.resource_name_editing: + # Display mode: show suggestion with Accept/Edit buttons + st.markdown(f"**Suggested sanitized name:** `{suggested_name}`") + + col1, col2, col3 = st.columns([1, 1, 2]) + with col1: + if st.button("✅ Accept", use_container_width=True, type="primary", key="btn_accept_suggestion"): + st.session_state.resource_name = suggested_name + st.session_state.resource_name_confirmed = True + st.session_state.resource_name_accepted = True + st.session_state.resource_name_editing = False + st.session_state.general_details["resource_name"] = suggested_name + st.rerun() + with col2: + if st.button("âœī¸ Edit", use_container_width=True, key="btn_edit_suggestion"): + st.session_state.resource_name = suggested_name + st.session_state.resource_name_editing = True + st.rerun() + else: + # Edit mode: show text input with validation + resource_name_edit = st.text_input( + "Edit Resource Name", + value=st.session_state.resource_name or suggested_name, + placeholder="my-data-resource", + help="Edit the resource name. Must contain only lowercase letters, numbers, hyphens, underscores, and dots.", + key="resource_name_edit_suggestion" + ) + + if resource_name_edit: + is_valid_edit, _, suggestion_edit = validator.validate_and_sanitize_resource_name(resource_name_edit) + + if is_valid_edit: + st.success(f"✅ **`{resource_name_edit}`** is valid!") + else: + st.error(f"❌ **`{resource_name_edit}`** contains invalid characters.") + st.markdown(f"**Suggested fix:** `{suggestion_edit}`") + + # Show buttons for editing + col1, col2, col3 = st.columns([1, 1, 2]) + with col1: + if st.button("✅ Accept", use_container_width=True, type="primary", key="btn_accept_edit", disabled=not is_valid_edit): + st.session_state.resource_name = resource_name_edit + st.session_state.resource_name_confirmed = True + st.session_state.resource_name_accepted = True + st.session_state.resource_name_editing = False + st.session_state.general_details["resource_name"] = resource_name_edit + st.rerun() + with col2: + if st.button("â†Šī¸ Cancel", use_container_width=True, key="btn_cancel_edit"): + st.session_state.resource_name_editing = False + st.rerun() + + # Show resource name input (either already accepted or for manual editing) + # Only show input section if name has been accepted or is being manually entered + if st.session_state.resource_name_accepted or st.session_state.resource_name: + st.markdown("---") + + # If accepted, show as info with option to edit + if st.session_state.resource_name_accepted and st.session_state.resource_name_confirmed: + st.success(f"✅ **Resource name:** `{st.session_state.resource_name}`") + if st.button("âœī¸ Edit Resource Name", key="btn_edit_resource_name"): + st.session_state.resource_name_accepted = False + st.session_state.resource_name_editing = False # Reset editing flag + st.rerun() + else: + # Resource name input with real-time validation + resource_name_input = st.text_input( + "Resource Name *", + value=st.session_state.resource_name or "", + placeholder="my-data-resource", + help="Enter or edit the resource name. Must follow the naming rules above. (Required)", + key="resource_name_input_meta" + ) + + # Validate the entered/edited name in real-time + if resource_name_input: + is_valid_input, _, suggestion = validator.validate_and_sanitize_resource_name(resource_name_input) + + if is_valid_input: + st.success(f"✅ **`{resource_name_input}`** is a valid resource name!") + # Show accept button for valid name + col_btn1, col_btn2, col_btn3 = st.columns([1, 1, 2]) + with col_btn1: + if st.button("✅ Accept", use_container_width=True, type="primary", key="btn_accept_manual"): + st.session_state.resource_name = resource_name_input + st.session_state.resource_name_confirmed = True + st.session_state.resource_name_accepted = True + st.session_state.general_details["resource_name"] = resource_name_input + st.rerun() + with col_btn2: + if st.button("🔄 Reset", help="Reset to sanitized filename + sheet", use_container_width=True, key="btn_reset"): + if st.session_state.file_name and st.session_state.selected_sheet: + file_stem = Path(st.session_state.file_name).stem + sheet_name = st.session_state.selected_sheet.replace(' ', '_') + original_name = f"{file_stem}_{sheet_name}" + st.session_state.resource_name = validator.sanitize_resource_name(original_name) + st.session_state.resource_name_accepted = False + st.rerun() + else: + st.error(f"❌ **`{resource_name_input}`** contains invalid characters.") + st.markdown(f"**Suggested fix:** `{suggestion}`") + + # Show buttons for invalid name + col_btn1, col_btn2, col_btn3 = st.columns([1, 1, 2]) + with col_btn1: + if st.button("✅ Use Suggestion", use_container_width=True, type="primary", key="btn_use_suggestion"): + st.session_state.resource_name = suggestion + st.session_state.resource_name_accepted = True + st.session_state.resource_name_confirmed = True + st.session_state.general_details["resource_name"] = suggestion + st.rerun() + with col_btn2: + if st.button("🔄 Reset", help="Reset to sanitized filename + sheet", use_container_width=True, key="btn_reset_invalid"): + if st.session_state.file_name and st.session_state.selected_sheet: + file_stem = Path(st.session_state.file_name).stem + sheet_name = st.session_state.selected_sheet.replace(' ', '_') + original_name = f"{file_stem}_{sheet_name}" + st.session_state.resource_name = validator.sanitize_resource_name(original_name) + st.session_state.resource_name_accepted = False + st.rerun() + + st.session_state.resource_name_confirmed = False + st.session_state.general_details.pop("resource_name", None) + else: + st.session_state.resource_name_confirmed = False + st.session_state.general_details.pop("resource_name", None) + st.markdown("### Additional Information") # Profile (optional) diff --git a/trailpack/validation/README.md b/trailpack/validation/README.md index c87d3fdb..de6ddefd 100644 --- a/trailpack/validation/README.md +++ b/trailpack/validation/README.md @@ -126,7 +126,49 @@ field = { result = validator.validate_field_definition(field) ``` -### 6. Schema-Based Data Validation +### 6. Resource Name Sanitization + +Resource names must match the pattern `^[a-z0-9\-_.]+$`. The validator can automatically sanitize invalid names: + +```python +from trailpack.validation import StandardValidator + +validator = StandardValidator("1.0.0") + +# Check and get suggestion for invalid name +is_valid, original, suggestion = validator.validate_and_sanitize_resource_name("My Resource!") +print(f"Valid: {is_valid}") # False +print(f"Suggestion: {suggestion}") # "my_resource" + +# Auto-sanitize names +is_valid, sanitized, _ = validator.validate_and_sanitize_resource_name("My Resource!", auto_fix=True) +print(f"Sanitized: {sanitized}") # "my_resource" + +# Or use the sanitize method directly +clean_name = validator.sanitize_resource_name("Test@123#ABC") +print(clean_name) # "test123abc" +``` + +**When validating resources**, the validator automatically suggests sanitized names: + +```python +resource = { + "name": "My Data File!", # Invalid: uppercase and special chars + "path": "data.csv", + "format": "csv" +} + +result = validator.validate_resource(resource) +# Warning: Resource name 'My Data File!' contains invalid characters. +# Suggested name: 'my_data_file' +``` + +**In the UI**: When resource names are auto-inferred (e.g., from filenames), the validator will: +1. Detect invalid names +2. Show the suggested sanitized name +3. Ask for user confirmation before applying + +### 7. Schema-Based Data Validation The validator can check that DataFrame values match their field type definitions: diff --git a/trailpack/validation/standard_validator.py b/trailpack/validation/standard_validator.py index 517f8565..085aee92 100644 --- a/trailpack/validation/standard_validator.py +++ b/trailpack/validation/standard_validator.py @@ -196,7 +196,7 @@ class StandardValidator: The StandardValidator checks data packages for: - Metadata completeness (required and recommended fields) - - Resource definitions (proper schema, formats) + - Resource definitions (proper schema, formats, name sanitization) - Field definitions (types, units, constraints) - Data quality (missing values, duplicates, type consistency) - Schema matching (column types match field definitions) @@ -206,6 +206,12 @@ class StandardValidator: - Counts/IDs: Use dimensionless unit (http://qudt.org/vocab/unit/NUM) - Percentages: Use percent or dimensionless unit + **Resource Name Sanitization:** + Resource names must match ^[a-z0-9\\-_.]+$. The validator automatically: + - Detects invalid resource names + - Suggests sanitized alternatives + - Can auto-sanitize names with sanitize_resource_name() + **Automatic Inconsistency Export:** When type inconsistencies are detected during validation (e.g., mixed types in a column), each inconsistent value is tracked and automatically exported to @@ -223,6 +229,10 @@ class StandardValidator: >>> # Validate with schema (auto-exports inconsistencies.csv if errors found) >>> result = validator.validate_data_quality(df, schema=schema) >>> print(result) # Shows errors and exports CSV automatically + + >>> # Sanitize resource names + >>> clean_name = validator.sanitize_resource_name("My File!") + >>> print(clean_name) # "my_file" """ def __init__(self, version: str = "1.0.0"): @@ -362,6 +372,8 @@ def validate_resource(self, resource: Dict[str, Any]) -> ValidationResult: """ Validate a resource (data file) definition. + Automatically checks and suggests sanitized names for invalid resource names. + Args: resource: Resource dictionary from metadata @@ -388,6 +400,19 @@ def validate_resource(self, resource: Dict[str, Any]) -> ValidationResult: field_def ) result.errors.extend(field_result.errors) + + # Special handling for resource name - suggest sanitized version + if field_name == "name": + is_valid, _, suggestion = self.validate_and_sanitize_resource_name( + resource[field_name], + auto_fix=False + ) + if not is_valid and suggestion: + result.add_warning( + f"Resource name '{resource[field_name]}' contains invalid characters. " + f"Suggested name: '{suggestion}'", + "name" + ) # 2. Check format preference if "format" in resource: @@ -822,6 +847,90 @@ def _validate_field_value( return result + def sanitize_resource_name(self, name: str) -> str: + """ + Sanitize resource name to match the required pattern ^[a-z0-9\-_.]+$. + + The resource name must only contain: + - Lowercase letters (a-z) + - Numbers (0-9) + - Hyphens (-) + - Underscores (_) + - Dots (.) + + Args: + name: Raw name string to sanitize + + Returns: + Sanitized name matching the required pattern + + Example: + >>> validator = StandardValidator() + >>> validator.sanitize_resource_name("My Resource Name!") + 'my_resource_name' + >>> validator.sanitize_resource_name("Test@123") + 'test123' + """ + # Convert to lowercase + name = name.lower() + + # Replace spaces with underscores + name = name.replace(' ', '_') + + # Remove or replace invalid characters + # Keep only lowercase letters, numbers, hyphens, underscores, and dots + name = re.sub(r'[^a-z0-9\-_.]', '', name) + + # Ensure name doesn't start or end with dots + name = name.strip('.') + + # Ensure name is not empty after sanitization + if not name: + name = "resource" + + return name + + def validate_and_sanitize_resource_name( + self, + name: str, + auto_fix: bool = False + ) -> Tuple[bool, str, Optional[str]]: + """ + Validate a resource name and optionally sanitize it. + + Args: + name: Resource name to validate + auto_fix: If True, return sanitized name; if False, just validate + + Returns: + Tuple of (is_valid, original_or_sanitized_name, suggestion) + - is_valid: Whether the original name is valid + - original_or_sanitized_name: Original name if valid/not auto_fix, sanitized if auto_fix + - suggestion: Sanitized name suggestion if original is invalid, None otherwise + + Example: + >>> validator = StandardValidator() + >>> is_valid, name, suggestion = validator.validate_and_sanitize_resource_name("Invalid Name!") + >>> print(f"Valid: {is_valid}, Suggestion: {suggestion}") + Valid: False, Suggestion: invalid_name + + >>> is_valid, name, _ = validator.validate_and_sanitize_resource_name("valid-name") + >>> print(f"Valid: {is_valid}, Name: {name}") + Valid: True, Name: valid-name + """ + pattern = r"^[a-z0-9\-_.]+$" + is_valid = bool(re.match(pattern, name)) + + if is_valid: + return True, name, None + + sanitized = self.sanitize_resource_name(name) + + if auto_fix: + return False, sanitized, None + else: + return False, name, sanitized + def _determine_level(self, result: ValidationResult) -> str: """ Determine validation level based on errors and warnings.