Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,10 +44,14 @@ Or using the run script:
python trailpack/ui/run_streamlit.py
```

The UI provides a 3-step workflow with smooth transitions:
The UI provides a step-by-step workflow with smooth transitions:
1. **Upload File & Select Language**: Upload an Excel file and select language for PyST mapping
2. **Select Sheet**: Choose which sheet to process with data preview
3. **Map Columns**: Map each column to PyST concepts with automatic suggestions and dataframe preview
4. **General Details**: Provide package metadata including:
- Basic information (name, title, description, version)
- **Resource name** with automatic sanitization and validation
- Licenses, contributors, and data sources

The view object is stored internally for further processing.

Expand Down
60 changes: 60 additions & 0 deletions tests/test_schema_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,3 +407,63 @@ def test_inconsistencies_export_to_csv(validator, sample_schema, tmp_path):
assert any(row['value'] == '456' for row in rows)
assert all(row['actual_type'] == 'int' for row in rows)
assert all(row['expected_type'] == 'str' for row in rows)


def test_sanitize_resource_name(validator):
"""Test resource name sanitization."""
# Test with invalid characters
assert validator.sanitize_resource_name("My Resource!") == "my_resource"
assert validator.sanitize_resource_name("Test@123#ABC") == "test123abc"
assert validator.sanitize_resource_name("DATA FILE") == "data_file"

# Test with special characters
assert validator.sanitize_resource_name("20_mw+") == "20_mw"
assert validator.sanitize_resource_name("test@#$%") == "test"

# Test with dots
assert validator.sanitize_resource_name(".test.name.") == "test.name"

# Test with valid name
assert validator.sanitize_resource_name("valid-name_123") == "valid-name_123"

# Test empty string
assert validator.sanitize_resource_name("") == "resource"
assert validator.sanitize_resource_name("@#$%") == "resource"


def test_validate_and_sanitize_resource_name(validator):
"""Test resource name validation with sanitization."""
# Valid name
is_valid, name, suggestion = validator.validate_and_sanitize_resource_name("valid-name")
assert is_valid is True
assert name == "valid-name"
assert suggestion is None

# Invalid name - get suggestion
is_valid, name, suggestion = validator.validate_and_sanitize_resource_name("Invalid Name!")
assert is_valid is False
assert name == "Invalid Name!" # Original preserved when not auto_fix
assert suggestion == "invalid_name"

# Invalid name - auto fix
is_valid, name, suggestion = validator.validate_and_sanitize_resource_name("Invalid Name!", auto_fix=True)
assert is_valid is False
assert name == "invalid_name" # Sanitized when auto_fix
assert suggestion is None


def test_validate_resource_suggests_sanitized_name(validator):
"""Test that resource validation suggests sanitized names."""
resource = {
"name": "My Resource!",
"path": "data.csv",
"format": "csv"
}

result = validator.validate_resource(resource)

# Should have a warning with suggested name
warnings_str = " ".join([str(w) for w in result.warnings])
assert "My Resource!" in warnings_str
assert "my_resource" in warnings_str
assert "Suggested name" in warnings_str or "suggested" in warnings_str.lower()
187 changes: 185 additions & 2 deletions trailpack/ui/streamlit_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
from trailpack.pyst.api.requests.suggest import SUPPORTED_LANGUAGES
from trailpack.pyst.api.client import get_suggest_client
from trailpack.packing.datapackage_schema import DataPackageSchema, COMMON_LICENSES
from trailpack.validation import StandardValidator
from trailpack.config import (
build_mapping_config,
build_metadata_config,
Expand Down Expand Up @@ -157,6 +158,14 @@ def iri_to_web_url(iri: str, language: str = "en") -> str:
st.session_state.view_object = {}
if "general_details" not in st.session_state:
st.session_state.general_details = {}
if "resource_name" not in st.session_state:
st.session_state.resource_name = None
if "resource_name_confirmed" not in st.session_state:
st.session_state.resource_name_confirmed = False
if "resource_name_accepted" not in st.session_state:
st.session_state.resource_name_accepted = False
if "resource_name_editing" not in st.session_state:
st.session_state.resource_name_editing = False


def render_sidebar_header():
Expand Down Expand Up @@ -450,7 +459,7 @@ def generate_view_object() -> Dict[str, Any]:
col1, col2, col3 = st.columns([1, 1, 1])

with col3:
# Enable Next button if file exists (either uploaded or in session state)
# Enable Next button if file exists
has_file = uploaded_file is not None or st.session_state.file_name is not None

if has_file:
Expand Down Expand Up @@ -478,7 +487,7 @@ def generate_view_object() -> Dict[str, Any]:

navigate_to(2)
else:
st.button("Next ➡️", type="primary", disabled=True, use_container_width=True)
st.button("Next ➡️", type="primary", disabled=True, use_container_width=True, help="Please upload a file first")


# Page 2: Sheet Selection
Expand Down Expand Up @@ -892,6 +901,180 @@ def generate_view_object() -> Dict[str, Any]:
elif version == "":
st.session_state.general_details.pop("version", None)

st.markdown("---")
st.markdown("### 📝 Resource Name Configuration")
st.markdown("""
The resource name identifies your data file in the package. It must follow specific naming rules:
- Only **lowercase letters** (a-z)
- **Numbers** (0-9)
- **Hyphens** (-), **underscores** (_), and **dots** (.)
- No spaces or special characters

**Example:** `solar-panel-data`, `emissions_2024`, `my.dataset.v1`
""")

# Initialize validator
validator = StandardValidator()

# Get original filename + sheet name and check validity
if st.session_state.file_name and st.session_state.selected_sheet:
# Combine filename and sheet name
file_stem = Path(st.session_state.file_name).stem
sheet_name = st.session_state.selected_sheet.replace(' ', '_')
original_name = f"{file_stem}_{sheet_name}"
is_valid_original, _, suggested_name = validator.validate_and_sanitize_resource_name(original_name)

# Only show the error/suggestion if not yet accepted
if not st.session_state.resource_name_accepted:
# Show source file and sheet info
st.info(f"📄 **Source:** `{file_stem}` (file) + `{st.session_state.selected_sheet}` (sheet)")

# Show original combined name with validation status
if is_valid_original:
st.success(f"✅ **Combined name is valid:** `{original_name}`")
# If valid and not set, use it
if not st.session_state.resource_name:
st.session_state.resource_name = original_name
st.session_state.resource_name_accepted = True
else:
# Show the problem prominently
st.error(f"❌ **Combined name has issues:** `{original_name}`")

st.warning("""
**Issues found:**
- Uppercase letters → converted to lowercase
- Spaces → replaced with underscores
- Special characters → removed
""")

# Check if we're in edit mode or display mode
if not st.session_state.resource_name_editing:
# Display mode: show suggestion with Accept/Edit buttons
st.markdown(f"**Suggested sanitized name:** `{suggested_name}`")

col1, col2, col3 = st.columns([1, 1, 2])
with col1:
if st.button("✅ Accept", use_container_width=True, type="primary", key="btn_accept_suggestion"):
st.session_state.resource_name = suggested_name
st.session_state.resource_name_confirmed = True
st.session_state.resource_name_accepted = True
st.session_state.resource_name_editing = False
st.session_state.general_details["resource_name"] = suggested_name
st.rerun()
with col2:
if st.button("✏️ Edit", use_container_width=True, key="btn_edit_suggestion"):
st.session_state.resource_name = suggested_name
st.session_state.resource_name_editing = True
st.rerun()
else:
# Edit mode: show text input with validation
resource_name_edit = st.text_input(
"Edit Resource Name",
value=st.session_state.resource_name or suggested_name,
placeholder="my-data-resource",
help="Edit the resource name. Must contain only lowercase letters, numbers, hyphens, underscores, and dots.",
key="resource_name_edit_suggestion"
)

if resource_name_edit:
is_valid_edit, _, suggestion_edit = validator.validate_and_sanitize_resource_name(resource_name_edit)

if is_valid_edit:
st.success(f"✅ **`{resource_name_edit}`** is valid!")
else:
st.error(f"❌ **`{resource_name_edit}`** contains invalid characters.")
st.markdown(f"**Suggested fix:** `{suggestion_edit}`")

# Show buttons for editing
col1, col2, col3 = st.columns([1, 1, 2])
with col1:
if st.button("✅ Accept", use_container_width=True, type="primary", key="btn_accept_edit", disabled=not is_valid_edit):
st.session_state.resource_name = resource_name_edit
st.session_state.resource_name_confirmed = True
st.session_state.resource_name_accepted = True
st.session_state.resource_name_editing = False
st.session_state.general_details["resource_name"] = resource_name_edit
st.rerun()
with col2:
if st.button("↩️ Cancel", use_container_width=True, key="btn_cancel_edit"):
st.session_state.resource_name_editing = False
st.rerun()

# Show resource name input (either already accepted or for manual editing)
# Only show input section if name has been accepted or is being manually entered
if st.session_state.resource_name_accepted or st.session_state.resource_name:
st.markdown("---")

# If accepted, show as info with option to edit
if st.session_state.resource_name_accepted and st.session_state.resource_name_confirmed:
st.success(f"✅ **Resource name:** `{st.session_state.resource_name}`")
if st.button("✏️ Edit Resource Name", key="btn_edit_resource_name"):
st.session_state.resource_name_accepted = False
st.session_state.resource_name_editing = False # Reset editing flag
st.rerun()
else:
# Resource name input with real-time validation
resource_name_input = st.text_input(
"Resource Name *",
value=st.session_state.resource_name or "",
placeholder="my-data-resource",
help="Enter or edit the resource name. Must follow the naming rules above. (Required)",
key="resource_name_input_meta"
)

# Validate the entered/edited name in real-time
if resource_name_input:
is_valid_input, _, suggestion = validator.validate_and_sanitize_resource_name(resource_name_input)

if is_valid_input:
st.success(f"✅ **`{resource_name_input}`** is a valid resource name!")
# Show accept button for valid name
col_btn1, col_btn2, col_btn3 = st.columns([1, 1, 2])
with col_btn1:
if st.button("✅ Accept", use_container_width=True, type="primary", key="btn_accept_manual"):
st.session_state.resource_name = resource_name_input
st.session_state.resource_name_confirmed = True
st.session_state.resource_name_accepted = True
st.session_state.general_details["resource_name"] = resource_name_input
st.rerun()
with col_btn2:
if st.button("🔄 Reset", help="Reset to sanitized filename + sheet", use_container_width=True, key="btn_reset"):
if st.session_state.file_name and st.session_state.selected_sheet:
file_stem = Path(st.session_state.file_name).stem
sheet_name = st.session_state.selected_sheet.replace(' ', '_')
original_name = f"{file_stem}_{sheet_name}"
st.session_state.resource_name = validator.sanitize_resource_name(original_name)
st.session_state.resource_name_accepted = False
st.rerun()
else:
st.error(f"❌ **`{resource_name_input}`** contains invalid characters.")
st.markdown(f"**Suggested fix:** `{suggestion}`")

# Show buttons for invalid name
col_btn1, col_btn2, col_btn3 = st.columns([1, 1, 2])
with col_btn1:
if st.button("✅ Use Suggestion", use_container_width=True, type="primary", key="btn_use_suggestion"):
st.session_state.resource_name = suggestion
st.session_state.resource_name_accepted = True
st.session_state.resource_name_confirmed = True
st.session_state.general_details["resource_name"] = suggestion
st.rerun()
with col_btn2:
if st.button("🔄 Reset", help="Reset to sanitized filename + sheet", use_container_width=True, key="btn_reset_invalid"):
if st.session_state.file_name and st.session_state.selected_sheet:
file_stem = Path(st.session_state.file_name).stem
sheet_name = st.session_state.selected_sheet.replace(' ', '_')
original_name = f"{file_stem}_{sheet_name}"
st.session_state.resource_name = validator.sanitize_resource_name(original_name)
st.session_state.resource_name_accepted = False
st.rerun()

st.session_state.resource_name_confirmed = False
st.session_state.general_details.pop("resource_name", None)
else:
st.session_state.resource_name_confirmed = False
st.session_state.general_details.pop("resource_name", None)

st.markdown("### Additional Information")

# Profile (optional)
Expand Down
44 changes: 43 additions & 1 deletion trailpack/validation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,49 @@ field = {
result = validator.validate_field_definition(field)
```

### 6. Schema-Based Data Validation
### 6. Resource Name Sanitization

Resource names must match the pattern `^[a-z0-9\-_.]+$`. The validator can automatically sanitize invalid names:

```python
from trailpack.validation import StandardValidator

validator = StandardValidator("1.0.0")

# Check and get suggestion for invalid name
is_valid, original, suggestion = validator.validate_and_sanitize_resource_name("My Resource!")
print(f"Valid: {is_valid}") # False
print(f"Suggestion: {suggestion}") # "my_resource"

# Auto-sanitize names
is_valid, sanitized, _ = validator.validate_and_sanitize_resource_name("My Resource!", auto_fix=True)
print(f"Sanitized: {sanitized}") # "my_resource"

# Or use the sanitize method directly
clean_name = validator.sanitize_resource_name("Test@123#ABC")
print(clean_name) # "test123abc"
```

**When validating resources**, the validator automatically suggests sanitized names:

```python
resource = {
"name": "My Data File!", # Invalid: uppercase and special chars
"path": "data.csv",
"format": "csv"
}

result = validator.validate_resource(resource)
# Warning: Resource name 'My Data File!' contains invalid characters.
# Suggested name: 'my_data_file'
```

**In the UI**: When resource names are auto-inferred (e.g., from filenames), the validator will:
1. Detect invalid names
2. Show the suggested sanitized name
3. Ask for user confirmation before applying

### 7. Schema-Based Data Validation

The validator can check that DataFrame values match their field type definitions:

Expand Down
Loading