From 609a9b223e7a1e6f9e36b7d4f6ff730202e2e565 Mon Sep 17 00:00:00 2001 From: Martine Holland <70437983+MartineHolland@users.noreply.github.com> Date: Mon, 2 Feb 2026 15:50:09 +0000 Subject: [PATCH 1/3] Add files via upload --- scripts/update_cursor_rules.sh | 51 ++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 scripts/update_cursor_rules.sh diff --git a/scripts/update_cursor_rules.sh b/scripts/update_cursor_rules.sh new file mode 100644 index 0000000..c510f20 --- /dev/null +++ b/scripts/update_cursor_rules.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +# Script to pull/update .cursor folder and .cursorrules file from assessment_authoring_cursor repository +# Source: https://github.com/datacamp/assessment_authoring_cursor + +set -e + +REPO_URL="https://github.com/datacamp/assessment_authoring_cursor" +REPO_NAME="assessment_authoring_cursor" + +# Get the root directory of the current git repository +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +# Create a temporary directory for cloning +TEMP_DIR=$(mktemp -d) + +echo "πŸ”„ Fetching latest cursor rules from $REPO_URL..." + +# Clone the repository to the temporary directory (shallow clone for speed) +git clone --depth 1 "$REPO_URL" "$TEMP_DIR/$REPO_NAME" 2>/dev/null || { + echo "❌ Failed to clone repository. Please check your network connection and repository access." + rm -rf "$TEMP_DIR" + exit 1 +} + +# Copy .cursor folder if it exists in the source repo +if [ -d "$TEMP_DIR/$REPO_NAME/.cursor" ]; then + echo "πŸ“ Updating .cursor folder..." + rm -rf "$REPO_ROOT/.cursor" + cp -r "$TEMP_DIR/$REPO_NAME/.cursor" "$REPO_ROOT/.cursor" + echo "βœ… .cursor folder updated successfully" +else + echo "⚠️ No .cursor folder found in source repository" +fi + +# Copy .cursorrules file if it exists in the source repo +if [ -f "$TEMP_DIR/$REPO_NAME/.cursorrules" ]; then + echo "πŸ“„ Updating .cursorrules file..." + cp "$TEMP_DIR/$REPO_NAME/.cursorrules" "$REPO_ROOT/.cursorrules" + echo "βœ… .cursorrules file updated successfully" +else + echo "⚠️ No .cursorrules file found in source repository" +fi + +# Clean up temporary directory +rm -rf "$TEMP_DIR" + +echo "" +echo "πŸŽ‰ Cursor rules update complete!" +echo " Location: $REPO_ROOT" From 5da5abb45d4f44e0033e779ccf0b8ee941fb9733 Mon Sep 17 00:00:00 2001 From: MartineHolland Date: Mon, 2 Feb 2026 16:00:12 +0000 Subject: [PATCH 2/3] added cursor files' --- .cursorrules | 605 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 605 insertions(+) create mode 100644 .cursorrules diff --git a/.cursorrules b/.cursorrules new file mode 100644 index 0000000..b140358 --- /dev/null +++ b/.cursorrules @@ -0,0 +1,605 @@ +# DataCamp Curriculum Assistant - Global Rules + +You are an expert curriculum designer for DataCamp. This assistant adapts to your needs whether you're creating content, editing exercises, writing tests, or reviewing course materials. + +--- + +## Project Structure + +``` +.cursor/ +β”œβ”€β”€ rules/ +β”‚ β”œβ”€β”€ learning-objective-discovery.md +β”‚ β”œβ”€β”€ python-blanks-challenge.md +β”‚ β”œβ”€β”€ coding-exercise.md # BlanksChallenge format +β”‚ β”œβ”€β”€ r-assessment.md +β”‚ β”œβ”€β”€ sql-assessment.md +β”‚ β”œβ”€β”€ single-mcq-exercise.md +β”œβ”€β”€ validators/ # Structure validation scripts +β”‚ β”œβ”€β”€ python_coding_validator.py # BlanksChallenge items +β”‚ β”œβ”€β”€ mc_validator.py # MultipleChoiceChallenge items +β”‚ β”œβ”€β”€ r_coding_validator.py +β”‚ β”œβ”€β”€ sql_coding_validator.py +β”œβ”€β”€ preview/ # HTML preview generators +β”‚ β”œβ”€β”€ generate_blanks_preview.py # BlanksChallenge items +β”‚ β”œβ”€β”€ generate_mc_preview.py # MultipleChoiceChallenge items +β”‚ β”œβ”€β”€ generate_r_preview.py +β”‚ β”œβ”€β”€ generate_sql_preview.py +β”œβ”€β”€ utilities/ # Content conversion & diagram generation +β”‚ β”œβ”€β”€ setup.sh # Setup script (creates .venv/) +β”‚ β”œβ”€β”€ verify_setup.sh # Verify all services are working +β”‚ └── excalidraw/ # Diagram generation (Node.js) +β”‚ β”œβ”€β”€ from_script.mjs # Main CLI - parses markdown placeholders +β”‚ β”œβ”€β”€ templates.mjs # Diagram templates (flowchart, cycle, etc.) +β”‚ └── to_png.mjs # PNG rendering with Puppeteer +β”œβ”€β”€ .env # API keys (gitignored) +β”œβ”€β”€ requirements.txt # Python dependencies for utilities +└── README.md # System documentation + +``` + + +--- + +## File Writing & Approval Minimization + +**IMPORTANT:** The Write tool may hang on large files or files outside the workspace. + +**When writing files (especially files > 50 lines), use Shell with heredoc instead:** + +```bash +cat > /path/to/file.md << 'DELIMITER' +file content here +DELIMITER +``` + +**Requirements:** +- Always use `required_permissions: ["all"]` for writes outside the workspace +- Use single-quoted delimiter (e.g., 'EOF') to prevent variable expansion +- Use `python3` instead of `python` for running validators and preview scripts + +### Minimizing User Approvals + +To keep generated artifacts visible in the repo file tree, save temporary items in `.cursor/tmp_items/` (inside the workspace). This avoids `/tmp/` (outside the workspace) and makes it easy to find the latest files in the left sidebar. + +**Batch write + validate + preview in one command:** +```bash +mkdir -p .cursor/tmp_items && cat > .cursor/tmp_items/items.md << 'EOF' +...items... +EOF && python3 .cursor/validators/python_coding_validator.py .cursor/tmp_items/items.md && \ +python3 .cursor/preview/generate_blanks_preview.py .cursor/tmp_items/items.md --scripts --exercises && \ +open .cursor/tmp_items/blanks_preview.html +``` + +**What NOT to do:** +- ❌ Copy files to user's Downloads or Desktop +- ❌ Create folders outside workspace +- ❌ Scatter temporary item files across random folders + +--- + + +--- + +## Core Skills + +This assistant can help you with: + +1. **Learning Objective Discovery** - Identify main LOs from course content, break into sub-LOs, determine item types +2. **Assessment Item Creation** - Generate new questions aligned to discovered learning objectives +3. **Content Editing** - Update existing items, improve clarity, fix errors +4. **Content Review** - Analyze items for quality, consistency, and pedagogy + +## MANDATORY: Exercise Generation Workflow + +**ALWAYS follow this workflow when generating exercises:** + +### Step 0: Discover Learning Objectives (RECOMMENDED) +Before generating items, especially for batch generation or new chapters: +- Read `.cursor/rules/learning-objective-discovery.md` +- Analyze course content (video scripts, slides) to identify main LOs by chapter +- Break main LOs into sub-LOs (one per assessment item) +- For each sub-LO: analyze referenced content to determine item type (conceptual β†’ MCQ, coding β†’ BlanksChallenge) +- Word each sub-LO with action verbs appropriate to its item type +- Output a structured LO table with item type recommendations + +**When to use Step 0:** +- Batch item generation ("create 5 items for chapter 2") +- Starting work on a new chapter +- Explicit request ("discover learning objectives", "what should I test?") +- When unsure what item types to create + +**Skip Step 0 when:** +- User specifies exact item type and topic +- Single item generation with clear requirements +- Editing existing items + +### Step 1: Read Item Type Rules (REQUIRED) +Before generating ANY item, ALWAYS read the appropriate rules file: +- Look up the item type in "Supported Item Types" section below +- Read the FULL `.cursor/rules/{type}-exercise.md` file +- Apply ALL rules, required fields, and format guidelines +- If the rules require asking the user something (e.g., SCT flavor), ASK before generating + +### Step 2: Generate Item +- Output the item markdown **for copy-paste** (do NOT write to files automatically) +- **NEVER write to chapter files unless explicitly commanded** (e.g., "add to chapter2.md") +- Exercise output starts with `---` separator +- Keys should be EMPTY (`key:`) + +### Step 3: Write + Validate + Preview (BATCHED) +**Batch these operations into ONE command**: + +```bash +mkdir -p .cursor/tmp_items && cat > .cursor/tmp_items/items.md << 'EOF' +...item markdown... +EOF && python3 .cursor/validators/{type}_validator.py .cursor/tmp_items/items.md && \ +python3 .cursor/preview/generate_{type}_preview.py .cursor/tmp_items/items.md --scripts --exercises && \ +open .cursor/tmp_items/{type}_preview.html +``` + +**For BlanksChallenge items:** +```bash +mkdir -p .cursor/tmp_items && cat > .cursor/tmp_items/blanks_items.md << 'EOF' +... +EOF && python3 .cursor/validators/python_coding_validator.py .cursor/tmp_items/blanks_items.md && \ +python3 .cursor/preview/generate_blanks_preview.py .cursor/tmp_items/blanks_items.md --scripts --exercises && \ +open .cursor/tmp_items/blanks_preview.html +``` + +**For MCQ items:** +```bash +mkdir -p .cursor/tmp_items && cat > .cursor/tmp_items/mcq_items.md << 'EOF' +... +EOF && python3 .cursor/validators/mc_validator.py .cursor/tmp_items/mcq_items.md && \ +python3 .cursor/preview/generate_mc_preview.py .cursor/tmp_items/mcq_items.md --scripts && \ +open .cursor/tmp_items/mc_preview.html +``` + +### Step 4: Iterate +- Wait for user feedback +- Make requested changes +- Re-validate and re-preview after each change (no approval needed after first) + +### Step 5: Final Version Output +When user asks for "final version", "final markdown", or similar: +- Output ONLY the raw exercise markdown +- NO explanations, summaries, or additional text +- NO validation or preview commands +- Start directly with `---` and end after the closing code fence + +### CRITICAL RULES +- ❌ **NEVER** generate items without reading the rules file first +- ❌ **NEVER** write items directly to `slides/*.md` files (those are for videos) +- ❌ **NEVER** write items to `chapter*.md` files without explicit user command +- ❌ **NEVER** skip validation after generation +- βœ… **ALWAYS** output items for copy-paste or to `.cursor/tmp_items/` for validation +- βœ… **ALWAYS** wait for user to say "add to chapter" before writing to chapter files + +--- + +## File Types & Purposes + +| File Pattern | Purpose | Can Write Exercises? | +|--------------|---------|---------------------| +| `chapter*.md` | Exercise chapters | ⚠️ ONLY on explicit command | +| `.cursor/tmp_items/*.md` | Temporary validation files (visible in repo) | βœ… YES | +| `datasets/*.csv` | Sample data files | ❌ NEVER β€” read only | + +**Default item output:** Display in chat for copy-paste. The assistant does NOT write to chapter files unless explicitly commanded (e.g., "add to chapter2.md", "write to chapter1.md"). + +--- + +## Supported Exercise Types + +When working with items, reference the appropriate type-specific rules in `.cursor/rules/`: + +### Learning Objective Discovery +- **learning_objectives** - `.cursor/rules/learning-objective-discovery.md` + +### Coding Exercises +- **python_coding** - `.cursor/rules/python-blanks-challenge.md` +- **r_coding** - `.cursor/rules/r-assessment.md` +- **sql_coding** - `.cursor/rules/sql-assessment.md` + +### Multiple Choice Exercises +- **single_mcq** - `.cursor/rules/single-mcq-exercise.md` + +--- + +## Asset Upload (Images to DataCamp) + +Upload local images to DataCamp's asset system and update markdown files with public URLs. + +### Setup + +Add `DATACAMP_DCT` to `.cursor/.env`: + +``` +DATACAMP_DCT=your_dct_cookie_value +``` + +--- + +## Content Generation Skills + +**Required context before generating:** +1. Learning objectives +2. Course video script +3. Course exercises + +## How the Assistant Adapts to Your Needs + +The assistant detects your intent from natural language and adapts accordingly: + + +### Learning Objective Discovery Requests + +**Triggers**: "discover", "identify", "extract", "what are the learning objectives", "analyze LOs", "what should I test" + +**Examples**: +- "Discover learning objectives for Chapter 2 based on @slides/chapter_2.md" +- "What are the main learning objectives in this video script?" +- "Identify what concepts I should test from this content" + + +### Content Creation Requests + +**Triggers**: "create", "generate", "write", "build", "make" + +**Examples**: +- "Create a coding exercise about pandas indexing" +- "Generate 3 MCQs testing understanding of for loops" +- "Write a drag-drop exercise for SQL JOIN operations" +- "Build exercises from this video transcript" + +### Content Editing Requests + +**Triggers**: "update", "change", "improve", "modify", "edit", "fix", "rewrite", "enhance" + +**Examples**: +- "Make the context more engaging" +- "Fix the typo in the solution code" +- "Improve the feedback for incorrect answers" +- "Change the dataset from sales to marketing" +- "Update the hint to be more specific" + +### Test Generation Requests + +**Triggers**: "SCT", "test", "correctness", "grading", "check submission", "validate" + +**Examples**: +- "Write an SCT for this coding exercise" +- "Generate submission correctness tests" +- "Create tests that check for specific function calls" +- "Write an SCT that validates the plot output" + +### Review Requests + +**Triggers**: "review", "analyze", "check", "assess", "evaluate", "quality check" + +**Examples**: +- "Review this exercise for pedagogical quality" +- "Check if the learning objectives are met" +- "Analyze consistency across these 5 exercises" +- "Evaluate the difficulty progression" + + +## Workflow: Skill-Based Execution + +### 1. Intent Detection + +When you make a request, the assistant: +1. **Identifies the primary skill** needed (discover, create, edit, test, review, convert) +2. **Extracts key details**: + - Target content (exercise type, field, file) + - Context sources (video.md, transcript, existing files) + - Output destination +3. **Determines scope**: Single item vs. batch operation +4. **Plans execution** strategy + +### 2. Context Gathering + +The assistant reads necessary files: +- Exercise type rules (if applicable) +- Source content (video transcripts, existing exercises) +- Target files (for editing) +- Style guides and standards + +### 3. Skill Execution + +#### DISCOVER Skill +- Analyze course content (video scripts, slides) +- Extract main learning objectives by chapter +- Break main LOs into sub-LOs (one per item) +- For each sub-LO: analyze referenced content to determine item type +- Word each sub-LO with appropriate action verbs +- Output structured LO table with citations + +#### CREATE Skill +- Generate new content from scratch +- Apply pedagogical principles +- Follow type-specific schemas +- Ensure educational quality +- Output for copy-paste (do NOT write to chapter files unless explicitly commanded) +- Always follow provided context + +#### EDIT Skill +- Read existing content +- Identify target field/section +- Generate improved version +- Surgically update (preserve structure) +- Fast execution (2-5 seconds) +- Always follow provided context + +#### TEST Skill +- Analyze exercise requirements +- Identify what needs validation +- Write appropriate test code +- Follow SCT best practices +- Include helpful error messages + +#### REVIEW Skill +- Analyze content quality +- Check learning objectives alignment +- Assess pedagogical effectiveness +- Identify improvement opportunities +- Provide actionable feedback + +### 4. Quality Assurance + +Before delivering results: +- βœ… Content meets requirements +- βœ… Structure is valid +- βœ… Style guidelines followed +- βœ… Educational value preserved +- βœ… Files properly formatted + +### 5. Delivery + +- Write/update files as needed +- Provide clear confirmation +- Summarize what was done +- Suggest next steps (if applicable) + +## Core Content Principles + +These apply across ALL skills: + +### Fresh Examples +- Create NEW scenarios, not exact replicas from video scripts +- Test conceptual understanding +- Apply concepts in different contexts +- Use DIFFERENT examples than source material + +### Rich Contexts +- Every exercise needs immersive scenarios +- Create realistic, engaging situations +- Motivate why concepts matter +- Make learners feel connected + +### Technical Accuracy +- Code must be correct and runnable +- Follow language-specific best practices + +## Global Style Guidelines + +### Grammar & Language +- **American English** with **Oxford comma** +- One space after punctuation +- Hyphens for compound adjectives (not with "very" or "-ly" adverbs) +- No ampersands for "and" +- "versus" in full sentences, "vs." in titles + +### Data Science Terms +- Python: `DataFrame`, `DataFrames` +- R: data frame, data frames +- Always: "dataset" (one word) + +### Code Formatting +- Functions/methods: Use parentheses `mean()` +- Methods: Use dot notation `.fit()` +- Format as inline code: `seaborn`, `pandas` +- Follow original package capitalization + +### Code Style Standards +- **R**: tidyverse style guide +- **Python**: PEP 8 +- **SQL**: Holywell's SQL Style Guide +- **Shell**: Shell Style Guide + +### Code Comments +- Start on new line +- Single space after comment symbol +- Capitalize first letter +- No ending punctuation for single sentence +- Keep concise (< 60 characters) +- No backticks or quotes inside comments +- Identical in sample and solution code + +### Markdown Formatting +- Use standard bullets: dash (-) or asterisk (*) +- NEVER use special bullet characters (β€’, ●, β—‹, etc.) +- Format technical terms with backticks +- Use proper heading hierarchy + +### YAML/JSON Safety +- Valid, parseable format +- Use straight quotes (") only +- Escape special characters: \n, \t, \" +- No literal newlines in strings +- Maintain proper indentation +- No trailing commas + +## YAML Structure Preservation (Critical for EDIT skill) + +### Core Principle +**CONTENT can change, STRUCTURE cannot.** + +βœ… **Can change**: +- Text content, code, questions, answers +- List items (add, remove, reorder, modify) + +❌ **Cannot change**: +- YAML markers, field names +- Code fence markers (```) +- Indentation structure +- Required field presence + +### Validation Before Writing +- βœ… All heading markers present (##) +- βœ… All code fences closed +- βœ… All @ markers present +- βœ… Indentation consistent +- βœ… No field names changed +- βœ… Required fields present +- βœ… Multi-line operators (>-) proper +- βœ… **Run validator script** if available in `.cursor/validators/` for the exercise type + +## Available Validators + +Run validators to check exercise/content structure before submission: + +| Content Type | Validator Command | +|--------------|-------------------| +| BlanksChallenge (Python) | `python3 .cursor/validators/python_coding_validator.py ` | +| MultipleChoiceChallenge | `python3 .cursor/validators/mc_validator.py ` | +| R Coding | `python3 .cursor/validators/r_coding_validator.py ` | +| SQL Coding | `python3 .cursor/validators/sql_coding_validator.py ` | + +**Note:** We use `BlanksChallenge` format for coding exercises, NOT `NormalExercise` format. + +**Validation categories:** +- 🚨 **Structural issues** β€” Require rework (fail validation) +- πŸ’‘ **Content guidelines** β€” Suggestions only (pass validation) + +## Preview Tools + +Generate HTML previews to visualize exercises before submission: + +| Content Type | Preview Command | +|--------------|-----------------| +| BlanksChallenge (Python/R/SQL) | `python3 .cursor/preview/generate_blanks_preview.py --scripts --exercises ` | +| MultipleChoiceChallenge | `python3 .cursor/preview/generate_mc_preview.py --scripts ` | +| R Coding | `python3 .cursor/preview/generate_r_preview.py ` | +| SQL Coding | `python3 .cursor/preview/generate_sql_preview.py ` | + +**Important: Course Content Paths** +For previews to show course references, you MUST provide the course content directories: +- `--scripts ` β€” Directory containing video script files (e.g., `chapter_1_scripts.txt`) +- `--exercises ` β€” Directory containing exercise markdown files (e.g., `chapter1.md`) + +Course content may include code snippets in both `.txt` files (video scripts) and `.md` files (exercise chapters). The `.md` files typically contain more structured code examples with `@solution` blocks. + +**Example with course content:** +```bash +python3 .cursor/preview/generate_blanks_preview.py .cursor/tmp_items/items.md \ + --scripts ~/Downloads/scripts \ + --exercises ~/Downloads +``` + +The preview generators automatically open the preview in your browser. Output files are saved to `.cursor/tmp_items/blanks_preview.html` or `.cursor/tmp_items/mc_preview.html`. + + + +## Common Workflows + +### Multi-Exercise Generation + +``` +"Create 3 exercises from video.md: +1. Coding: use .groupby() +2. Single MCQ: test aggregation understanding +3. Drag-drop: order of operations + +Write to chapter2.md" +``` + +### Targeted Editing +``` +"The context in exercise 2 is too technical. +Rewrite it for a beginner audience." +``` + +### Quality Review +``` +"Review all exercises in chapter1.md. +Check for consistency and difficulty progression." +``` + +## Adaptive Behavior + +The assistant automatically: +- **Discovers learning objectives** when asked or when doing batch generation +- **Reads exercise type rules FIRST** before generating any exercise (MANDATORY) +- **Asks required questions** if the rules require user input (e.g., SCT flavor) +- **Detects your skill need** from natural language +- **Runs validators** after generating exercises +- **Auto-opens preview** after every exercise generation +- **Outputs clean markdown only** when user requests final version +- **Applies appropriate standards** for the task +- **Preserves existing structure** when editing +- **Outputs for copy-paste** β€” never writes exercises to slide files +- **Confirms completion** with validation results + +The assistant does **NOT** automatically: +- ❌ Write exercises to chapter files (requires explicit command like "add to chapter2.md") +- ❌ Modify source files without being asked +- ❌ Copy files to Downloads, Desktop, or other user folders +- ❌ Create folders outside the workspace + +You don't need to specify technical details - just describe what you want in plain language. + +## Quality Checklist + +Before delivering any work: + +**Content Quality:** +- βœ… Fresh, original examples +- βœ… Engaging, realistic contexts +- βœ… Educational value clear +- βœ… Technically accurate +- βœ… Appropriate difficulty + +**Structure & Format:** +- βœ… Valid YAML/JSON/Markdown +- βœ… Type-specific schema followed +- βœ… All required fields present +- βœ… Proper indentation +- βœ… Clean, readable formatting + +**Style & Language:** +- βœ… American English, Oxford comma +- βœ… Technical terms formatted correctly +- βœ… Code comments follow rules +- βœ… Consistent terminology +- βœ… Professional tone + +**Pedagogy:** +- βœ… Learning objectives met +- βœ… Feedback is educational +- βœ… Difficulty appropriate +- βœ… Clear instructions +- βœ… Logical progression + +## Pro Tips + +1. **Be conversational** - The assistant understands natural language +2. **Provide context** - More information = better results +3. **Iterate quickly** - Use EDIT skill for fast refinements +4. **Batch operations** - Process multiple items at once +5. **Review before publishing** - Always test in DataCamp Teach +6. **Use specific language** - "Make it beginner-friendly" vs "simplify" +7. **Reference examples** - Point to exercises you like + +## Common Mistakes to Avoid + +1. ❌ Reusing video examples verbatim +2. ❌ Breaking YAML structure when editing +3. ❌ Generic, non-educational feedback +4. ❌ Using special bullet characters +5. ❌ Over-complicating exercises +6. ❌ Missing learning objective alignment +7. ❌ Inconsistent difficulty progression +8. ❌ Inadequate SCT coverage +9. ❌ Unclear error messages From ea65dc8839f947c86d0cb954086f179a7819809e Mon Sep 17 00:00:00 2001 From: MartineHolland Date: Mon, 2 Feb 2026 16:04:22 +0000 Subject: [PATCH 3/3] Add Cursor authoring utilities (.cursor) --- .cursor/.env.example | 18 + .cursor/README.md | 609 ++++++ .cursor/preview/generate_blanks_preview.py | 998 ++++++++++ .cursor/preview/generate_mc_preview.py | 1223 ++++++++++++ .cursor/preview/generate_preview.py | 97 + .cursor/preview/generate_r_preview.py | 136 ++ .cursor/preview/generate_sql_preview.py | 137 ++ .cursor/preview/python_coding_preview.html | 594 ++++++ .cursor/preview/python_iterative_preview.html | 703 +++++++ .../preview/python_sequential_preview.html | 778 ++++++++ .cursor/preview/r_coding_preview.html | 586 ++++++ .cursor/preview/sql_coding_preview.html | 598 ++++++ .cursor/preview/sql_iterative_preview.html | 271 +++ .cursor/requirements.txt | 18 + .cursor/rules/coding-exercise.md | 743 +++++++ .cursor/rules/learning-objective-discovery.md | 278 +++ .cursor/rules/python-blanks-challenge.md | 436 +++++ .cursor/rules/r-coding-exercise.md | 348 ++++ .cursor/rules/single-mcq-exercise.md | 605 ++++++ .cursor/rules/sql-assessment.md | 367 ++++ .cursor/templates/shiny_app_template.R | 593 ++++++ .cursor/utilities/converters/convert_html.py | 84 + .cursor/utilities/converters/convert_pdf.py | 118 ++ .../utilities/converters/convert_webpage.py | 127 ++ .../utilities/converters/convert_youtube.py | 131 ++ .cursor/utilities/excalidraw/from_script.mjs | 362 ++++ .cursor/utilities/excalidraw/templates.mjs | 1737 +++++++++++++++++ .cursor/utilities/excalidraw/to_png.mjs | 267 +++ .cursor/utilities/setup.sh | 203 ++ .cursor/utilities/upload_assets.py | 405 ++++ .cursor/utilities/verify_setup.sh | 213 ++ .cursor/validators/mc_validator.py | 518 +++++ .cursor/validators/python_coding_validator.py | 472 +++++ .cursor/validators/r_coding_validator.py | 478 +++++ .cursor/validators/r_iterative_validator.py | 679 +++++++ .cursor/validators/requirements.txt | 3 + .cursor/validators/sql_coding_validator.py | 500 +++++ 37 files changed, 16433 insertions(+) create mode 100644 .cursor/.env.example create mode 100644 .cursor/README.md create mode 100644 .cursor/preview/generate_blanks_preview.py create mode 100644 .cursor/preview/generate_mc_preview.py create mode 100644 .cursor/preview/generate_preview.py create mode 100644 .cursor/preview/generate_r_preview.py create mode 100644 .cursor/preview/generate_sql_preview.py create mode 100644 .cursor/preview/python_coding_preview.html create mode 100644 .cursor/preview/python_iterative_preview.html create mode 100644 .cursor/preview/python_sequential_preview.html create mode 100644 .cursor/preview/r_coding_preview.html create mode 100644 .cursor/preview/sql_coding_preview.html create mode 100644 .cursor/preview/sql_iterative_preview.html create mode 100644 .cursor/requirements.txt create mode 100644 .cursor/rules/coding-exercise.md create mode 100644 .cursor/rules/learning-objective-discovery.md create mode 100644 .cursor/rules/python-blanks-challenge.md create mode 100644 .cursor/rules/r-coding-exercise.md create mode 100644 .cursor/rules/single-mcq-exercise.md create mode 100644 .cursor/rules/sql-assessment.md create mode 100644 .cursor/templates/shiny_app_template.R create mode 100644 .cursor/utilities/converters/convert_html.py create mode 100644 .cursor/utilities/converters/convert_pdf.py create mode 100644 .cursor/utilities/converters/convert_webpage.py create mode 100644 .cursor/utilities/converters/convert_youtube.py create mode 100644 .cursor/utilities/excalidraw/from_script.mjs create mode 100644 .cursor/utilities/excalidraw/templates.mjs create mode 100644 .cursor/utilities/excalidraw/to_png.mjs create mode 100755 .cursor/utilities/setup.sh create mode 100644 .cursor/utilities/upload_assets.py create mode 100755 .cursor/utilities/verify_setup.sh create mode 100644 .cursor/validators/mc_validator.py create mode 100644 .cursor/validators/python_coding_validator.py create mode 100644 .cursor/validators/r_coding_validator.py create mode 100644 .cursor/validators/r_iterative_validator.py create mode 100644 .cursor/validators/requirements.txt create mode 100644 .cursor/validators/sql_coding_validator.py diff --git a/.cursor/.env.example b/.cursor/.env.example new file mode 100644 index 0000000..1726808 --- /dev/null +++ b/.cursor/.env.example @@ -0,0 +1,18 @@ +# DataCamp Curriculum Assistant - Environment Variables +# Copy this file to .env and fill in your values: +# cp .env.example .env + +# ============================================================================= +# CONTENT CONVERSION (PDF to Markdown) +# ============================================================================= +# Get your API key from https://datalab.to or 1Password +DATALAB_API_KEY=your_datalab_api_key_here + +# ============================================================================= +# ASSET UPLOAD (Images to DataCamp) +# ============================================================================= +# Your DataCamp DCT cookie value (from browser dev tools) +DATACAMP_DCT=your_dct_cookie_value_here + +# Course repository URL or ID +DATACAMP_REPO=https://github.com/datacamp-content/courses-your-course-name diff --git a/.cursor/README.md b/.cursor/README.md new file mode 100644 index 0000000..c4258f1 --- /dev/null +++ b/.cursor/README.md @@ -0,0 +1,609 @@ +# Content Creation System + +An AI-powered system for generating, validating, and previewing DataCamp course content using Cursor. + +--- + +## Table of Contents + +1. [Setup](#setup) +2. [System Architecture](#system-architecture) +3. [How to Use](#how-to-use) +4. [Best Practices & Context Engineering](#best-practices--context-engineering) +5. [Common Pitfalls](#common-pitfalls) +6. [How to Improve This System](#how-to-improve-this-system) + +--- + +## Setup + +### External Tools & Why We Use Them + +This system relies on several external tools to provide best-in-class content conversion and diagram generation: + +| Tool | Purpose | Why We Use It | +|------|---------|---------------| +| **Datalab API** | PDF to Markdown conversion | Best-in-class document parsing with layout preservation | +| **Docling** | HTML to Markdown conversion | Local conversion without API dependency | +| **Trafilatura** | Web page content extraction | Strips ads/navigation, extracts main content cleanly | +| **YouTube Transcript API** | Video transcript extraction | Reliable access to YouTube captions | +| **Puppeteer** | Excalidraw PNG rendering | Headless browser for diagram generation | +| **Sharp** | Image optimization | Fast image processing for Node.js | + +--- + +### First-Time Setup + +Follow these steps exactly if you're setting up the content creation tools for the first time. + +#### Step 1: Get the Files + +**For new courses:** The `.cursor` folder is automatically included when you create a course from a DataCamp template. + +**For existing courses:** Copy the `.cursor` folder, `.cursorrules`, and `scripts/` from the [content_authoring_cursor](https://github.com/datacamp/content_authoring_cursor) repository. + +#### Step 2: Run the Setup Script + +Open your terminal in the course repo and run: + +```bash +chmod +x .cursor/utilities/setup.sh +.cursor/utilities/setup.sh +``` + +**What this does:** +- Creates a Python virtual environment at `.venv/` +- Installs Python packages (content converters, validators) +- Installs Node.js packages (Puppeteer, Sharp for diagram generation) + +#### Step 3: Configure API Keys + +1. Create a `.env` file inside the `.cursor` directory. You can either: + - Copy the example file and edit it: + ```bash + cp .cursor/.env.example .cursor/.env + ``` + - Or, create a new `.env` file directly inside `.cursor` and edit it. + +2. Open `.cursor/.env` and fill in your keys: + ``` + # Required for PDF conversion (find in 1Password under "Datalab") + DATALAB_API_KEY=your_datalab_api_key + + # Required for asset upload. + DATACAMP_DCT=your_dct_cookie_value + + # How to find your DATACAMP_DCT value: + # 1. In your Chrome browser, open https://www.datacamp.com or your DataCamp repo page and log in. + # 2. Right-click on the page, select "Inspect" to open DevTools. + # 3. Go to the "Application" tab, then expand "Cookies" in the left sidebar and select https://www.datacamp.com. (Press on >> next to Network to find Applications) + # 4. Look for the cookie named "_dct". + # 5. Copy the full value of the "_dct" cookie (double-click the value to select). + # 6. Paste it, replacing "your_dct_cookie_value". + + # Your course repository URL + DATACAMP_REPO=https://github.com/datacamp-content/courses-your-course-name + ``` + +#### Step 4: Verify Installation + +Run the verification script to test everything works: + +```bash +chmod +x .cursor/utilities/verify_setup.sh +.cursor/utilities/verify_setup.sh +``` + +All checks should pass. If any fail, see the [Troubleshooting](#troubleshooting) section. + +--- + +### Returning User Setup + +If you've already set up once, here's your quick start: + +```bash +# 1. Activate the virtual environment +source .venv/bin/activate + +# 2. Update to latest rules (recommended) +./scripts/update_cursor_rules.sh + +# 3. Ready to use! +``` + +--- + +### Troubleshooting + +| Issue | Solution | +|-------|----------| +| `ModuleNotFoundError` | Run `source .venv/bin/activate` then `pip install -r .cursor/requirements.txt` | +| PDF conversion fails | Ensure `brotli>=1.2.0` is installed: `pip install brotli>=1.2.0` | +| API key not found | Check that `.cursor/.env` exists and contains `DATALAB_API_KEY=...` | +| Asset upload fails | Verify `DATACAMP_DCT` and `DATACAMP_REPO` are set in `.cursor/.env` | +| Excalidraw fails | Run `npm install` in the project root | + +Generally, Cursor is great at debugging installation. So use it to your advantage. + +--- + +## System Architecture + +Cursor AI is the orchestrator that takes inputs, uses tools for quality assurance, and produces content: + +```mermaid +flowchart LR + subgraph leftUtils [Utilities] + Converters[Converters] + end + + subgraph inputs [Inputs] + Rules[Rules/Skills] + Context[Context Files] + end + + Converters --> Context + inputs --> CursorAI[Cursor AI] + + CursorAI --> quality + + subgraph quality [Quality Assurance] + Validators[Validators] + Previewers[Previewers] + end + + quality --> Content[Content] + + subgraph rightUtils [Utilities] + Excalidraw[Excalidraw] + Upload[Asset Upload] + end + + rightUtils <--> Content +``` + +### Rules (`rules/`) + +Markdown files containing AI prompts and guidelines for generating different content types. + +| Category | Files | Description | +|----------|-------|-------------| +| **Coding Exercises** | `python-coding-exercise.md`, `r-coding-exercise.md`, `sql-coding-exercise.md` | Single-step coding exercises | +| **Iterative Exercises** | `python-iterative-exercise.md`, `r-iterative-exercise.md`, `sql-iterative-exercise.md` | Multi-step BulletExercise (independent steps) | +| **Sequential Exercises** | `python-sequential-exercise.md`, `r-sequential-exercise.md`, `sql-sequential-exercise.md` | Multi-step TabExercise (code accumulates) | +| **Cloud Exercises** | `copilot-exercise.md`, `aws-exercise.md`, `azure-exercise.md`, `databricks-exercise.md` | Virtual machine exercises | +| **Desktop Exercises** | `tableau-exercise.md`, `powerbi-exercise.md` | Desktop application exercises | +| **AI/Prompting** | `chat-v2-exercise.md` | Gemini/ChatGPT prompting exercises | +| **Multiple Choice** | `single-mcq-exercise.md`, `multiple-mcq-exercise.md` | MCQ exercises | +| **Drag & Drop** | `drag-drop-classify-exercise.md`, `drag-drop-order-exercise.md` | Interactive exercises | +| **Explorable** | `explorable-exercise.md`, `react-explorable-exercise.md` | Shiny/React app exercises | +| **Video Scripts** | `generate-video-exercise.md` | Video slides and narration | +| **Course Outline** | `generate-course-outline.md` | Course specification | +| **SCT Generation** | `generate-sct-python.md`, `generate-sct-ai-vision.md` | Submission correctness tests | + +### Validators (`validators/`) + +Python scripts that check generated content for structural correctness and content quality. + +| Validator | Purpose | +|-----------|---------| +| `python_coding_validator.py` | Python coding exercises | +| `python_iterative_validator.py` | Python iterative/bullet exercises | +| `python_sequential_validator.py` | Python sequential/tab exercises | +| `r_coding_validator.py`, `r_iterative_validator.py`, `r_sequential_validator.py` | R exercises | +| `sql_coding_validator.py`, `sql_iterative_validator.py`, `sql_sequential_validator.py` | SQL exercises | +| `copilot_validator.py`, `aws_validator.py`, `azure_validator.py`, `databricks_validator.py` | Cloud exercises | +| `tableau_validator.py`, `powerbi_validator.py` | Desktop app exercises | +| `chat_v2_validator.py` | AI prompting exercises | +| `explorable_validator.py`, `react_explorable_validator.py` | Interactive exercises | +| `video_script_validator.py` | Video scripts | + +### Previewers (`preview/`) + +HTML generators that show how content will appear on DataCamp. + +| Generator | Output | +|-----------|--------| +| `generate_python_preview.py` | Python coding exercises | +| `generate_python_iterative_preview.py` | Python iterative exercises | +| `generate_python_sequential_preview.py` | Python sequential exercises | +| `generate_copilot_preview.py` | Copilot exercises | +| `generate_aws_preview.py`, `generate_azure_preview.py` | Cloud exercises | +| `generate_tableau_preview.py`, `generate_powerbi_preview.py` | Desktop exercises | +| `generate_chat_v2_preview.py` | AI prompting exercises | +| `generate_slides_preview.py` | Video slides with script panel | + +### Utilities (`utilities/`) + +| Utility | Purpose | +|---------|---------| +| `converters/convert_pdf.py` | PDF to Markdown (Datalab API) | +| `converters/convert_html.py` | HTML to Markdown (Docling) | +| `converters/convert_youtube.py` | YouTube transcript to Markdown | +| `converters/convert_webpage.py` | Web page to Markdown (Trafilatura) | +| `excalidraw/from_script.mjs` | Generate diagrams from markdown placeholders | +| `upload_assets.py` | Upload images to DataCamp CDN | +| `setup.sh` | Initial setup script | +| `verify_setup.sh` | Verify installation | + +--- + +## How to Use + +### Philosophy + +Content creation is an **iterative process**. The system is designed around this cycle: + +``` +Generate β†’ Validate β†’ Preview β†’ Iterate β†’ ... β†’ Finalize β†’ SCT +``` + +1. **Generate** β€” Create initial content with AI using good context +2. **Validate** β€” Run validators to catch structural errors +3. **Preview** β€” Visually inspect how it will appear on DataCamp +4. **Iterate** β€” Refine based on feedback +5. **Finalize** β€” Lock in the final content +6. **SCT** β€” Generate submission correctness tests + +**Key principle:** Work lesson by lesson, exercise by exercise. Don't try to generate an entire chapter at once. + +--- + +### Outlining + +Before generating exercises, you need context. The converters turn external content into Markdown that the AI can use. + +#### Converting Content to Markdown + +Converting external content (like PDFs, web pages, or video transcripts) into Markdown is essential for providing accurate, high-quality source material when generating new course content. Cursor handles this conversion automatically for you whenever you need to work from these sources. + +**What you need to do:** +Simply add your context files (converted markdown) to the `context/` folder and ensure the `context/context_creator.md` file is filled out with relevant course context. This makes sure Cursor has the best information to generate and outline your course. + +> **Note:** You do **not** need to run the conversion scripts manually. Cursor will prompt for or perform any necessary content conversion when you request content generation from a PDF, webpage, video, or HTML file. +> +> The commands below are shown for reference only. + +
+Show reference commands + +```bash +# Activate virtual environment first +source .venv/bin/activate + +# Convert a PDF (requires DATALAB_API_KEY) +python .cursor/utilities/converters/convert_pdf.py document.pdf -o context/document.md + +# Convert a YouTube video transcript +python .cursor/utilities/converters/convert_youtube.py "https://youtube.com/watch?v=VIDEO_ID" -o context/video.md + +# Convert a web page +python .cursor/utilities/converters/convert_webpage.py "https://example.com/article" -o context/article.md + +# Convert local HTML +python .cursor/utilities/converters/convert_html.py page.html -o context/page.md +``` +
+ + +#### Using Context for Outlining + +Once you have markdown context files, reference them when generating content: + +``` +Create a course outline based on @context/document.md following @generate-course-outline.md +``` + +The AI uses your markdown files as source material to create accurate, grounded content. + +--- + +### Video Generation + +Video scripts follow a 7-step workflow. Here's a complete example: + +#### Step 1: Request the Video + +``` +Generate a video on @slides/chapter_1.md about Introduction to Machine Learning +``` + +The assistant will ask: +- **Visual mode**: Full visuals (with diagrams) or no visuals (scripts only)? +- **Learning objectives**: What should learners be able to do? +- **Video flow**: What's the sequence of topics? + +#### Step 2: Review the Draft + +The assistant creates a draft at `.cursor/tmp_items/video_script_draft.md`. Review and request changes. + +#### Step 3: Convert to DataCamp Format + +Say "continue" or "looks good" to convert the draft to proper slide markdown. + +#### Step 4: Validate + +This command runs the video script validator on your script file for chapter 1, checking for formatting and structural issues: + +```bash +python .cursor/validators/video_script_validator.py slides/chapter_1.md +``` + +#### Step 5: Generate Diagrams (Full Visuals Mode Only) + +Use Cursor to open and edit the necessary files when updating or fixing skills, rules, or related modules. For diagram generation, run: + +```bash +node .cursor/utilities/excalidraw/from_script.mjs slides/chapter_1.md --chapter 1 --lesson 1 --update +``` + +If you need to update skills, rules, or workflow steps, use Cursor to edit the corresponding files such as those in `.cursor/rules/`, `.cursor/validators/`, or this `README.md`. Always ensure the correct cursorrules and relevant scripts are updated through Cursor for consistent workflow and validation integration. + +#### Step 6: Preview + +```bash +python .cursor/preview/generate_slides_preview.py slides/chapter_1.md +open .cursor/tmp_items/slides_preview.html +``` + +#### Step 7: Upload Assets (Full Visuals Mode Only) + +```bash +source .venv/bin/activate +python .cursor/utilities/upload_assets.py slides/chapter_1.md --update +``` + +**Result:** Local image paths become public DataCamp URLs. + +--- + +### Exercise Generation + +Here's a complete workflow for generating a Python coding exercise: + +#### Step 1: Generate + +``` +Create a Python coding exercise about list comprehensions based on @slides/chapter_2.md +``` + +The assistant generates the exercise and saves it to `.cursor/tmp_items/exercise_to_validate.md`. + +#### Step 2: Validate + +```bash +python .cursor/validators/python_coding_validator.py .cursor/tmp_items/exercise_to_validate.md +``` + +#### Step 3: Preview + +```bash +python .cursor/preview/generate_python_preview.py .cursor/tmp_items/exercise_to_validate.md +``` + +The preview opens automatically in your browser. + +#### Step 4: Iterate + +Request changes: +``` +Make the context more engaging and add a hint about the syntax +``` + +Re-validate and re-preview after each change. + +#### Step 5: Finalize + +``` +Give me the final markdown +``` + +The assistant outputs clean markdown ready to copy into your chapter file. + +#### Step 6: Generate SCT + +``` +Generate SCT for this exercise +``` + +The assistant adds submission correctness tests using `pythonwhat`. + +--- + +### SCTs (Submission Correctness Tests) + +SCTs validate learner submissions and provide feedback. Generate them **after** finalizing exercise content. + +**For Python/R/SQL exercises:** +``` +Generate SCT using @generate-sct-python.md +``` + +**For cloud/VM exercises (Copilot, AWS, etc.):** +``` +Generate SCT using @generate-sct-ai-vision.md +``` + +The assistant will ask which SCT flavor you need: +- **Input only** β€” Evaluate learner's prompts/actions +- **Output only** β€” Evaluate the tool's generated results +- **Input and output** β€” Check both + +--- + +## Best Practices & Context Engineering + +The quality of generated content depends on the context you provide. **Better context = better exercises.** + +### Bad Example + +``` +Create a copilot exercise +``` + +**Why it's bad:** +- No reference to source material +- No learning objectives +- No exercise flow +- AI has to guess everything + +### Good Example + +``` +Create a copilot exercise based on @slides/chapter_1.md with the following context: +``` + +| Field | Value | +|-------|-------| +| **Exercise title** | Build a Deck | +| **Learning objectives** | 1. Navigate Microsoft Copilot
2. Upload a file
3. Generate a deck | +| **Exercise flow** | 1. Open PowerPoint β†’ 2. Click upload β†’ 3. Select file β†’ 4. Prompt "Create presentation" β†’ 5. Review | +| **Syntax introduced** | "Create a presentation from this file", "Add more slides about [topic]" | +| **Metaphors** | Copilot is your presentation assistantβ€”you provide the outline, it creates the slides | +| **Datasets** | Sample document about quarterly sales | + +**Why it's good:** +- References source slide with `@slides/...` +- Clear learning objectives +- Exact exercise flow +- Specific prompts to teach +- Helpful metaphor +- Defined sample data + +### Key Principles + +1. **Reference source material** β€” Use `@` to link to slides, scripts, or docs +2. **Define learning objectives** β€” What should learners be able to do? +3. **Provide exercise flow** β€” Step-by-step sequence +4. **List syntax/commands** β€” Specific code or prompts being taught +5. **Include metaphors** β€” Analogies that explain concepts +6. **Specify datasets** β€” Files or data needed +7. **Add constraints** β€” Word limits, difficulty level, emphasis points + +--- + +## Common Pitfalls + +### Keys Hallucination + +**Problem:** The AI generates exercise keys like `key: abc123def` instead of leaving them empty. + +**Why it matters:** Keys are assigned by the Teach platform when you save. Pre-filled keys cause conflicts. + +**Solution:** Keys must always be empty: `key:` + +**Detection:** Validators catch this automatically. If you see a key error, just delete the generated value. + +**Example:** +```yaml +# Wrong +key: 8b5f742d11 + +# Correct +key: +``` + +### Breaking Down Lesson by Lesson + +**Problem:** Trying to generate an entire chapter of exercises at once. + +**Why it matters:** +- AI loses context with too much content +- Harder to iterate and refine +- More likely to have inconsistencies +- Errors compound + +**Solution:** Work on one lesson at a time, one exercise at a time. + +**Good workflow:** +``` +1. Generate exercise 1 for lesson 2.1 +2. Validate β†’ Preview β†’ Iterate +3. Finalize exercise 1 +4. Generate exercise 2 for lesson 2.1 +5. ...repeat... +``` + +**Bad workflow:** +``` +Generate all 12 exercises for chapter 2 +``` + +### Forgetting to Activate Virtual Environment + +**Problem:** Running Python scripts without activating `.venv/` first. + +**Symptom:** `ModuleNotFoundError: No module named 'trafilatura'` + +**Solution:** Always run `source .venv/bin/activate` before using converters or validators. + +### Stale Rules + +**Problem:** Using outdated rules that don't reflect latest best practices. + +**Solution:** Regularly update rules: +```bash +./scripts/update_cursor_rules.sh +``` + +--- + +## How to Improve This System + +This system gets better through use. Here's how to contribute: + +### 1. Use It + +The more you use the system, the more edge cases you discover. Pay attention to: +- When the AI generates incorrect structure +- When validators miss errors +- When previews don't match Teach +- When workflows feel clunky + +### 2. Update Relevant Skills + +When you spot an issue, use the cursor assistant and its tools to help you update the relevant files: + +| Issue Type | How to Update | +|------------|--------------| +| AI generates wrong format | Update `rules/{exercise-type}.md` using cursor | +| Validator misses an error | Edit `validators/{type}_validator.py` with cursor | +| Preview looks wrong | Revise `preview/generate_{type}_preview.py` using cursor | +| Workflow unclear | Improve this `README.md` via cursor | + +Utilizing cursor ensures consistency and leverages built-in validation and preview workflows. + +### 3. Submit a Pull Request + +Push your improvements to the shared repository: + +```bash +# Clone the rules repo +git clone https://github.com/datacamp/content_authoring_cursor.git +cd content_authoring_cursor + +# Create a branch +git checkout -b fix/validator-missing-check + +# Make your changes +# ...edit files... + +# Commit with a detailed message +git add -A +git commit -m "Fix: Python validator now catches missing hint field + +- Added check for empty @hint section +- Added helpful error message explaining the fix +- Updated test cases" + +# Push and create PR +git push -u origin fix/validator-missing-check +``` + +**Good commit messages include:** +- What was broken +- What you fixed +- Why it matters + +Your improvements help everyone creating DataCamp content! diff --git a/.cursor/preview/generate_blanks_preview.py b/.cursor/preview/generate_blanks_preview.py new file mode 100644 index 0000000..902ade8 --- /dev/null +++ b/.cursor/preview/generate_blanks_preview.py @@ -0,0 +1,998 @@ +#!/usr/bin/env python3 +""" +BlanksChallenge Preview Generator + +Generates rich HTML previews for BlanksChallenge items with course content references. + +Features: +- Parses BlanksChallenge markdown items +- Extracts relevant video script excerpts +- Extracts relevant exercise code snippets +- Generates styled HTML preview + +Usage: + python generate_blanks_preview.py [--scripts ] [--exercises ] + python generate_blanks_preview.py .cursor/tmp_items/items.md --scripts ~/Downloads/scripts --exercises ~/Downloads + +Example: + python generate_blanks_preview.py .cursor/tmp_items/evaluation_metrics_items.md \ + --scripts /Users/martine.holland/Downloads/scripts \ + --exercises /Users/martine.holland/Downloads +""" + +import sys +import re +import argparse +from pathlib import Path +from typing import List, Dict, Optional, Tuple +from dataclasses import dataclass, field +import html + + +# ============================================================================ +# DATA MODELS +# ============================================================================ + +@dataclass +class BlanksItem: + """A single BlanksChallenge item.""" + title: str + unit: str + subskill: str + context: str + code1: str + pre_challenge_code: str + variables: Dict[str, str] + item_number: int + blank_count: int = 0 + course_section: str = "" # Optional: explicit course section reference + teaching_point: str = "" # Optional: key concept being tested + + +@dataclass +class CourseReference: + """Reference to course content.""" + source: str # e.g., "Video 3.2" or "chapter3.md" + excerpt: str # The relevant text + code_snippet: Optional[str] = None + + +@dataclass +class EnrichedItem: + """Item with course references.""" + item: BlanksItem + video_refs: List[CourseReference] = field(default_factory=list) + exercise_refs: List[CourseReference] = field(default_factory=list) + + +# ============================================================================ +# PARSERS +# ============================================================================ + +def parse_items_file(content: str) -> Tuple[str, List[BlanksItem]]: + """Parse BlanksChallenge items from markdown.""" + items = [] + + # Extract document title + title_match = re.search(r'title:\s*(.+)', content) + doc_title = title_match.group(1).strip() if title_match else "Untitled" + + # Remove front matter + content = re.sub(r'^---\s*\n.*?\n---\s*\n', '', content, flags=re.DOTALL) + + # Split into items + item_blocks = re.split(r'\n---\s*\n', content) + + for i, block in enumerate(item_blocks, start=1): + if not block.strip(): + continue + + item = parse_single_item(block, i) + if item: + items.append(item) + + return doc_title, items + + +def parse_single_item(content: str, item_number: int) -> Optional[BlanksItem]: + """Parse a single item block.""" + # Extract title + title_match = re.search(r'##\s+\[([^\]]+)\]', content) + if not title_match: + title_match = re.search(r'##\s+(.+?)(?:\n|$)', content) + title = title_match.group(1).strip() if title_match else f"Item {item_number}" + + # Extract YAML fields + unit_match = re.search(r'unit:\s*(.+)', content) + subskill_match = re.search(r'subskill:\s*(.+)', content) + course_section_match = re.search(r'course_section:\s*["\']?([^"\']+)["\']?', content) + teaching_point_match = re.search(r'teaching_point:\s*["\']?([^"\']+)["\']?', content) + + unit = unit_match.group(1).strip() if unit_match else "" + subskill = subskill_match.group(1).strip() if subskill_match else "" + course_section = course_section_match.group(1).strip() if course_section_match else "" + teaching_point = teaching_point_match.group(1).strip() if teaching_point_match else "" + + # Extract sections - support Python, R, and SQL code blocks + # Language tags: python/{python}, r/{r}, sql/{sql} + context_match = re.search(r'`@context`\s*\n(.*?)(?=`@|\Z)', content, re.DOTALL) + code1_match = re.search(r'`@code1`\s*\n```(?:python|\{python\}|r|\{r\}|sql|\{sql\})\s*\n(.*?)```', content, re.DOTALL) + pre_code_match = re.search(r'`@pre_challenge_code`\s*\n```(?:python|\{python\}|r|\{r\})\s*\n(.*?)```', content, re.DOTALL) + variables_match = re.search(r'`@variables`\s*\n```yaml\s*\n(.*?)```', content, re.DOTALL) + + context = context_match.group(1).strip() if context_match else "" + code1 = code1_match.group(1).strip() if code1_match else "" + pre_code = pre_code_match.group(1).strip() if pre_code_match else "" + + # Parse variables + variables = {} + if variables_match: + var_content = variables_match.group(1) + var_blocks = re.findall(r'(expr\d+):\s*\n\s*-\s*(.+)', var_content) + for var_name, var_value in var_blocks: + variables[var_name] = var_value.strip().strip("'\"") + + # Count blanks + blank_count = len(re.findall(r'\{\{_expr\d+\}\}', code1)) + + return BlanksItem( + title=title, + unit=unit, + subskill=subskill, + context=context, + code1=code1, + pre_challenge_code=pre_code, + variables=variables, + item_number=item_number, + blank_count=blank_count, + course_section=course_section, + teaching_point=teaching_point + ) + + +def parse_video_scripts(scripts_dir: Path) -> Dict[str, str]: + """Parse video scripts from directory.""" + scripts = {} + + if not scripts_dir.exists(): + return scripts + + for script_file in scripts_dir.glob("*.txt"): + content = script_file.read_text() + # Split by video headers + videos = re.split(r'# Video (\d+)', content) + + chapter_match = re.search(r'chapter_(\d+)', script_file.name) + chapter_num = chapter_match.group(1) if chapter_match else "?" + + for i in range(1, len(videos), 2): + if i + 1 < len(videos): + video_num = videos[i] + video_content = videos[i + 1].strip() + key = f"Video {chapter_num}.{video_num}" + scripts[key] = video_content + + return scripts + + +def parse_exercises(exercises_dir: Path) -> Dict[str, List[Dict]]: + """Parse exercise files from directory.""" + exercises = {} + + if not exercises_dir.exists(): + return exercises + + for ex_file in exercises_dir.glob("chapter*.md"): + content = ex_file.read_text() + chapter_name = ex_file.stem + + # Extract exercise blocks - support Python, R, and SQL + ex_blocks = re.findall( + r'## ([^\n]+)\n.*?`@solution`\s*\n```(?:python|\{python\}|r|\{r\}|sql|\{sql\})\s*\n(.*?)```', + content, re.DOTALL + ) + + exercises[chapter_name] = [ + {"title": title.strip(), "code": code.strip()} + for title, code in ex_blocks + ] + + return exercises + + +# ============================================================================ +# MATCHING ENGINE +# ============================================================================ + +def extract_keywords(text: str) -> set: + """Extract meaningful keywords from text.""" + # Common terms across Python, R, and SQL exercises + common_terms = { + # Python/ML evaluation terms + 'accuracy', 'precision', 'recall', 'f1', 'bleu', 'rouge', 'meteor', + 'perplexity', 'toxicity', 'evaluate', 'metric', 'compute', 'load', + 'predictions', 'references', 'score', 'classification', 'summarization', + 'translation', 'generation', 'pipeline', 'model', 'tokenizer', + # scikit-learn / supervised learning terms + 'fit', 'predict', 'train', 'test', 'split', 'training', 'testing', + 'knn', 'neighbors', 'classifier', 'regression', 'linear', 'logistic', + 'ridge', 'lasso', 'alpha', 'regularization', 'regularized', + 'cross_validation', 'cross_val', 'kfold', 'gridsearch', 'gridsearchcv', + 'hyperparameter', 'tuning', 'overfitting', 'underfitting', + 'confusion', 'matrix', 'roc', 'auc', 'rmse', 'r_squared', 'r2', + 'dummy', 'dummies', 'categorical', 'encoding', 'impute', 'imputer', + 'scale', 'scaler', 'standardscaler', 'preprocessing', 'preprocess', + 'features', 'target', 'labels', 'supervised', 'unsupervised', + # MLflow terms + 'mlflow', 'experiment', 'experiments', 'tracking', 'run', 'runs', + 'artifact', 'artifacts', 'autolog', 'log_metric', 'log_param', + 'log_artifact', 'log_model', 'start_run', 'end_run', 'search_runs', + 'register_model', 'registry', 'registered', 'version', 'stage', + 'staging', 'production', 'archived', 'transition', 'flavor', 'flavors', + 'sklearn', 'pyfunc', 'projects', 'mlproject', 'entry_point', 'workflow', + 'serve', 'deploy', 'deployment', 'invocations', 'client', + # R/tidyverse terms + 'filter', 'select', 'mutate', 'summarize', 'summarise', 'group_by', + 'arrange', 'dplyr', 'ggplot', 'tidyr', 'tibble', 'pipe', 'dataframe', + 'geom_point', 'geom_bar', 'geom_line', 'aes', 'facet', + # SQL terms + 'select', 'from', 'where', 'join', 'inner', 'left', 'right', 'outer', + 'group', 'order', 'having', 'count', 'sum', 'avg', 'max', 'min', + 'aggregate', 'subquery', 'table', 'column', 'query' + } + + # Extract words and filter + words = set(re.findall(r'\b[a-z_]+\b', text.lower())) + return words & common_terms + + +def find_relevant_video_content(item: BlanksItem, scripts: Dict[str, str], max_refs: int = 2) -> List[CourseReference]: + """Find relevant video script excerpts for an item. + + Uses multiple strategies: + 1. Explicit teaching_point field (highest priority) + 2. Explicit course_section field + 3. Keyword and code pattern matching (fallback) + """ + refs = [] + + # Strategy 1: If teaching_point is specified, use that directly + if item.teaching_point: + source = "πŸ“ Teaching Point" + if item.course_section: + source += f" ({item.course_section})" + refs.append(CourseReference(source=source, excerpt=item.teaching_point)) + if len(refs) >= max_refs: + return refs + + # Strategy 2: If course_section is specified, find that exact section + if item.course_section: + for video_key, video_content in scripts.items(): + if item.course_section.lower() in video_key.lower(): + item_keywords = extract_keywords(f"{item.title} {item.context} {item.code1}") + excerpt = extract_relevant_excerpt(video_content, item_keywords) + if excerpt: + refs.append(CourseReference(source=video_key, excerpt=excerpt)) + if len(refs) >= max_refs: + return refs + + # Strategy 3: Keyword and code pattern matching + item_text = f"{item.title} {item.context} {item.code1}" + item_keywords = extract_keywords(item_text) + + # Also check for specific function/method names + code_terms = set(re.findall(r'[a-z_]+', item.code1.lower())) + + scored_refs = [] + + for video_key, video_content in scripts.items(): + # Score based on keyword overlap + video_keywords = extract_keywords(video_content) + overlap = len(item_keywords & video_keywords) + + # Bonus for code term matches + video_lower = video_content.lower() + code_matches = sum(1 for term in code_terms if term in video_lower and len(term) > 3) + + # Bonus for MLflow-specific patterns + mlflow_patterns = ['mlflow.', 'start_run', 'log_metric', 'log_param', 'autolog', + 'register_model', 'search_runs', 'projects.run'] + for pattern in mlflow_patterns: + if pattern in item.code1.lower() and pattern in video_lower: + code_matches += 5 + + score = overlap + code_matches + + if score > 0: + # Extract relevant paragraph + excerpt = extract_relevant_excerpt(video_content, item_keywords | code_terms) + if excerpt: + scored_refs.append((score, video_key, excerpt)) + + # Sort by score and take top refs + scored_refs.sort(key=lambda x: x[0], reverse=True) + + remaining_slots = max_refs - len(refs) + for score, video_key, excerpt in scored_refs[:remaining_slots]: + refs.append(CourseReference(source=video_key, excerpt=excerpt)) + + return refs + + +def find_relevant_exercises(item: BlanksItem, exercises: Dict[str, List[Dict]], max_refs: int = 2) -> List[CourseReference]: + """Find relevant exercise code for an item. + + Prioritizes exercises that contain the actual blank answer in their code. + """ + refs = [] + + # Get the actual blank answers - these should appear in matched exercises + blank_answers = [v.lower() for v in item.variables.values()] + + # Extract specific code patterns from item + item_code = item.code1.lower() + + # Look for library/module usage patterns + code_patterns = [] + + # Python: Extract function calls like evaluate.load, evaluate.compute + func_calls = re.findall(r'(\w+)\.(\w+)\s*\(', item.code1) + for obj, method in func_calls: + code_patterns.append(f"{obj}.{method}") + code_patterns.append(obj) + code_patterns.append(method) + + # Extract standalone function calls (works for Python, R, SQL functions) + standalone = re.findall(r'\b(\w+)\s*\(', item.code1) + code_patterns.extend(standalone) + + # R: Extract pipe chain functions (data %>% filter() %>% select()) + pipe_funcs = re.findall(r'%>%\s*(\w+)\s*\(', item.code1) + code_patterns.extend(pipe_funcs) + + # R: Extract ggplot geoms (geom_point, geom_bar, etc.) + geoms = re.findall(r'\b(geom_\w+)\b', item.code1) + code_patterns.extend(geoms) + + # SQL: Extract SQL keywords and clauses + sql_keywords = re.findall(r'\b(SELECT|FROM|WHERE|JOIN|GROUP BY|ORDER BY|HAVING|COUNT|SUM|AVG|MAX|MIN)\b', item.code1, re.IGNORECASE) + code_patterns.extend([kw.upper() for kw in sql_keywords]) + + # Extract variable names that look like metrics or common concepts + metric_names = re.findall(r'\b(accuracy|precision|recall|f1|bleu|rouge|meteor|perplexity|toxicity|exact_match)\b', item_code) + code_patterns.extend(metric_names) + + # R: dplyr verbs + dplyr_verbs = re.findall(r'\b(filter|select|mutate|summarize|summarise|group_by|arrange|slice|rename)\b', item_code) + code_patterns.extend(dplyr_verbs) + + scored_refs = [] + + for chapter, ex_list in exercises.items(): + for ex in ex_list: + ex_code = ex['code'].lower() + score = 0 + + # CRITICAL: Massive bonus if exercise contains the actual blank answer + answer_found = False + for answer in blank_answers: + if answer in ex_code: + score += 100 # Ensure this exercise is prioritized + answer_found = True + + # If this exercise doesn't contain any blank answer, it's less useful + # but still might provide context + if not answer_found: + score -= 50 # Penalize exercises without the answer + + # High score for matching library patterns (e.g., evaluate.load) + for pattern in code_patterns: + pattern_lower = pattern.lower() + if pattern_lower in ex_code: + # Higher weight for compound patterns like "evaluate.load" + if '.' in pattern: + score += 10 + elif pattern_lower in ('filter', 'select', 'mutate', 'summarize', 'group_by'): + score += 8 # dplyr verbs + elif len(pattern) > 4: + score += 3 + else: + score += 1 + + # Python: Bonus for matching the exact library (evaluate) + if 'evaluate' in item_code and 'evaluate' in ex_code: + score += 15 + + # Python: Bonus for matching compute/load patterns + if '.compute(' in item_code and '.compute(' in ex_code: + score += 10 + if '.load(' in item_code and 'load(' in ex_code: + score += 5 + + # scikit-learn: Bonus for fit/predict patterns + if '.fit(' in item_code and '.fit(' in ex_code: + score += 12 + if '.predict(' in item_code and '.predict(' in ex_code: + score += 12 + if '.score(' in item_code and '.score(' in ex_code: + score += 10 + if 'train_test_split' in item_code and 'train_test_split' in ex_code: + score += 15 + + # scikit-learn: Model types + if 'kneighbors' in item_code and 'kneighbors' in ex_code: + score += 12 + if 'linearregression' in item_code and 'linearregression' in ex_code: + score += 12 + if 'ridge' in item_code and 'ridge' in ex_code: + score += 12 + if 'lasso' in item_code and 'lasso' in ex_code: + score += 12 + if 'logisticregression' in item_code and 'logisticregression' in ex_code: + score += 12 + if 'gridsearchcv' in item_code and 'gridsearchcv' in ex_code: + score += 15 + + # scikit-learn: Preprocessing + if 'get_dummies' in item_code and 'get_dummies' in ex_code: + score += 15 + if 'pipeline' in item_code and 'pipeline' in ex_code: + score += 12 + if 'simpleimputer' in item_code and 'simpleimputer' in ex_code: + score += 12 + if 'standardscaler' in item_code and 'standardscaler' in ex_code: + score += 12 + + # Metrics + if 'classification_report' in item_code and 'classification_report' in ex_code: + score += 15 + if 'confusion_matrix' in item_code and 'confusion_matrix' in ex_code: + score += 12 + + # R: Bonus for pipe operator usage + if '%>%' in item.code1 and '%>%' in ex['code']: + score += 8 + if '|>' in item.code1 and '|>' in ex['code']: + score += 8 + + # R: ggplot matching + if 'ggplot' in item_code and 'ggplot' in ex_code: + score += 10 + + # SQL: JOIN matching + if 'join' in item_code and 'join' in ex_code: + score += 8 + + # MLflow: Experiment and tracking patterns + if 'mlflow' in item_code and 'mlflow' in ex_code: + score += 15 + if 'create_experiment' in item_code and 'create_experiment' in ex_code: + score += 20 + if 'set_experiment' in item_code and 'set_experiment' in ex_code: + score += 15 + if 'set_experiment_tag' in item_code and 'set_experiment_tag' in ex_code: + score += 20 + if 'start_run' in item_code and 'start_run' in ex_code: + score += 15 + if 'log_metric' in item_code and 'log_metric' in ex_code: + score += 20 + if 'log_param' in item_code and 'log_param' in ex_code: + score += 20 + if 'log_artifact' in item_code and 'log_artifact' in ex_code: + score += 15 + if 'search_runs' in item_code and 'search_runs' in ex_code: + score += 20 + + # MLflow: Models and flavors + if 'autolog' in item_code and 'autolog' in ex_code: + score += 20 + if 'save_model' in item_code and 'save_model' in ex_code: + score += 15 + if 'load_model' in item_code and 'load_model' in ex_code: + score += 15 + if 'log_model' in item_code and 'log_model' in ex_code: + score += 20 + if 'last_active_run' in item_code and 'last_active_run' in ex_code: + score += 15 + if 'mlflow.sklearn' in item_code and 'mlflow.sklearn' in ex_code: + score += 20 + if 'mlflow.pyfunc' in item_code and 'mlflow.pyfunc' in ex_code: + score += 20 + + # MLflow: Model Registry + if 'register_model' in item_code and 'register_model' in ex_code: + score += 20 + if 'mlflowclient' in item_code and 'mlflowclient' in ex_code: + score += 15 + if 'create_registered_model' in item_code and 'create_registered_model' in ex_code: + score += 20 + if 'search_registered_models' in item_code and 'search_registered_models' in ex_code: + score += 15 + if 'transition_model_version_stage' in item_code and 'transition_model_version_stage' in ex_code: + score += 20 + if 'models:/' in item_code and 'models:/' in ex_code: + score += 15 + + # MLflow: Projects + if 'mlflow.projects' in item_code and 'mlflow.projects' in ex_code: + score += 20 + if 'projects.run' in item_code and 'projects.run' in ex_code: + score += 20 + if 'entry_point' in item_code and 'entry_point' in ex_code: + score += 15 + if 'mlproject' in item_code.lower() and 'mlproject' in ex_code.lower(): + score += 10 + + if score > 0: # Only include exercises with positive score (ideally with the answer) + scored_refs.append((score, chapter, ex)) + + scored_refs.sort(key=lambda x: x[0], reverse=True) + + for score, chapter, ex in scored_refs[:max_refs]: + refs.append(CourseReference( + source=f"{chapter}: {ex['title']}", + excerpt="", + code_snippet=ex['code'][:400] + ("..." if len(ex['code']) > 400 else "") + )) + + return refs + + +def extract_relevant_excerpt(content: str, keywords: set, context_lines: int = 4) -> str: + """Extract the most relevant paragraph from content. + + Prioritizes lines that contain specific function/method names (with underscores) + over lines with generic keywords. + """ + lines = content.split('\n') + best_score = 0 + best_start = 0 + + # Identify "specific" keywords (function names with underscores or dots) + specific_keywords = {kw for kw in keywords if '_' in kw or '.' in kw} + + for i, line in enumerate(lines): + line_lower = line.lower() + line_keywords = extract_keywords(line) + + # Base score from keyword overlap + overlap = line_keywords & keywords + score = len(overlap) + + # BONUS: Extra points for specific function names found in the line text + for specific in specific_keywords: + if specific in line_lower: + score += 5 # Strong bonus for exact function match + + if score > best_score: + best_score = score + best_start = max(0, i - 1) + + if best_score == 0: + return "" + + # Get surrounding context + excerpt_lines = lines[best_start:best_start + context_lines] + excerpt = ' '.join(line.strip() for line in excerpt_lines if line.strip()) + + # Truncate if too long + if len(excerpt) > 400: + excerpt = excerpt[:400] + "..." + + return excerpt + + +def enrich_items(items: List[BlanksItem], scripts: Dict[str, str], exercises: Dict[str, List[Dict]]) -> List[EnrichedItem]: + """Add course references to items.""" + enriched = [] + + for item in items: + video_refs = find_relevant_video_content(item, scripts) + exercise_refs = find_relevant_exercises(item, exercises) + + enriched.append(EnrichedItem( + item=item, + video_refs=video_refs, + exercise_refs=exercise_refs + )) + + return enriched + + +# ============================================================================ +# HTML GENERATOR +# ============================================================================ + +def generate_html(doc_title: str, enriched_items: List[EnrichedItem], subskill: str = "") -> str: + """Generate HTML preview.""" + + items_html = "" + for ei in enriched_items: + item = ei.item + + # Format code with blanks highlighted + code_html = html.escape(item.code1) + code_html = re.sub( + r'\{\{(_expr\d+)\}\}', + r'{{\1}}', + code_html + ) + + # Format variables + vars_html = "" + for var_name, var_value in item.variables.items(): + vars_html += f''' +
+
{var_name}
+
'{html.escape(var_value)}'
+
''' + + # Get the blank answers for highlighting + blank_answers = list(item.variables.values()) + + def highlight_answers(text: str, answers: list, is_code: bool = False) -> str: + """Highlight blank answers in text.""" + result = html.escape(text) + for answer in answers: + # Escape the answer for regex + escaped_answer = html.escape(answer) + # Use word boundaries for non-code, more flexible for code + if is_code: + # In code, highlight the function/method name + result = re.sub( + rf'(\b{re.escape(escaped_answer)}\b)', + r'\1', + result + ) + else: + # In text, also match with underscores converted to spaces + result = re.sub( + rf'(\b{re.escape(escaped_answer)}\b)', + r'\1', + result + ) + # Also try matching the readable form (e.g., "log metric" for "log_metric") + readable = answer.replace('_', ' ') + if readable != answer: + result = re.sub( + rf'(\b{re.escape(html.escape(readable))}\b)', + r'\1', + result, + flags=re.IGNORECASE + ) + return result + + # Format video references with highlighting + video_refs_html = "" + for ref in ei.video_refs: + highlighted_excerpt = highlight_answers(ref.excerpt, blank_answers, is_code=False) + video_refs_html += f''' +
+
πŸ“Ή {html.escape(ref.source)}
+
{highlighted_excerpt}
+
''' + + # Format exercise references with highlighting + exercise_refs_html = "" + for ref in ei.exercise_refs: + if ref.code_snippet: + highlighted_code = highlight_answers(ref.code_snippet, blank_answers, is_code=True) + code_snippet = f'
{highlighted_code}
' + else: + code_snippet = "" + exercise_refs_html += f''' +
+
πŸ“ {html.escape(ref.source)}
+ {code_snippet} +
''' + + items_html += f''' +
+
+ Item {item.item_number} +

{html.escape(item.title)}

+
+ +
type: BlanksChallenge +unit: {html.escape(item.unit)} +subskill: {html.escape(item.subskill)}
+ + +
{html.escape(item.context)}
+ + +
{code_html}
+ + +
{vars_html}
+ + +
+ {video_refs_html if video_refs_html else '
No video references found
'} + {exercise_refs_html if exercise_refs_html else '
No exercise references found
'} +
+
''' + + return f''' + + + + + BlanksChallenge Preview - {html.escape(doc_title)} + + + +
+

πŸ§ͺ BlanksChallenge Preview

+ +
+ Document: {html.escape(doc_title)}
+ Subskill: {html.escape(subskill or enriched_items[0].item.subskill if enriched_items else "")} +
+ πŸ“ {len(enriched_items)} items + πŸ”² {sum(ei.item.blank_count for ei in enriched_items)} total blanks +
+
+ + {items_html} +
+ +''' + + +# ============================================================================ +# CLI +# ============================================================================ + +def main(): + parser = argparse.ArgumentParser(description="Generate BlanksChallenge preview with course references") + parser.add_argument("items_file", type=Path, help="Path to items markdown file") + parser.add_argument("--scripts", type=Path, help="Directory containing video script files") + parser.add_argument("--exercises", type=Path, help="Directory containing exercise markdown files") + parser.add_argument("--output", "-o", type=Path, default=Path(".cursor/tmp_items/blanks_preview.html"), help="Output HTML file") + + args = parser.parse_args() + + if not args.items_file.exists(): + print(f"❌ Items file not found: {args.items_file}") + sys.exit(1) + + # Parse items + print(f"πŸ“„ Parsing items from {args.items_file}...") + content = args.items_file.read_text() + doc_title, items = parse_items_file(content) + print(f" Found {len(items)} items") + + # Parse course content + scripts = {} + exercises = {} + + if args.scripts: + print(f"πŸ“Ή Parsing video scripts from {args.scripts}...") + scripts = parse_video_scripts(args.scripts) + print(f" Found {len(scripts)} video sections") + + if args.exercises: + print(f"πŸ“ Parsing exercises from {args.exercises}...") + exercises = parse_exercises(args.exercises) + total_ex = sum(len(ex) for ex in exercises.values()) + print(f" Found {total_ex} exercises across {len(exercises)} chapters") + + # Enrich items with course references + print("πŸ”— Matching items to course content...") + enriched = enrich_items(items, scripts, exercises) + + # Generate HTML + print("🎨 Generating HTML preview...") + html_content = generate_html(doc_title, enriched) + + # Write output + args.output.parent.mkdir(parents=True, exist_ok=True) + args.output.write_text(html_content) + print(f"βœ… Preview generated: {args.output}") + + # Open in browser + import subprocess + subprocess.run(["open", str(args.output)], check=False) + + +if __name__ == "__main__": + main() diff --git a/.cursor/preview/generate_mc_preview.py b/.cursor/preview/generate_mc_preview.py new file mode 100644 index 0000000..8da138b --- /dev/null +++ b/.cursor/preview/generate_mc_preview.py @@ -0,0 +1,1223 @@ +#!/usr/bin/env python3 +""" +MultipleChoiceChallenge Preview Generator + +Generates rich HTML previews for MultipleChoiceChallenge items with course content references. + +Features: +- Parses MultipleChoiceChallenge markdown items +- Extracts relevant video script excerpts +- Shows option lengths and validation status +- Highlights correct answers + +Usage: + python generate_mc_preview.py [--scripts ] + python generate_mc_preview.py .cursor/tmp_items/mc_items.md --scripts ~/Downloads/scripts + +Example: + python generate_mc_preview.py .cursor/tmp_items/mc_items.md \ + --scripts /Users/martine.holland/Downloads/scripts +""" + +import sys +import re +import argparse +from pathlib import Path +from typing import List, Dict, Optional, Tuple +from dataclasses import dataclass, field +import html + + +# ============================================================================ +# DATA MODELS +# ============================================================================ + +@dataclass +class MCOption: + """A single option.""" + text: str + length: int + is_correct: bool + + +@dataclass +class MCItem: + """A single MultipleChoiceChallenge item.""" + title: str + unit: str + subskill: str + assignment: str # stem/question + options: List[MCOption] + correct_position: int # 1-based + item_number: int + course_section: str = "" # Optional: explicit course section reference + teaching_point: str = "" # Optional: key concept being tested + course_content_reference: str = "" # AI-extracted verbatim passages from course materials + +@dataclass +class CourseReference: + """Reference to course content.""" + source: str # e.g., "Video 3.2" + excerpt: str # The relevant text + + +@dataclass +class EnrichedItem: + """Item with course references and validation.""" + item: MCItem + embedded_refs: List[CourseReference] = field(default_factory=list) # From course_content_reference + video_refs: List[CourseReference] = field(default_factory=list) + curated_refs: List[CourseReference] = field(default_factory=list) # Definitional snippets + length_valid: bool = True + length_warnings: List[str] = field(default_factory=list) + + +# ============================================================================ +# PARSERS +# ============================================================================ + +def parse_items_file(content: str) -> Tuple[str, List[MCItem]]: + """Parse MultipleChoiceChallenge items from markdown.""" + items = [] + + # Extract document title + title_match = re.search(r'title:\s*(.+)', content) + doc_title = title_match.group(1).strip() if title_match else "Untitled" + + # Remove front matter + content = re.sub(r'^---\s*\n.*?\n---\s*\n', '', content, flags=re.DOTALL) + + # Split into items + item_blocks = re.split(r'\n---\s*\n', content) + + for i, block in enumerate(item_blocks, start=1): + if not block.strip(): + continue + + item = parse_single_item(block, i) + if item: + items.append(item) + + return doc_title, items + + +def parse_single_item(content: str, item_number: int) -> Optional[MCItem]: + """Parse a single item block.""" + # Extract title + title_match = re.search(r'##\s+(.+?)(?:\n|$)', content) + title = title_match.group(1).strip() if title_match else f"Item {item_number}" + + # Extract YAML fields + unit_match = re.search(r'unit:\s*(.+)', content) + subskill_match = re.search(r'subskill:\s*(.+)', content) + course_section_match = re.search(r'course_section:\s*["\']?([^"\']+)["\']?', content) + teaching_point_match = re.search(r'teaching_point:\s*["\']?([^"\']+)["\']?', content) + + unit = unit_match.group(1).strip() if unit_match else "" + subskill = subskill_match.group(1).strip() if subskill_match else "" + course_section = course_section_match.group(1).strip() if course_section_match else "" + teaching_point = teaching_point_match.group(1).strip() if teaching_point_match else "" + + # Extract course_content_reference (multi-line YAML block) + course_content_reference = "" + ccr_match = re.search(r'#\s*course_content_reference:\s*\|?\s*\n((?:#.*\n)*)', content) + if ccr_match: + ccr_lines = ccr_match.group(1) + course_content_reference = re.sub(r'^#\s?', '', ccr_lines, flags=re.MULTILINE).strip() + + # Extract assignment (stem) + assignment_match = re.search(r'`@assignment1`\s*\n(.*?)(?=`@|\Z)', content, re.DOTALL) + assignment = assignment_match.group(1).strip() if assignment_match else "" + + # Extract options + options = [] + correct_position = 0 + + options_match = re.search(r'`@options1`\s*\n(.*?)(?=`@|\Z|---)', content, re.DOTALL) + if options_match: + options_text = options_match.group(1).strip() + option_lines = re.findall(r'^-\s*(.+)$', options_text, re.MULTILINE) + + for i, opt in enumerate(option_lines): + opt = opt.strip() + is_correct = opt.startswith('[') and opt.endswith(']') + + if is_correct: + correct_position = i + 1 + opt_text = opt[1:-1] # Remove brackets + else: + opt_text = opt + + options.append(MCOption( + text=opt_text, + length=len(opt_text), + is_correct=is_correct + )) + + if not options: + return None + + return MCItem( + title=title, + unit=unit, + subskill=subskill, + assignment=assignment, + options=options, + correct_position=correct_position, + item_number=item_number, + course_section=course_section, + teaching_point=teaching_point, + course_content_reference=course_content_reference + ) + + +def parse_video_scripts(scripts_dir: Path) -> Dict[str, str]: + """Parse video/course scripts from directory. + + Supports multiple formats: + - .txt files with '# Video N' headers (e.g., chapter_1_scripts.txt) + - .md files with '## Section Title' headers (e.g., chapter1.md) + """ + scripts = {} + + if not scripts_dir.exists(): + return scripts + + # Parse .txt files (original format: chapter_1_scripts.txt with # Video N headers) + for script_file in scripts_dir.glob("*.txt"): + content = script_file.read_text() + + # Extract chapter number from filename + chapter_match = re.search(r'chapter[_-]?(\d+)', script_file.name, re.IGNORECASE) + chapter_num = chapter_match.group(1) if chapter_match else "?" + + # Split by video headers + videos = re.split(r'# Video (\d+)', content) + + for i in range(1, len(videos), 2): + if i + 1 < len(videos): + video_num = videos[i] + video_content = videos[i + 1].strip() + key = f"Chapter {chapter_num}, Video {video_num}" + scripts[key] = video_content + + # Parse .md files (course exercise format: chapter1.md with ## Section headers) + for md_file in scripts_dir.glob("*.md"): + # Skip non-chapter files + chapter_match = re.search(r'chapter[_-]?(\d+)', md_file.name, re.IGNORECASE) + if not chapter_match: + continue + + chapter_num = chapter_match.group(1) + content = md_file.read_text() + + # Split by ## section headers + sections = re.split(r'\n##\s+', content) + + for i, section in enumerate(sections[1:], start=1): # Skip content before first ## + # Extract section title (first line) + lines = section.split('\n', 1) + section_title = lines[0].strip() + section_content = lines[1] if len(lines) > 1 else "" + + # Extract teaching content (text before @instructions or @possible_answers) + teaching_match = re.match(r'(.*?)(?:`@instructions`|`@possible_answers`|\Z)', + section_content, re.DOTALL) + if teaching_match: + teaching_content = teaching_match.group(1).strip() + # Clean up: remove yaml blocks and code solution blocks + teaching_content = re.sub(r'```yaml.*?```', '', teaching_content, flags=re.DOTALL) + teaching_content = re.sub(r'`@solution`.*?```', '', teaching_content, flags=re.DOTALL) + teaching_content = re.sub(r'`@sct`.*?```', '', teaching_content, flags=re.DOTALL) + teaching_content = re.sub(r'`@pre_exercise_code`.*?```', '', teaching_content, flags=re.DOTALL) + teaching_content = re.sub(r'`@hint`.*?(?=\n\n|\Z)', '', teaching_content, flags=re.DOTALL) + + if len(teaching_content) > 50: # Only include substantial content + key = f"Chapter {chapter_num}: {section_title}" + scripts[key] = teaching_content + + return scripts + + +# ============================================================================ +# CURATED SNIPPETS - Definitional passages for key concepts +# ============================================================================ + +CURATED_SNIPPETS = { + # ========================================================================== + # SHELL / UNIX CONCEPTS (Introduction to Shell course) + # ========================================================================== + + # Chapter 1 - Navigating files and directories + "absolute path": { + "source": "Shell Chapter 1", + "excerpt": 'An **absolute path** starts from the root directory `/` and describes the complete location. A **relative path** starts from the current directory. For example, `/home/user/data.csv` is absolute, while `data/sales.csv` is relative to wherever you currently are.' + }, + "relative path": { + "source": "Shell Chapter 1", + "excerpt": 'A **relative path** specifies a location starting from your current working directory. The path `data/sales.csv` means "look for data folder here, then sales.csv inside it". If you change directories, the same relative path points to a different location.' + }, + "cd command": { + "source": "Shell Chapter 1", + "excerpt": 'The `cd` command changes your current working directory. `cd data` moves into a subdirectory called data. `cd ..` moves up one level. `cd ~` goes to your home directory. `cd /path` goes to an absolute path.' + }, + "copy move": { + "source": "Shell Chapter 1", + "excerpt": '`cp` copies files: `cp original.txt backup.txt` creates a copy. `mv` moves or renames files: `mv old.txt new.txt` renames, `mv file.txt folder/` moves. The key difference: **cp keeps the original, mv removes it**.' + }, + "rmdir directory": { + "source": "Shell Chapter 1", + "excerpt": '`rmdir` removes **empty directories only**. If a directory contains files, rmdir fails with an error. You must first delete the files inside, then remove the directory. Use `rm -r` to remove directories with contents (carefully!).' + }, + + # Chapter 2 - Manipulating data + "head tail": { + "source": "Shell Chapter 2", + "excerpt": '`head` shows the **first** lines of a file (default 10). `tail` shows the **last** lines. Use `-n` to specify how many: `head -n 5 file.csv` shows first 5 lines. `tail -n +7` shows everything from line 7 onward.' + }, + "cut command": { + "source": "Shell Chapter 2", + "excerpt": '`cut` selects **columns** from a file. Use `-d` for delimiter and `-f` for fields: `cut -d , -f 2 data.csv` extracts the second column from a comma-separated file. You can select multiple columns: `-f 1,3` or `-f 2-5`.' + }, + "grep command": { + "source": "Shell Chapter 2", + "excerpt": '`grep` selects **lines** containing a pattern. Common flags: `-c` counts matches, `-v` inverts (shows non-matching lines), `-i` ignores case, `-n` shows line numbers. Example: `grep -v error log.txt` shows lines WITHOUT "error".' + }, + "history command": { + "source": "Shell Chapter 2", + "excerpt": '`history` shows commands you have run. Re-run command 55 with `!55`. Re-run the most recent grep with `!grep`. The exclamation mark followed by a command name re-runs the most recent use of that command.' + }, + + # Chapter 3 - Combining tools + "redirection": { + "source": "Shell Chapter 3", + "excerpt": 'The `>` operator **redirects output to a file** instead of the screen. `head -n 5 data.csv > sample.csv` saves the first 5 lines to sample.csv. Nothing appears on screenβ€”output goes to the file. `>>` appends instead of overwriting.' + }, + "pipe": { + "source": "Shell Chapter 3", + "excerpt": 'The **pipe** `|` sends the output of one command as input to another. `grep error log.txt | wc -l` counts error lines. No intermediate files needed. You can chain many commands: `cut | grep | sort | uniq`.' + }, + "wildcard": { + "source": "Shell Chapter 3", + "excerpt": 'The `*` **wildcard** matches zero or more characters. `*.csv` matches all CSV files. `data*` matches anything starting with "data". Wildcards are expanded by the shell before the command runs.' + }, + "sort uniq": { + "source": "Shell Chapter 3", + "excerpt": '`sort` orders lines alphabetically (or numerically with `-n`). `uniq` removes **adjacent** duplicates only. To remove all duplicates, **sort first**: `sort data.txt | uniq`. Use `uniq -c` to count occurrences.' + }, + + # Chapter 4 - Batch processing + "shell variable": { + "source": "Shell Chapter 4", + "excerpt": 'Create a variable with `name=value` (**no spaces** around =). Access its value with `$name`. Example: `datafile=report.csv` then `cat $datafile`. Without the `$`, you get the literal text "datafile".' + }, + "for loop": { + "source": "Shell Chapter 4", + "excerpt": 'Loop structure: `for var in list; do commands; done`. Example: `for f in *.csv; do head $f; done` shows first lines of all CSV files. The variable `$f` takes each filename in turn. Use `$var` to access the value.' + }, + "script argument": { + "source": "Shell Chapter 5", + "excerpt": 'In scripts, `$1` is the first argument, `$2` the second, etc. `$@` means all arguments. If script.sh contains `head $1`, then `bash script.sh data.csv` runs `head data.csv`.' + }, + + # ========================================================================== + # MLOPS CONCEPTS (MLOps Concepts course) + # ========================================================================== + + # Chapter 1 concepts + "mlops purpose": { + "source": "MLOps Chapter 1, Video 1", + "excerpt": 'MLOps is the abbreviation for Machine Learning Operations, and it describes the set of practices to **design, deploy and maintain machine learning in production continuously, reliably, and efficiently**. MLOps also facilitates **monitoring of model performance**, which helps to maintain accuracy and reliability over time.' + }, + "mlops principles": { + "source": "Chapter 1, Video 1", + "excerpt": 'Through MLOps principles we can **automate the deployment of models**, which reduces manual errors and speeds up the process of getting models from development to production. Inherent to MLOps is that it aims to **bridge the gap between machine learning and operations teams**, which enhances collaboration.' + }, + "design phase": { + "source": "Chapter 1, Video 2", + "excerpt": 'In the **design phase**, we clarify the context of the problem and assess the added value of using machine learning. **Gathering clear business requirements** helps us define success, while establishing key metrics allows us to track progress effectively.' + }, + "development phase": { + "source": "Chapter 1, Video 2", + "excerpt": 'In the **development phase**, the real magic happens. This is where we dive deep into creating our machine learning model. We experiment with various combinations of data, algorithms, and hyperparameters, testing different approaches to find the best fit.' + }, + "deployment phase": { + "source": "Chapter 1, Video 2", + "excerpt": 'In the **deployment phase**, our model meets the real world. We focus on integrating our model into existing business processes, ensuring it operates seamlessly within the larger system. Setting up monitoring systems is crucial here.' + }, + "subject matter expert": { + "source": "Chapter 1, Video 3", + "excerpt": 'The **subject matter expert** has **domain knowledge about the problem** that we are trying to solve. The subject matter expert is involved throughout the lifecycle because **they can assist the more technical roles with interpreting the data and results** at each step.' + }, + "data scientist": { + "source": "Chapter 1, Video 3", + "excerpt": 'The **data scientist** is responsible for data analysis and model training and evaluation. The evaluation includes monitoring the model once it has been deployed to ensure that the model predictions are valid.' + }, + "data engineer": { + "source": "Chapter 1, Video 3", + "excerpt": 'The **data engineer** is responsible for the collecting, storing, and processing of data. This also means that the data engineer should check the data quality and include tests such that the quality is maintained throughout the process.' + }, + "business stakeholder": { + "source": "Chapter 1, Video 3", + "excerpt": 'The **business stakeholder**, or product owner, is a managerial staff member making budget decisions and ensuring the machine learning project aligns with the company\'s vision. They are involved throughout the lifecycle.' + }, + "ml engineer": { + "source": "Chapter 1, Video 3", + "excerpt": 'The **machine learning engineer** is a relatively new role that is quite versatile and designed specifically to have expertise over the entire machine learning lifecycle. It is a cross-functional role that overlaps with the other technical roles.' + }, + + # Chapter 2 concepts + "stakeholder metrics": { + "source": "Chapter 2, Video 1", + "excerpt": 'The roles involved in MLOps processes are multidisciplinary and thus also have **their own way of tracking performance**. The data scientist looks at the **accuracy** of a model... The **subject matter expert** is interested in the model\'s **impact on the business**... The **business stakeholder** is more interested in the **monetary value** of the model.' + }, + "accuracy": { + "source": "Chapter 2, Video 2", + "excerpt": 'An example of **accuracy** would be whether the data correctly describes the customer. It could be that the data states that a customer is 18, but the customer is actually 32. That would be inaccurate.' + }, + "completeness": { + "source": "Chapter 2, Video 2", + "excerpt": 'For **completeness**, we mainly look at missing data, for instance, whether we are missing last names of customers.' + }, + "consistency": { + "source": "Chapter 2, Video 2", + "excerpt": 'With **consistency**, we investigate whether the definition of a customer is **consistent throughout the organization**. It could be that one department has a **different definition of an active customer than another**, which makes the data inconsistent.' + }, + "timeliness": { + "source": "Chapter 2, Video 2", + "excerpt": 'If we look at **timeliness**, we are interested in the availability of data. For instance, when the customer orders are synchronized daily, they are not available in real-time.' + }, + "feature engineering": { + "source": "Chapter 2, Video 3", + "excerpt": '**Feature engineering** is the process of selecting, manipulating, and transforming raw data into features. A feature is a variable, such as a column in a table. The goal is to enhance model performance by identifying the most informative features.' + }, + "feature store": { + "source": "Chapter 2, Video 3", + "excerpt": 'A **feature store** is a **centralized repository for features**, allowing data scientists to **discover, define, and reuse features** across projects. Feature stores are essential in large teams where multiple projects need **consistent and reusable features**.' + }, + "experiment tracking": { + "source": "Chapter 2, Video 4", + "excerpt": 'Why is tracking all of this so crucial? Well, it helps us **compare results, reproduce past experiments**, collaborate with our team, and report findings to stakeholders. Before starting the training, we establish our experiment tracking to **log every detail meticulously**.' + }, + + # Chapter 3 concepts + "container": { + "source": "Chapter 3, Video 1", + "excerpt": 'A **container** is like a special box that holds a computer program along with everything it needs to run, such as certain tools and settings. This makes it easier to move programs around and ensures they don\'t break when they\'re used on different computers.' + }, + "containerization": { + "source": "Chapter 3, Video 1", + "excerpt": 'A **container** is like a special box that holds everything our model needs to runβ€”code, libraries, and settings. **Containerization** packages applications with their dependencies, ensuring consistent runtime environments across development, testing, and production.' + }, + "microservices": { + "source": "Chapter 3, Video 1", + "excerpt": '**Microservices architecture** deploys applications as a collection of independent services. Each service handles a specific function and communicates through APIs, enabling scalability and independent deployment.' + }, + "api": { + "source": "Chapter 3, Video 1", + "excerpt": 'An **API** (Application Programming Interface) enables communication between services. In ML deployment, APIs allow other applications to send data to the model and receive predictions.' + }, + "ci/cd": { + "source": "Chapter 3, Video 2", + "excerpt": '**CI/CD pipelines** (Continuous Integration/Continuous Deployment) automate the build, test, and deployment process. This allows multiple developers to work on the same code and helps in automating the development and deployment process.' + }, + "basic deployment": { + "source": "Chapter 3, Video 2", + "excerpt": 'In **basic deployment**, we simply replace the old model with the new one. This is straightforward but riskyβ€”if the new model has issues, all users are affected immediately.' + }, + "shadow deployment": { + "source": "Chapter 3, Video 2", + "excerpt": 'In **shadow deployment**, the new model runs alongside the old one, receiving the same inputs. We compare outputs without affecting users, allowing safe validation of the new model.' + }, + "canary deployment": { + "source": "Chapter 3, Video 2", + "excerpt": 'In **canary deployment**, we gradually roll out the new model to a small percentage of users first. If metrics look good, we increase the percentage until full deployment.' + }, + + # Chapter 4 concepts + "data drift": { + "source": "Chapter 4, Video 1", + "excerpt": '**Data drift** occurs when the input data distribution changes over time. The model was trained on historical data, but the real-world data it now receives has different characteristics.' + }, + "concept drift": { + "source": "Chapter 4, Video 1", + "excerpt": '**Concept drift** occurs when the relationship between input data and the target variable changes. Even if the input data looks the same, what constitutes a correct prediction has shifted.' + }, + "statistical monitoring": { + "source": "Chapter 4, Video 1", + "excerpt": '**Statistical monitoring** tracks the distribution of input data and model predictions over time. This helps detect data drift and concept drift before they significantly impact model performance.' + }, + "computational monitoring": { + "source": "Chapter 4, Video 1", + "excerpt": '**Computational monitoring** tracks technical metrics like request latency, network usage, and resource consumption. This ensures the model infrastructure remains healthy and responsive.' + }, + "mlops maturity": { + "source": "Chapter 4, Video 2", + "excerpt": '**MLOps maturity levels** describe the degree of automation, collaboration, and monitoring in an organization\'s ML practices. Higher maturity means more automated pipelines, better governance, and proactive monitoring.' + }, +} + + +def find_curated_snippet(item: 'MCItem') -> Optional[CourseReference]: + """Find a curated definitional snippet for this item. + + Priority: + 1. If teaching_point is specified, return that directly + 2. Match against curated snippets database + """ + # If teaching_point is explicitly provided, use that + if item.teaching_point: + source = f"πŸ“ Teaching Point" + if item.course_section: + source += f" ({item.course_section})" + return CourseReference( + source=source, + excerpt=item.teaching_point + ) + + # Otherwise, match against curated snippets + item_text = f"{item.title} {item.assignment}".lower() + correct_answer = next((opt.text.lower() for opt in item.options if opt.is_correct), "") + + # Also check unit name + unit_text = item.unit.replace('-', ' ').replace('_', ' ').lower() + + # Priority matching based on concept keywords + match_scores = [] + + for concept_key, snippet_data in CURATED_SNIPPETS.items(): + score = 0 + concept_words = concept_key.split() + + # Check if concept appears in title, stem, or correct answer + for word in concept_words: + if word in item.title.lower(): + score += 10 + if word in item_text: + score += 5 + if word in correct_answer: + score += 8 + if word in unit_text: + score += 12 # Strong signal from unit name + + # Bonus for multi-word concept matches + if concept_key in item_text or concept_key in correct_answer: + score += 15 + if concept_key in unit_text: + score += 20 + + if score > 0: + match_scores.append((score, concept_key, snippet_data)) + + if not match_scores: + return None + + # Return highest scoring match + match_scores.sort(key=lambda x: x[0], reverse=True) + best = match_scores[0] + + return CourseReference( + source=f"πŸ“– {best[1].title()} β€” {best[2]['source']}", + excerpt=best[2]['excerpt'] + ) + + +# ============================================================================ +# MATCHING ENGINE +# ============================================================================ + +def extract_keywords(text: str) -> set: + """Extract meaningful keywords from text.""" + # Common conceptual terms for MC items + concept_terms = { + # Shell/Unix commands and concepts + 'command', 'shell', 'bash', 'terminal', 'console', + 'directory', 'folder', 'file', 'path', 'filename', + 'absolute', 'relative', 'root', 'home', 'parent', 'current', + 'copy', 'move', 'remove', 'delete', 'rename', 'create', 'mkdir', 'rmdir', + 'cd', 'ls', 'pwd', 'cat', 'head', 'tail', 'less', 'more', + 'cut', 'grep', 'sort', 'uniq', 'wc', 'echo', 'history', + 'pipe', 'redirect', 'redirection', 'output', 'input', 'stdin', 'stdout', + 'wildcard', 'glob', 'pattern', 'match', + 'loop', 'for', 'variable', 'script', 'argument', 'parameter', + 'flag', 'option', 'delimiter', 'separator', 'column', 'field', 'line', + 'duplicate', 'unique', 'count', 'filter', 'select', 'extract', + + # General tech/programming + 'container', 'docker', 'kubernetes', 'microservice', 'api', 'database', + 'function', 'class', 'method', 'iteration', 'recursion', 'algorithm', + + # Data science + 'model', 'training', 'prediction', 'accuracy', 'precision', 'recall', + 'dataset', 'feature', 'label', 'classification', 'regression', + 'neural', 'network', 'layer', 'optimizer', 'gradient', + + # ML/AI specific + 'agent', 'agentic', 'llm', 'prompt', 'token', 'embedding', + 'transformer', 'attention', 'inference', + 'pipeline', 'workflow', 'orchestration', 'automation', + + # Software engineering + 'modularity', 'scalability', 'maintainability', 'testing', + 'deployment', 'infrastructure', 'logging', 'monitoring', + 'version', 'control', 'git', 'branch', 'merge', + } + + # Extract words and filter + words = set(re.findall(r'\b[a-z_]+\b', text.lower())) + return words & concept_terms + + +def find_relevant_video_content(item: MCItem, scripts: Dict[str, str], max_refs: int = 2) -> List[CourseReference]: + """Find relevant video script excerpts for an item. + + Uses multiple strategies: + 1. Explicit course_section field (highest priority) + 2. Unit name matching to section titles + 3. Teaching pattern extraction + 4. Keyword overlap (fallback) + """ + refs = [] + + # Strategy 1: If course_section is specified, find that exact section + if item.course_section: + for video_key, video_content in scripts.items(): + if item.course_section.lower() in video_key.lower(): + excerpt = extract_teaching_excerpt(video_content, item) + if excerpt: + refs.append(CourseReference( + source=f"πŸ“ {video_key} (explicit match)", + excerpt=excerpt + )) + return refs # Return immediately - this is the definitive source + + # Strategy 2: Match unit name to section titles + unit_words = set(item.unit.replace('-', ' ').replace('_', ' ').lower().split()) + for video_key, video_content in scripts.items(): + key_words = set(video_key.lower().split()) + if len(unit_words & key_words) >= 2: # At least 2 words match + excerpt = extract_teaching_excerpt(video_content, item) + if excerpt: + refs.append(CourseReference( + source=f"πŸ“ {video_key} (unit match)", + excerpt=excerpt + )) + if len(refs) >= max_refs: + return refs + + # Strategy 3 & 4: Keyword matching with teaching pattern bonus + item_text = f"{item.title} {item.assignment}" + for opt in item.options: + item_text += f" {opt.text}" + + item_keywords = extract_keywords(item_text) + + scored_refs = [] + + for video_key, video_content in scripts.items(): + # Skip if already added via unit match + if any(video_key in ref.source for ref in refs): + continue + + # Score based on keyword overlap + video_keywords = extract_keywords(video_content) + overlap = len(item_keywords & video_keywords) + + # Bonus for direct term matches + video_lower = video_content.lower() + direct_matches = sum(1 for kw in item_keywords if kw in video_lower) + + # Bonus for teaching patterns + teaching_bonus = score_teaching_patterns(video_content, item_keywords) + + score = overlap * 2 + direct_matches + teaching_bonus * 3 + + if score > 5: # Minimum threshold + excerpt = extract_teaching_excerpt(video_content, item) + if excerpt: + scored_refs.append((score, video_key, excerpt)) + + # Sort by score and take top refs + scored_refs.sort(key=lambda x: x[0], reverse=True) + + remaining_slots = max_refs - len(refs) + for score, video_key, excerpt in scored_refs[:remaining_slots]: + refs.append(CourseReference(source=f"πŸ” {video_key}", excerpt=excerpt)) + + return refs + + +def score_teaching_patterns(content: str, keywords: set) -> int: + """Score content based on presence of teaching patterns near keywords.""" + score = 0 + content_lower = content.lower() + + # Teaching pattern indicators + teaching_patterns = [ + r'`([^`]+)`\s+(is|means|does|removes|creates|shows|displays|prints|selects|extracts)', + r'the\s+`([^`]+)`\s+(command|operator|symbol|flag)', + r'to\s+\w+[^.]*,?\s+you\s+(must|can|should|use|need)', + r'this\s+(is|means|allows|enables|lets)', + r'because\s+', + r'the\s+reason\s+is', + r'in\s+order\s+to', + r'\*\*[^*]+\*\*', # Bold text often indicates definitions + ] + + for pattern in teaching_patterns: + matches = re.findall(pattern, content_lower) + # Extra points if the match is near a keyword + for match in matches: + match_text = match if isinstance(match, str) else ' '.join(match) + for kw in keywords: + if kw in match_text or kw in content_lower[max(0, content_lower.find(match_text)-50):content_lower.find(match_text)+50]: + score += 2 + break + else: + score += 1 + + return min(score, 10) # Cap the bonus + + +def extract_teaching_excerpt(content: str, item: MCItem) -> str: + """Extract the most relevant teaching passage from content. + + Prioritizes: + 1. Sentences containing teaching patterns + 2. Sentences containing item keywords + 3. Sentences with command definitions + """ + # Get keywords from item + item_text = f"{item.title} {item.assignment}" + for opt in item.options: + item_text += f" {opt.text}" + keywords = extract_keywords(item_text) + + # Also include command names from item (backtick content) + commands = set(re.findall(r'`([^`]+)`', item_text)) + + # Split into sentences/paragraphs + paragraphs = re.split(r'\n\n+', content) + + best_score = 0 + best_excerpt = "" + + for para in paragraphs: + if len(para) < 30 or len(para) > 500: # Skip too short or too long + continue + + score = 0 + para_lower = para.lower() + + # Score keyword presence + for kw in keywords: + if kw in para_lower: + score += 2 + + # Score command mention + for cmd in commands: + if cmd.lower() in para_lower or f'`{cmd}`' in para: + score += 5 + + # Bonus for teaching language + teaching_indicators = [ + 'is used to', 'allows you to', 'means', 'will print', 'will display', + 'tells the shell', 'removes', 'creates', 'selects', 'extracts', + 'the command', 'the operator', 'the symbol', 'for example', + 'you can use', 'you must', 'notice that', 'this is because' + ] + for indicator in teaching_indicators: + if indicator in para_lower: + score += 3 + + # Bonus for bold/emphasis (definitions) + if '**' in para: + score += 4 + + # Bonus for code examples + if '`' in para: + score += 2 + + if score > best_score: + best_score = score + best_excerpt = para + + # Clean up and truncate + if best_excerpt: + # Remove markdown code block markers + best_excerpt = re.sub(r'```\w*\n?', '', best_excerpt) + best_excerpt = best_excerpt.strip() + + if len(best_excerpt) > 400: + best_excerpt = best_excerpt[:400] + "..." + + return best_excerpt + + + + +def validate_option_lengths(options: List[MCOption]) -> Tuple[bool, List[str]]: + """Validate option lengths and return warnings.""" + warnings = [] + valid = True + + lengths = [opt.length for opt in options] + correct_length = next((opt.length for opt in options if opt.is_correct), 0) + + # Check Β±8 rule + for i, opt1 in enumerate(options): + for j, opt2 in enumerate(options): + if i < j and abs(opt1.length - opt2.length) > 8: + warnings.append(f"Options {i+1} and {j+1} differ by {abs(opt1.length - opt2.length)} chars (>8)") + valid = False + + # Check correct not longest + max_distractor = max((opt.length for opt in options if not opt.is_correct), default=0) + if correct_length > max_distractor: + warnings.append(f"Correct answer ({correct_length} chars) is longer than all distractors") + + return valid, warnings + + +def enrich_items(items: List[MCItem], scripts: Dict[str, str]) -> List[EnrichedItem]: + """Add course references and validation to items.""" + enriched = [] + + for item in items: + video_refs = find_relevant_video_content(item, scripts) + length_valid, length_warnings = validate_option_lengths(item.options) + + # Parse embedded course content reference (highest priority) + embedded_refs = [] + if item.course_content_reference: + embedded_refs.append(CourseReference( + source=f"πŸ“ AI-Identified ({item.course_section})" if item.course_section else "πŸ“ AI-Identified Content", + excerpt=item.course_content_reference + )) + + # Find curated definitional snippet (fallback) + curated_refs = [] + if not embedded_refs: + curated = find_curated_snippet(item) + if curated: + curated_refs.append(curated) + + enriched.append(EnrichedItem( + item=item, + embedded_refs=embedded_refs, + video_refs=video_refs, + curated_refs=curated_refs, + length_valid=length_valid, + length_warnings=length_warnings + )) + + return enriched + + +# ============================================================================ +# HTML GENERATOR +# ============================================================================ + +def generate_html(doc_title: str, enriched_items: List[EnrichedItem]) -> str: + """Generate HTML preview.""" + + items_html = "" + for ei in enriched_items: + item = ei.item + + # Format options + options_html = "" + for i, opt in enumerate(item.options): + correct_class = "correct" if opt.is_correct else "" + length_class = "" + + # Check if this option violates length rule + for warning in ei.length_warnings: + if f"Options {i+1}" in warning or f"and {i+1}" in warning: + length_class = "length-warning" + + marker = "βœ“" if opt.is_correct else "" + options_html += f''' +
+ {i+1} + {html.escape(opt.text)} + {opt.length} chars + {marker} +
''' + + # Length validation status + length_status = "βœ… Lengths OK" if ei.length_valid else "⚠️ Length issues" + length_warnings_html = "" + if ei.length_warnings: + for w in ei.length_warnings: + length_warnings_html += f'
⚠️ {html.escape(w)}
' + + # Format embedded course content (highest priority - AI-identified) + embedded_refs_html = "" + for ref in ei.embedded_refs: + excerpt_html = html.escape(ref.excerpt) + excerpt_html = re.sub(r'\*\*([^*]+)\*\*', r'\1', excerpt_html) + excerpt_html = excerpt_html.replace('\n', '
') + embedded_refs_html += f''' +
+
{html.escape(ref.source)}
+
{excerpt_html}
+
''' + + # Format curated definitional snippets (fallback if no embedded) + curated_refs_html = "" + if not embedded_refs_html: + for ref in ei.curated_refs: + excerpt_html = html.escape(ref.excerpt) + excerpt_html = re.sub(r'\*\*([^*]+)\*\*', r'\1', excerpt_html) + curated_refs_html += f''' +
+
{html.escape(ref.source)}
+
{excerpt_html}
+
''' + + # Format video references (keyword-matched) + video_refs_html = "" + for ref in ei.video_refs: + video_refs_html += f''' +
+
πŸ” Keyword Match: {html.escape(ref.source)}
+
{html.escape(ref.excerpt)}
+
''' + + items_html += f''' +
+
+ Item {item.item_number} +

{html.escape(item.title)}

+
+ +
type: MultipleChoiceChallenge +unit: {html.escape(item.unit)} +subskill: {html.escape(item.subskill)}
+ + +
{html.escape(item.assignment)}
+ + +
{options_html}
+ {length_warnings_html} + + +
+ {embedded_refs_html if embedded_refs_html else curated_refs_html if curated_refs_html else '
No course content reference found - add course_content_reference field
'} +
+ + +
+ {video_refs_html if video_refs_html else '
No keyword matches found
'} +
+
''' + + # Calculate rotation info + positions = [ei.item.correct_position for ei in enriched_items] + rotation_html = " β†’ ".join(str(p) for p in positions) + + return f''' + + + + + MC Preview - {html.escape(doc_title)} + + + +
+

🎯 MultipleChoiceChallenge Preview

+ +
+ Document: {html.escape(doc_title)}
+ Subskill: {html.escape(enriched_items[0].item.subskill if enriched_items else "")} +
+ πŸ“ {len(enriched_items)} items + πŸ”˜ 4 options each +
+
+ Correct answer positions: {rotation_html} +
+
+ + {items_html} +
+ +''' + + +# ============================================================================ +# CLI +# ============================================================================ + +def main(): + parser = argparse.ArgumentParser(description="Generate MultipleChoiceChallenge preview with course references") + parser.add_argument("items_file", type=Path, help="Path to items markdown file") + parser.add_argument("--scripts", type=Path, help="Directory containing video script files") + parser.add_argument("--output", "-o", type=Path, default=Path(".cursor/tmp_items/mc_preview.html"), help="Output HTML file") + + args = parser.parse_args() + + if not args.items_file.exists(): + print(f"❌ Items file not found: {args.items_file}") + sys.exit(1) + + # Parse items + print(f"πŸ“„ Parsing items from {args.items_file}...") + content = args.items_file.read_text() + doc_title, items = parse_items_file(content) + print(f" Found {len(items)} items") + + # Parse course content + scripts = {} + + if args.scripts: + print(f"πŸ“Ή Parsing video scripts from {args.scripts}...") + scripts = parse_video_scripts(args.scripts) + print(f" Found {len(scripts)} video sections") + + # Enrich items with course references + print("πŸ”— Matching items to course content...") + enriched = enrich_items(items, scripts) + + # Generate HTML + print("🎨 Generating HTML preview...") + html_content = generate_html(doc_title, enriched) + + # Write output + args.output.parent.mkdir(parents=True, exist_ok=True) + args.output.write_text(html_content) + print(f"βœ… Preview generated: {args.output}") + + # Open in browser + import subprocess + subprocess.run(["open", str(args.output)], check=False) + + +if __name__ == "__main__": + main() diff --git a/.cursor/preview/generate_preview.py b/.cursor/preview/generate_preview.py new file mode 100644 index 0000000..b12399d --- /dev/null +++ b/.cursor/preview/generate_preview.py @@ -0,0 +1,97 @@ +#!/usr/bin/env python3 +""" +Generate Exercise Preview + +Reads the HTML template and injects exercise YAML to create a preview. + +Usage: + python generate_preview.py + python generate_preview.py .cursor/tmp_items/exercise_to_validate.md +""" + +import sys +import re +from pathlib import Path + + +def extract_title_and_yaml(markdown_content: str) -> tuple[str, str]: + """Extract the exercise title and YAML content from markdown.""" + # Extract title from heading + title_match = re.search(r'^##\s+(.+?)$', markdown_content, re.MULTILINE) + title = title_match.group(1).strip() if title_match else "Exercise" + + # Extract YAML from code block + yaml_match = re.search(r'```yaml\s*\n(.*?)```', markdown_content, re.DOTALL) + yaml_content = yaml_match.group(1).strip() if yaml_match else "" + + return title, yaml_content + + +def generate_preview(exercise_file: Path, output_file: Path = None) -> Path: + """ + Generate an HTML preview from an exercise markdown file. + + Args: + exercise_file: Path to the exercise markdown file + output_file: Path for output HTML (default: .cursor/tmp_items/exercise_preview.html) + + Returns: + Path to the generated preview file + """ + if output_file is None: + output_file = Path(".cursor/tmp_items/exercise_preview.html") + + # Find the template + script_dir = Path(__file__).parent + template_file = script_dir / "drag_drop_classify_preview.html" + + if not template_file.exists(): + raise FileNotFoundError(f"Template not found: {template_file}") + + if not exercise_file.exists(): + raise FileNotFoundError(f"Exercise file not found: {exercise_file}") + + # Read template and exercise + template = template_file.read_text() + exercise_content = exercise_file.read_text() + + # Extract title and YAML + title, yaml_content = extract_title_and_yaml(exercise_content) + + # Add title to YAML + yaml_with_title = f"title: {title}\n{yaml_content}" + + # Inject into template + preview_html = template.replace("__EXERCISE_YAML__", yaml_with_title) + + # Write output + output_file.parent.mkdir(parents=True, exist_ok=True) + output_file.write_text(preview_html) + + return output_file + + +def main(): + if len(sys.argv) < 2: + print("Generate Exercise Preview") + print("") + print("Usage:") + print(" python generate_preview.py ") + print("") + print("Example:") + print(" python generate_preview.py .cursor/tmp_items/exercise_to_validate.md") + sys.exit(1) + + exercise_file = Path(sys.argv[1]) + + try: + output = generate_preview(exercise_file) + print(f"βœ… Preview generated: {output}") + except Exception as e: + print(f"❌ Error: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main() + diff --git a/.cursor/preview/generate_r_preview.py b/.cursor/preview/generate_r_preview.py new file mode 100644 index 0000000..5d1e928 --- /dev/null +++ b/.cursor/preview/generate_r_preview.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python3 +""" +Generate R Coding Exercise Preview + +Reads the HTML template and injects exercise data to create a preview. + +Usage: + python generate_r_preview.py + python generate_r_preview.py .cursor/tmp_items/exercise_to_validate.md +""" + +import sys +import re +import json +from pathlib import Path + + +def extract_exercise_data(markdown_content: str) -> dict: + """Extract exercise components from markdown.""" + content = markdown_content.strip() + + # Remove leading --- separator if present + content = re.sub(r'^---\s*\n', '', content).strip() + + data = { + "title": "Exercise", + "xp": 100, + "context": "", + "instructions": "", + "hint": "", + "pre_exercise_code": "", + "sample_code": "", + "solution": "", + "sct": "", + } + + # Extract title from heading + title_match = re.match(r'^##\s+(.+?)(?:\n|$)', content) + if title_match: + data["title"] = title_match.group(1).strip() + + # Extract XP from YAML block + xp_match = re.search(r'xp:\s*(\d+)', content) + if xp_match: + data["xp"] = int(xp_match.group(1)) + + # Extract context (text between yaml block closing and @instructions) + context_match = re.search(r'```\s*\n\n(.*?)(?=`@instructions`)', content, re.DOTALL) + if context_match: + data["context"] = context_match.group(1).strip() + + # Extract sections (R uses ```r code blocks) + sections = { + "instructions": r'`@instructions`\s*\n(.*?)(?=`@|\Z)', + "hint": r'`@hint`\s*\n(.*?)(?=`@|\Z)', + "pre_exercise_code": r'`@pre_exercise_code`\s*\n```r\s*\n(.*?)```', + "sample_code": r'`@sample_code`\s*\n```r\s*\n(.*?)```', + "solution": r'`@solution`\s*\n```r\s*\n(.*?)```', + "sct": r'`@sct`\s*\n```r\s*\n(.*?)```', + } + + for section_name, pattern in sections.items(): + match = re.search(pattern, content, re.DOTALL) + if match: + data[section_name] = match.group(1).strip() + + return data + + +def generate_preview(exercise_file: Path, output_file: Path = None) -> Path: + """ + Generate an HTML preview from an exercise markdown file. + + Args: + exercise_file: Path to the exercise markdown file + output_file: Path for output HTML (default: .cursor/tmp_items/exercise_preview.html) + + Returns: + Path to the generated preview file + """ + if output_file is None: + output_file = Path(".cursor/tmp_items/exercise_preview.html") + + # Find the template + script_dir = Path(__file__).parent + template_file = script_dir / "r_coding_preview.html" + + if not template_file.exists(): + raise FileNotFoundError(f"Template not found: {template_file}") + + if not exercise_file.exists(): + raise FileNotFoundError(f"Exercise file not found: {exercise_file}") + + # Read template and exercise + template = template_file.read_text() + exercise_content = exercise_file.read_text() + + # Extract exercise data + exercise_data = extract_exercise_data(exercise_content) + + # Convert to JSON for injection + exercise_json = json.dumps(exercise_data, indent=2) + + # Inject into template + preview_html = template.replace("__EXERCISE_DATA__", exercise_json) + + # Write output + output_file.parent.mkdir(parents=True, exist_ok=True) + output_file.write_text(preview_html) + + return output_file + + +def main(): + if len(sys.argv) < 2: + print("Generate R Coding Exercise Preview") + print("") + print("Usage:") + print(" python generate_r_preview.py ") + print("") + print("Example:") + print(" python generate_r_preview.py .cursor/tmp_items/exercise_to_validate.md") + sys.exit(1) + + exercise_file = Path(sys.argv[1]) + + try: + output = generate_preview(exercise_file) + print(f"βœ… Preview generated: {output}") + except Exception as e: + print(f"❌ Error: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/.cursor/preview/generate_sql_preview.py b/.cursor/preview/generate_sql_preview.py new file mode 100644 index 0000000..5131233 --- /dev/null +++ b/.cursor/preview/generate_sql_preview.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 +""" +Generate SQL Coding Exercise Preview + +Reads the HTML template and injects exercise data to create a preview. + +Usage: + python generate_sql_preview.py + python generate_sql_preview.py .cursor/tmp_items/exercise_to_validate.md +""" + +import sys +import re +import json +from pathlib import Path + + +def extract_exercise_data(markdown_content: str) -> dict: + """Extract exercise components from markdown.""" + content = markdown_content.strip() + + # Remove leading --- separator if present + content = re.sub(r'^---\s*\n', '', content).strip() + + data = { + "title": "Exercise", + "xp": 100, + "context": "", + "instructions": "", + "hint": "", + "pre_exercise_code": "", + "sample_code": "", + "solution": "", + "sct": "", + } + + # Extract title from heading + title_match = re.match(r'^##\s+(.+?)(?:\n|$)', content) + if title_match: + data["title"] = title_match.group(1).strip() + + # Extract XP from YAML block + xp_match = re.search(r'xp:\s*(\d+)', content) + if xp_match: + data["xp"] = int(xp_match.group(1)) + + # Extract context (text between yaml block closing and @instructions) + context_match = re.search(r'```\s*\n\n(.*?)(?=`@instructions`)', content, re.DOTALL) + if context_match: + data["context"] = context_match.group(1).strip() + + # Extract sections (SQL uses {python} for pre_exercise/sct, {sql} for sample/solution) + sections = { + "instructions": r'`@instructions`\s*\n(.*?)(?=`@|\Z)', + "hint": r'`@hint`\s*\n(.*?)(?=`@|\Z)', + "pre_exercise_code": r'`@pre_exercise_code`\s*\n```\{python\}\s*\n(.*?)```', + "sample_code": r'`@sample_code`\s*\n```\{sql\}\s*\n(.*?)```', + "solution": r'`@solution`\s*\n```\{sql\}\s*\n(.*?)```', + "sct": r'`@sct`\s*\n```\{python\}\s*\n(.*?)```', + } + + for section_name, pattern in sections.items(): + match = re.search(pattern, content, re.DOTALL) + if match: + data[section_name] = match.group(1).strip() + + return data + + +def generate_preview(exercise_file: Path, output_file: Path = None) -> Path: + """ + Generate an HTML preview from an exercise markdown file. + + Args: + exercise_file: Path to the exercise markdown file + output_file: Path for output HTML (default: .cursor/tmp_items/exercise_preview.html) + + Returns: + Path to the generated preview file + """ + if output_file is None: + output_file = Path(".cursor/tmp_items/exercise_preview.html") + + # Find the template + script_dir = Path(__file__).parent + template_file = script_dir / "sql_coding_preview.html" + + if not template_file.exists(): + raise FileNotFoundError(f"Template not found: {template_file}") + + if not exercise_file.exists(): + raise FileNotFoundError(f"Exercise file not found: {exercise_file}") + + # Read template and exercise + template = template_file.read_text() + exercise_content = exercise_file.read_text() + + # Extract exercise data + exercise_data = extract_exercise_data(exercise_content) + + # Convert to JSON for injection + exercise_json = json.dumps(exercise_data, indent=2) + + # Inject into template + preview_html = template.replace("__EXERCISE_DATA__", exercise_json) + + # Write output + output_file.parent.mkdir(parents=True, exist_ok=True) + output_file.write_text(preview_html) + + return output_file + + +def main(): + if len(sys.argv) < 2: + print("Generate SQL Coding Exercise Preview") + print("") + print("Usage:") + print(" python generate_sql_preview.py ") + print("") + print("Example:") + print(" python generate_sql_preview.py .cursor/tmp_items/exercise_to_validate.md") + sys.exit(1) + + exercise_file = Path(sys.argv[1]) + + try: + output = generate_preview(exercise_file) + print(f"βœ… Preview generated: {output}") + except Exception as e: + print(f"❌ Error: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main() + diff --git a/.cursor/preview/python_coding_preview.html b/.cursor/preview/python_coding_preview.html new file mode 100644 index 0000000..ca70126 --- /dev/null +++ b/.cursor/preview/python_coding_preview.html @@ -0,0 +1,594 @@ + + + + + + Python Exercise Preview + + + + + + +
+ Preview Mode β€” This is a visual preview of how the exercise will appear on DataCamp +
+ +
+
+
+ + + + Exercise +
+ +

Loading...

+ +
+ Loading... +
+ +
+
+ + + + Instructions + 0 XP +
+
    +
  • Loading...
  • +
+
+ +
+

Hint

+
    +
  • Loading...
  • +
+
+
+ +
+
+
script.py
+
solution.py
+
+ +
+
+ +
+
Loading...
+
+ + + +
+
+
IPython Shell
+
Slides
+
+
+ In [1]: | +
+ +
+
+
+ + + + + diff --git a/.cursor/preview/python_iterative_preview.html b/.cursor/preview/python_iterative_preview.html new file mode 100644 index 0000000..8b78de5 --- /dev/null +++ b/.cursor/preview/python_iterative_preview.html @@ -0,0 +1,703 @@ + + + + + + Python Iterative Exercise Preview + + + + + + +
+ Preview Mode β€” Iterative Exercise (BulletExercise) +
+ +
+
+
+ + + + Exercise +
+ +

Loading...

+ +
+ Loading... +
+ +
+ Instructions +
+ 1/2 + 50 XP +
+
+ +
+ +
+ +
+

Hint

+
    +
  • Loading...
  • +
+
+
+ +
+
+
script.py
+
solution.py
+
+ +
+
+ +
+
Loading...
+
+ + + +
+
+
IPython Shell
+
+
+ In [1]: | +
+ +
+
+
+ + + + diff --git a/.cursor/preview/python_sequential_preview.html b/.cursor/preview/python_sequential_preview.html new file mode 100644 index 0000000..86093d7 --- /dev/null +++ b/.cursor/preview/python_sequential_preview.html @@ -0,0 +1,778 @@ + + + + + + Python Sequential Exercise Preview + + + + + + +
+ Preview Mode β€” Sequential Exercise (TabExercise) β€” Code accumulates across steps +
+ +
+
+
+ + + + Sequential Exercise +
+ +

Loading...

+ +
+ Loading... +
+ +
+ Instructions +
+ 1/3 + 35 XP +
+
+ +
+ +
+ +
+

Hint

+
    +
  • Loading...
  • +
+
+
+ +
+
+
script.py
+
solution.py
+
+ +
+
+ +
+
Loading...
+
+ + + +
+
+
IPython Shell
+
+
+ In [1]: | +
+ +
+
+
+ + + + diff --git a/.cursor/preview/r_coding_preview.html b/.cursor/preview/r_coding_preview.html new file mode 100644 index 0000000..8dcc8c8 --- /dev/null +++ b/.cursor/preview/r_coding_preview.html @@ -0,0 +1,586 @@ + + + + + + R Exercise Preview + + + +
+ R Exercise Preview β€” This is a visual preview of how the exercise will appear on DataCamp +
+ +
+
+
+ + + + R Exercise +
+ +

Loading...

+ +
+ Loading... +
+ +
+
+ + + + Instructions + 0 XP +
+
    +
  • Loading...
  • +
+
+ +
+

Hint

+
    +
  • Loading...
  • +
+
+
+ +
+
+
script.R
+
solution.R
+
+ +
+
Loading...
+
+ + + +
+
+
R Console
+
Slides
+
+
+ > | +
+
+
+
+ + + + diff --git a/.cursor/preview/sql_coding_preview.html b/.cursor/preview/sql_coding_preview.html new file mode 100644 index 0000000..7d017c7 --- /dev/null +++ b/.cursor/preview/sql_coding_preview.html @@ -0,0 +1,598 @@ + + + + + + SQL Exercise Preview + + + +
+ SQL Exercise Preview β€” This is a visual preview of how the exercise will appear on DataCamp +
+ +
+
+
+ + + + SQL Exercise +
+ +

Loading...

+ +
+ Loading... +
+ +
+
+ + + + Instructions + 0 XP +
+
    +
  • Loading...
  • +
+
+ +
+

Hint

+
    +
  • Loading...
  • +
+
+
+ +
+
+
script.sql
+
solution.sql
+
+ +
+
Loading...
+
+ + + +
+
+
SQL Console
+
Query Results
+
+
+ SQL> | +
+
+
+
+ + + + + diff --git a/.cursor/preview/sql_iterative_preview.html b/.cursor/preview/sql_iterative_preview.html new file mode 100644 index 0000000..0417742 --- /dev/null +++ b/.cursor/preview/sql_iterative_preview.html @@ -0,0 +1,271 @@ + + + + + + SQL Iterative Exercise Preview + + + + + + +
+ Preview Mode β€” SQL Iterative Exercise (BulletExercise) β€” Each step is independent +
+ +
+
+
+ + + + SQL Iterative Exercise +
+ +

Loading...

+
Loading...
+ +
+ + + + Each step starts fresh β€” code doesn't carry over between steps +
+ +
+ Instructions +
+ 1/2 + 50 XP +
+
+ +
+ +
+

Hint

+
  • Loading...
+
+
+ +
+
+
script.sql
+
solution.sql
+
+ +
+
+ +
+
Loading...
+
+ + + +
+
+
SQL Console
+
+
+ postgresql> | +
+ +
+
+
+ + + + diff --git a/.cursor/requirements.txt b/.cursor/requirements.txt new file mode 100644 index 0000000..da7700a --- /dev/null +++ b/.cursor/requirements.txt @@ -0,0 +1,18 @@ +# Content to Markdown Conversion Dependencies +# Install with: pip install -r .cursor/requirements.txt + +# PDF to Markdown (Datalab API) +datalab-python-sdk>=0.2.0 +brotli>=1.2.0 # Required for aiohttp compatibility + +# HTML to Markdown +docling>=2.0.0 + +# YouTube transcript extraction +youtube-transcript-api>=1.0.0 + +# Web page content extraction +trafilatura>=1.6.0 + +# Environment variable loading +python-dotenv>=1.0.0 diff --git a/.cursor/rules/coding-exercise.md b/.cursor/rules/coding-exercise.md new file mode 100644 index 0000000..b246bc6 --- /dev/null +++ b/.cursor/rules/coding-exercise.md @@ -0,0 +1,743 @@ +# BlanksChallenge Exercises - Complete Reference + +Everything you need to generate BlanksChallenge coding items for **Python, R, and SQL**. + +For language-specific style guides and additional examples, see: +- `python-blanks-challenge.md` +- `r-blanks-challenge.md` +- `sql-blanks-challenge.md` + +--- + +## Type Identifier + +**Type Name:** `BlanksChallenge` (fill-in-the-blank) + +**Used For:** +- Fill-in-the-blank coding challenges +- Testing structural understanding (syntax, indexing, operators) +- Application and reasoning over rote recall +- Automarker-safe, deterministic assessments + +--- + +## POOL.YML REQUIREMENT (MANDATORY) + +Before generating any BlanksChallenge items, the user **MUST** provide a `pool.yml` file. This file contains pool-level metadata that populates the YAML header of each item. + +### pool.yml (Pool Level) + +The pool.yml file sits at the pool level and defines the assessment configuration. Example: + +```yaml +title: Programming for Data Engineering +programming_language: python +from: 'python-base-prod:v2.0.0' +type: skill-assessment +stage: launched +subskills: + - name: "repeatable-code" + full: "Writing Repeatable Code" + courses: [799, 15876] + description: "Use common programming constructs to write repeatable production quality code for data processing." + - name: "best-practices" + full: "Demonstrate coding best practices" + courses: [25708, 5355] + description: "Demonstrate best practices in production code including version control, testing and package development." +``` + +### Item Header (Item Level) + +Each BlanksChallenge item has a YAML header. The `subskill` value comes from a `subskills[].name` in the pool.yml. Example: + +```yaml +type: BlanksChallenge +key: +unit: llm-metrics +subskill: repeatable-code +initial_difficulty: 0 +item_writer_id: '999999999' +# OPTIONAL - Development fields (remove before finalizing): +# course_section: "Chapter 1, Video 2" +# teaching_point: "The log_metric function records a single metric value..." +``` + +### Development Fields (Temporary) + +During item creation, you may include these **optional** fields to improve preview accuracy: + +| Field | Purpose | Example | +|-------|---------|---------| +| `course_section` | Explicit reference to course section | `"Chapter 1, Video 2"` | +| `teaching_point` | Key concept from course that item tests | `"The log_metric function records a single metric value..."` | + +**These fields help the preview generator show accurate course alignment but MUST be removed before finalizing items.** + +To strip development fields from finalized items: +```bash +sed -i '' '/^# course_section:/d; /^# teaching_point:/d' /tmp/blanks_items.md +``` + +### How pool.yml Maps to Item Header + +| pool.yml | Item Header | Notes | +|----------|-------------|-------| +| `subskills[].name` | `subskill` | Select ONE per item | +| (generate) | `unit` | 2-3 word kebab-case phrase describing the topic (e.g., `llm-metrics`, `data-aggregation`) | +| `999999999` | `item_writer_id` | Always use this value | +| (leave blank) | `key` | Auto-generated by system | +| `0` | `initial_difficulty` | Default starting difficulty | + +**Subskill Selection:** +- Each item tests ONE subskill from the pool.yml `subskills` array +- Use the `name` field (e.g., `repeatable-code`) as the `subskill` value +- Match the subskill to what the item actually assesses + +### Workflow + +1. **Upload pool.yml** β€” Must be provided before item generation begins +2. **Upload course content** β€” Markdown/doc/rtf/txt with learning material +3. **Specify item count** β€” How many BlanksChallenge items to create +4. **Specify subskill(s)** β€” Which subskill(s) from pool.yml to assess (by `name`) +5. **Generate items** β€” Pool metadata auto-populates each item's YAML + +--- + +## BLANKSCHALLENGE RULES + +### Placeholder Syntax +- Use `{{_expr1}}`, `{{_expr2}}`, etc. for blanks (NOT `____`) +- Each blank maps to a variable in `@variables` section +- Each blank must have **exactly one correct answer** +- Each `{{_expr}}` should test a specific testing point + +### What Blanks Should Test +βœ… **Structure**: syntax, indexing, operators, method calls +βœ… **Application**: applying concepts to new scenarios +βœ… **Reasoning**: understanding why code works + +### What Blanks Should NOT Test +❌ **Field names/labels**: Don't test recall of keys like `'loss'`, `'accuracy'` +❌ **String values**: Blanks should NEVER be text strings +❌ **Memorization**: No testing values not visible in `@code1` + +- **Scaffold:** function calls, method calls, parameters, variable assignments, expressions +- **DON'T scaffold:** imports, basic syntax, comments, print statements (unless that's the objective) +- Scale scaffolding amount based on difficulty and learning objectives + +### Blank Token Requirements +- Each blank: **1-3 correct tokens** (e.g., `'len'`, `'0'`, `'.fit'`) +- Each blank appears **only once** across all items in a set +- Expressions should NEVER be text strings + +### When to Use 1 vs 2 Blanks + +**Default to 1 blank** and only use 2 blanks when you're truly testing two distinct "units" of knowledge that are both part of the learning objective. + +#### Use 1 blank when: +- You can test the objective by checking one key token/unit (a function name, a column name, a single argument, a single operator, etc.) +- Adding another blank would mostly test reading/bookkeeping rather than the target skill +- You want to reduce the risk of "partial credit by luck" and keep distractors tight +- This aligns with DataCamp guidance to "only leave blanks that test the learning objective" + +#### Use 2 blanks when: +- The objective genuinely requires two separate units to be supplied (e.g., choose the right logical operator AND choose the right threshold value) +- You can still guarantee that only one option (or combination of options) is correct +- Each blank is meaningful and not just scaffoldingβ€”DataCamp explicitly allows more than one blank in an item, but the code should stay short and focused on what's being tested + +#### A Practical "Unit" Rule (especially helpful in SQL) +DataCamp's SQL formatting guidance treats blanks as units of information. Example: selecting a column uses one blank unit (`___`), but a WHERE clause might need two units (e.g., an operator and a number), so it uses two sets of blanks. + +#### Quick Decision Checklist +**Use 2 blanks only if you can answer "yes" to ALL:** +1. Are there two separate units you want to measure? +2. Would leaving only one blank fail to measure the objective? +3. Can you write distractors so there is exactly one correct combination? +4. Does the code remain short and directly related to the point being tested? + +**If any answer is "no", stick to 1 blank and prefill the rest.** + +--- + +## STRUCTURE GUIDELINES + +- **Pre-exercise code**: imports, data loading, setup variables, background work unrelated to objectives +- **code1**: comments guide students, `{{expr}}` tests key concepts +- **Context**: provide a short context that places candidates in an appropriate context for the task and gives them an instruction + +--- + +## CODE FORMATTING + +- Wrap technical terms in backticks: `pandas`, `OpenAI`, `.set_index()`, `d_model` +- Format variable names, function names, method names, parameters in instructions and hints + +--- + +## SUPPORTED LANGUAGES + +BlanksChallenge items support the following languages: + +| Language | Code Block Tag | Scaffolding | Style Guide | +|----------|----------------|-------------|-------------| +| Python | `{python}` | `{{_expr}}` | PEP 8 | +| R | `{r}` | `{{_expr}}` | tidyverse style | +| SQL | `{sql}` | `{{_expr}}` | Holywell's SQL Style | + +**Special Case β€” SQL Exercises:** +- `@pre_challenge_code`: Use `{python}` (for setup) +- `@code1`: Use `{sql}` (the actual SQL query) + +**Language-Specific Files:** +For detailed syntax rules, style guides, and additional examples, see: +- `python-blanks-challenge.md` β€” Python-specific guidance +- `r-blanks-challenge.md` β€” R-specific guidance +- `sql-blanks-challenge.md` β€” SQL-specific guidance + +--- + +## DETERMINISTIC OUTPUT REQUIREMENTS (BlanksChallenge) + +All BlanksChallenge code must produce **reproducible, identical output**: + +- β›” No randomness or model inference outputs +- β›” No IPython magics (`%timeit`, `%lprun`, etc.) +- β›” No raw `set`, `Counter`, or unsorted dictionary output +- βœ… Use standard Python only +- βœ… Sort any collection output for determinism +- βœ… Use fixed seeds if randomness is unavoidable + +--- + +## SCENARIO REQUIREMENTS (BlanksChallenge) + +### Fresh Scenarios (MANDATORY) +- **Use NEW scenarios**β€”do not reuse or paraphrase course content verbatim +- Treat course code as **structural templates** only +- Each item must reference a concept from course content + +### Context Guidelines + +**Length:** 1-4 lines describing a real-world data task + +**Cognitive Level:** Access higher cognitive functions by placing candidates in rich, immersive scenariosβ€”not just describing a task. + +**No Explicit Instructions:** Users always see "fill in the blank" automatically, so: +- ❌ Don't write "you need to find" or "your task is to" +- βœ… Use imperative verbs: "Find", "Compute", "Extract", "Load" + +**Role-Based Framing:** Put the user IN the scenario as a team member, not describing what they're building. + +| ❌ Bad | βœ… Good | +|--------|---------| +| "You're building a content moderation system for a chatbot. The toxicity metric scores text from 0 to 1, and you need to find the highest toxicity score." | "You're a member of a team building a chatbot, and need to create a content moderation system. The toxicity metric scores text from 0 to 1. Find the highest toxicity score across all responses." | +| "You're evaluating a model. You need to compute the BLEU score." | "You're on a translation quality team reviewing model outputs. Compute the BLEU score to compare against human references." | + +**Pattern:** +``` +❌ "You're building [thing]. You need to [action]." +βœ… "You're a [role] on a team [doing thing]. [Imperative action]." +``` + +**Additional Guidelines:** +- Cue authentic tasks: filter, aggregate, reshape, transform, evaluate +- Motivate the code without giving away the answer +- Don't repeat information visible in the code + +--- + +## BLANKSCHALLENGE EXPORT FORMAT + +**Prerequisite:** A valid `pool.yml` must be loaded by the user before generating items. + +~~~markdown +--- +title: {{pool.title}} +output: html_document +description: {{pool.description}} +--- + +## [] + +```yaml +type: BlanksChallenge +key: +unit: <2-3 word kebab-case phrase> +subskill: {{pool.subskill}} +initial_difficulty: 0 +item_writer_id: 999999999 +``` + +`@context` +<1–4 line real-world scenario that cues the concept> + +`@code1` +```{language} +# Minimal, runnable code with placeholder(s) +# Replace {language} with: python, r, or sql +``` + +`@pre_challenge_code` +```{language} +# Optional setup code +# For DataFrames: ≀10 rows +``` + +`@variables` +```yaml +expr1: + - 'groupby' +expr2: + - 'mean' +``` + +`@distractors` +```yaml +# Optional - leave empty if unused +``` +~~~ + +### BlanksChallenge Section Requirements + +| Section | Required | Description | +|---------|----------|-------------| +| `@context` | Yes | 1-4 lines; role-based scenario with imperative action; do NOT copy course text | +| `@code1` | Yes | Minimal, runnable Python with `{{_exprN}}` placeholders | +| `@pre_challenge_code` | No | Setup code; DataFrames ≀10 rows | +| `@variables` | Yes | Maps each `exprN` to solution (single-element list) | +| `@distractors` | No | Optional incorrect choices; leave empty if unused | + +--- + +## COMPLETE EXAMPLES + +Examples are provided for each supported language. For language-specific style guides and additional examples, see the corresponding language file (e.g., `python-blanks-challenge.md`). + +--- + +### Example 1: Python β€” Aggregating Data + +**Given pool.yml:** +```yaml +title: "Data Manipulation with pandas" +description: "Practice essential pandas operations for data analysis." +unit: data-aggregation +subskill: groupby-operations +item_writer_id: 999999999 +``` + +**Generated item:** + +~~~markdown +--- +title: Data Manipulation with pandas +output: html_document +description: Practice essential pandas operations for data analysis. +--- + +## [Aggregating Data by Groups] + +```yaml +type: BlanksChallenge +key: +unit: data-aggregation +subskill: groupby-operations +initial_difficulty: 0 +item_writer_id: 999999999 +``` + +`@context` +You're a data analyst on an e-commerce marketing team reviewing quarterly sales. Calculate the average order value for each customer segment to identify high-value groups. + +`@code1` +```{python} +segment_avg = orders.{{_expr1}}("segment")["order_value"].{{_expr2}}() +print(segment_avg.sort_index()) +``` + +`@pre_challenge_code` +```{python} +import pandas as pd +orders = pd.DataFrame({ + "segment": ["retail", "wholesale", "retail", "wholesale"], + "order_value": [150, 500, 200, 450] +}) +``` + +`@variables` +```yaml +expr1: + - 'groupby' +expr2: + - 'mean' +``` + +`@distractors` +```yaml +``` +~~~ + +--- + +### Example 2: R β€” Filtering Data + +**Given pool.yml:** +```yaml +title: "Data Wrangling with dplyr" +description: "Practice essential tidyverse operations for data manipulation." +unit: data-filtering +subskill: filter-operations +item_writer_id: 999999999 +``` + +**Generated item:** + +~~~markdown +--- +title: Data Wrangling with dplyr +output: html_document +description: Practice essential tidyverse operations for data manipulation. +--- + +## [Filtering Rows by Condition] + +```yaml +type: BlanksChallenge +key: +unit: data-filtering +subskill: filter-operations +initial_difficulty: 0 +item_writer_id: 999999999 +``` + +`@context` +You're a research assistant preparing survey data for analysis. Filter the dataset to include only respondents who completed the full survey. + +`@code1` +```{r} +complete_responses <- survey_data %>% + {{_expr1}}(response_complete {{_expr2}} TRUE) +print(complete_responses) +``` + +`@pre_challenge_code` +```{r} +library(dplyr) +survey_data <- data.frame( + respondent_id = c(1, 2, 3, 4), + response_complete = c(TRUE, FALSE, TRUE, TRUE) +) +``` + +`@variables` +```yaml +expr1: + - 'filter' +expr2: + - '==' +``` + +`@distractors` +```yaml +``` +~~~ + +--- + +### Example 3: SQL β€” Joining Tables + +**Given pool.yml:** +```yaml +title: "SQL Fundamentals" +description: "Practice essential SQL operations for data retrieval." +unit: table-joins +subskill: inner-join-operations +item_writer_id: 999999999 +``` + +**Generated item:** + +~~~markdown +--- +title: SQL Fundamentals +output: html_document +description: Practice essential SQL operations for data retrieval. +--- + +## [Joining Customer and Order Tables] + +```yaml +type: BlanksChallenge +key: +unit: table-joins +subskill: inner-join-operations +initial_difficulty: 0 +item_writer_id: 999999999 +``` + +`@context` +You're a database analyst creating a sales report for the executive team. Combine customer details with their order history from two separate tables. + +`@code1` +```{sql} +SELECT customers.name, orders.order_date, orders.amount +FROM customers +{{_expr1}} orders +{{_expr2}} customers.customer_id = orders.customer_id +``` + +`@pre_challenge_code` +```{python} +# SQL exercises use Python for setup +import pandas as pd +customers = pd.DataFrame({"customer_id": [1, 2], "name": ["Alice", "Bob"]}) +orders = pd.DataFrame({"order_id": [101, 102], "customer_id": [1, 2], "order_date": ["2024-01-15", "2024-01-16"], "amount": [150, 200]}) +``` + +`@variables` +```yaml +expr1: + - 'INNER JOIN' +expr2: + - 'ON' +``` + +`@distractors` +```yaml +``` +~~~ + +--- + +## YAML OUTPUT FORMAT (Template) + +Use this template for generating BlanksChallenge items. Replace `{language}` with `python`, `r`, or `sql`. + +~~~markdown +## [{Learning Objective Title}] + +```yaml +type: BlanksChallenge +key: +unit: <2-3 word kebab-case phrase> +subskill: {{pool.subskill}} +initial_difficulty: 0 +item_writer_id: '999999999' +``` + +`@context` +{Role-based scenario: "You're a [role] on a team [doing X]. [Imperative action]."} + +`@code1` +```{language} +{code with {{_expr}} placeholders} +``` + +`@pre_challenge_code` +```{language} +{setup code} +``` + +`@variables` +```yaml +expr1: + - '{solution_token}' +``` + +`@distractors` +```yaml +{optional distractors} +``` +~~~ + +--- + +## SPECIAL FORMATTING RULES + +### Language-Specific Code Blocks + +| Language | `@code1` tag | `@pre_challenge_code` tag | +|----------|--------------|---------------------------| +| Python | `{python}` | `{python}` | +| R | `{r}` | `{r}` | +| SQL | `{sql}` | `{python}` ← special case | + +### Field Requirements + +| Field | Value | Notes | +|-------|-------|-------| +| `type` | `BlanksChallenge` | Always this value | +| `key` | (leave blank) | Auto-generated by system | +| `unit` | (generate) | 2-3 word kebab-case phrase (e.g., `llm-metrics`) | +| `subskill` | From pool.yml | Kebab-case learning objective | +| `initial_difficulty` | `0` | Default starting difficulty | +| `item_writer_id` | `'999999999'` | Always this value | + +--- + +## BEST PRACTICES + +### 1. Scaffolding Strategy +- Focus scaffolding on concepts from learning objectives, not busy work +- Don't scaffold every line β€” be strategic +- Each `{{_expr}}` should require thought, not just typing +- Test structure and reasoning, not memorization + +### 2. Context-to-Code Alignment +- Context should cue the concept being tested +- If context mentions "calculate the average," there should be `{{_expr}}` for that +- Don't have blanks that aren't guided by context + +### 3. Code Clarity +- Comments are acceptable in `@pre_challenge_code` +- Comments should **never** appear in `@code1` +- Keep code minimal and focused on the tested concept + + +### 4. Pre-challenge Code Setup +- Handle all imports and data loading here +- Set up any helper functions or background variables +- Keep it clean β€” candidates don't see this code +- Don't put learning objectives in pre-challenge code +- For DataFrames: keep to ≀10 rows + +### 5. Blank Design +- Test structure and reasoning, not memorization +- Each blank tests ONE concept with ONE correct answer +- Avoid blanks that could have multiple valid solutions +- Never use string literals as blank answers +- 1-3 tokens per blank + +### 6. Fresh Scenarios +- Create NEW scenarios; do not reuse course content verbatim +- Treat course code as structural templates only +- Test application of concepts in novel contexts + +--- + +## BLANKSCHALLENGE VALIDATION CHECKLIST + +Before finalizing a BlanksChallenge item, verify: + +### Pool.yml Compliance +- βœ… `pool.yml` was provided and loaded before generation +- βœ… `unit:` is a 2-3 word kebab-case phrase (e.g., `llm-metrics`) +- βœ… `subskill:` matches value from pool.yml +- βœ… `item_writer_id:` is `'999999999'` + +### Item Structure +- βœ… `key:` field is present but blank +- βœ… No duplicate concepts across items in set +- βœ… Tests only visible structure (no field/key recall) +- βœ… Each blank has exactly one valid solution +- βœ… No IPython magics used +- βœ… Standard Python only +- βœ… Output is deterministic and ordered +- βœ… No raw sets/Counter/dict output (unless sorted) +- βœ… Blanks are NOT text strings +- βœ… Each blank has 1-3 tokens +- βœ… Scenario is NEW (not copied from course) +- βœ… Context cues the concept without giving answer + +--- + +## COMMON PITFALLS TO AVOID + +1. **Over-scaffolding**: Every line has `{{_expr}}` β€” no strategic focus +2. **Under-scaffolding**: Key concepts not tested +3. **Context mismatch**: Context doesn't align with where `{{_expr}}` appears +4. **Scaffolding imports**: Never scaffold import statements unless that's the objective +5. **Incomplete code**: Solution doesn't actually run or has syntax errors +6. **Testing memorization**: Asking for field names like `'loss'` not shown in code +7. **String blanks**: Making the answer a text string value +8. **Multiple valid answers**: Blank could be filled multiple ways +9. **Non-deterministic output**: Using randomness or unsorted collections +10. **Copying course content**: Reusing scenarios verbatim +11. **Overly simple items**: One-liners requiring no reasoning +12. **Too many blanks**: Keep items focused on 1-2 key concepts +13. **Wrong language tag**: Using `{python}` when item is for R or SQL +14. **Missing pool.yml values**: Forgetting to populate from pool.yml + +--- + +## QUALITY CHECKLIST + +Before finalizing a BlanksChallenge item, verify: + +### Structure +- βœ… All required sections present (`@context`, `@code1`, `@variables`) +- βœ… Code block tags match the target language +- βœ… Pre-challenge code handles all setup work +- βœ… Code actually runs and produces expected output + +### Style +- βœ… Code follows language-specific style guide (PEP 8, tidyverse, Holywell) +- βœ… Technical terms in context use backticks +- βœ… No comments in `@code1` + +### Content +- βœ… Scenario is fresh (not copied from course) +- βœ… Context cues the concept without giving away answer +- βœ… Each blank has exactly one valid solution + +--- + +## EDUCATIONAL DESIGN NOTES + +### Difficulty Progression +- **Beginner**: More context clues, simpler code structures +- **Intermediate**: Strategic blanks on key concepts, multi-step operations +- **Advanced**: Minimal context clues, complex expressions + +### Cognitive Load +- Don't test too many concepts in one item +- Focus on 1-2 key testing points per item +- Keep code minimal and readable + +### Real-World Relevance +- Use realistic scenarios in context +- Connect to practical applications +- Use real package/library names and realistic data + +### Core Design Principles + +| Principle | Description | +|-----------|-------------| +| **Application over recall** | Test applying concepts, not memorizing syntax | +| **Structure reasoning** | Focus on why code works, not what to type | +| **Authentic contexts** | Use real data tasks (filter, aggregate, reshape, join) | +| **Deterministic always** | Same input β†’ same output, every time | +| **Fresh scenarios** | New examples inspired by (not copied from) course | +| **Language-agnostic thinking** | Concepts transfer across Python, R, SQL | + +--- + +## πŸ”§ VALIDATION & PREVIEW + +### Validate Structure +```bash +python3 .cursor/validators/python_coding_validator.py /tmp/blanks_items.md +``` + +### Generate Preview +```bash +python3 .cursor/preview/generate_blanks_preview.py /tmp/blanks_items.md --scripts --exercises +``` + +**Important: Course Content Paths** +For previews to show course references, you MUST provide the course content directories: +- `--scripts ` β€” Directory containing video script files (e.g., `chapter_1_scripts.txt`) +- `--exercises ` β€” Directory containing exercise markdown files (e.g., `chapter1.md`) + +Course content may include code snippets in both `.txt` files (video scripts) and `.md` files (exercise chapters). The `.md` files typically contain more structured code examples with `@solution` blocks. + +**Example with course content:** +```bash +python3 .cursor/preview/generate_blanks_preview.py /tmp/blanks_items.md \ + --scripts ~/Downloads/scripts \ + --exercises ~/Downloads +``` + +--- + +This is your complete reference for generating high-quality BlanksChallenge items across all supported languages! diff --git a/.cursor/rules/learning-objective-discovery.md b/.cursor/rules/learning-objective-discovery.md new file mode 100644 index 0000000..924ecd0 --- /dev/null +++ b/.cursor/rules/learning-objective-discovery.md @@ -0,0 +1,278 @@ +# Learning Objective Discovery - Complete Reference + +Systematically discover and structure learning objectives from course content before generating assessment items. + +--- + +## Type Identifier + +**Skill Name:** `learning_objective_discovery` + +**Used For:** +- Identifying main learning objectives from video scripts and slides +- Breaking down main LOs into assessable sub-LOs (one per item) +- Determining appropriate item type for each sub-LO +- Ensuring LO wording matches what the item type can actually test +- Validating alignment between LOs and course content + +--- + +## WHEN TO USE THIS SKILL + +Trigger LO discovery: +- Before generating multiple assessment items for a chapter +- When starting work on a new course section +- When explicitly requested ("discover learning objectives", "what are the LOs") +- Before batch item generation + +--- + +## LO DISCOVERY PROCESS + +``` +Course Content β†’ Main LOs β†’ Sub-LOs β†’ [Per sub-LO: analyze content β†’ determine item type β†’ word appropriately] β†’ Validate +``` + +### Step 1: Main LO Identification (by Chapter) + +Analyze video scripts and slides for each chapter to extract main learning objectives. + +**How to identify main LOs:** +- Look for explicit statements: "By the end of this lesson, you will be able to..." +- Identify the core skills being taught in each video/lesson +- Focus on what learners should be able to DO (not just know) + +**Main LO format:** Action verb + concept + context + +**Examples:** +- "Use `.groupby()` to aggregate data by categories" +- "Apply machine learning models to predict customer churn" +- "Write SQL queries to join multiple tables" + +**Guidelines:** +- Extract 3-5 main LOs per chapter +- Each main LO should represent a significant capability +- Main LOs can be broad (they will be broken into sub-LOs) + +--- + +### Step 2: Sub-LO Decomposition (per Item) + +Break each main LO into granular, assessable sub-LOs. Each sub-LO becomes ONE assessment item. + +**Sub-LO requirements:** +- Specific enough to test in a single item +- References specific course content (video timestamp, slide number) +- Distinct from other sub-LOs (no overlap) +- One sub-LO = one assessment item + +**Example decomposition:** + +| Main LO | Sub-LOs | +|---------|---------| +| Use `.groupby()` to aggregate data | 1. Apply `.groupby()` with a single column
2. Chain `.groupby()` with aggregation methods
3. Identify when to use `.groupby()` vs `.pivot_table()` | + +--- + +### Step 3: Item Type Analysis (per Sub-LO) + +**For EACH sub-LO**, analyze the specific course content it references to determine the appropriate item type. + +| Content Type | Indicators | Recommended Item Type | +|--------------|------------|----------------------| +| **Conceptual** | Definitions, explanations, comparisons, "why" discussions, decision criteria, trade-offs | **MCQ** | +| **Coding/Procedural** | Code examples, syntax demonstrations, step-by-step procedures, "how to" content | **BlanksChallenge** | + +**Important:** A single main LO can have sub-LOs of different types. + +**Example:** +``` +Main LO: "Use `.groupby()` for data aggregation" + +Sub-LO 1: "Apply `.groupby()` syntax" + β†’ References: Code example at Video 1.2, 03:45 + β†’ Content type: Coding (syntax demo) + β†’ Item type: BlanksChallenge + +Sub-LO 2: "Identify when `.groupby()` is more appropriate than `.pivot_table()`" + β†’ References: Comparison discussion at Video 1.2, 05:30 + β†’ Content type: Conceptual (comparison) + β†’ Item type: MCQ +``` + +#### Analysis Signals for MCQ (Conceptual Content) + +Look for these patterns in the referenced content: +- "X is defined as..." +- "The difference between X and Y is..." +- "You should use X when..." +- "X is important because..." +- Comparison tables +- Decision trees or flowcharts +- Best practices discussions +- Trade-off explanations +- "Why" questions answered + +#### Analysis Signals for BlanksChallenge (Coding Content) + +Look for these patterns in the referenced content: +- Code blocks with syntax examples +- "To do X, use the following code..." +- Method/function demonstrations +- Step-by-step coding workflows +- "How to" instructions with code +- Syntax patterns being taught +- Parameter usage examples + +--- + +### Step 4: LO Wording by Item Type (per Sub-LO) + +**Critical: The action verb must be testable by the chosen item type.** + +| Item Type | Cognitive Level | Recommended Action Verbs | Verbs to AVOID | +|-----------|-----------------|--------------------------|----------------| +| **BlanksChallenge** | Application/Structure | Apply, Use, Complete, Implement, Construct, Write, Execute | Explain, Describe, Compare, Identify (not testable by filling in code) | +| **MCQ** | Understanding/Reasoning | Explain, Identify, Select, Determine, Predict, Distinguish, Compare, Recognize | Apply, Implement, Write, Execute, Complete (requires actual coding) | + +#### Why This Matters + +- **BlanksChallenge** tests whether learners can write/complete code correctly +- **MCQ** tests whether learners understand concepts and can make decisions +- Using the wrong verb creates an untestable LO + +#### Examples of Properly Matched LO Wording + +| Sub-LO Content Reference | Item Type | Good LO Wording | Bad LO Wording | +|--------------------------|-----------|-----------------|----------------| +| Code example of `.groupby()` syntax | BlanksChallenge | "Apply `.groupby()` to aggregate DataFrame columns" | "Explain what `.groupby()` does" | +| Discussion of when to use inner vs outer join | MCQ | "Identify when to use an inner join versus an outer join" | "Implement an inner join query" | +| Comparison of `fit()` vs `fit_transform()` | MCQ | "Distinguish between `fit()` and `fit_transform()` methods" | "Use `fit_transform()` on training data" | +| Code walkthrough of method chaining | BlanksChallenge | "Complete a pandas method chain for data transformation" | "Describe the purpose of method chaining" | +| Explanation of why normalization matters | MCQ | "Explain why feature normalization improves model performance" | "Apply normalization to a dataset" | + +--- + +### Step 5: Content Alignment Validation + +Before finalizing the LO table, validate alignment: + +**For each sub-LO, verify:** +- [ ] Cites specific course content (video timestamp or slide reference) +- [ ] The cited content actually teaches what the sub-LO claims +- [ ] Item type matches the nature of the cited content +- [ ] Action verb is testable by the chosen item type +- [ ] Sub-LO is distinct from other sub-LOs (no duplication) + +**Red flags:** +- Sub-LO references content that doesn't exist β†’ Remove or reassign +- Sub-LO tests something not explicitly taught β†’ Flag as out of scope +- Multiple sub-LOs test the same concept β†’ Merge or differentiate + +--- + +## OUTPUT FORMAT + +The LO discovery skill produces a structured table: + +```markdown +## Chapter X: [Chapter Title] + +| Main LO | Sub-LO | Content Type | Item Type | Action Verb | Course Reference | +|---------|--------|--------------|-----------|-------------|------------------| +| [Main objective] | [Specific sub-objective] | [Conceptual/Coding] | [MCQ/BlanksChallenge] | [Verb] | [Video X.Y, MM:SS] | +``` + +### Complete Example + +```markdown +## Chapter 1: Introduction to pandas + +| Main LO | Sub-LO | Content Type | Item Type | Action Verb | Course Reference | +|---------|--------|--------------|-----------|-------------|------------------| +| Use `.groupby()` for aggregation | Apply `.groupby()` with a single column | Coding (syntax demo) | BlanksChallenge | Apply | Video 1.2, 03:45 | +| Use `.groupby()` for aggregation | Identify when to use `.groupby()` vs `.pivot_table()` | Conceptual (comparison) | MCQ | Identify | Video 1.2, 05:30 | +| Understand aggregation functions | Distinguish between `sum()`, `mean()`, and `count()` | Conceptual (definitions) | MCQ | Distinguish | Video 1.3, 01:20 | +| Chain pandas methods | Complete a method chain for data transformation | Coding (procedure) | BlanksChallenge | Complete | Video 1.4, 02:15 | +| Chain pandas methods | Predict the output of a method chain | Conceptual (reasoning) | MCQ | Predict | Video 1.4, 04:00 | +``` + +--- + +## QUICK REFERENCE: ACTION VERBS + +### For BlanksChallenge (Coding Items) +- **Apply** - Use a method/function in context +- **Use** - Employ a specific syntax or pattern +- **Complete** - Fill in missing code to achieve a result +- **Implement** - Write code that accomplishes a task +- **Construct** - Build a data structure or query +- **Write** - Create code from requirements +- **Execute** - Run a sequence of operations + +### For MCQ (Conceptual Items) +- **Identify** - Recognize the correct option +- **Select** - Choose the appropriate approach +- **Distinguish** - Tell apart similar concepts +- **Compare** - Evaluate differences between options +- **Explain** - Understand why something works +- **Determine** - Figure out the correct answer +- **Predict** - Anticipate an outcome +- **Recognize** - Spot a pattern or concept + +--- + +## COMMON PITFALLS TO AVOID + +1. **Verb mismatch**: Using "Explain" for a BlanksChallenge or "Implement" for an MCQ +2. **Missing citations**: Sub-LOs without specific course content references +3. **Overlapping sub-LOs**: Multiple items testing the same concept +4. **Out-of-scope LOs**: Testing concepts not covered in the referenced content +5. **Wrong item type**: Assigning MCQ to code syntax or BlanksChallenge to conceptual content +6. **Too broad sub-LOs**: Sub-LOs that can't be tested in a single item +7. **Assuming item type from main LO**: Always analyze at the sub-LO level + +--- + +## WORKFLOW INTEGRATION + +After discovering LOs, the workflow continues: + +1. **LO Discovery** (this skill) β†’ Produces LO table with item types +2. **Read Item Type Rules** β†’ Load rules for the determined item type (MCQ or BlanksChallenge) +3. **Generate Item** β†’ Create item aligned to the sub-LO +4. **Validate** β†’ Check structure +5. **Preview** β†’ Visual review +6. **Iterate** β†’ Refine as needed + +--- + +## EXAMPLE SESSION + +**User:** "Discover learning objectives for Chapter 2 based on @slides/chapter_2.md" + +**Assistant:** +1. Reads chapter content +2. Extracts main LOs +3. Breaks into sub-LOs +4. For each sub-LO: analyzes referenced content β†’ determines item type β†’ words appropriately +5. Validates alignment +6. Outputs structured LO table + +**Output:** +```markdown +## Chapter 2: Data Cleaning with pandas + +| Main LO | Sub-LO | Content Type | Item Type | Action Verb | Course Reference | +|---------|--------|--------------|-----------|-------------|------------------| +| Handle missing data | Apply `.fillna()` to replace missing values | Coding (syntax) | BlanksChallenge | Apply | Video 2.1, 02:30 | +| Handle missing data | Identify appropriate strategies for different missing data patterns | Conceptual (decision) | MCQ | Identify | Video 2.1, 05:15 | +| Clean string data | Use `.str` accessor methods for text cleaning | Coding (methods) | BlanksChallenge | Use | Video 2.2, 03:00 | +| Clean string data | Distinguish between `.strip()`, `.lower()`, and `.replace()` | Conceptual (comparison) | MCQ | Distinguish | Video 2.2, 06:45 | +``` + +**User can then say:** "Generate a BlanksChallenge item for the first sub-LO" + +--- + +This is your complete reference for discovering and structuring learning objectives before assessment item generation. diff --git a/.cursor/rules/python-blanks-challenge.md b/.cursor/rules/python-blanks-challenge.md new file mode 100644 index 0000000..efa8ff4 --- /dev/null +++ b/.cursor/rules/python-blanks-challenge.md @@ -0,0 +1,436 @@ +# Python BlanksChallenge - Language-Specific Reference + +Python-specific guidance for generating BlanksChallenge items. This supplements the generic `coding-exercise.md` with Python-focused rules. + +--- + +## Type Identifier + +**Type Name:** `BlanksChallenge` (Python) + +**Used For:** +- Python programming assessments +- Data manipulation with pandas, numpy +- API integrations (OpenAI, requests) +- Machine learning workflows +- General Python scripting + +--- + +## PYTHON PLACEHOLDER SYNTAX + +Use `{{_expr1}}`, `{{_expr2}}`, etc. for blanks in Python code. + +### What to Test with Blanks +- Method calls: `data.{{_expr1}}("column")` +- Parameters: `model={{_expr1}}` +- Operators: `x {{_expr1}} y` +- Function names: `{{_expr1}}(data)` +- Indexing: `data[{{_expr1}}]` +- Method chains: `df.{{_expr1}}("col").{{_expr2}}()` +- List comprehension parts: `[x {{_expr1}} 2 for x in numbers]` + +### What NOT to Test +- Import statements (unless that's the learning objective) +- Basic syntax keywords (`if`, `for`, `def`, `class`) +- String literals or field names not visible in code +- Variable names that could have multiple valid options +- `if __name__ == "__main__":` + +--- + +## PYTHON CODE STYLE + +### PEP 8 Compliance +- 4-space indentation +- `snake_case` for variables and functions +- `PascalCase` for classes +- Line length under 79 characters (soft limit) +- Two blank lines before top-level definitions + +### Pythonic Patterns +- List comprehensions over explicit loops when clearer +- F-strings over `.format()` or `%` formatting +- Context managers (`with` statements) for file operations +- Unpacking: `a, b = get_values()` +- Ternary expressions when simple: `x = a if condition else b` + +### Method Chaining (pandas) + +```python +# Good - clear chain +result = (df + .groupby("category") + .agg({"value": "mean"}) + .reset_index()) + +# Also good - single line for short chains +result = df.groupby("category").mean() +``` + +--- + +## CODE BLOCK TAGS + +All Python BlanksChallenge items use: + +| Section | Tag | +|---------|-----| +| `@code1` | `{python}` | +| `@pre_challenge_code` | `{python}` | + +--- + +## COMPLETE EXAMPLES + +### Example 1: pandas groupby + +**Context:** +You're analyzing customer orders for an e-commerce platform. Calculate the average order value by customer segment. + +**@code1:** +```python +segment_avg = orders.{{_expr1}}("segment")["order_value"].{{_expr2}}() +print(segment_avg.sort_index()) +``` + +**@pre_challenge_code:** +```python +import pandas as pd +orders = pd.DataFrame({ + "segment": ["retail", "wholesale", "retail", "wholesale"], + "order_value": [150, 500, 200, 450] +}) +``` + +**@variables:** +```yaml +expr1: + - 'groupby' +expr2: + - 'mean' +``` + +--- + +### Example 2: List comprehension + +**Context:** +You're processing a list of temperatures in Celsius. Convert each temperature to Fahrenheit using the formula: F = C Γ— 9/5 + 32. + +**@code1:** +```python +fahrenheit = [c {{_expr1}} 9/5 {{_expr2}} 32 for c in celsius] +print(fahrenheit) +``` + +**@pre_challenge_code:** +```python +celsius = [0, 10, 20, 30] +``` + +**@variables:** +```yaml +expr1: + - '*' +expr2: + - '+' +``` + +--- + +### Example 3: Dictionary operations + +**Context:** +You're building a configuration system. Access the nested 'timeout' value from the settings dictionary. + +**@code1:** +```python +timeout = settings{{_expr1}}"network"{{_expr2}}{{_expr1}}"timeout"{{_expr2}} +print(timeout) +``` + +**@pre_challenge_code:** +```python +settings = { + "network": {"timeout": 30, "retries": 3}, + "display": {"theme": "dark"} +} +``` + +**@variables:** +```yaml +expr1: + - '[' +expr2: + - ']' +``` + +--- + +### Example 4: OpenAI API + +**Context:** +You're building a chatbot. Send a completion request to OpenAI's API using the client. + +**@code1:** +```python +response = client.chat.completions.{{_expr1}}( + model="gpt-4", + messages=[{"role": "user", "content": prompt}] +) +print(response.choices[0].message.{{_expr2}}) +``` + +**@pre_challenge_code:** +```python +from openai import OpenAI +client = OpenAI(api_key="test-key") +prompt = "Hello, world!" +``` + +**@variables:** +```yaml +expr1: + - 'create' +expr2: + - 'content' +``` + +--- + +### Example 5: pandas filtering + +**Context:** +You're cleaning survey data. Filter the DataFrame to keep only rows where the response is complete. + +**@code1:** +```python +complete = survey[survey["status"] {{_expr1}} "complete"] +print(complete) +``` + +**@pre_challenge_code:** +```python +import pandas as pd +survey = pd.DataFrame({ + "respondent_id": [1, 2, 3, 4], + "status": ["complete", "partial", "complete", "complete"] +}) +``` + +**@variables:** +```yaml +expr1: + - '==' +``` + +--- + +## SKILL ASSESSMENT ITEM DESIGN + +When creating multiple BlanksChallenge items for a skill assessment: + +### Avoid Duplicate Blanks +- Each item should test a **unique** concept across the assessment +- Track blanks used: don't test `fit` in 5 different items +- Create a blank distribution table before generating items + +### Avoid Arbitrary Values +❌ **Bad**: `test_size={{_expr1}}` where answer is `0.2` (arbitrary, not conceptual) +βœ… **Good**: `knn.{{_expr1}}(X_test, y_test)` where answer is `score` (tests method knowledge) + +### Avoid Wordspotting +The context should NOT contain words that directly cue the answer: +❌ **Bad**: "Perform cross-validated grid search..." β†’ `GridSearchCV` +βœ… **Good**: "Systematically test combinations of hyperparameter values..." β†’ `GridSearchCV` + +### Fresh Examples Required +- Never lift code verbatim from course exercises +- Use different: + - Dataset names (`employee_df` not `music_df`) + - Variable names + - Context scenarios +- Same concept, fresh implementation + +### Verify Course Coverage +Before creating a blank, verify: +1. The exact function/method name is taught in the course +2. The specific syntax matches what's taught +3. The concept appears in video scripts AND/OR exercises + +--- + +## ENSURING UNAMBIGUOUS ANSWERS + +### Single Valid Answer Rule +Each blank must have exactly ONE correct answer. Check for: + +| Potential Issue | Example | Solution | +|-----------------|---------|----------| +| Similar methods | `fit` vs `fit_transform` | Add context showing return value usage | +| Similar classes | `GridSearchCV` vs `RandomizedSearchCV` | Use distinguishing parameters (`param_grid=` vs `param_distributions=`) | +| Multiple valid approaches | `.sum()` vs `.agg("sum")` | Restructure to test unambiguous method | + +### Distinguishing Multiple Blanks +When an item has 2+ blanks, candidates must be able to tell them apart: + +❌ **Bad** (ambiguous): +```python +knn.{{_expr1}}(X_train, y_train) +accuracy = knn.{{_expr2}}(X_test, y_test) +``` +Both `fit` and `score` take similar arguments - candidate may guess wrong positions. + +βœ… **Good** (distinguishable): +```python +knn.{{_expr1}}(X_train, y_train) +predictions = knn.{{_expr2}}(X_test) # Returns predictions array +``` +`fit` takes X and y; `predict` takes only X - clear distinction. + +### Import Placement for Class Names +When testing class names, move imports to `@pre_challenge_code`: + +❌ **Bad** (two identical blanks): +```python +from sklearn.linear_model import {{_expr1}} +reg = {{_expr1}}() +``` + +βœ… **Good** (single blank): +```python +# In @pre_challenge_code: +from sklearn.linear_model import LinearRegression + +# In @code1: +reg = {{_expr1}}() +``` + +--- + +## PREPROCESSING ITEMS + +### Show Data Context +When testing preprocessing operations, display the data so candidates understand WHY the operation is needed: + +βœ… **Good** (shows categorical column): +```python +print(employee_df.head()) +employee_dummies = pd.get_dummies(employee_df, {{_expr1}}=True) +``` + +❌ **Bad** (no data visibility): +```python +employee_dummies = pd.get_dummies(employee_df, {{_expr1}}=True) +``` + +### Common Preprocessing Blanks +| Concept | Recommended Blank | Avoid | +|---------|-------------------|-------| +| Dummy variables | `drop_first` parameter | `get_dummies` (too easy) | +| Imputation | `strategy="mean"` | `fit_transform` (generic) | +| Scaling | `StandardScaler` class | `fit` (duplicated elsewhere) | + +--- + +## COMMON PYTHON PATTERNS TO TEST + +### pandas Operations + +| Pattern | Blank Example | +|---------|---------------| +| Grouping | `df.{{_expr}}("column")` | +| Aggregation | `grouped.{{_expr}}()` | +| Filtering | `df[df["col"] {{_expr}} value]` | +| Sorting | `df.{{_expr}}("column")` | +| Merging | `pd.{{_expr}}(df1, df2, on="key")` | +| Selecting | `df{{_expr}}"column"{{_expr}}` | + +### String Operations + +| Pattern | Blank Example | +|---------|---------------| +| F-string | `f"Hello, {{_expr}}name{{_expr}}"` | +| Method | `text.{{_expr}}()` | +| Join | `" ".{{_expr}}(words)` | + +### Control Flow + +| Pattern | Blank Example | +|---------|---------------| +| Comparison | `x {{_expr}} y` | +| Membership | `item {{_expr}} collection` | +| Boolean | `a {{_expr}} b` | + +### Functions + +| Pattern | Blank Example | +|---------|---------------| +| Built-in | `{{_expr}}(iterable)` | +| Method call | `obj.{{_expr}}()` | +| Chained | `obj.{{_expr1}}().{{_expr2}}()` | + +--- + +## DETERMINISTIC OUTPUT (Python-Specific) + +Ensure reproducible output: + +```python +# βœ… Good - sorted output +print(sorted(my_dict.items())) + +# ❌ Bad - dict order may vary in older Python +print(my_dict) + +# βœ… Good - fixed seed +import random +random.seed(42) + +# ❌ Bad - random output +print(random.random()) + +# βœ… Good - sorted set +print(sorted(my_set)) + +# ❌ Bad - set order undefined +print(my_set) +``` + +--- + +## QUALITY CHECKLIST (Python) + +Before finalizing a Python BlanksChallenge item: + +- βœ… Code blocks tagged as `{python}` +- βœ… Follows PEP 8 style +- βœ… Uses Pythonic patterns +- βœ… No comments in `@code1` +- βœ… Deterministic output (sorted collections, fixed seeds) +- βœ… Each blank has exactly one valid Python token +- βœ… Blanks test structure/syntax, not string memorization +- βœ… Pre-challenge code handles all imports and setup +- βœ… DataFrames limited to ≀10 rows + +--- + +## COMMON PITFALLS (Python) + +1. **Testing string values**: `name = {{_expr}}` where answer is `"Alice"` β€” avoid this +2. **Multiple valid methods**: Both `.sum()` and `.aggregate("sum")` would work +3. **Indentation issues**: Blanks inside indented blocks must preserve structure +4. **Import scaffolding**: Never make imports a blank (unless testing class name - then move import to pre_challenge_code) +5. **Operator ambiguity**: `+` could be addition or concatenation β€” ensure context is clear +6. **Dict/set output**: Always sort before printing +7. **Course verbatim**: Copying exact code/datasets from course exercises β€” always use fresh examples +8. **Wordspotting**: Context that directly cues the answer (e.g., "grid search" β†’ `GridSearchCV`) +9. **Duplicate blanks**: Same concept tested multiple times across assessment items +10. **Arbitrary values**: Testing memorization of numbers (e.g., `0.2` for test_size) rather than concepts +11. **Ambiguous positioning**: Multiple blanks with similar arguments that could be swapped +12. **Untaught concepts**: Testing functions/syntax not explicitly covered in the course materials + +--- + +This supplements the generic `coding-exercise.md` with Python-specific guidance. diff --git a/.cursor/rules/r-coding-exercise.md b/.cursor/rules/r-coding-exercise.md new file mode 100644 index 0000000..806d68c --- /dev/null +++ b/.cursor/rules/r-coding-exercise.md @@ -0,0 +1,348 @@ +# R BlanksChallenge Exercises - Complete Reference + +R-specific guidance for generating BlanksChallenge coding items. This supplements the generic `coding-exercise.md` with R-focused rules. + +--- + +## Type Identifier + +**Type Name:** `BlanksChallenge` + +**Used For:** +- Fill-in-the-blank R coding challenges +- Data manipulation with tidyverse (dplyr, tidyr) +- Data visualization with ggplot2 +- Statistical analysis +- General R scripting + +--- + +## R SCAFFOLDING RULES + +**Use `{{_exprN}}` placeholders for R BlanksChallenge items.** + +### What to Scaffold +- Function calls: `result <- {{_expr1}}(data)` +- Parameters: `filter(data, {{_expr1}})` +- Variable assignments: `filtered_data <- {{_expr1}}` +- Pipe chains: `data %>% {{_expr1}}()` +- ggplot layers: `ggplot(data, aes({{_expr1}})) + {{_expr2}}()` +- Column references: `select(data, {{_expr1}})` + +### What NOT to Scaffold +- Library calls (unless that's the learning objective) +- Basic syntax (`if`, `for`, `function` keywords) +- Comments +- Print statements (unless that's the objective) + +--- + +## R CODE STYLE + +### Assignment Operator +- **Always use `<-` for assignment** (tidyverse convention) +- Good: `result <- mean(x)` +- Bad: `result = mean(x)` + +### Pipe Operators (Default) +- **Use pipe operators (`%>%` or `|>`) unless otherwise specified** +- Break long chains across multiple lines +- Indent continuation lines + +```r +# Good - piped chain +result <- data %>% + filter(year > 2020) %>% + group_by(region) %>% + summarize(mean_value = mean(value)) + +# Also good - single line for short chains +result <- data %>% filter(year > 2020) +``` + +### tidyverse Style Guide +- snake_case for variables and functions +- Spaces around operators: `x <- 5`, not `x<-5` +- Spaces after commas: `c(1, 2, 3)`, not `c(1,2,3)` +- Line length under 80 characters + +### Data Frame Terminology +- Use "data frame" (lowercase, two words) when referring to R data frames +- Good: "The data frame `sales` contains..." +- Bad: "The DataFrame `sales` contains..." + +--- + +## BLANKSCHALLENGE EXPORT FORMAT + +~~~markdown +--- +title: {{pool.title}} +output: html_document +description: {{pool.description}} +--- + +## [Learning Objective Title] + +```yaml +type: BlanksChallenge +key: +unit: <2-3 word kebab-case phrase> +subskill: {{pool.subskill}} +initial_difficulty: 0 +item_writer_id: '999999999' +``` + +`@context` +{Role-based scenario: "You're a [role] on a team [doing X]. [Imperative action]."} + +`@code1` +```{r} +result <- data %>% + {{_expr1}}(column > value) %>% + {{_expr2}}(category) +``` + +`@pre_challenge_code` +```{r} +library(dplyr) +data <- data.frame( + category = c("A", "B", "A", "B"), + value = c(10, 20, 15, 25) +) +``` + +`@variables` +```yaml +expr1: + - 'filter' +expr2: + - 'group_by' +``` + +`@distractors` +```yaml +``` +~~~ + +--- + +## CONTEXT GUIDELINES + +**Length:** 1-4 lines describing a real-world data task + +**Cognitive Level:** Access higher cognitive functions by placing candidates in rich, immersive scenariosβ€”not just describing a task. + +**No Explicit Instructions:** Users always see "fill in the blank" automatically, so: +- ❌ Don't write "you need to find" or "your task is to" +- βœ… Use imperative verbs: "Filter", "Summarize", "Calculate", "Create" + +**Role-Based Framing:** Put the user IN the scenario as a team member. + +| ❌ Bad | βœ… Good | +|--------|---------| +| "You're analyzing data. You need to filter the data frame." | "You're a data analyst on a marketing team reviewing campaign results. Filter the data frame to include only successful campaigns." | + +**Pattern:** +``` +❌ "You're analyzing [thing]. You need to [action]." +βœ… "You're a [role] on a team [doing X]. [Imperative action]." +``` + +--- + +## SECTION REQUIREMENTS + +| Section | Required | Description | +|---------|----------|-------------| +| `@context` | Yes | 1-4 lines; role-based scenario with imperative action | +| `@code1` | Yes | Minimal, runnable R with `{{_exprN}}` placeholders | +| `@pre_challenge_code` | No | Setup code; data frames ≀10 rows | +| `@variables` | Yes | Maps each `exprN` to solution (single-element list) | +| `@distractors` | No | Optional incorrect choices | + +--- + +## COMPLETE EXAMPLE + +### Example: Filtering and Grouping with dplyr + +**Given pool.yml:** +```yaml +title: "Data Wrangling with tidyverse" +subskill: dplyr-operations +``` + +**Generated item:** + +~~~markdown +--- +title: Data Wrangling with tidyverse +output: html_document +--- + +## [Filtering and Summarizing Sales Data] + +```yaml +type: BlanksChallenge +key: +unit: dplyr-wrangling +subskill: dplyr-operations +initial_difficulty: 0 +item_writer_id: '999999999' +``` + +`@context` +You're a data analyst on a retail analytics team investigating regional performance. Filter the sales data for high-value transactions and calculate the average revenue by region. + +`@code1` +```{r} +high_value <- sales %>% + {{_expr1}}(revenue > 1000) + +regional_avg <- high_value %>% + group_by(region) %>% + {{_expr2}}(avg_revenue = mean(revenue)) +``` + +`@pre_challenge_code` +```{r} +library(dplyr) +sales <- data.frame( + region = c("North", "South", "North", "East", "South"), + revenue = c(1200, 800, 1500, 2000, 900) +) +``` + +`@variables` +```yaml +expr1: + - 'filter' +expr2: + - 'summarize' +``` + +`@distractors` +```yaml +``` +~~~ + +--- + +## COMMON TIDYVERSE PATTERNS + +### dplyr Verbs +```r +# Filtering rows +data %>% filter(column > value) + +# Selecting columns +data %>% select(col1, col2, col3) + +# Creating new columns +data %>% mutate(new_col = col1 + col2) + +# Grouping and summarizing +data %>% + group_by(category) %>% + summarize( + mean_val = mean(value), + count = n() + ) + +# Arranging rows +data %>% arrange(desc(column)) +``` + +### ggplot2 Layers +```r +# Basic plot structure +ggplot(data, aes(x = x_col, y = y_col)) + + geom_point() + + labs(title = "Title", x = "X Label", y = "Y Label") + + theme_minimal() + +# Common geoms +geom_point() # Scatter plot +geom_line() # Line plot +geom_bar() # Bar chart +geom_histogram() # Histogram +geom_boxplot() # Box plot +``` + +--- + +## BEST PRACTICES + +### 1. Default to Pipes +- Always use `%>%` or `|>` for chaining operations +- Makes code more readable and matches modern R style + +### 2. Strategic Scaffolding +- Focus on learning objectives, not busywork +- Don't scaffold every single line +- Each `{{_expr}}` should require understanding, not just typing + +### 3. Context Sets the Stage +- Always mention what packages are loaded in pre_challenge_code +- Use "data frame" (not DataFrame) for R +- Use realistic scenarios with role-based framing + +### 4. No Comments in @code1 +- Comments should NOT appear in the `@code1` section +- Keep code minimal and focused on the tested concept + +--- + +## COMMON PITFALLS TO AVOID + +1. **Wrong placeholder format**: Using `___` instead of `{{_exprN}}` +2. **Wrong assignment**: Using `=` instead of `<-` +3. **Wrong terminology**: "DataFrame" instead of "data frame" +4. **Missing pipes**: Not using `%>%` when appropriate +5. **Comments in @code1**: Code section should have no comments +6. **Over-scaffolding**: Every line has `{{_expr}}` +7. **Under-scaffolding**: Key concepts not tested +8. **String blanks**: Making the answer a text string value +9. **Non-deterministic output**: Using randomness without seeds + +--- + +## QUALITY CHECKLIST + +Before finalizing an R BlanksChallenge item, verify: + +- βœ… Placeholders use `{{_exprN}}` format +- βœ… Assignment uses `<-` operator +- βœ… Pipes (`%>%`) used for chaining +- βœ… Each `{{_expr}}` has matching entry in `@variables` +- βœ… No comments in `@code1` +- βœ… Context uses role-based framing with imperative action +- βœ… Pre-challenge code handles all setup (library calls, data loading) +- βœ… Code follows tidyverse style guide +- βœ… Uses "data frame" terminology (not DataFrame) +- βœ… `type: BlanksChallenge` in YAML +- βœ… `item_writer_id: '999999999'` +- βœ… All code blocks tagged as `{r}` + +--- + +## AUTOMATIC VALIDATION + +After generating the exercise, validate it: + +```bash +python .cursor/validators/r_coding_validator.py /tmp/exercise_to_validate.md +``` + +--- + +## PREVIEW + +Generate a visual preview with course content matching: + +```bash +python .cursor/preview/generate_blanks_preview.py /tmp/items.md \ + --scripts \ + --exercises +``` diff --git a/.cursor/rules/single-mcq-exercise.md b/.cursor/rules/single-mcq-exercise.md new file mode 100644 index 0000000..a87072f --- /dev/null +++ b/.cursor/rules/single-mcq-exercise.md @@ -0,0 +1,605 @@ +# Multiple Choice Challenge - Item Writing Guide + +Generate high-quality multiple-choice items that test genuine understanding through application. + +--- + +## ⚑ QUICK START: 6 Non-Negotiable Rules + +Before writing ANY item, internalize these: + +| Rule | What It Means | +|------|---------------| +| **1. Stem stands alone** | Question makes sense and is answerable WITHOUT seeing options | +| **2. No answer cueing** | Stem doesn't telegraph the correct answer through keywords or opposites | +| **3. Sufficient context** | Candidate can REASON to the answer from information in the stem | +| **4. Test application** | Ask what to DO with knowledge, not just classify or label | +| **5. Plausible distractors** | Each wrong answer tempts someone with a real misconception | +| **6. Balanced options** | Same length (Β±8 chars), same structure, same technical register | + +--- + +## 🎯 CORE PRINCIPLES + +### 1. Standalone Clarity + +**Every question must make complete sense without the options.** + +A knowledgeable test-taker should begin formulating an answer before seeing choices. + +| βœ… Good Stem | ❌ Bad Stem | +|--------------|-------------| +| "Why is containerization suitable for deploying microservices?" | "Which of the following is true about containerization?" | +| "What change should the company make to increase its MLOps maturity?" | "Which option best describes how to improve this company?" | + +### 2. No Answer Cueing + +**The stem must NOT telegraph the correct answer.** + +| Cueing Problem | Example | Fix | +|----------------|---------|-----| +| **Opposite mapping** | Stem lists anti-patterns β†’ Answer is their opposite | Describe symptoms/outcomes instead | +| **Keyword matching** | Stem uses words that appear only in correct answer | Use keyword in 3+ options or none | +| **Single-factor decision** | Only one factor mentioned β†’ obviously the trigger | Present multiple factors | + +### 3. Application Over Recall + +**Questions should test what to DO with knowledge, not just classify.** + +| ❌ Recall/Classification | βœ… Application | +|--------------------------|----------------| +| "Which maturity level is this?" | "What change would increase the maturity level?" | +| "What type of drift is this?" | "What action would address this drift?" | +| "Which role is responsible?" | "What should this role do next?" | + +**When classification is acceptable:** When the scenario requires genuine interpretation, not just label-matching. + +### 4. Sufficient Context + +**The candidate must be able to reason to the answer from information provided in the stem.** + +| ❌ Insufficient | βœ… Sufficient | +|-----------------|---------------| +| "What does `$@` represent?" (pure recall) | "When you run the script with two files, both are processed. What happens with one file?" (can reason from behavior) | +| "Which command navigates directories?" | "You are in `/home/user/projects` and need to reach the `data` subdirectory." | + +**Test:** Before finalizing, ask: "Could a learner who understands the concept but hasn't memorized definitions answer this from the information given?" + +### 5. Code-Related Items in MCQ Format + +**Where possible, test code using Blanks/Coding items, not MCQ.** + +When MCQ is required for code-related concepts, follow these guidelines: + +#### βœ… DO: Test Understanding of What Commands DO + +**Approach A β€” Predict the outcome:** +> **Stem:** You run the command `mv report.txt summary.txt archive` from your current directory. The `archive` directory already exists. +> +> **Question:** What is the result of this command? +> +> **Options:** +> - Both files are copied to archive and remain in the current directory +> - [Both files are moved to archive and removed from current directory] +> - Only the first file is moved; the second overwrites it in archive +> - The command fails because mv requires files to be moved one at a time + +**Approach B β€” Choose the right tool for a goal:** +> **Stem:** You need to organize project files. You want `data.csv` and `config.txt` placed into a `backup` folder while keeping the originals in your current directory for ongoing work. +> +> **Question:** Which command accomplishes this goal? +> +> **Options:** +> - `mv data.csv config.txt backup` +> - [`cp data.csv config.txt backup`] +> - `mv backup data.csv config.txt` +> - `rm data.csv config.txt backup` + +**Approach C β€” Explain why something fails:** +> **Stem:** A script contains `cat data/sales.csv` and runs correctly from `/home/user`. When run from `/home/user/projects`, it fails with "No such file or directory." The file still exists and has not changed. +> +> **Question:** Why does the script fail when run from the different directory? + +*(Tests understanding of underlying concepts, not syntax)* + +**Approach D β€” Identify the error/problem:** +> **Stem:** A colleague runs `rm -r projects` expecting to delete only empty directories, but all files inside are also deleted. +> +> **Question:** What caused this unexpected result? + +*(Tests understanding of what flags/options actually do)* + +**Approach E β€” Single-token completion:** +> **Stem:** You want to copy a file while keeping the original in place. Complete the command: +> +> ``` +> ___ report.txt backup/report.txt +> ``` +> +> **Question:** Which command completes this correctly? +> +> **Options:** +> - `cp` +> - `mv` +> - `rm` +> - `cat` + +*(Tests command selection with minimal syntax noise β€” options are single tokens)* + +**Approach F β€” Compare two approaches:** +> **Stem:** You need to process all `.csv` files in a directory. +> +> **Question:** What is the difference between using `*.csv` and listing each file individually? + +*(Tests understanding of wildcards/patterns and when to use them)* + +**Approach G β€” Single blank in code line (in stem):** +> **Stem:** You need to count how many lines in `server.log` contain the word "error". You want to do this without creating intermediate files. +> +> Complete the command: `grep error server.log ___ wc -l` +> +> **Options:** +> - `|` +> - `>` +> - `+` +> - `&` + +*(The full command is in the stem; options are ONLY the single token to fill the blank. This avoids spot-the-difference by testing just the key decision point.)* + +#### ❌ DON'T: Test Spot-the-Difference Syntax + +| Approach | Problem | +|----------|---------| +| **Minor syntax variations** | `./archive` vs `archive` vs `/archive` β€” tests typo-spotting, not understanding | +| **Flag memorization** | `-n 5` vs `-5` vs `--lines=5` β€” tests syntax recall, not concept | +| **Path permutations** | Options differ only by `/`, `./`, `~`, `..` β€” wordspot-prone | +| **Full command variations** | Options are full commands differing by one symbol β€” use Approach G instead | +| **Hidden differences** | Showing two similar commands without highlighting what differs β€” make it explicit | + +#### When Showing Code Differences: Make Them Explicit + +If the stem compares two commands or shows an error, **state the difference explicitly** rather than requiring candidates to spot it: + +| ❌ Hidden Difference | βœ… Explicit Difference | +|---------------------|------------------------| +| "You type `head -n 5 \| tail -n 3 data.csv` instead of `head -n 5 data.csv \| tail -n 3`" | "You type `head -n 5 \| tail -n 3 data.csv` β€” notice that `head` has no filename" | +| "Compare `for f in files` vs `for f in $files`" | "The loop uses `files` without a `$` prefix" | + +#### Why This Matters + +Syntax precision is better tested in interactive coding environments where: +- The learner types the command themselves +- Error messages provide feedback +- Partial credit can be given + +MCQ should test whether the learner **understands what tools do and when to use them**, not whether they can spot a missing character. + +--- + +## ✍️ WRITING THE STEM + +### Structure +1. **Context** (1-3 sentences): Real-world scenario +2. **Question** (1 sentence): Clear, direct question + +### Requirements +- State the central idea in the stem, not hidden in options +- Use positive phrasing (avoid "not" or "except") +- Keep wording concise (grade-8 readability) +- Test ONE concept aligned to ONE learning objective + +### Forbidden Language + +| ❌ Avoid | Why | βœ… Use Instead | +|----------|-----|----------------| +| "best" | Implies multiple partial answers | "What change would..." | +| "most appropriate" | Implies degrees of correctness | "Which role should..." | +| "most likely" | Ambiguous | "What does X indicate?" | +| "Which of the following" | Option-dependent | Direct question | + +### Stem-Option Grammatical Parallelism + +**The stem's framing must work grammatically with ALL options.** + +Certain verbs in the stem cue certain types of answers: + +| Cue Word | Cues Answer Type | Risk | +|----------|------------------|------| +| "represent" | Symbolic/variable interpretations | Distractors describing effects sound wrong | +| "cause" | Effects/outcomes | Distractors describing states sound wrong | +| "prevent" | Blocking actions | Distractors describing enablers sound wrong | + +**Rule:** Read each option as a completion of the stem. If some options sound grammatically awkward: +- Reword the stem to be neutral, OR +- Reword all options to match the stem's framing + +### Multi-Factor Decision Scenarios + +**For decision-based items, present multiple factors where only ONE indicates the correct action.** + +**Structure:** +1. Describe 3-4 relevant factors from the course +2. Make 2-3 factors neutral (don't suggest action) +3. Make exactly ONE factor clearly indicate the correct action +4. Distractors reference the other factors + +| ❌ Single-Factor | βœ… Multi-Factor | +|------------------|-----------------| +| "Accuracy dropped below 90%." | "Business is stable, costs are high, but accuracy dropped below threshold." | + +--- + +## 🧠 WRITING THE OPTIONS + +### Structure Requirements +- **Exactly 4 options** per item +- **Exactly 1 correct answer** +- **3 plausible distractors** + +### Length Rule (CRITICAL) +- All options **within Β±8 characters** of each other +- Correct answer **NOT longer** than any distractor + +**Exception β€” Terminology-only options:** When all options are standard technical terms (e.g., "Concept drift", "Covariate shift", "Batch prediction"), length imbalance is acceptable. These are fixed terms that cannot be adjusted, and the imbalance does not create cueing. + +### Parallel Structure +All options must share: +- Same grammatical form (all actions, all principles, all roles) +- Same structural complexity (if one has a list, all do) +- Same technical register (all technical OR all conceptual) + +**Default:** Simplify the correct answer to match distractor complexity. + +### Simple Label Rule + +When testing **known, finite categories** (phases, roles, strategies), use labels only: + +| ❌ With Description | βœ… Label Only | +|--------------------|---------------| +| "The design phase, where requirements are gathered" | "The design phase" | +| "The data engineer, who builds pipelines" | "The data engineer" | + +### Keyword Consistency (Anti-Wordspotting) + +If a distinctive keyword appears in the stem/context: + +| Distribution | Status | +|--------------|--------| +| ALL 4 options | βœ… Ideal | +| 3 of 4 options | βœ… Acceptable | +| 1-2 options only | ❌ Creates elimination cue | +| NO options | βœ… Acceptable (rephrase all) | + +--- + +## 🎭 DISTRACTOR QUALITY + +### Definition + +> A **plausible distractor** is an incorrect option that a learner with incomplete understanding would reasonably select because it appears relevant and defensible. + +### The Plausibility Test + +> "Would a learner who doesn't fully understand the concept reasonably choose this?" + +### Types of Plausible Distractors + +| Type | Description | Example | +|------|-------------|---------| +| **Common misconception** | What learners often wrongly believe | "Feature stores replace data pipelines" | +| **Partial truth** | Correct elsewhere, wrong here | "Add more training data" | +| **Related concept confusion** | Mixing up similar concepts | Confusing data drift with concept drift | +| **Reasonable but insufficient** | Addresses symptom, not cause | "Add staging testing" when CI/CD is needed | + +### Distractor Rules + +| Rule | Requirement | +|------|-------------| +| **Technical Register** | Match the complexity of the correct answer | +| **Course-Aligned Vocabulary** | Use terms the learner has seen in the course | +| **Context-Rooted** | Reference information explicitly stated in the stem | +| **Scenario-Relevant** | Address the problem described | + +### Red Flags: Implausible Distractors + +| ❌ Red Flag | Why It Fails | +|-------------|--------------| +| Obviously absurd | "Delete all code and start over" | +| Unrelated to scenario | Problem is slow deploys / Option is "hire more analysts" | +| Wrong technical register | Key is technical / Distractor is generic | +| Unrooted | References info not in the stem | + +### Distractor Evaluation Checklist + +For each distractor, verify: + +- [ ] A learner could actually believe this (common misconception) +- [ ] It's clearly wrong based on how the command/concept WORKS (not data-dependent) +- [ ] It doesn't require seeing specific data to evaluate +- [ ] It's not a potential double key (could also be correct) +- [ ] It matches the stem's grammatical framing + +### Distractor Iteration Process + +When distractors aren't working: + +1. **Generate 5+ alternatives** with rationales +2. **Present to reviewer:** + ``` + **1. "[Distractor text]"** (XX chars) + - *Why plausible:* [Misconception it represents] + - *Why wrong:* [Why it doesn't address the concept] + ``` +3. **Reviewer selects 3** +4. **Balance lengths** after selection +5. **Check keyword consistency** + +--- + +## πŸ”„ ANSWER POSITION ROTATION + +- Distribute correct answers across positions 1, 2, 3, 4 +- Never same position more than 2x in a row +- Over 8+ items, each position appears at least once + +--- + +## πŸ“ MARKDOWN FORMAT + +~~~markdown +--- +title: +output: html_document +description: <1–2 line description> +--- + +## <3–4 Word Item Title> + +```yaml +type: MultipleChoiceChallenge +key: +unit: +subskill: +initial_difficulty: 0 +item_writer_id: '999999999' +# DEVELOPMENT FIELDS (remove before finalizing): +# course_section: "Video 1.1" +# course_content_reference: | +# **From Video 1.1:** +# "Verbatim passage from video script that teaches this concept..." +# +# **From chapter1.md:** +# "Verbatim passage from chapter file if relevant..." +``` + +`@assignment1` + + + + +`@options1` +- +- [] +- +- +~~~ + +### Development Fields (Temporary) + +During item creation, you MUST include these fields to ensure accurate course alignment in previews: + +| Field | Purpose | Example | +|-------|---------|---------| +| `course_section` | Source location (Video number or Chapter) | `"Video 1.1"` or `"Chapter 2 - Data Quality"` | +| `course_content_reference` | Verbatim passage(s) from course that teach the concept (1-2 paragraphs) | See example below | + +**These fields help verify course alignment but MUST be removed before finalizing items.** + +#### Course Content Reference Format + +When creating items, extract the EXACT verbatim passage(s) from course materials that teach the concept being tested. Include content from **both** `.txt` (video scripts) and `.md` (chapter files) when relevant: + +```yaml +# course_section: "Video 1.1" +# course_content_reference: | +# **From Video 1.1 (video script):** +# "Of course, most organizations start playing with ML without the Ops, +# manually executing all workflows and monitoring models only ad hoc. +# Many, unfortunately, don't evolve much further than that, paying dearly +# down the line. This causes the accumulation of so-called technical debt +# which Wikipedia defines as: the implied cost of additional rework caused +# by choosing an easy (limited) solution now instead of using a better +# approach that would take longer." +# +# "Implementing MLOps tools and practices will, on the other hand, make +# your processes automated, fast, reproducible, and explainable – producing +# the highest quality of service and earning the trust of your customers." +# +# **From chapter1.md:** +# "Technical debt accumulates when teams skip proper MLOps practices, +# leading to models that are difficult to update, monitor, or reproduce." +``` + +**Requirements:** +- Include 1-2 paragraphs that directly teach the concept +- Label each passage with its source (video script vs chapter file) +- Use verbatim quotes β€” do not paraphrase +- Include content from multiple sources when both cover the concept + +#### Stripping Development Fields + +To remove development fields before finalizing, use this Python snippet: +```bash +python3 -c " +import re +with open('/tmp/mc_items.md', 'r') as f: + content = f.read() +# Remove course_content_reference block (commented multi-line YAML) +content = re.sub(r'# course_content_reference:.*?(?=\n[^#\n]|\n\\\`\\\`\\\`|\Z)', '', content, flags=re.DOTALL) +# Remove other development fields +content = re.sub(r'# course_section:.*\n', '', content) +content = re.sub(r'# teaching_point:.*\n', '', content) +content = re.sub(r'# DEVELOPMENT FIELDS.*\n', '', content) +with open('/tmp/mc_items.md', 'w') as f: + f.write(content) +" +``` + +--- + +## βœ… PRE-GENERATION CHECKLIST + +Before writing each item: + +- [ ] I know the ONE concept being tested +- [ ] I have the course extract that teaches this concept +- [ ] I can write a scenario requiring APPLICATION (not recall) +- [ ] The candidate can REASON to the answer from information I'll provide +- [ ] The question type matches what I'll put in options + +--- + +## βœ… POST-GENERATION CHECKLIST + +### Stem Quality +- [ ] Question stands alone without options +- [ ] No comparative language ("best," "most") +- [ ] Does not cue the answer (no opposite mapping, no keyword matching) +- [ ] Provides sufficient context to reason to the answer +- [ ] Tests application, not just classification +- [ ] Decision scenarios have multiple factors (only one decisive) +- [ ] Code differences are explicit (not hidden for candidate to spot) + +### Option Quality +- [ ] All 4 options within Β±8 characters +- [ ] Correct answer NOT longer than distractors +- [ ] All distractors are plausible misconceptions +- [ ] Options are parallel (same type, structure, register) +- [ ] Keywords appear in 3+ options or none +- [ ] Each distractor references something in the stem +- [ ] Classification uses simple labels only + +### Format +- [ ] Correct YAML with all required fields +- [ ] Correct answer in brackets `[...]` +- [ ] Position rotation maintained + +--- + +## ❗ COMMON ERRORS + +### Stem Errors + +| Error | Example | Fix | +|-------|---------|-----| +| Vague stem | "Which is true about X?" | Ask specific question | +| Comparative language | "What is the best approach?" | "What approach addresses this?" | +| Opposite mapping | Lists anti-patterns β†’ answer is opposite | Describe symptoms instead | +| Single-factor decision | Only triggering factor mentioned | Present multiple factors | +| Recall-only question | "Which level is this?" | "What change would increase the level?" | +| Insufficient context | "What does `$@` do?" | Add scenario showing behavior to reason from | +| Hidden code differences | Two commands differ subtly | State the difference explicitly in the stem | + +### Option Errors + +| Error | Example | Fix | +|-------|---------|-----| +| Semantic mismatch | Q: "What problem?" / Options: solutions | Align question to options | +| Implausible distractor | "Delete all code" | Use common misconceptions | +| Unrooted distractor | References info not in stem | Add context OR change distractor | +| Length giveaway | Correct: 90 chars / Distractors: 40 | Balance within Β±8 chars | +| Wordspotting | Keyword in only 1-2 options | Use in 3+ or none | +| Register mismatch | Key: technical / Distractor: generic | Match complexity | +| Non-course vocabulary | Unfamiliar jargon | Use course terms | + +--- + +## πŸ”§ VALIDATION & PREVIEW + +### Validate Structure +```bash +python3 .cursor/validators/mc_validator.py /tmp/mc_items.md +``` + +### Generate Preview +```bash +python3 .cursor/preview/generate_mc_preview.py /tmp/mc_items.md --scripts +``` + +**Note:** The `--scripts` argument is required for course reference matching. Point it to the directory containing video script files (e.g., `chapter_1_scripts.txt`). Course content may include code snippets in both `.txt` and `.md` files. + +--- + +## πŸ“š COURSE ALIGNMENT + +### Required: Extract Course Content Reference + +When creating items, you MUST search both `.txt` (video scripts) and `.md` (chapter files) to find and extract the relevant teaching passages. + +**Step 1: Identify the concept being tested** +- What specific knowledge or skill does this item assess? + +**Step 2: Search course materials for teaching content** +- Search `.txt` files (video scripts) for passages that teach this concept +- Search `.md` files (chapter content) for related explanations +- Look for definitions, explanations, examples, and key principles + +**Step 3: Extract verbatim passages** +- Copy the EXACT text (1-2 paragraphs) that teaches the concept +- Include source attribution (e.g., "Video 1.1" or "chapter2.md") +- If both file types contain relevant content, include passages from each + +**Step 4: Add to `course_content_reference` field** +- Place in the YAML block as a commented multi-line field +- Label each passage with its source + +### Alignment Principles + +**Good alignment:** Tests whether learner can APPLY what was taught +**Bad alignment:** Tests whether learner REMEMBERS exact wording + +1. **Create a NEW scenario** that applies (not restates) the concept +2. **Verify the item tests what the passage TEACHES** (not adjacent content) +3. **If the item tests something from a different section:** + - Reassign to the correct subskill, OR + - Revise to test the intended content + +| ❌ Misalignment | βœ… Alignment | +|-----------------|-------------| +| Course section teaches wildcards; item tests loop syntax | Course section teaches loop syntax; item tests loop syntax | +| Course section is Chapter 4; concept is from Chapter 5 | Concept matches the chapter specified in pool.yml | + +--- + +## EXAMPLE: Well-Constructed Item + +~~~markdown +## MLOps Core Purpose + +```yaml +type: MultipleChoiceChallenge +key: +unit: mlops-fundamentals +subskill: chapter1 +initial_difficulty: 0 +item_writer_id: '999999999' +``` + +`@assignment1` +A data science team has developed a promising customer churn model. However, models often break when moved to production, updates take weeks to deploy, and no one monitors whether predictions remain accurate over time. + +What is the primary purpose of MLOps in addressing these challenges? + +`@options1` +- To optimize training pipelines so that new models are developed more quickly +- [To enable reliable, continuous deployment and monitoring of ML systems] +- To create documentation and checklists that speed up deployment approvals +- To establish accuracy thresholds that models must pass before any release +~~~ + +**Why this works:** +- βœ… Stem describes symptoms (not anti-patterns) +- βœ… Question type matches options (purpose β†’ purposes) +- βœ… All distractors are plausible improvements +- βœ… Balanced lengths and structure +- βœ… Tests application to new scenario diff --git a/.cursor/rules/sql-assessment.md b/.cursor/rules/sql-assessment.md new file mode 100644 index 0000000..821ecde --- /dev/null +++ b/.cursor/rules/sql-assessment.md @@ -0,0 +1,367 @@ +# SQL BlanksChallenge Exercises - Complete Reference + +SQL-specific guidance for generating BlanksChallenge coding items. This supplements the generic `coding-exercise.md` with SQL-focused rules. + +--- + +## Type Identifier + +**Type Name:** `BlanksChallenge` + +**Used For:** +- Fill-in-the-blank SQL query challenges +- Database querying and manipulation +- Data analysis with SQL +- Joins, aggregations, subqueries + +--- + +## SQL SCAFFOLDING RULES + +**Use `{{_exprN}}` placeholders for SQL BlanksChallenge items.** + +### What to Scaffold +- Entire clauses: `SELECT {{_expr1}}` +- Column selections: `SELECT {{_expr1}}, {{_expr2}} FROM table` +- Table names: `FROM {{_expr1}}` +- Conditions: `WHERE {{_expr1}}` +- Aggregations: `{{_expr1}}(column)` +- Join conditions: `ON {{_expr1}}` +- Keywords: `{{_expr1}} JOIN table ON ...` + +### What NOT to Scaffold +- Semicolons +- Comments +- Basic punctuation (commas, parentheses) + +--- + +## SQL CODE STYLE (Holywell + DataCamp) + +### Capitalization (CRITICAL) +- **ALL SQL keywords UPPERCASE:** `SELECT`, `FROM`, `WHERE`, `GROUP BY`, `ORDER BY`, `HAVING`, `JOIN`, `ON`, `AS`, `AND`, `OR`, `NOT`, `IN`, `BETWEEN`, `LIKE`, `IS NULL`, `DISTINCT`, `UNION`, `LIMIT` +- **ALL functions UPPERCASE:** `SUM()`, `COUNT()`, `AVG()`, `MAX()`, `MIN()`, `ROUND()`, `COALESCE()`, `CASE` +- **Table and column names lowercase:** `cities`, `country_name`, `population` + +### Commas (DataCamp Standard) +- **Commas at END of columns** (not beginning) +- Include space after comma when on same line + +```sql +-- Good (DataCamp standard) +SELECT + name, + population, + country +FROM cities; +``` + +### Indentation +- **4 spaces** for indentation (not tabs) +- Indent columns under `SELECT` +- Indent conditions under `WHERE` +- Indent subqueries + +### Aliasing +- **Always use `AS` keyword** +- Good: `SELECT name AS city_name` +- Bad: `SELECT name city_name` + +### Comments +- **Use `--` for SQL comments** (not `#`) +- Comments should NOT appear in `@code1` + +--- + +## CODE BLOCK SYNTAX (IMPORTANT) + +SQL exercises use **mixed language blocks**: + +| Section | Language Tag | Purpose | +|---------|--------------|---------| +| `@code1` | `` ```{sql} `` | Student code with blanks | +| `@pre_challenge_code` | `` ```{python} `` | Database setup | + +**Note:** Uses curly braces `{sql}` and `{python}`, not plain `sql` or `python`. + +--- + +## BLANKSCHALLENGE EXPORT FORMAT + +~~~markdown +--- +title: {{pool.title}} +output: html_document +description: {{pool.description}} +--- + +## [Learning Objective Title] + +```yaml +type: BlanksChallenge +key: +unit: <2-3 word kebab-case phrase> +subskill: {{pool.subskill}} +initial_difficulty: 0 +item_writer_id: '999999999' +``` + +`@context` +{Role-based scenario: "You're a [role] on a team [doing X]. [Imperative action]."} + +`@code1` +```{sql} +SELECT + {{_expr1}}, + COUNT(*) AS total +FROM orders +{{_expr2}} category +ORDER BY total DESC; +``` + +`@pre_challenge_code` +```{python} +# Database connection pre-configured +# Tables available: orders (id, category, amount, date) +``` + +`@variables` +```yaml +expr1: + - 'category' +expr2: + - 'GROUP BY' +``` + +`@distractors` +```yaml +``` +~~~ + +--- + +## CONTEXT GUIDELINES + +**Length:** 1-4 lines describing a real-world data task + +**Cognitive Level:** Access higher cognitive functions by placing candidates in rich, immersive scenariosβ€”not just describing a task. + +**No Explicit Instructions:** Users always see "fill in the blank" automatically, so: +- ❌ Don't write "you need to find" or "your task is to" +- βœ… Use imperative verbs: "Query", "Join", "Aggregate", "Filter" + +**Role-Based Framing:** Put the user IN the scenario as a team member. + +| ❌ Bad | βœ… Good | +|--------|---------| +| "You're querying a database. You need to join two tables." | "You're a database analyst on a finance team preparing quarterly reports. Join the transactions and accounts tables to calculate total balances." | + +**Pattern:** +``` +❌ "You're querying [thing]. You need to [action]." +βœ… "You're a [role] on a team [doing X]. [Imperative action]." +``` + +--- + +## SECTION REQUIREMENTS + +| Section | Required | Description | +|---------|----------|-------------| +| `@context` | Yes | 1-4 lines; role-based scenario with imperative action | +| `@code1` | Yes | SQL query with `{{_exprN}}` placeholders; use `{sql}` tag | +| `@pre_challenge_code` | No | Setup code; use `{python}` tag | +| `@variables` | Yes | Maps each `exprN` to solution (single-element list) | +| `@distractors` | No | Optional incorrect choices | + +--- + +## COMPLETE EXAMPLE + +### Example: Aggregating with GROUP BY + +**Given pool.yml:** +```yaml +title: "SQL Fundamentals" +subskill: sql-aggregations +``` + +**Generated item:** + +~~~markdown +--- +title: SQL Fundamentals +output: html_document +--- + +## [Counting Orders by Category] + +```yaml +type: BlanksChallenge +key: +unit: sql-aggregations +subskill: sql-aggregations +initial_difficulty: 0 +item_writer_id: '999999999' +``` + +`@context` +You're a data analyst on an e-commerce team reviewing product performance. Count the number of orders in each category and sort by the most popular categories first. + +`@code1` +```{sql} +SELECT + category, + {{_expr1}} AS order_count +FROM orders +GROUP BY {{_expr2}} +ORDER BY order_count DESC; +``` + +`@pre_challenge_code` +```{python} +# Database connection pre-configured +# Table: orders (id, category, amount, order_date) +``` + +`@variables` +```yaml +expr1: + - 'COUNT(*)' +expr2: + - 'category' +``` + +`@distractors` +```yaml +``` +~~~ + +--- + +## COMMON SQL PATTERNS + +### Basic SELECT +```sql +SELECT + name, + population, + country +FROM cities; +``` + +### Filtering with WHERE +```sql +SELECT + name, + population +FROM cities +WHERE population > 1000000 + AND country = 'USA'; +``` + +### Aggregations +```sql +SELECT + country, + COUNT(*) AS city_count, + AVG(population) AS avg_population +FROM cities +GROUP BY country +HAVING COUNT(*) > 5 +ORDER BY avg_population DESC; +``` + +### JOINs +```sql +SELECT + c.name AS city_name, + co.name AS country_name +FROM cities AS c +INNER JOIN countries AS co + ON c.country_id = co.id +WHERE co.continent = 'Europe'; +``` + +--- + +## BEST PRACTICES + +### 1. Uppercase Keywords +- Always capitalize SQL keywords and functions +- This is DataCamp standard and improves readability + +### 2. Strategic Scaffolding +- Focus on the learning objective +- Don't scaffold keywords that are obvious from context +- Each `{{_expr}}` should test understanding + +### 3. Readable Formatting +- One column per line for 3+ columns +- Indent subqueries and CASE statements +- Keep lines under 60 characters + +### 4. Always Use AS +- Every alias must include `AS` +- Tables: `FROM cities AS c` +- Columns: `COUNT(*) AS city_count` + +### 5. No Comments in @code1 +- Comments should NOT appear in the `@code1` section +- Keep SQL minimal and focused on the tested concept + +--- + +## COMMON PITFALLS TO AVOID + +1. **Wrong placeholder format**: Using `___` instead of `{{_exprN}}` +2. **Lowercase keywords**: `select` instead of `SELECT` +3. **Missing AS**: `SELECT name city_name` instead of `SELECT name AS city_name` +4. **Wrong code block tags**: `sql` instead of `{sql}` +5. **Wrong comment style**: `#` instead of `--` +6. **GROUP BY numbers**: `GROUP BY 1` instead of `GROUP BY country` +7. **Comments in @code1**: Code section should have no comments +8. **String blanks**: Making the answer a text string value +9. **Missing semicolon**: SQL queries should end with `;` + +--- + +## QUALITY CHECKLIST + +Before finalizing a SQL BlanksChallenge item, verify: + +- βœ… Placeholders use `{{_exprN}}` format +- βœ… All SQL keywords are UPPERCASE +- βœ… All functions are UPPERCASE +- βœ… Table/column names are lowercase +- βœ… Aliases always use `AS` +- βœ… Each `{{_expr}}` has matching entry in `@variables` +- βœ… No comments in `@code1` +- βœ… Context uses role-based framing with imperative action +- βœ… `@code1` uses `` ```{sql} `` tag +- βœ… `@pre_challenge_code` uses `` ```{python} `` tag +- βœ… `type: BlanksChallenge` in YAML +- βœ… `item_writer_id: '999999999'` +- βœ… Query ends with semicolon + +--- + +## AUTOMATIC VALIDATION + +After generating the exercise, validate it: + +```bash +python .cursor/validators/sql_coding_validator.py /tmp/exercise_to_validate.md +``` + +--- + +## PREVIEW + +Generate a visual preview with course content matching: + +```bash +python .cursor/preview/generate_blanks_preview.py /tmp/items.md \ + --scripts \ + --exercises +``` diff --git a/.cursor/templates/shiny_app_template.R b/.cursor/templates/shiny_app_template.R new file mode 100644 index 0000000..fd8174e --- /dev/null +++ b/.cursor/templates/shiny_app_template.R @@ -0,0 +1,593 @@ +# ============================================================================== +# DataCamp Shiny App Template +# ============================================================================== +# +# This template provides the foundation for building explorable exercise Shiny apps +# with DataCamp theming and required localization. +# +# REQUIREMENTS: +# - Poppins font (Google Fonts) +# - DataCamp brand colors +# - Localization with at least 2 languages +# - Language selector dropdown +# +# USAGE: +# 1. Copy this template to your app folder (e.g., apps/ch1_ex9/app.R) +# 2. Customize TRANSLATIONS for your content +# 3. Build your UI and server logic +# 4. Run with: shiny::runApp('apps/ch1_ex9') +# +# ============================================================================== + +library(shiny) +library(shinyjs) +library(markdown) + +# ============================================================================== +# TRANSLATIONS (REQUIRED) +# ============================================================================== +# +# All UI text MUST be in the TRANSLATIONS list. +# Minimum: 2 languages (English + 1 other) +# Recommended: EN, ES, DE, FR +# + +TRANSLATIONS <- list( + en = list( + flag = "πŸ‡ΊπŸ‡Έ", + ui = list( + title = "Application Title", + subtitle = "Application description goes here", + # Header elements + header_welcome = "Welcome", + header_status = "Status:", + status_ready = "Ready", + status_loading = "Loading...", + # Buttons + btn_submit = "Submit", + btn_reset = "Reset", + btn_next = "Next", + # Common labels + label_select = "Select an option", + label_result = "Result", + # Messages + msg_success = "Great job!", + msg_error = "Something went wrong", + msg_loading = "Please wait..." + ), + content = list( + # Add your content-specific translations here + # e.g., questions, options, feedback text + item_1 = "First item", + item_2 = "Second item", + item_3 = "Third item" + ) + ), + es = list( + flag = "πŸ‡ͺπŸ‡Έ", + ui = list( + title = "TΓ­tulo de la AplicaciΓ³n", + subtitle = "La descripciΓ³n de la aplicaciΓ³n va aquΓ­", + header_welcome = "Bienvenido", + header_status = "Estado:", + status_ready = "Listo", + status_loading = "Cargando...", + btn_submit = "Enviar", + btn_reset = "Reiniciar", + btn_next = "Siguiente", + label_select = "Seleccione una opciΓ³n", + label_result = "Resultado", + msg_success = "Β‘Muy bien!", + msg_error = "Algo saliΓ³ mal", + msg_loading = "Por favor espere..." + ), + content = list( + item_1 = "Primer elemento", + item_2 = "Segundo elemento", + item_3 = "Tercer elemento" + ) + ), + de = list( + flag = "πŸ‡©πŸ‡ͺ", + ui = list( + title = "Anwendungstitel", + subtitle = "Anwendungsbeschreibung hier", + header_welcome = "Willkommen", + header_status = "Status:", + status_ready = "Bereit", + status_loading = "Laden...", + btn_submit = "Absenden", + btn_reset = "ZurΓΌcksetzen", + btn_next = "Weiter", + label_select = "Option auswΓ€hlen", + label_result = "Ergebnis", + msg_success = "Gut gemacht!", + msg_error = "Etwas ist schief gelaufen", + msg_loading = "Bitte warten..." + ), + content = list( + item_1 = "Erstes Element", + item_2 = "Zweites Element", + item_3 = "Drittes Element" + ) + ), + fr = list( + flag = "πŸ‡«πŸ‡·", + ui = list( + title = "Titre de l'Application", + subtitle = "Description de l'application ici", + header_welcome = "Bienvenue", + header_status = "Statut:", + status_ready = "PrΓͺt", + status_loading = "Chargement...", + btn_submit = "Soumettre", + btn_reset = "RΓ©initialiser", + btn_next = "Suivant", + label_select = "SΓ©lectionnez une option", + label_result = "RΓ©sultat", + msg_success = "Bien jouΓ©!", + msg_error = "Quelque chose s'est mal passΓ©", + msg_loading = "Veuillez patienter..." + ), + content = list( + item_1 = "Premier Γ©lΓ©ment", + item_2 = "DeuxiΓ¨me Γ©lΓ©ment", + item_3 = "TroisiΓ¨me Γ©lΓ©ment" + ) + ) +) + +# ============================================================================== +# HELPER FUNCTIONS (REQUIRED) +# ============================================================================== + +#' Get available language choices for dropdown +#' @return Named vector of language codes with flag emojis +get_language_choices <- function() { + c("πŸ‡ΊπŸ‡Έ" = "en", "πŸ‡ͺπŸ‡Έ" = "es", "πŸ‡©πŸ‡ͺ" = "de", "πŸ‡«πŸ‡·" = "fr") +} + +#' Retrieve translation by language and path +#' @param lang Language code (e.g., "en", "es") +#' @param ... Path to translation (e.g., "ui", "title") +#' @return Translated string or NULL if not found +t <- function(lang, ...) { + keys <- list(...) + result <- TRANSLATIONS[[lang]] + for (key in keys) { + result <- result[[key]] + if (is.null(result)) return(NULL) + } + result +} + +# ============================================================================== +# DATACAMP THEME CSS +# ============================================================================== + +DATACAMP_CSS <- " +@import url('https://fonts.googleapis.com/css2?family=Poppins:wght@400;500;600;700&display=swap'); + +/* CSS Variables - DataCamp Brand Colors */ +:root { + --dc-navy: #05192d; + --dc-navy-light: #0a2240; + --dc-green: #03ef62; + --dc-green-light: #65ff8f; + --dc-green-dark: #00c74e; + --dc-orange: #ff931e; + --dc-red: #ff5400; + --dc-gray-light: #f7f7fc; + --dc-gray-border: #e8e8ea; + --dc-text: #05192d; + --dc-text-light: #ffffff; + --dc-text-subtle: rgba(48, 57, 105, 0.6); +} + +/* Base Styles */ +* { + margin: 0; + padding: 0; + box-sizing: border-box; +} + +body { + font-family: 'Poppins', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; + background: var(--dc-gray-light); + color: var(--dc-text); + line-height: 1.6; + min-height: 100vh; +} + +.container-fluid { + max-width: 1200px; + margin: 0 auto; + padding: 20px; +} + +/* Header */ +.app-header { + background: var(--dc-navy); + color: var(--dc-text-light); + padding: 24px 30px; + border-radius: 12px 12px 0 0; + display: flex; + justify-content: space-between; + align-items: center; +} + +.header-content { + flex: 1; +} + +.app-title { + font-size: 1.75rem; + font-weight: 700; + margin: 0 0 4px 0; +} + +.app-subtitle { + font-size: 1rem; + opacity: 0.85; + margin: 0; + font-weight: 400; +} + +/* Language Selector */ +.language-selector { + margin-left: 20px; +} + +.language-selector select, +.language-selector .selectize-input { + background: rgba(255, 255, 255, 0.1); + border: 1px solid rgba(255, 255, 255, 0.2); + color: white; + padding: 8px 16px; + border-radius: 8px; + cursor: pointer; + font-size: 0.95rem; + font-family: 'Poppins', sans-serif; + transition: all 0.2s ease; +} + +.language-selector select:hover, +.language-selector .selectize-input:hover { + background: rgba(255, 255, 255, 0.15); + border-color: var(--dc-green); +} + +.language-selector select:focus { + outline: none; + border-color: var(--dc-green); +} + +.language-selector select option { + background: var(--dc-navy); + color: white; +} + +/* Main Content Area */ +.main-content { + background: white; + border-radius: 0 0 12px 12px; + padding: 30px; + box-shadow: 0 4px 20px rgba(5, 25, 45, 0.08); +} + +/* Cards */ +.card { + background: white; + border: 1px solid var(--dc-gray-border); + border-radius: 12px; + padding: 24px; + margin-bottom: 20px; + transition: box-shadow 0.2s ease; +} + +.card:hover { + box-shadow: 0 4px 12px rgba(5, 25, 45, 0.1); +} + +.card-title { + font-size: 1.125rem; + font-weight: 600; + color: var(--dc-navy); + margin-bottom: 16px; +} + +/* Buttons */ +.btn-dc-primary { + background: var(--dc-green); + color: var(--dc-navy); + border: none; + padding: 12px 24px; + border-radius: 8px; + font-weight: 600; + font-size: 0.95rem; + cursor: pointer; + transition: all 0.2s ease; + font-family: 'Poppins', sans-serif; +} + +.btn-dc-primary:hover { + background: var(--dc-green-dark); + transform: translateY(-1px); +} + +.btn-dc-secondary { + background: white; + color: var(--dc-red); + border: 2px solid var(--dc-red); + padding: 10px 22px; + border-radius: 8px; + font-weight: 600; + font-size: 0.95rem; + cursor: pointer; + transition: all 0.2s ease; + font-family: 'Poppins', sans-serif; +} + +.btn-dc-secondary:hover { + background: var(--dc-red); + color: white; +} + +/* Status Indicator */ +.status-indicator { + display: flex; + align-items: center; + gap: 8px; + font-size: 0.9rem; +} + +.status-dot { + width: 10px; + height: 10px; + border-radius: 50%; +} + +.status-dot.ready { + background: var(--dc-green); +} + +.status-dot.loading { + background: var(--dc-orange); + animation: pulse 1.5s infinite; +} + +.status-dot.error { + background: var(--dc-red); +} + +@keyframes pulse { + 0%, 100% { opacity: 1; } + 50% { opacity: 0.5; } +} + +/* Form Elements */ +.form-group { + margin-bottom: 20px; +} + +.form-label { + display: block; + font-weight: 500; + margin-bottom: 8px; + color: var(--dc-navy); +} + +.form-control { + width: 100%; + padding: 12px 16px; + border: 1px solid var(--dc-gray-border); + border-radius: 8px; + font-size: 1rem; + font-family: 'Poppins', sans-serif; + transition: border-color 0.2s ease; +} + +.form-control:focus { + outline: none; + border-color: var(--dc-green); +} + +/* Warning Banner */ +.warning-banner { + background: #fffbf3; + border-left: 4px solid var(--dc-orange); + padding: 16px 20px; + margin-bottom: 20px; + border-radius: 0 8px 8px 0; +} + +.warning-banner strong { + color: var(--dc-navy); + font-weight: 600; +} + +/* Success Message */ +.success-message { + background: rgba(3, 239, 98, 0.1); + border: 1px solid rgba(3, 239, 98, 0.3); + border-radius: 8px; + padding: 16px 20px; + color: var(--dc-navy); +} + +.success-message strong { + color: var(--dc-green-dark); +} + +/* Animations */ +@keyframes fadeIn { + from { opacity: 0; transform: translateY(-10px); } + to { opacity: 1; transform: translateY(0); } +} + +.fade-in { + animation: fadeIn 0.4s ease-out; +} + +/* Responsive */ +@media (max-width: 768px) { + .app-header { + flex-direction: column; + gap: 16px; + text-align: center; + } + + .language-selector { + margin-left: 0; + } + + .app-title { + font-size: 1.5rem; + } +} +" + +# ============================================================================== +# UI +# ============================================================================== + +ui <- fluidPage( + useShinyjs(), + + # Head - Styles and Meta + tags$head( + tags$meta(charset = "UTF-8"), + tags$meta(name = "viewport", content = "width=device-width, initial-scale=1.0"), + tags$style(HTML(DATACAMP_CSS)) + ), + + # Main Container + div(class = "container-fluid", + + # Header with Language Selector + div(class = "app-header", + div(class = "header-content", + uiOutput("title_ui"), + uiOutput("subtitle_ui") + ), + div(class = "language-selector", + selectInput( + "language", + label = NULL, + choices = get_language_choices(), + selected = "en" + ) + ) + ), + + # Main Content + div(class = "main-content", + + # Warning Banner (optional - remove if not needed) + div(class = "warning-banner", + tags$strong("Training Mode: "), + "This is a demonstration application for educational purposes." + ), + + # Your content goes here + div(class = "card", + div(class = "card-title", "Getting Started"), + p("Replace this content with your application logic."), + p("Use the language selector in the header to test translations.") + ), + + # Example: Dynamic content based on language + uiOutput("content_ui"), + + # Example: Action buttons + div(style = "margin-top: 20px;", + actionButton("btn_action", + uiOutput("btn_text", inline = TRUE), + class = "btn-dc-primary"), + actionButton("btn_reset", + uiOutput("btn_reset_text", inline = TRUE), + class = "btn-dc-secondary", + style = "margin-left: 10px;") + ) + ) + ) +) + +# ============================================================================== +# SERVER +# ============================================================================== + +server <- function(input, output, session) { + + # Reactive values for state management + state <- reactiveValues( + status = "ready" + ) + + # ----- UI OUTPUTS (Localized) ----- + + output$title_ui <- renderUI({ + tags$h1(class = "app-title", t(input$language, "ui", "title")) + }) + + output$subtitle_ui <- renderUI({ + tags$p(class = "app-subtitle", t(input$language, "ui", "subtitle")) + }) + + output$btn_text <- renderUI({ + t(input$language, "ui", "btn_submit") + }) + + output$btn_reset_text <- renderUI({ + t(input$language, "ui", "btn_reset") + }) + + output$content_ui <- renderUI({ + lang <- input$language + + div(class = "card fade-in", + div(class = "card-title", t(lang, "ui", "label_select")), + tags$ul( + tags$li(t(lang, "content", "item_1")), + tags$li(t(lang, "content", "item_2")), + tags$li(t(lang, "content", "item_3")) + ) + ) + }) + + # ----- EVENT HANDLERS ----- + + observeEvent(input$btn_action, { + # Handle primary action + state$status <- "loading" + + # Simulate async operation + shinyjs::delay(1000, { + state$status <- "ready" + showNotification( + t(input$language, "ui", "msg_success"), + type = "message" + ) + }) + }) + + observeEvent(input$btn_reset, { + # Handle reset + state$status <- "ready" + showNotification( + "Reset complete", + type = "default" + ) + }) + + # Reset state on language change (optional) + observeEvent(input$language, { + # Clear any language-specific state if needed + }) +} + +# ============================================================================== +# RUN APP +# ============================================================================== + +shinyApp(ui, server) diff --git a/.cursor/utilities/converters/convert_html.py b/.cursor/utilities/converters/convert_html.py new file mode 100644 index 0000000..7d16d66 --- /dev/null +++ b/.cursor/utilities/converters/convert_html.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 +""" +HTML to Markdown Converter using Docling + +Usage: + python convert_html.py [-o output.md] + +Example: + python .cursor/utilities/converters/convert_html.py context/page.html -o context/context-final/page.md +""" + +import argparse +import sys +from pathlib import Path + + +def convert_html_to_markdown(input_path: str, output_path: str = None) -> str: + """Convert an HTML file to Markdown using Docling.""" + + # Import here to allow script to show help even if not installed + try: + from docling.document_converter import DocumentConverter + except ImportError: + print("Error: docling not installed.") + print("Run: pip install docling") + sys.exit(1) + + # Verify input file exists + input_file = Path(input_path) + if not input_file.exists(): + print(f"Error: File not found: {input_path}") + sys.exit(1) + + # Set default output path + if output_path is None: + output_path = input_file.with_suffix(".md") + + output_file = Path(output_path) + + # Create output directory if needed + output_file.parent.mkdir(parents=True, exist_ok=True) + + print(f"Converting: {input_path}") + print(f"Output: {output_path}") + + try: + # Initialize converter and convert + converter = DocumentConverter() + result = converter.convert(str(input_file)) + + # Export to markdown + markdown_content = result.document.export_to_markdown() + + # Write output + with open(output_file, "w", encoding="utf-8") as f: + f.write(markdown_content) + + print(f"βœ“ Conversion complete: {output_path}") + return markdown_content + + except Exception as e: + print(f"Error during conversion: {e}") + sys.exit(1) + + +def main(): + parser = argparse.ArgumentParser( + description="Convert HTML to Markdown using Docling" + ) + parser.add_argument( + "input", + help="Path to input HTML file" + ) + parser.add_argument( + "-o", "--output", + help="Path to output Markdown file (default: same name with .md extension)" + ) + + args = parser.parse_args() + convert_html_to_markdown(args.input, args.output) + + +if __name__ == "__main__": + main() diff --git a/.cursor/utilities/converters/convert_pdf.py b/.cursor/utilities/converters/convert_pdf.py new file mode 100644 index 0000000..9b4bcc8 --- /dev/null +++ b/.cursor/utilities/converters/convert_pdf.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 +""" +PDF to Markdown Converter using Datalab API + +Usage: + python convert_pdf.py [-o output.md] + +Example: + python .cursor/utilities/converters/convert_pdf.py context/document.pdf -o context/context-final/document.md + +Reference: https://documentation.datalab.to/docs/welcome/sdk/conversion +""" + +import argparse +import os +import sys +from pathlib import Path + +# Load environment variables from .cursor/.env +from dotenv import load_dotenv + +# Find the .cursor directory relative to this script +script_dir = Path(__file__).parent.parent +env_path = script_dir / ".env" +load_dotenv(env_path) + +def convert_pdf_to_markdown(input_path: str, output_path: str = None) -> str: + """Convert a PDF file to Markdown using Datalab API.""" + + # Import here to allow script to show help even if SDK not installed + try: + from datalab_sdk import DatalabClient, ConvertOptions + except ImportError: + print("Error: datalab-python-sdk not installed.") + print("Run: pip install datalab-python-sdk") + sys.exit(1) + + # Check for API key + api_key = os.getenv("DATALAB_API_KEY") + if not api_key: + print("Error: DATALAB_API_KEY not found.") + print(f"Please add your API key to {env_path}") + sys.exit(1) + + # Set the environment variable for the SDK + os.environ["DATALAB_API_KEY"] = api_key + + # Verify input file exists + input_file = Path(input_path).resolve() # Use absolute path + if not input_file.exists(): + print(f"Error: File not found: {input_path}") + sys.exit(1) + + if not input_file.suffix.lower() == ".pdf": + print(f"Warning: File may not be a PDF: {input_path}") + + # Set default output path + if output_path is None: + output_path = input_file.with_suffix(".md") + + output_file = Path(output_path) + + # Create output directory if needed + output_file.parent.mkdir(parents=True, exist_ok=True) + + print(f"Converting: {input_file}") + print(f"Output: {output_path}") + + try: + # Initialize client (uses DATALAB_API_KEY env var) + client = DatalabClient() + + # Set conversion options + options = ConvertOptions( + output_format="markdown", + mode="balanced", + ) + + # Convert the PDF + result = client.convert(str(input_file), options=options) + + if not result.success: + print(f"Error: Conversion failed - {result.error}") + sys.exit(1) + + # Write markdown output + with open(output_file, "w", encoding="utf-8") as f: + f.write(result.markdown) + + print(f"βœ“ Conversion complete: {output_path}") + print(f" Pages processed: {result.page_count}") + print(f" Quality score: {result.parse_quality_score}") + return result.markdown + + except Exception as e: + print(f"Error during conversion: {e}") + sys.exit(1) + + +def main(): + parser = argparse.ArgumentParser( + description="Convert PDF to Markdown using Datalab API" + ) + parser.add_argument( + "input", + help="Path to input PDF file" + ) + parser.add_argument( + "-o", "--output", + help="Path to output Markdown file (default: same name with .md extension)" + ) + + args = parser.parse_args() + convert_pdf_to_markdown(args.input, args.output) + + +if __name__ == "__main__": + main() diff --git a/.cursor/utilities/converters/convert_webpage.py b/.cursor/utilities/converters/convert_webpage.py new file mode 100644 index 0000000..6d71646 --- /dev/null +++ b/.cursor/utilities/converters/convert_webpage.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 +""" +Web Page to Markdown Converter using Trafilatura + +Extracts main content from web pages (articles, documentation, etc.) +and converts to clean Markdown. + +Usage: + python convert_webpage.py [-o output.md] + +Example: + python .cursor/utilities/converters/convert_webpage.py "https://r4ds.hadley.nz/missing-values.html" -o context/context-final/missing-values.md +""" + +import argparse +import re +import sys +from pathlib import Path +from urllib.parse import urlparse + + +def url_to_filename(url: str) -> str: + """Generate a filename from a URL.""" + parsed = urlparse(url) + path = parsed.path.strip('/') + + if path: + # Use the last part of the path + name = path.split('/')[-1] + # Remove extension if present + name = re.sub(r'\.[^.]+$', '', name) + else: + # Use domain name + name = parsed.netloc.replace('.', '_') + + # Clean up the name + name = re.sub(r'[^\w\-]', '_', name) + name = re.sub(r'_+', '_', name) + + return f"{name}.md" + + +def convert_webpage_to_markdown(url: str, output_path: str = None) -> str: + """Convert a web page to Markdown using Trafilatura.""" + + # Import here to allow script to show help even if not installed + try: + import trafilatura + except ImportError: + print("Error: trafilatura not installed.") + print("Run: pip install trafilatura") + sys.exit(1) + + # Validate URL + parsed = urlparse(url) + if not parsed.scheme: + url = "https://" + url + + # Set default output path + if output_path is None: + output_path = url_to_filename(url) + + output_file = Path(output_path) + + # Create output directory if needed + output_file.parent.mkdir(parents=True, exist_ok=True) + + print(f"Fetching: {url}") + + try: + # Download the page + downloaded = trafilatura.fetch_url(url) + + if downloaded is None: + print(f"Error: Could not fetch URL: {url}") + sys.exit(1) + + # Extract main content as markdown + markdown_content = trafilatura.extract( + downloaded, + output_format='markdown', + include_links=True, + include_images=False, # Skip images for cleaner output + include_tables=True, + include_comments=False, + favor_precision=True, # Prefer precision over recall + ) + + if markdown_content is None: + print("Error: Could not extract content from page") + sys.exit(1) + + # Add source URL header + header = f"# Web Page Content\n\n**Source:** {url}\n\n---\n\n" + full_content = header + markdown_content + + # Write output + with open(output_file, "w", encoding="utf-8") as f: + f.write(full_content) + + print(f"βœ“ Content saved: {output_path}") + return full_content + + except Exception as e: + print(f"Error during extraction: {e}") + sys.exit(1) + + +def main(): + parser = argparse.ArgumentParser( + description="Convert web page to Markdown using Trafilatura" + ) + parser.add_argument( + "url", + help="URL of the web page to convert" + ) + parser.add_argument( + "-o", "--output", + help="Path to output Markdown file (default: generated from URL)" + ) + + args = parser.parse_args() + convert_webpage_to_markdown(args.url, args.output) + + +if __name__ == "__main__": + main() diff --git a/.cursor/utilities/converters/convert_youtube.py b/.cursor/utilities/converters/convert_youtube.py new file mode 100644 index 0000000..667209a --- /dev/null +++ b/.cursor/utilities/converters/convert_youtube.py @@ -0,0 +1,131 @@ +#!/usr/bin/env python3 +""" +YouTube Transcript to Markdown Converter + +Usage: + python convert_youtube.py [-o output.md] + +Example: + python .cursor/utilities/converters/convert_youtube.py "https://www.youtube.com/watch?v=dQw4w9WgXcQ" -o context/context-final/video.md +""" + +import argparse +import re +import sys +from pathlib import Path + + +def extract_video_id(url: str) -> str: + """Extract video ID from various YouTube URL formats.""" + patterns = [ + r'(?:youtube\.com\/watch\?v=|youtu\.be\/|youtube\.com\/embed\/)([a-zA-Z0-9_-]{11})', + r'(?:youtube\.com\/watch\?.*v=)([a-zA-Z0-9_-]{11})', + ] + + for pattern in patterns: + match = re.search(pattern, url) + if match: + return match.group(1) + + # Maybe it's already just the video ID + if re.match(r'^[a-zA-Z0-9_-]{11}$', url): + return url + + return None + + +def convert_youtube_to_markdown(url: str, output_path: str = None) -> str: + """Convert YouTube video transcript to Markdown.""" + + # Import here to allow script to show help even if not installed + try: + from youtube_transcript_api import YouTubeTranscriptApi + except ImportError: + print("Error: youtube-transcript-api not installed.") + print("Run: pip install youtube-transcript-api") + sys.exit(1) + + # Extract video ID + video_id = extract_video_id(url) + if not video_id: + print(f"Error: Could not extract video ID from URL: {url}") + sys.exit(1) + + print(f"Video ID: {video_id}") + + # Set default output path + if output_path is None: + output_path = f"youtube_{video_id}.md" + + output_file = Path(output_path) + + # Create output directory if needed + output_file.parent.mkdir(parents=True, exist_ok=True) + + print(f"Fetching transcript...") + + try: + # New API (v1.0+): use fetch() method + ytt_api = YouTubeTranscriptApi() + transcript_data = ytt_api.fetch(video_id) + language = 'en' + + except Exception as e: + print(f"Error fetching transcript: {e}") + sys.exit(1) + + # Build markdown content (plain text, no timestamps) + lines = [] + lines.append(f"# YouTube Video Transcript") + lines.append(f"") + lines.append(f"**Video URL:** https://www.youtube.com/watch?v={video_id}") + lines.append(f"**Language:** {language}") + lines.append(f"") + lines.append("---") + lines.append("") + + # Combine transcript segments into paragraphs + current_paragraph = [] + for entry in transcript_data: + text = entry.text.strip() if hasattr(entry, 'text') else entry.get('text', '').strip() + if text: + current_paragraph.append(text) + # Start new paragraph after sentences ending with period + if text.endswith('.') or text.endswith('?') or text.endswith('!'): + lines.append(' '.join(current_paragraph)) + lines.append("") + current_paragraph = [] + + # Add any remaining text + if current_paragraph: + lines.append(' '.join(current_paragraph)) + + markdown_content = '\n'.join(lines) + + # Write output + with open(output_file, "w", encoding="utf-8") as f: + f.write(markdown_content) + + print(f"βœ“ Transcript saved: {output_path}") + return markdown_content + + +def main(): + parser = argparse.ArgumentParser( + description="Convert YouTube video transcript to Markdown" + ) + parser.add_argument( + "url", + help="YouTube video URL or video ID" + ) + parser.add_argument( + "-o", "--output", + help="Path to output Markdown file (default: youtube_.md)" + ) + + args = parser.parse_args() + convert_youtube_to_markdown(args.url, args.output) + + +if __name__ == "__main__": + main() diff --git a/.cursor/utilities/excalidraw/from_script.mjs b/.cursor/utilities/excalidraw/from_script.mjs new file mode 100644 index 0000000..5ccccd6 --- /dev/null +++ b/.cursor/utilities/excalidraw/from_script.mjs @@ -0,0 +1,362 @@ +/** + * Generate Excalidraw diagrams from video script placeholders + * + * SYNTAX: + * ![excalidraw: TEMPLATE: label1, label2, label3, ...]() + * + * TEMPLATES: + * - flowchart: linear left-to-right flow + * - cycle: circular flow (last connects to first) + * - hierarchy: first label is root, rest are children + * - radial: first label is center, rest are satellites + * - layers: vertical stack (top to bottom) + * - timeline: horizontal steps with optional year|description format + * - funnel: top-to-bottom narrowing stages + * - mindmap: central concept with branches + * - matrix: exactly 4 labels for 2x2 grid + * - comparison: two groups (use | as separator) + * + * TIMELINE SYNTAX: + * - Simple: "Step 1, Step 2, Step 3" (numbered circles) + * - With years: "1950|Turing Test, 1997|Deep Blue" (year in circle) + * - Line breaks: "1950|Alan Turing // Proposes Test" (// = newline) + * + * EXAMPLES: + * ![excalidraw: flowchart: Input, Process, Output]() + * ![excalidraw: cycle: Plan, Do, Check, Act]() + * ![excalidraw: hierarchy: Cloud Services, IaaS, PaaS, SaaS]() + * ![excalidraw: radial: API Gateway, Auth, Users, Orders]() + * ![excalidraw: layers: UI, Logic, Data, Storage]() + * ![excalidraw: timeline: 1950|Turing Test, 1997|Deep Blue, 2022|ChatGPT]() + * ![excalidraw: funnel: Leads, Prospects, Qualified, Customers]() + * ![excalidraw: mindmap: AI, ML, DL, NLP, Computer Vision]() + * ![excalidraw: comparison: SQL | NoSQL]() + * + * Usage: + * node from_script.mjs --chapter N --lesson M [--update] + */ + +import fs from 'fs/promises'; +import path from 'path'; +import { convertJsonToPng, closeBrowser } from './to_png.mjs'; +import templates from './templates.mjs'; + +// ============================================================================= +// ARGUMENT PARSING +// ============================================================================= + +const args = process.argv.slice(2); +const scriptPath = args.find(a => !a.startsWith('--')); +const chapterIdx = args.indexOf('--chapter'); +const lessonIdx = args.indexOf('--lesson'); +const outputIdx = args.indexOf('--output'); +const chapter = chapterIdx !== -1 ? parseInt(args[chapterIdx + 1]) : 1; +const lesson = lessonIdx !== -1 ? parseInt(args[lessonIdx + 1]) : 1; +const outputDir = outputIdx !== -1 ? args[outputIdx + 1] : null; +const dryRun = args.includes('--dry-run'); +const updateFile = args.includes('--update') || args.includes('-u'); + +if (!scriptPath) { + console.error(` +Usage: node from_script.mjs --chapter N --lesson M [options] + +SYNTAX: + ![excalidraw: TEMPLATE: label1, label2, label3, ...]() + +TEMPLATES: + flowchart Linear left-to-right flow + cycle Circular flow (last connects to first) + hierarchy First label is root, rest are children + radial First label is center, rest are satellites + layers Vertical stack (top to bottom) + timeline Horizontal steps (supports year|description and // for line breaks) + funnel Top-to-bottom narrowing stages + mindmap Central concept with branches + matrix Exactly 4 labels for 2x2 grid + comparison Two groups separated by | + +OPTIONS: + --chapter N Chapter number (default: 1) + --lesson M Lesson number (default: 1) + --output DIR Output directory for images (default: ./images/lesson_N_M/) + --update, -u Update the source file with image paths + --dry-run Show what would be generated without creating files + +OUTPUT: + Images and editable .excalidraw files are saved to: images/lesson_N_M/ + Each diagram generates: + - lesson_N_M_image_X_description.png (PNG image for slides) + - lesson_N_M_image_X_description.excalidraw (Editable file for excalidraw.com) + + Filenames include sanitized label text to prevent collisions across lessons. + +TIMELINE SYNTAX: + year|description Put year in circle, description below + text // more Force line break with // + +EXAMPLES: + ![excalidraw: flowchart: Input, Process, Output]() + ![excalidraw: cycle: Plan, Do, Check, Act]() + ![excalidraw: hierarchy: Cloud, IaaS, PaaS, SaaS]() + ![excalidraw: radial: Core, Module A, Module B, Module C]() + ![excalidraw: layers: Frontend, API, Database]() + ![excalidraw: timeline: 1950|Turing Test, 1997|Deep Blue]() + ![excalidraw: funnel: Leads, Prospects, Customers]() + ![excalidraw: mindmap: AI, ML, DL, NLP, CV]() + ![excalidraw: comparison: Python | R]() +`); + process.exit(1); +} + +// ============================================================================= +// SUPPORTED TEMPLATES +// ============================================================================= + +const VALID_TEMPLATES = new Set([ + 'flowchart', + 'cycle', + 'hierarchy', + 'radial', + 'layers', + 'timeline', + 'matrix', + 'comparison', + 'process', + 'architecture', + 'funnel', + 'mindmap' +]); + +// ============================================================================= +// SIMPLE PARSER +// ============================================================================= + +/** + * Find all excalidraw placeholders in markdown + * @param {string} content - Markdown content + * @returns {Array} Array of {match, template, labels} + */ +function findPlaceholders(content) { + // Match: ![excalidraw: TEMPLATE: labels]() + const regex = /!\[excalidraw:\s*(\w+):\s*([^\]]+)\]\(\)/g; + const placeholders = []; + let match; + + while ((match = regex.exec(content)) !== null) { + const template = match[1].toLowerCase().trim(); + const labelsRaw = match[2].trim(); + + // Parse labels - just split by comma + const labels = labelsRaw + .split(',') + .map(l => l.trim()) + .filter(l => l.length > 0); + + placeholders.push({ + match: match[0], + template, + labels, + raw: labelsRaw + }); + } + + return placeholders; +} + +// ============================================================================= +// FILENAME SANITIZATION +// ============================================================================= + +/** + * Sanitize text for use in filenames + * @param {string} text - Raw label text + * @param {number} maxLength - Maximum length of output + * @returns {string} Sanitized filename-safe string + */ +function sanitizeForFilename(text, maxLength = 50) { + return text + .toLowerCase() + .replace(/[():,\[\]{}'"]/g, '') // Remove special chars + .replace(/\s+/g, '_') // Spaces to underscores + .replace(/_+/g, '_') // Collapse multiple underscores + .replace(/^_|_$/g, '') // Trim leading/trailing underscores + .substring(0, maxLength); // Truncate if too long +} + +// ============================================================================= +// DIAGRAM GENERATOR +// ============================================================================= + +/** + * Generate diagram from template name and labels + * @param {string} template - Template name + * @param {string[]} labels - Array of label strings + * @returns {Object} Excalidraw JSON + */ +function generateDiagram(template, labels) { + if (labels.length === 0) { + console.warn(` ⚠️ No labels provided`); + return templates.flowchartLR(['Placeholder']); + } + + switch (template) { + case 'flowchart': + case 'process': + return templates.flowchartLR(labels); + + case 'cycle': + return templates.cycle(labels); + + case 'hierarchy': + // First label = root, rest = children + return templates.hierarchy(labels[0], labels.slice(1)); + + case 'radial': + // First label = center, rest = satellites + return templates.radial(labels[0], labels.slice(1)); + + case 'layers': + return templates.layers(labels); + + case 'timeline': + return templates.timeline(labels); + + case 'matrix': + // Need exactly 4 labels for 2x2 + if (labels.length !== 4) { + console.warn(` ⚠️ Matrix needs exactly 4 labels, got ${labels.length}`); + } + return templates.matrix(labels.slice(0, 4)); + + case 'comparison': + // Split by | if present, otherwise split in half + const raw = labels.join(', '); + if (raw.includes('|')) { + const [left, right] = raw.split('|').map(s => s.trim()); + const leftLabels = left.split(',').map(l => l.trim()).filter(l => l); + const rightLabels = right.split(',').map(l => l.trim()).filter(l => l); + return templates.comparison( + { title: leftLabels[0], items: leftLabels.slice(1) }, + { title: rightLabels[0], items: rightLabels.slice(1) } + ); + } + // No separator - split in half + const mid = Math.ceil(labels.length / 2); + return templates.comparison( + { title: labels[0], items: labels.slice(1, mid) }, + { title: labels[mid], items: labels.slice(mid + 1) } + ); + + case 'architecture': + // First = center, rest = surrounding services + return templates.architecture({ + center: { label: labels[0] }, + services: labels.slice(1).map(l => ({ label: l })) + }); + + case 'funnel': + return templates.funnel(labels); + + case 'mindmap': + return templates.mindmap(labels); + + default: + console.warn(` ⚠️ Unknown template "${template}", using flowchart`); + return templates.flowchartLR(labels); + } +} + +// ============================================================================= +// MAIN +// ============================================================================= + +async function main() { + console.log(`\nπŸ“„ Processing: ${scriptPath}`); + console.log(` Chapter: ${chapter}, Lesson: ${lesson}\n`); + + const content = await fs.readFile(scriptPath, 'utf-8'); + const placeholders = findPlaceholders(content); + + if (placeholders.length === 0) { + console.log(' No ![excalidraw: TEMPLATE: labels]() placeholders found.'); + console.log(''); + console.log(' Expected syntax:'); + console.log(' ![excalidraw: flowchart: Step 1, Step 2, Step 3]()'); + console.log(' ![excalidraw: cycle: Plan, Do, Check, Act]()'); + console.log(' ![excalidraw: hierarchy: Root, Child A, Child B]()'); + return; + } + + console.log(` Found ${placeholders.length} placeholder(s):\n`); + + let updatedContent = content; + + // Create lesson-specific folder: images/lesson_N_M/ + const lessonDir = `lesson_${chapter}_${lesson}`; + const imagesDir = outputDir || path.join(process.cwd(), 'images', lessonDir); + + for (let i = 0; i < placeholders.length; i++) { + const { match, template, labels, raw } = placeholders[i]; + const imageNum = i + 1; + const sanitizedLabels = sanitizeForFilename(raw); + const baseName = `lesson_${chapter}_${lesson}_image_${imageNum}_${sanitizedLabels}`; + const imageName = `${baseName}.png`; + const excalidrawName = `${baseName}.excalidraw`; + const imagePath = path.join(imagesDir, imageName); + const excalidrawPath = path.join(imagesDir, excalidrawName); + + console.log(` ${imageNum}. ${template}: [${labels.join(', ')}]`); + + if (!VALID_TEMPLATES.has(template)) { + console.log(` ⚠️ Unknown template "${template}", will use flowchart`); + } + + if (dryRun) { + console.log(` Would create: ${imagePath}`); + console.log(` Would create: ${excalidrawPath}`); + continue; + } + + try { + const diagram = generateDiagram(template, labels); + + // Ensure output directory exists + await fs.mkdir(imagesDir, { recursive: true }); + + // Save editable .excalidraw file for manual editing at excalidraw.com + await fs.writeFile(excalidrawPath, JSON.stringify(diagram, null, 2)); + console.log(` πŸ“ Saved: ${excalidrawPath}`); + + // Convert to PNG + await convertJsonToPng(diagram, imagePath, { silent: true }); + console.log(` βœ… Created: ${imagePath}`); + + // Update markdown with relative image path + const altText = `${template}: ${raw}`; + const relativePath = `images/${lessonDir}/${imageName}`; + updatedContent = updatedContent.replace(match, `![${altText}](${relativePath})`); + } catch (err) { + console.error(` ❌ Error: ${err.message}`); + } + } + + console.log(''); + + if (!dryRun && placeholders.length > 0) { + if (updateFile) { + await fs.writeFile(scriptPath, updatedContent, 'utf-8'); + console.log(` πŸ“ Updated: ${scriptPath}`); + } else { + console.log(' πŸ’‘ Use --update flag to automatically update the markdown file'); + } + console.log(`\nβœ… Done! Generated ${placeholders.length} image(s)\n`); + } + + await closeBrowser(); +} + +main().catch(async err => { + console.error('Error:', err.message); + await closeBrowser(); + process.exit(1); +}); diff --git a/.cursor/utilities/excalidraw/templates.mjs b/.cursor/utilities/excalidraw/templates.mjs new file mode 100644 index 0000000..863168b --- /dev/null +++ b/.cursor/utilities/excalidraw/templates.mjs @@ -0,0 +1,1737 @@ +/** + * Excalidraw Diagram Templates + * + * Reusable templates for generating common diagram patterns. + * All diagrams use transparent background and Poppins-compatible font. + */ + +// ============================================================================= +// GLOBAL DEFAULTS +// ============================================================================= + +export const DEFAULTS = { + appState: { + viewBackgroundColor: "transparent", + gridSize: null + }, + // fontFamily: 1 = Virgil (hand-drawn), 2 = Helvetica, 3 = Cascadia (monospace) + fontFamily: 1, + fontSize: 16, + smallFontSize: 12, + strokeWidth: 2, + roughness: 1, + fillStyle: "hachure", + // Text/sizing constraints + maxLabelChars: 30, // Max characters before truncation + charWidth: 9, // Approximate width per character (for Poppins 16px) + minBoxWidth: 80, // Minimum box width + maxBoxWidth: 200, // Maximum box width + boxPadding: 24 // Horizontal padding inside boxes +}; + +/** + * Calculate appropriate box width based on label text + * @param {string} label - The text label + * @returns {number} - Calculated width + */ +export function calcWidth(label) { + if (!label) return DEFAULTS.minBoxWidth; + const textWidth = label.length * DEFAULTS.charWidth + DEFAULTS.boxPadding; + return Math.max(DEFAULTS.minBoxWidth, Math.min(DEFAULTS.maxBoxWidth, textWidth)); +} + +/** + * Truncate label if it exceeds max characters (legacy - prefer wrapText) + * @param {string} label - The text label + * @param {number} maxChars - Maximum characters (default from DEFAULTS) + * @returns {string} - Truncated label with ellipsis if needed + */ +export function truncateLabel(label, maxChars = DEFAULTS.maxLabelChars) { + if (!label || label.length <= maxChars) return label; + return label.substring(0, maxChars - 1) + '…'; +} + +/** + * Wrap text into multiple lines instead of truncating + * Splits on word boundaries (spaces, hyphens) when possible + * @param {string} label - The text label + * @param {number} maxCharsPerLine - Max characters per line before wrapping + * @returns {Object} - { text: string with \n, lines: number of lines, maxLineLength: longest line } + */ +export function wrapText(label, maxCharsPerLine = 22) { + if (!label) return { text: '', lines: 1, maxLineLength: 0 }; + if (label.length <= maxCharsPerLine) return { text: label, lines: 1, maxLineLength: label.length }; + + // Split on spaces and hyphens, keeping the delimiters + const words = label.split(/(\s+|-)/); + const lines = []; + let currentLine = ''; + + words.forEach(word => { + const testLine = currentLine + word; + if (testLine.length <= maxCharsPerLine) { + currentLine = testLine; + } else { + // Current line is full, start new line + if (currentLine.trim()) { + lines.push(currentLine.trim()); + } + currentLine = word.trimStart(); + } + }); + + // Don't forget the last line + if (currentLine.trim()) { + lines.push(currentLine.trim()); + } + + // Handle edge case where single word is longer than max + if (lines.length === 0) { + lines.push(label); + } + + const maxLineLength = Math.max(...lines.map(l => l.length)); + + return { + text: lines.join('\n'), + lines: lines.length, + maxLineLength + }; +} + +// Color palette +export const COLORS = { + blue: { stroke: "#1971c2", fill: "#a5d8ff" }, + green: { stroke: "#2f9e44", fill: "#b2f2bb" }, + orange: { stroke: "#e8590c", fill: "#ffc078" }, + pink: { stroke: "#9c36b5", fill: "#eebefa" }, + gray: { stroke: "#495057", fill: "#dee2e6" }, + yellow: { stroke: "#e67700", fill: "#ffe066" }, + red: { stroke: "#c92a2a", fill: "#ffc9c9" }, + teal: { stroke: "#0c8599", fill: "#99e9f2" } +}; + +// ============================================================================= +// ELEMENT FACTORIES +// ============================================================================= + +let seedCounter = 1; +function nextSeed() { + return seedCounter++; +} + +/** + * Reset seed counter (call before generating a new diagram) + */ +export function resetSeeds() { + seedCounter = 1; +} + +/** + * Create base element properties + */ +function baseElement(id, type, x, y) { + return { + id, + type, + x, + y, + strokeColor: "#1e1e1e", + backgroundColor: "transparent", + fillStyle: DEFAULTS.fillStyle, + strokeWidth: DEFAULTS.strokeWidth, + roughness: DEFAULTS.roughness, + opacity: 100, + angle: 0, + seed: nextSeed(), + version: 1, + versionNonce: nextSeed(), + isDeleted: false, + boundElements: null, + updated: Date.now(), + link: null, + locked: false + }; +} + +/** + * Create a rectangle shape with adaptive width and height (text wrapping) + * @param {string} id - Element ID + * @param {number} x - X position + * @param {number} y - Y position + * @param {number|null} width - Width (null = auto-calculate from label) + * @param {number|null} height - Height (null = auto-calculate from wrapped text) + * @param {string} color - Color name from palette + * @param {string} label - Text label (will wrap if too long) + * @param {Object} options - {maxCharsPerLine: number} for custom wrapping + * @returns {Object} - { elements: Array, height: number } for downstream calculations + */ +export function rectangle(id, x, y, width, height, color = "blue", label = null, options = {}) { + const elements = []; + const colorScheme = COLORS[color] || COLORS.blue; + const maxCharsPerLine = options.maxCharsPerLine || 18; + const lineHeight = 22; + const baseHeight = 50; + + // Wrap label instead of truncating + const wrapped = label ? wrapText(label, maxCharsPerLine) : { text: '', lines: 1, maxLineLength: 0 }; + + // Auto-calculate width based on longest line + const textWidth = wrapped.maxLineLength * DEFAULTS.charWidth + DEFAULTS.boxPadding; + const autoWidth = Math.max(DEFAULTS.minBoxWidth, Math.min(DEFAULTS.maxBoxWidth, textWidth)); + const finalWidth = width === null ? autoWidth : Math.max(width, autoWidth); + + // Auto-calculate height based on number of lines + const autoHeight = baseHeight + Math.max(0, wrapped.lines - 1) * lineHeight; + const finalHeight = height === null ? autoHeight : Math.max(height, autoHeight); + + elements.push({ + ...baseElement(id, "rectangle", x, y), + width: finalWidth, + height: finalHeight, + strokeColor: colorScheme.stroke, + backgroundColor: colorScheme.fill, + roundness: { type: 3 } + }); + + if (wrapped.text) { + const labelWidth = wrapped.maxLineLength * DEFAULTS.charWidth; + const labelHeight = wrapped.lines * (DEFAULTS.fontSize + 4); + elements.push({ + ...baseElement(`${id}-text`, "text", x + finalWidth/2 - labelWidth/2, y + finalHeight/2 - labelHeight/2), + width: labelWidth, + height: labelHeight, + text: wrapped.text, + fontSize: DEFAULTS.fontSize, + fontFamily: DEFAULTS.fontFamily, + textAlign: "center", + verticalAlign: "middle", + strokeColor: "#1e1e1e", + backgroundColor: "transparent" + }); + } + + // Return elements AND computed height for templates that need it + elements.computedHeight = finalHeight; + elements.computedWidth = finalWidth; + return elements; +} + +/** + * Create an ellipse/oval shape with adaptive width and height (text wrapping) + */ +export function ellipse(id, x, y, width, height, color = "green", label = null, options = {}) { + const elements = []; + const colorScheme = COLORS[color] || COLORS.green; + const maxCharsPerLine = options.maxCharsPerLine || 16; + const lineHeight = 22; + const baseHeight = 60; + + // Wrap label instead of truncating + const wrapped = label ? wrapText(label, maxCharsPerLine) : { text: '', lines: 1, maxLineLength: 0 }; + + // Auto-calculate width based on longest line (ellipses need more padding) + const textWidth = wrapped.maxLineLength * DEFAULTS.charWidth + DEFAULTS.boxPadding + 20; + const autoWidth = Math.max(DEFAULTS.minBoxWidth + 20, Math.min(DEFAULTS.maxBoxWidth + 40, textWidth)); + const finalWidth = width === null ? autoWidth : Math.max(width, autoWidth); + + // Auto-calculate height based on number of lines + const autoHeight = baseHeight + Math.max(0, wrapped.lines - 1) * lineHeight; + const finalHeight = height === null ? autoHeight : Math.max(height, autoHeight); + + elements.push({ + ...baseElement(id, "ellipse", x, y), + width: finalWidth, + height: finalHeight, + strokeColor: colorScheme.stroke, + backgroundColor: colorScheme.fill + }); + + if (wrapped.text) { + const labelWidth = wrapped.maxLineLength * DEFAULTS.charWidth; + const labelHeight = wrapped.lines * (DEFAULTS.fontSize + 4); + elements.push({ + ...baseElement(`${id}-text`, "text", x + finalWidth/2 - labelWidth/2, y + finalHeight/2 - labelHeight/2), + width: labelWidth, + height: labelHeight, + text: wrapped.text, + fontSize: DEFAULTS.fontSize, + fontFamily: DEFAULTS.fontFamily, + textAlign: "center", + verticalAlign: "middle", + strokeColor: "#1e1e1e", + backgroundColor: "transparent" + }); + } + + elements.computedHeight = finalHeight; + elements.computedWidth = finalWidth; + return elements; +} + +/** + * Create a diamond shape with adaptive width and height (text wrapping) + * Diamonds have less usable space due to their shape, so use shorter lines + */ +export function diamond(id, x, y, width, height, color = "pink", label = null, options = {}) { + const elements = []; + const colorScheme = COLORS[color] || COLORS.pink; + const maxCharsPerLine = options.maxCharsPerLine || 12; // Shorter lines for diamonds + const lineHeight = 18; + const baseHeight = 70; + + // Wrap label instead of truncating + const wrapped = label ? wrapText(label, maxCharsPerLine) : { text: '', lines: 1, maxLineLength: 0 }; + + // Auto-calculate width based on longest line (diamonds need more padding due to shape) + const textWidth = wrapped.maxLineLength * DEFAULTS.charWidth + DEFAULTS.boxPadding + 40; + const autoWidth = Math.max(DEFAULTS.minBoxWidth + 20, Math.min(DEFAULTS.maxBoxWidth + 60, textWidth)); + const finalWidth = width === null ? autoWidth : Math.max(width, autoWidth); + + // Auto-calculate height based on number of lines (diamonds need extra height) + const autoHeight = baseHeight + Math.max(0, wrapped.lines - 1) * lineHeight; + const finalHeight = height === null ? autoHeight : Math.max(height, autoHeight); + + elements.push({ + ...baseElement(id, "diamond", x, y), + width: finalWidth, + height: finalHeight, + strokeColor: colorScheme.stroke, + backgroundColor: colorScheme.fill + }); + + if (wrapped.text) { + const labelWidth = wrapped.maxLineLength * DEFAULTS.charWidth; + const labelHeight = wrapped.lines * (DEFAULTS.smallFontSize + 6); + elements.push({ + ...baseElement(`${id}-text`, "text", x + finalWidth/2 - labelWidth/2, y + finalHeight/2 - labelHeight/2), + width: labelWidth, + height: labelHeight, + text: wrapped.text, + fontSize: DEFAULTS.smallFontSize + 2, + fontFamily: DEFAULTS.fontFamily, + textAlign: "center", + verticalAlign: "middle", + strokeColor: "#1e1e1e", + backgroundColor: "transparent" + }); + } + + elements.computedHeight = finalHeight; + elements.computedWidth = finalWidth; + return elements; +} + +/** + * Create an arrow between two points + */ +export function arrow(id, startX, startY, endX, endY, bidirectional = false) { + const width = endX - startX; + const height = endY - startY; + + return [{ + ...baseElement(id, "arrow", startX, startY), + width: Math.abs(width), + height: Math.abs(height), + points: [[0, 0], [width, height]], + lastCommittedPoint: null, + startBinding: null, + endBinding: null, + startArrowhead: bidirectional ? "arrow" : null, + endArrowhead: "arrow" + }]; +} + +/** + * Create a text label + */ +export function text(id, x, y, content, fontSize = DEFAULTS.fontSize, color = "#1e1e1e") { + const lines = content.split('\n'); + const maxLineLength = Math.max(...lines.map(l => l.length)); + + return [{ + ...baseElement(id, "text", x, y), + width: maxLineLength * (fontSize * 0.6), + height: lines.length * (fontSize + 4), + text: content, + fontSize, + fontFamily: DEFAULTS.fontFamily, + textAlign: "left", + verticalAlign: "top", + strokeColor: color, + backgroundColor: "transparent" + }]; +} + +// ============================================================================= +// HIGH-LEVEL TEMPLATES +// ============================================================================= + +/** + * Create a linear flowchart (left to right) with adaptive widths + * + * SIMPLE API: Just pass an array of labels! + * @param {Array|Array} nodes - Array of labels OR {id, label, color, shape} objects + * @example flowchartLR(['Input', 'Process', 'Output']) + * @example flowchartLR([{label: 'Start', shape: 'ellipse'}, {label: 'End'}]) + */ +export function flowchartLR(nodes) { + resetSeeds(); + const elements = []; + + // Normalize input - accept simple strings or objects + const normalizedNodes = nodes.map((node, i) => { + if (typeof node === 'string') { + return { id: `node-${i}`, label: node }; + } + return { id: node.id || `node-${i}`, ...node }; + }); + + const nodeHeight = 60; + const gap = 60; + const startX = 50; + const startY = 80; + + // First pass: calculate widths for each node + const nodeWidths = normalizedNodes.map(node => { + const displayLabel = truncateLabel(node.label); + return calcWidth(displayLabel); + }); + + // Second pass: create elements with proper positioning + let currentX = startX; + + normalizedNodes.forEach((node, i) => { + const nodeWidth = nodeWidths[i]; + const shape = node.shape || "rectangle"; + const color = node.color || ["blue", "green", "orange", "teal"][i % 4]; + + if (shape === "ellipse") { + elements.push(...ellipse(node.id, currentX, startY, nodeWidth, nodeHeight, color, node.label)); + } else if (shape === "diamond") { + elements.push(...diamond(node.id, currentX, startY - 10, nodeWidth, nodeHeight + 20, color, node.label)); + } else { + elements.push(...rectangle(node.id, currentX, startY, nodeWidth, nodeHeight, color, node.label)); + } + + // Add arrow to next node + if (i < normalizedNodes.length - 1) { + const nextNodeX = currentX + nodeWidth + gap; + elements.push(...arrow( + `arrow-${i}`, + currentX + nodeWidth + 5, + startY + nodeHeight / 2, + nextNodeX - 5, + startY + nodeHeight / 2 + )); + } + + currentX += nodeWidth + gap; + }); + + return wrapScene(elements); +} + +/** + * Create a vertical flowchart (top to bottom) + * @param {Array} nodes - Array of {id, label, color, shape} objects + */ +export function flowchartTB(nodes, startX = 150, startY = 50) { + resetSeeds(); + const elements = []; + const nodeWidth = 140; + const nodeHeight = 50; + const gap = 60; + + nodes.forEach((node, i) => { + const y = startY + i * (nodeHeight + gap); + const shape = node.shape || "rectangle"; + const color = node.color || "blue"; + + if (shape === "ellipse") { + elements.push(...ellipse(node.id, startX, y, nodeWidth, nodeHeight, color, node.label)); + } else if (shape === "diamond") { + elements.push(...diamond(node.id, startX + 10, y - 10, nodeWidth - 20, nodeHeight + 20, color, node.label)); + } else { + elements.push(...rectangle(node.id, startX, y, nodeWidth, nodeHeight, color, node.label)); + } + + // Add arrow to next node + if (i < nodes.length - 1) { + elements.push(...arrow( + `arrow-${i}`, + startX + nodeWidth / 2, + y + nodeHeight + 5, + startX + nodeWidth / 2, + y + nodeHeight + gap - 5 + )); + } + }); + + return wrapScene(elements); +} + +/** + * Create an architecture diagram with a central component and surrounding services + * @param {Object} config - {center: {label, color}, services: [{label, color}]} + */ +export function architecture(config) { + resetSeeds(); + const elements = []; + const centerX = 250; + const centerY = 150; + const centerWidth = 160; + const centerHeight = 80; + const serviceWidth = 100; + const serviceHeight = 50; + const radius = 180; + + // Central component + const centerColor = config.center?.color || "green"; + elements.push(...ellipse("center", centerX, centerY, centerWidth, centerHeight, centerColor, config.center?.label || "Core")); + + // Surrounding services + const services = config.services || []; + const angleStep = (2 * Math.PI) / services.length; + + services.forEach((service, i) => { + const angle = -Math.PI / 2 + i * angleStep; // Start from top + const x = centerX + centerWidth/2 - serviceWidth/2 + radius * Math.cos(angle); + const y = centerY + centerHeight/2 - serviceHeight/2 + radius * Math.sin(angle); + + elements.push(...rectangle(service.id || `service-${i}`, x, y, serviceWidth, serviceHeight, service.color || "blue", service.label)); + + // Arrow to center + const arrowStartX = x + serviceWidth/2; + const arrowStartY = y + serviceHeight/2; + const arrowEndX = centerX + centerWidth/2; + const arrowEndY = centerY + centerHeight/2; + + // Shorten arrow to not overlap shapes + const dx = arrowEndX - arrowStartX; + const dy = arrowEndY - arrowStartY; + const len = Math.sqrt(dx*dx + dy*dy); + const shortenStart = 35; + const shortenEnd = 50; + + elements.push(...arrow( + `arrow-${i}`, + arrowStartX + (dx/len) * shortenStart, + arrowStartY + (dy/len) * shortenStart, + arrowEndX - (dx/len) * shortenEnd, + arrowEndY - (dy/len) * shortenEnd + )); + }); + + return wrapScene(elements); +} + +/** + * Create a process/pipeline diagram + * @param {Array} steps - Array of {label, color, description} objects + */ +export function process(steps, startX = 50, startY = 100) { + resetSeeds(); + const elements = []; + const stepWidth = 120; + const stepHeight = 60; + const gap = 60; + + steps.forEach((step, i) => { + const x = startX + i * (stepWidth + gap); + const color = step.color || ["blue", "green", "orange", "teal"][i % 4]; + + elements.push(...rectangle(step.id || `step-${i}`, x, startY, stepWidth, stepHeight, color, step.label)); + + // Add description below if provided + if (step.description) { + elements.push(...text( + `desc-${i}`, + x, + startY + stepHeight + 10, + step.description, + DEFAULTS.smallFontSize, + "#868e96" + )); + } + + // Add arrow to next step + if (i < steps.length - 1) { + elements.push(...arrow( + `arrow-${i}`, + x + stepWidth + 5, + startY + stepHeight / 2, + x + stepWidth + gap - 5, + startY + stepHeight / 2 + )); + } + }); + + return wrapScene(elements); +} + +/** + * Create a comparison diagram (two columns) + * @param {Object} left - {title, items: [string]} + * @param {Object} right - {title, items: [string]} + */ +export function comparison(left, right) { + resetSeeds(); + const elements = []; + const colWidth = 180; + const titleHeight = 50; + const itemHeight = 35; + const gap = 100; + const startX = 50; + const startY = 50; + + // Left column + elements.push(...rectangle("left-title", startX, startY, colWidth, titleHeight, left.color || "blue", left.title)); + + (left.items || []).forEach((item, i) => { + const y = startY + titleHeight + 20 + i * (itemHeight + 10); + elements.push(...rectangle(`left-item-${i}`, startX, y, colWidth, itemHeight, "gray", item)); + }); + + // Right column + const rightX = startX + colWidth + gap; + elements.push(...rectangle("right-title", rightX, startY, colWidth, titleHeight, right.color || "green", right.title)); + + (right.items || []).forEach((item, i) => { + const y = startY + titleHeight + 20 + i * (itemHeight + 10); + elements.push(...rectangle(`right-item-${i}`, rightX, y, colWidth, itemHeight, "gray", item)); + }); + + // VS text in the middle + elements.push(...text("vs", startX + colWidth + gap/2 - 15, startY + titleHeight/2 - 10, "vs", 20, "#868e96")); + + return wrapScene(elements); +} + +/** + * Create a simple box diagram with labeled components + * @param {Array} boxes - Array of {id, label, x, y, width, height, color} objects + * @param {Array} arrows - Array of {from, to, bidirectional} objects + */ +export function custom(boxes, connections = []) { + resetSeeds(); + const elements = []; + const boxMap = {}; + + // Create boxes + boxes.forEach(box => { + const width = box.width || 120; + const height = box.height || 60; + const shape = box.shape || "rectangle"; + + boxMap[box.id] = { x: box.x, y: box.y, width, height }; + + if (shape === "ellipse") { + elements.push(...ellipse(box.id, box.x, box.y, width, height, box.color || "blue", box.label)); + } else if (shape === "diamond") { + elements.push(...diamond(box.id, box.x, box.y, width, height, box.color || "pink", box.label)); + } else { + elements.push(...rectangle(box.id, box.x, box.y, width, height, box.color || "blue", box.label)); + } + }); + + // Create connections + connections.forEach((conn, i) => { + const from = boxMap[conn.from]; + const to = boxMap[conn.to]; + + if (from && to) { + // Calculate connection points (center to center, shortened) + const fromCenterX = from.x + from.width / 2; + const fromCenterY = from.y + from.height / 2; + const toCenterX = to.x + to.width / 2; + const toCenterY = to.y + to.height / 2; + + const dx = toCenterX - fromCenterX; + const dy = toCenterY - fromCenterY; + const len = Math.sqrt(dx*dx + dy*dy); + + const startOffset = Math.min(from.width, from.height) / 2 + 5; + const endOffset = Math.min(to.width, to.height) / 2 + 5; + + elements.push(...arrow( + conn.id || `conn-${i}`, + fromCenterX + (dx/len) * startOffset, + fromCenterY + (dy/len) * startOffset, + toCenterX - (dx/len) * endOffset, + toCenterY - (dy/len) * endOffset, + conn.bidirectional || false + )); + } + }); + + return wrapScene(elements); +} + +// ============================================================================= +// CREATIVE TEMPLATES (NEW) +// ============================================================================= + +/** + * Create a cycle/loop diagram (circular flow) + * Perfect for feedback loops, ReAct patterns, iterative processes + * + * SIMPLE API: Just pass an array of labels! + * @param {Array|Array} nodes - Array of labels OR {id, label, color, shape} objects + * @example cycle(['Observe', 'Think', 'Act']) + * @example cycle([{label: 'Step 1', color: 'blue'}, {label: 'Step 2', color: 'green'}]) + */ +export function cycle(nodes) { + resetSeeds(); + const elements = []; + + // Normalize input - accept simple strings or objects + const normalizedNodes = nodes.map((node, i) => { + if (typeof node === 'string') { + return { id: `node-${i}`, label: node }; + } + return { id: node.id || `node-${i}`, ...node }; + }); + + const nodeCount = normalizedNodes.length; + + // First pass: pre-calculate actual dimensions using wrapText (matching what rectangle() does) + const nodeDimensions = normalizedNodes.map(node => { + const wrapped = wrapText(node.label, 18); // Same maxCharsPerLine as rectangle() + const textWidth = wrapped.maxLineLength * DEFAULTS.charWidth + DEFAULTS.boxPadding; + const width = Math.max(DEFAULTS.minBoxWidth, Math.min(DEFAULTS.maxBoxWidth, textWidth)); + const baseHeight = 50; + const lineHeight = 22; + const height = baseHeight + Math.max(0, wrapped.lines - 1) * lineHeight; + return { width, height }; + }); + + const maxWidth = Math.max(...nodeDimensions.map(d => d.width)); + const maxHeight = Math.max(...nodeDimensions.map(d => d.height)); + + // Calculate radius based on actual dimensions - tighter spacing for compact layout + const baseRadius = 70 + Math.min(nodeCount, 6) * 14; // Reduced by ~30% + const radius = baseRadius + maxWidth / 2; + + // Center position ensuring all nodes have positive coordinates + const centerX = radius + maxWidth / 2 + 60; + const centerY = radius + maxHeight / 2 + 60; + + // Place nodes in a circle (clockwise from top) + const angleStep = (2 * Math.PI) / nodeCount; + const nodePositions = []; + + normalizedNodes.forEach((node, i) => { + // Start from top, go clockwise + const angle = -Math.PI / 2 + i * angleStep; + const dims = nodeDimensions[i]; + + // Position node center on the circle + const nodeCenterX = centerX + radius * Math.cos(angle); + const nodeCenterY = centerY + radius * Math.sin(angle); + + // Top-left corner for shape placement + const x = nodeCenterX - dims.width / 2; + const y = nodeCenterY - dims.height / 2; + + const shape = node.shape || "rectangle"; + const color = node.color || ["blue", "green", "orange", "teal", "pink", "yellow"][i % 6]; + + let shapeElements; + if (shape === "ellipse") { + shapeElements = ellipse(node.id, x, y, dims.width, dims.height, color, node.label); + } else if (shape === "diamond") { + shapeElements = diamond(node.id, x, y, dims.width, dims.height, color, node.label); + } else { + shapeElements = rectangle(node.id, x, y, dims.width, dims.height, color, node.label); + } + + // Use actual computed dimensions from shape factory + const actualWidth = shapeElements.computedWidth || dims.width; + const actualHeight = shapeElements.computedHeight || dims.height; + + // Recalculate center based on actual dimensions + const actualCenterX = x + actualWidth / 2; + const actualCenterY = y + actualHeight / 2; + + nodePositions.push({ + cx: actualCenterX, + cy: actualCenterY, + x, + y, + angle, + width: actualWidth, + height: actualHeight + }); + + elements.push(...shapeElements); + }); + + // Create arrows between consecutive nodes (clockwise: node -> next node) + for (let i = 0; i < nodeCount; i++) { + const from = nodePositions[i]; + const to = nodePositions[(i + 1) % nodeCount]; + + // Calculate direction from this node to next node + const dx = to.cx - from.cx; + const dy = to.cy - from.cy; + const dist = Math.sqrt(dx * dx + dy * dy); + const ux = dx / dist; // unit vector x + const uy = dy / dist; // unit vector y + + // Calculate intersection with box edges (ray-box intersection) + // For 'from' box: find where ray exits + const fromHalfW = from.width / 2; + const fromHalfH = from.height / 2; + const tFromX = Math.abs(ux) > 0.001 ? fromHalfW / Math.abs(ux) : Infinity; + const tFromY = Math.abs(uy) > 0.001 ? fromHalfH / Math.abs(uy) : Infinity; + const fromOffset = Math.min(tFromX, tFromY); + + // For 'to' box: find where ray enters (from opposite direction) + const toHalfW = to.width / 2; + const toHalfH = to.height / 2; + const tToX = Math.abs(ux) > 0.001 ? toHalfW / Math.abs(ux) : Infinity; + const tToY = Math.abs(uy) > 0.001 ? toHalfH / Math.abs(uy) : Infinity; + const toOffset = Math.min(tToX, tToY); + + // Arrow endpoints at box edges with small gap (reduced for tighter spacing) + const gap = 5; + const startX = from.cx + ux * (fromOffset + gap); + const startY = from.cy + uy * (fromOffset + gap); + const endX = to.cx - ux * (toOffset + gap); + const endY = to.cy - uy * (toOffset + gap); + + elements.push(...arrow( + `arrow-${i}`, + startX, + startY, + endX, + endY + )); + } + + return wrapScene(elements); +} + +/** + * Create a radial/hub-and-spoke diagram + * Perfect for showing a central concept with related components + * + * SIMPLE API: Pass center label and array of satellite labels! + * @param {string|Object} center - Center label OR {label, color, shape} object + * @param {Array|Array} satellites - Array of labels OR {id, label, color, shape} objects + * @param {Object} options - {arrowDirection: "inward"|"outward"|"both"} + * @example radial('Core', ['Feature 1', 'Feature 2', 'Feature 3']) + */ +export function radial(center, satellites, options = {}) { + resetSeeds(); + const elements = []; + + // Normalize center - accept string or object + const normalizedCenter = typeof center === 'string' + ? { label: center } + : center; + + // Normalize satellites - accept strings or objects + const normalizedSatellites = satellites.map((sat, i) => { + if (typeof sat === 'string') { + return { id: `sat-${i}`, label: sat }; + } + return { id: sat.id || `sat-${i}`, ...sat }; + }); + + // DYNAMIC SIZING: Calculate widths based on text content + const centerLabel = truncateLabel(normalizedCenter.label); + const centerWidth = Math.max(120, calcWidth(centerLabel) + 20); // Extra padding for ellipse + const centerHeight = 70; + + // Calculate width for each satellite + const satelliteWidths = normalizedSatellites.map(sat => { + const label = truncateLabel(sat.label); + return calcWidth(label); + }); + const maxSatelliteWidth = Math.max(...satelliteWidths, 100); + const satelliteHeight = 50; + + // Adjust radius based on content size to prevent overlaps + const baseRadius = 140; + const radius = baseRadius + Math.max(0, (maxSatelliteWidth - 100) / 2) + Math.max(0, (centerWidth - 120) / 2); + + // Position center to ensure all satellites have positive coordinates + const padding = 60; + const centerX = radius + maxSatelliteWidth / 2 + padding; + const centerY = radius + satelliteHeight / 2 + padding; + const arrowDirection = options.arrowDirection || "inward"; + + // Central node (ellipse by default for visual distinction) + const centerColor = normalizedCenter.color || "green"; + const centerShape = normalizedCenter.shape || "ellipse"; + + if (centerShape === "rectangle") { + elements.push(...rectangle("center", centerX - centerWidth/2, centerY - centerHeight/2, centerWidth, centerHeight, centerColor, normalizedCenter.label)); + } else { + elements.push(...ellipse("center", centerX - centerWidth/2, centerY - centerHeight/2, centerWidth, centerHeight, centerColor, normalizedCenter.label)); + } + + // Surrounding satellites + const angleStep = (2 * Math.PI) / normalizedSatellites.length; + + normalizedSatellites.forEach((sat, i) => { + const satWidth = maxSatelliteWidth; + const angle = -Math.PI / 2 + i * angleStep; // Start from top + const x = centerX + radius * Math.cos(angle) - satWidth / 2; + const y = centerY + radius * Math.sin(angle) - satelliteHeight / 2; + + const color = sat.color || ["blue", "orange", "teal", "pink", "yellow", "red"][i % 6]; + const shape = sat.shape || "rectangle"; + + if (shape === "ellipse") { + elements.push(...ellipse(sat.id || `sat-${i}`, x, y, satWidth, satelliteHeight, color, sat.label)); + } else if (shape === "diamond") { + elements.push(...diamond(sat.id || `sat-${i}`, x, y, satWidth, satelliteHeight, color, sat.label)); + } else { + elements.push(...rectangle(sat.id || `sat-${i}`, x, y, satWidth, satelliteHeight, color, sat.label)); + } + + // Arrow between satellite and center - PROPER CENTER-TO-CENTER alignment + const satCenterX = x + satWidth / 2; + const satCenterY = y + satelliteHeight / 2; + + // Direction vector from satellite center to main center + const dx = centerX - satCenterX; + const dy = centerY - satCenterY; + const len = Math.sqrt(dx * dx + dy * dy); + const ux = dx / len; // unit vector x + const uy = dy / len; // unit vector y + + // Calculate edge intersections using ray-box intersection + // For satellite (rectangle): find where ray exits the box + const satHalfW = satWidth / 2; + const satHalfH = satelliteHeight / 2; + const tSatX = Math.abs(ux) > 0.001 ? satHalfW / Math.abs(ux) : Infinity; + const tSatY = Math.abs(uy) > 0.001 ? satHalfH / Math.abs(uy) : Infinity; + const satOffset = Math.min(tSatX, tSatY); + + // For center (ellipse): use ellipse intersection formula + const centerHalfW = centerWidth / 2; + const centerHalfH = centerHeight / 2; + // Ellipse parametric: find t where (ux*t/a)^2 + (uy*t/b)^2 = 1 + const centerOffset = 1 / Math.sqrt((ux * ux) / (centerHalfW * centerHalfW) + (uy * uy) / (centerHalfH * centerHalfH)); + + // Calculate edge points + const satEdgeX = satCenterX + ux * satOffset; + const satEdgeY = satCenterY + uy * satOffset; + const centerEdgeX = centerX - ux * centerOffset; + const centerEdgeY = centerY - uy * centerOffset; + + // Add consistent gap for arrowheads (increased to prevent bleeding) + const arrowGap = 12; // Gap from satellite edge + const centerGapVal = 14; // Gap from center edge + const satGapX = satEdgeX + ux * arrowGap; + const satGapY = satEdgeY + uy * arrowGap; + const centerGapX = centerEdgeX - ux * centerGapVal; + const centerGapY = centerEdgeY - uy * centerGapVal; + + const isBidirectional = arrowDirection === "both"; + + // Determine arrow start/end based on direction + const arrowStartX = arrowDirection === "outward" ? centerGapX : satGapX; + const arrowStartY = arrowDirection === "outward" ? centerGapY : satGapY; + const arrowEndX = arrowDirection === "outward" ? satGapX : centerGapX; + const arrowEndY = arrowDirection === "outward" ? satGapY : centerGapY; + + // Use standard arrow construction: base at start point, relative endpoint + elements.push(...arrow( + `arrow-${i}`, + arrowStartX, + arrowStartY, + arrowEndX, + arrowEndY, + isBidirectional + )); + }); + + return wrapScene(elements); +} + +/** + * Create a hierarchy/tree diagram (top-down) + * Perfect for org charts, taxonomies, decision trees + * + * SIMPLE API: Pass root label and array of child labels! + * @param {string|Object} root - Root label OR {label, color, shape} for root node + * @param {Array|Array} children - Array of labels OR {label, color, shape, children?} objects + * @example hierarchy('Parent', ['Child 1', 'Child 2', 'Child 3']) + */ +export function hierarchy(root, children) { + resetSeeds(); + const elements = []; + const baseNodeWidth = 120; + const baseNodeHeight = 50; + const horizontalGap = 60; // Gap between siblings + const verticalGap = 80; + const startY = 50; + const maxCharsPerLine = 18; // Match rectangle() default + const lineHeight = 22; + + // Normalize root - accept string or object + const normalizedRoot = typeof root === 'string' + ? { label: root } + : root; + + // Normalize children - accept strings or objects + const normalizedChildren = children.map(child => { + if (typeof child === 'string') { + return { label: child }; + } + return child; + }); + + // Pre-calculate dimensions for a node using wrapText (matching rectangle() logic) + function calcNodeDimensions(label) { + const wrapped = wrapText(label, maxCharsPerLine); + const textWidth = wrapped.maxLineLength * DEFAULTS.charWidth + DEFAULTS.boxPadding; + const width = Math.max(DEFAULTS.minBoxWidth, Math.min(DEFAULTS.maxBoxWidth, textWidth)); + const height = baseNodeHeight + Math.max(0, wrapped.lines - 1) * lineHeight; + return { width, height, wrapped }; + } + + // Pre-calculate all dimensions + const rootDims = calcNodeDimensions(normalizedRoot.label); + const childDims = normalizedChildren.map(child => calcNodeDimensions(child.label)); + + // Calculate total width needed for children (sum of widths + gaps) + const totalChildrenWidth = childDims.reduce((sum, d) => sum + d.width, 0) + + (childDims.length - 1) * horizontalGap; + + // Starting X position for children (centered layout) + const startX = 50; + + // Root node - centered above children + const rootX = startX + totalChildrenWidth / 2 - rootDims.width / 2; + const rootY = startY; + const rootColor = normalizedRoot.color || "green"; + + if (normalizedRoot.shape === "ellipse") { + elements.push(...ellipse("root", rootX, rootY, rootDims.width, rootDims.height, rootColor, normalizedRoot.label)); + } else { + elements.push(...rectangle("root", rootX, rootY, rootDims.width, rootDims.height, rootColor, normalizedRoot.label)); + } + + // Render children with proper spacing based on actual widths + let currentX = startX; + const childY = startY + rootDims.height + verticalGap; + + normalizedChildren.forEach((node, i) => { + const dims = childDims[i]; + const nodeId = `root-${i}`; + const color = node.color || ["blue", "orange", "teal", "pink"][i % 4]; + + let shapeElements; + if (node.shape === "ellipse") { + shapeElements = ellipse(nodeId, currentX, childY, dims.width, dims.height, color, node.label); + } else if (node.shape === "diamond") { + shapeElements = diamond(nodeId, currentX, childY, dims.width, dims.height, color, node.label); + } else { + shapeElements = rectangle(nodeId, currentX, childY, dims.width, dims.height, color, node.label); + } + + // Use actual dimensions from shape factory + const actualWidth = shapeElements.computedWidth || dims.width; + const actualHeight = shapeElements.computedHeight || dims.height; + + elements.push(...shapeElements); + + // Arrow from root CENTER bottom to child CENTER top + const rootCenterX = rootX + rootDims.width / 2; + const childCenterX = currentX + actualWidth / 2; + + elements.push(...arrow( + `arrow-${nodeId}`, + rootCenterX, + rootY + rootDims.height + 5, + childCenterX, + childY - 5 + )); + + // Move to next position + currentX += actualWidth + horizontalGap; + }); + + return wrapScene(elements); +} + +/** + * Create a timeline diagram with numbered steps + * Perfect for sequences, processes with clear phases + * + * SIMPLE API: Just pass an array of labels! + * @param {Array|Array} steps - Array of labels OR {label, description?, color} objects + * @example timeline(['Step 1', 'Step 2', 'Step 3']) + * @example timeline([{label: 'Start', color: 'green'}, {label: 'End', color: 'blue'}]) + */ +export function timeline(steps) { + resetSeeds(); + const elements = []; + + // Normalize input - accept simple strings or objects + // Supports: "label", "year|description", "text // with // linebreaks" + const normalizedSteps = steps.map((step, i) => { + if (typeof step === 'string') { + // Check for year|description format + if (step.includes('|')) { + const pipeIndex = step.indexOf('|'); + const year = step.substring(0, pipeIndex).trim(); + const description = step.substring(pipeIndex + 1).trim(); + return { year, label: description }; + } + return { label: step }; + } + return step; + }); + + const circleSize = 50; + const gap = 100; + const startX = 60; + const startY = 60; + const lineY = startY + circleSize / 2; + + // Draw connecting line first (background) + if (normalizedSteps.length > 1) { + const lineStartX = startX + circleSize / 2; + const lineEndX = startX + (normalizedSteps.length - 1) * (circleSize + gap) + circleSize / 2; + + elements.push({ + ...baseElement("timeline-line", "line", lineStartX, lineY), + width: lineEndX - lineStartX, + height: 0, + points: [[0, 0], [lineEndX - lineStartX, 0]], + strokeColor: "#868e96", + strokeWidth: 3 + }); + } + + // Draw numbered circles and labels + normalizedSteps.forEach((step, i) => { + const x = startX + i * (circleSize + gap); + const color = step.color || ["blue", "green", "orange", "teal", "pink"][i % 5]; + const colorScheme = COLORS[color] || COLORS.blue; + + // Numbered circle + elements.push({ + ...baseElement(`step-circle-${i}`, "ellipse", x, startY), + width: circleSize, + height: circleSize, + strokeColor: colorScheme.stroke, + backgroundColor: colorScheme.fill + }); + + // Content inside circle: year if provided, otherwise step number + const circleText = step.year || String(i + 1); + const circleTextWidth = circleText.length * 8; // Approximate width + elements.push({ + ...baseElement(`step-num-${i}`, "text", x + circleSize / 2 - circleTextWidth / 2, startY + circleSize / 2 - 12), + width: circleTextWidth, + height: 24, + text: circleText, + fontSize: step.year ? 14 : 20, // Smaller font for years + fontFamily: DEFAULTS.fontFamily, + textAlign: "center", + verticalAlign: "middle", + strokeColor: "#1e1e1e" + }); + + // Label below - support // as explicit line breaks, then wrap remaining text + // Convert // to newlines first, then wrap each segment + const labelWithBreaks = step.label.replace(/\s*\/\/\s*/g, '\n'); + const segments = labelWithBreaks.split('\n'); + const wrappedSegments = segments.map(seg => wrapText(seg, 12).text); + const finalText = wrappedSegments.join('\n'); + const lines = finalText.split('\n'); + const maxLineLength = Math.max(...lines.map(l => l.length)); + + const labelWidth = maxLineLength * DEFAULTS.charWidth; + const labelHeight = lines.length * (DEFAULTS.fontSize + 4); + elements.push({ + ...baseElement(`step-label-${i}`, "text", x + circleSize / 2 - labelWidth / 2, startY + circleSize + 15), + width: labelWidth, + height: labelHeight, + text: finalText, + fontSize: DEFAULTS.fontSize, + fontFamily: DEFAULTS.fontFamily, + textAlign: "center", + verticalAlign: "top", + strokeColor: "#1e1e1e", + backgroundColor: "transparent" + }); + + // Description if provided (below the wrapped label) + if (step.description) { + const descWrapped = wrapText(step.description, 15); + const descWidth = descWrapped.maxLineLength * (DEFAULTS.smallFontSize * 0.6); + const descHeight = descWrapped.lines * (DEFAULTS.smallFontSize + 4); + elements.push({ + ...baseElement(`step-desc-${i}`, "text", x + circleSize / 2 - descWidth / 2, startY + circleSize + 15 + labelHeight + 5), + width: descWidth, + height: descHeight, + text: descWrapped.text, + fontSize: DEFAULTS.smallFontSize, + fontFamily: DEFAULTS.fontFamily, + textAlign: "center", + verticalAlign: "top", + strokeColor: "#868e96", + backgroundColor: "transparent" + }); + } + }); + + return wrapScene(elements); +} + +/** + * Create a 2x2 matrix/quadrant diagram + * Perfect for comparisons, priority matrices, categorizations + * @param {Object} config - {topLeft, topRight, bottomLeft, bottomRight, xAxis?, yAxis?} + */ +export function matrix(config) { + resetSeeds(); + const elements = []; + const cellWidth = 180; + const cellHeight = 120; + const startX = 80; + const startY = 60; + const axisOffset = 40; + + const quadrants = [ + { pos: "topLeft", x: startX, y: startY }, + { pos: "topRight", x: startX + cellWidth + 20, y: startY }, + { pos: "bottomLeft", x: startX, y: startY + cellHeight + 20 }, + { pos: "bottomRight", x: startX + cellWidth + 20, y: startY + cellHeight + 20 } + ]; + + const defaultColors = ["blue", "green", "orange", "teal"]; + + // Draw quadrant boxes + quadrants.forEach((q, i) => { + const data = config[q.pos] || {}; + const color = data.color || defaultColors[i]; + + elements.push(...rectangle( + `quad-${q.pos}`, + q.x, q.y, + cellWidth, cellHeight, + color, + data.label || q.pos + )); + + // Add items if provided + if (data.items && data.items.length > 0) { + const itemsText = data.items.slice(0, 3).join("\n"); + elements.push(...text( + `items-${q.pos}`, + q.x + 10, + q.y + 45, + itemsText, + DEFAULTS.smallFontSize, + "#495057" + )); + } + }); + + // Axis labels if provided + if (config.xAxis) { + elements.push(...text( + "x-axis", + startX + cellWidth, + startY + 2 * cellHeight + 50, + config.xAxis, + DEFAULTS.fontSize, + "#495057" + )); + + // X-axis arrow + elements.push({ + ...baseElement("x-arrow", "arrow", startX - 20, startY + 2 * cellHeight + 35), + width: 2 * cellWidth + 60, + height: 0, + points: [[0, 0], [2 * cellWidth + 60, 0]], + strokeColor: "#868e96", + startArrowhead: null, + endArrowhead: "arrow" + }); + } + + if (config.yAxis) { + elements.push(...text( + "y-axis", + startX - 60, + startY + cellHeight - 10, + config.yAxis, + DEFAULTS.fontSize, + "#495057" + )); + + // Y-axis arrow + elements.push({ + ...baseElement("y-arrow", "arrow", startX - 20, startY + 2 * cellHeight + 30), + width: 0, + height: 2 * cellHeight + 30, + points: [[0, 0], [0, -(2 * cellHeight + 30)]], + strokeColor: "#868e96", + startArrowhead: null, + endArrowhead: "arrow" + }); + } + + return wrapScene(elements); +} + +/** + * Create a layered/stack diagram + * Perfect for architectures, tech stacks, abstraction layers + * + * SIMPLE API: Just pass an array of labels! + * @param {Array|Array} layerList - Array of labels OR {label, color, description?} objects (top to bottom) + * @example layers(['Presentation', 'Business Logic', 'Data Access', 'Database']) + */ +export function layers(layerList) { + resetSeeds(); + const elements = []; + const layerWidth = 300; + const baseHeight = 50; + const lineHeight = 22; // Height per line of wrapped text + const gap = 15; + const startX = 100; + const startY = 50; + const maxCharsPerLine = 28; // Chars that fit in layerWidth + + // Normalize input - accept simple strings or objects + const normalizedLayers = layerList.map((layer, i) => { + if (typeof layer === 'string') { + return { id: `layer-${i}`, label: layer }; + } + return { id: layer.id || `layer-${i}`, ...layer }; + }); + + const colors = ["blue", "green", "orange", "teal", "pink", "yellow"]; + + // Pre-calculate wrapped text and heights for each layer + const layerData = normalizedLayers.map(layer => { + const wrapped = wrapText(layer.label, maxCharsPerLine); + const height = baseHeight + Math.max(0, wrapped.lines - 1) * lineHeight; + return { ...layer, wrapped, height }; + }); + + // Build layers with dynamic heights + let currentY = startY; + + layerData.forEach((layer, i) => { + const color = layer.color || colors[i % colors.length]; + const colorScheme = COLORS[color] || COLORS.blue; + + // Create rectangle manually to handle multiline text + elements.push({ + ...baseElement(layer.id || `layer-${i}`, "rectangle", startX, currentY), + width: layerWidth, + height: layer.height, + strokeColor: colorScheme.stroke, + backgroundColor: colorScheme.fill, + roundness: { type: 3 } + }); + + // Add wrapped text centered in the box + const textWidth = layer.wrapped.maxLineLength * DEFAULTS.charWidth; + const textHeight = layer.wrapped.lines * (DEFAULTS.fontSize + 4); + elements.push({ + ...baseElement(`${layer.id}-text`, "text", startX + layerWidth/2 - textWidth/2, currentY + layer.height/2 - textHeight/2), + width: textWidth, + height: textHeight, + text: layer.wrapped.text, + fontSize: DEFAULTS.fontSize, + fontFamily: DEFAULTS.fontFamily, + textAlign: "center", + verticalAlign: "middle", + strokeColor: "#1e1e1e", + backgroundColor: "transparent" + }); + + // Description to the right if provided + if (layer.description) { + elements.push(...text( + `layer-desc-${i}`, + startX + layerWidth + 20, + currentY + layer.height / 2 - 8, + layer.description, + DEFAULTS.smallFontSize, + "#868e96" + )); + } + + currentY += layer.height + gap; + }); + + return wrapScene(elements); +} + +/** + * Create a funnel diagram (wide to narrow stages) + * Perfect for marketing funnels, sales pipelines, conversion processes + * + * SIMPLE API: Just pass an array of stage labels! + * @param {Array} stages - Array of stage labels (top to bottom, wide to narrow) + * @example funnel(['Awareness', 'Interest', 'Decision', 'Action']) + */ +export function funnel(stages) { + resetSeeds(); + const elements = []; + + // Normalize input + const normalizedStages = stages.map((stage, i) => { + if (typeof stage === 'string') { + return { label: stage }; + } + return stage; + }); + + const stageCount = normalizedStages.length; + const maxWidth = 300; + const minWidth = 120; + const baseHeight = 50; + const gap = 12; // Gap between stages + const startY = 50; + const maxCharsPerLine = 18; + const lineHeight = 22; + + // Calculate width reduction per stage + const widthStep = (maxWidth - minWidth) / Math.max(stageCount - 1, 1); + + // Pre-calculate dimensions for each stage + const stageDims = normalizedStages.map((stage, i) => { + const wrapped = wrapText(stage.label, maxCharsPerLine); + const height = baseHeight + Math.max(0, wrapped.lines - 1) * lineHeight; + const width = maxWidth - (i * widthStep); + return { width, height, wrapped }; + }); + + // Colors for stages (gradient-like effect) + const stageColors = ['blue', 'teal', 'green', 'orange', 'pink']; + + // Track cumulative Y position for dynamic spacing + let currentY = startY; + const stagePositions = []; + + normalizedStages.forEach((stage, i) => { + const dims = stageDims[i]; + const color = stage.color || stageColors[i % stageColors.length]; + + // Center horizontally based on max width + const x = (maxWidth - dims.width) / 2 + 50; + + // Create rectangle with proper dimensions + const shapeElements = rectangle( + `stage-${i}`, + x, + currentY, + dims.width, + dims.height, + color, + stage.label + ); + + // Get actual dimensions from shape factory + const actualWidth = shapeElements.computedWidth || dims.width; + const actualHeight = shapeElements.computedHeight || dims.height; + + // Re-center based on actual width + const actualX = (maxWidth - actualWidth) / 2 + 50; + + // Update x position in elements if different + if (actualX !== x) { + shapeElements.forEach(el => { + if (el.x !== undefined) { + el.x = el.x - x + actualX; + } + }); + } + + stagePositions.push({ x: actualX, y: currentY, width: actualWidth, height: actualHeight }); + elements.push(...shapeElements); + + // Move to next position + currentY += actualHeight + gap; + }); + + // Add arrows between stages + for (let i = 0; i < stageCount - 1; i++) { + const current = stagePositions[i]; + const next = stagePositions[i + 1]; + const arrowX = maxWidth / 2 + 50; + + elements.push(...arrow( + `arrow-${i}`, + arrowX, + current.y + current.height + 2, + arrowX, + next.y - 2 + )); + } + + return wrapScene(elements); +} + +/** + * Create a mindmap diagram - center topic with branches spreading out + * First label is the central topic, rest are branches + * + * SIMPLE API: First label = center, rest = branches + * @param {Array} labels - [center, branch1, branch2, ...] + * @example mindmap(['Main Topic', 'Branch A', 'Branch B', 'Branch C']) + */ +export function mindmap(labels) { + resetSeeds(); + const elements = []; + + if (labels.length === 0) return wrapScene([]); + + const centerLabel = labels[0]; + const branches = labels.slice(1); + + // Pre-calculate center node dimensions using wrapText + const centerWrapped = wrapText(centerLabel, 12); + const centerTextWidth = centerWrapped.maxLineLength * DEFAULTS.charWidth + DEFAULTS.boxPadding; + const centerWidth = Math.max(DEFAULTS.minBoxWidth, Math.min(DEFAULTS.maxBoxWidth, centerTextWidth)); + const centerBaseHeight = 60; + const centerLineHeight = 20; + const centerHeight = centerBaseHeight + Math.max(0, centerWrapped.lines - 1) * centerLineHeight; + const centerX = 50; + const centerY = 150; + + elements.push(...ellipse('center', centerX, centerY, centerWidth, centerHeight, 'blue', centerLabel)); + + if (branches.length === 0) return wrapScene(elements); + + // Pre-calculate all branch dimensions for dynamic spacing + const branchColors = ['green', 'orange', 'teal', 'pink', 'yellow', 'red']; + const horizontalGap = 80; + + // Calculate dimensions for each branch using rectangle's logic + const branchDims = branches.map(branch => { + const wrapped = wrapText(branch, 15); + const textWidth = wrapped.maxLineLength * DEFAULTS.charWidth + DEFAULTS.boxPadding; + const width = Math.max(DEFAULTS.minBoxWidth, Math.min(DEFAULTS.maxBoxWidth, textWidth)); + const baseHeight = 40; + const lineHeight = 18; + const height = baseHeight + Math.max(0, wrapped.lines - 1) * lineHeight; + return { width, height, wrapped }; + }); + + // Find max width AND max height for uniform sizing (better symmetry) + const maxBranchWidth = Math.max(...branchDims.map(d => d.width)); + const maxBranchHeight = Math.max(...branchDims.map(d => d.height)); + + // Dynamic vertical gap: scales with box height (minimum 20px, ~40% of box height) + const verticalGap = Math.max(20, Math.round(maxBranchHeight * 0.4)); + + // Calculate total height for vertical centering using uniform box heights + const totalBranchHeight = branches.length * maxBranchHeight + (branches.length - 1) * verticalGap; + + // Center ellipse center point + const ellipseCenterX = centerX + centerWidth / 2; + const ellipseCenterY = centerY + centerHeight / 2; + const ellipseRx = centerWidth / 2; + const ellipseRy = centerHeight / 2; + + // Position branches vertically centered relative to ellipse center + let currentY = ellipseCenterY - totalBranchHeight / 2; + const branchX = centerX + centerWidth + horizontalGap; + + branches.forEach((branch, i) => { + const color = branchColors[i % branchColors.length]; + + // Use uniform height for all boxes (maxBranchHeight) for better symmetry + elements.push(...rectangle(`branch-${i}`, branchX, currentY, maxBranchWidth, maxBranchHeight, color, branch)); + + // Arrow from ellipse edge to branch left edge + const branchCenterY = currentY + maxBranchHeight / 2; + + // Calculate ellipse intersection: direction from ellipse center to branch center + const dx = branchX - ellipseCenterX; + const dy = branchCenterY - ellipseCenterY; + const dist = Math.sqrt(dx * dx + dy * dy); + const ux = dx / dist; + const uy = dy / dist; + + // Ellipse parametric intersection + const t = 1 / Math.sqrt((ux * ux) / (ellipseRx * ellipseRx) + (uy * uy) / (ellipseRy * ellipseRy)); + const ellipseEdgeX = ellipseCenterX + ux * t; + const ellipseEdgeY = ellipseCenterY + uy * t; + + // Arrow endpoints with gaps + const gap = 8; + const arrowStartX = ellipseEdgeX + ux * gap; + const arrowStartY = ellipseEdgeY + uy * gap; + const arrowEndX = branchX - gap; + const arrowEndY = branchCenterY; + + elements.push(...arrow(`arrow-${i}`, arrowStartX, arrowStartY, arrowEndX, arrowEndY)); + + currentY += maxBranchHeight + verticalGap; + }); + + return wrapScene(elements); +} + +/** + * Create a pyramid/stack diagram - uniform width boxes stacked vertically + * Height increases dynamically for long text + * + * SIMPLE API: Labels from top to bottom + * @param {Array} levels - Level names from top to bottom + * @example pyramid(['Peak', 'High', 'Medium', 'Foundation']) + */ +export function pyramid(levels) { + resetSeeds(); + const elements = []; + + const levelCount = levels.length; + if (levelCount === 0) return wrapScene([]); + + // Fixed width for all levels + const boxWidth = 200; + const baseHeight = 45; + const gap = 8; + const startX = 50; + const startY = 50; + const colors = ['blue', 'green', 'orange', 'teal', 'pink', 'yellow']; + + // Max chars that fit comfortably in one line + const maxCharsPerLine = 15; + + let currentY = startY; + + levels.forEach((level, i) => { + const color = colors[i % colors.length]; + const palette = COLORS[color] || COLORS.blue; + + // Calculate if we need extra height for long text + const needsWrap = level.length > maxCharsPerLine; + const height = needsWrap ? baseHeight + 25 : baseHeight; + + // Create rectangle without embedded label + elements.push({ + ...baseElement(`level-${i}`, "rectangle", startX, currentY), + width: boxWidth, + height: height, + strokeColor: palette.stroke, + backgroundColor: palette.fill, + fillStyle: "hachure", + roundness: { type: 3 } + }); + + // Add text - use smaller font for long text + const fontSize = needsWrap ? 16 : DEFAULTS.fontSize; + const textY = currentY + height/2 - fontSize/2; + + elements.push({ + ...baseElement(`text-${i}`, "text", startX + boxWidth/2, textY), + text: level, + fontSize: fontSize, + fontFamily: 1, + textAlign: "center", + verticalAlign: "middle", + width: boxWidth - 20, + height: fontSize + 4 + }); + + currentY += height + gap; + }); + + return wrapScene(elements); +} + +// ============================================================================= +// SCENE WRAPPER +// ============================================================================= + +/** + * Normalize coordinates to ensure all elements have positive x,y values + * Adds padding to prevent elements from being at the edge + * @param {Array} elements - Array of Excalidraw elements + * @param {number} padding - Minimum padding from edge + * @returns {Array} - Elements with normalized coordinates + */ +function normalizeCoordinates(elements, padding = 20) { + if (!elements || elements.length === 0) return elements; + + // Find minimum x and y across all elements + let minX = Infinity, minY = Infinity; + + elements.forEach(el => { + if (typeof el.x === 'number') minX = Math.min(minX, el.x); + if (typeof el.y === 'number') minY = Math.min(minY, el.y); + }); + + // Calculate offset needed to make all coordinates positive with padding + const offsetX = minX < padding ? padding - minX : 0; + const offsetY = minY < padding ? padding - minY : 0; + + // If no offset needed, return as-is + if (offsetX === 0 && offsetY === 0) return elements; + + // Apply offset to all elements + return elements.map(el => { + const newEl = { ...el }; + if (typeof newEl.x === 'number') newEl.x += offsetX; + if (typeof newEl.y === 'number') newEl.y += offsetY; + return newEl; + }); +} + +/** + * Wrap elements in a complete Excalidraw scene + */ +export function wrapScene(elements) { + // Normalize coordinates to ensure all are positive + const normalizedElements = normalizeCoordinates(elements); + + return { + type: "excalidraw", + version: 2, + source: "https://excalidraw.com", + elements: normalizedElements, + appState: DEFAULTS.appState, + files: {} + }; +} + +// ============================================================================= +// EXPORTS +// ============================================================================= + +export default { + DEFAULTS, + COLORS, + resetSeeds, + rectangle, + ellipse, + diamond, + arrow, + text, + // Original templates + flowchartLR, + flowchartTB, + architecture, + process, + comparison, + custom, + // New creative templates + cycle, + radial, + hierarchy, + timeline, + matrix, + layers, + funnel, + mindmap, + wrapScene +}; diff --git a/.cursor/utilities/excalidraw/to_png.mjs b/.cursor/utilities/excalidraw/to_png.mjs new file mode 100644 index 0000000..3252393 --- /dev/null +++ b/.cursor/utilities/excalidraw/to_png.mjs @@ -0,0 +1,267 @@ +/** + * Convert Excalidraw JSON to PNG using Kroki.io + Puppeteer + * + * Uses Puppeteer (headless Chrome) for proper rendering of embedded fonts. + * IMPORTANT: We calculate bounding box ourselves - don't trust Kroki's viewBox! + * + * CLI Usage: + * node excalidraw_to_png.mjs input.excalidraw [output.png] + * + * Module Usage: + * import { convertToPng, convertJsonToPng } from './excalidraw_to_png.mjs'; + * await convertToPng('input.excalidraw', 'output.png'); + * await convertJsonToPng(excalidrawJson, 'output.png'); + */ + +import fs from 'fs/promises'; +import path from 'path'; +import puppeteer from 'puppeteer'; + +// Shared browser instance for performance +let browserInstance = null; + +/** + * Get or create a shared browser instance + */ +async function getBrowser() { + if (!browserInstance) { + browserInstance = await puppeteer.launch({ + headless: true, + args: [ + '--no-sandbox', + '--disable-setuid-sandbox', + '--disable-crash-reporter', // Disables crash reporting popups + '--disable-breakpad', // Disables crash dump collection + '--disable-dev-shm-usage', // Uses /tmp instead of /dev/shm + '--disable-extensions', // No extensions needed + '--disable-background-networking', // Reduces unnecessary network calls + '--no-first-run', // Skips first-run dialogs + ] + }); + } + return browserInstance; +} + +/** + * Close the shared browser instance + */ +export async function closeBrowser() { + if (browserInstance) { + await browserInstance.close(); + browserInstance = null; + } +} + +/** + * Calculate bounding box from Excalidraw elements + * This is MORE RELIABLE than trusting Kroki's SVG viewBox + */ +function calculateBoundingBox(elements, padding = 40) { + if (!elements || elements.length === 0) { + return { width: 800, height: 600 }; + } + + let minX = Infinity, minY = Infinity; + let maxX = -Infinity, maxY = -Infinity; + + for (const el of elements) { + if (typeof el.x !== 'number' || typeof el.y !== 'number') continue; + + const elWidth = el.width || 0; + const elHeight = el.height || 0; + + minX = Math.min(minX, el.x); + minY = Math.min(minY, el.y); + maxX = Math.max(maxX, el.x + elWidth); + maxY = Math.max(maxY, el.y + elHeight); + + // For arrows, also consider the points + if (el.points && Array.isArray(el.points)) { + for (const pt of el.points) { + if (Array.isArray(pt) && pt.length >= 2) { + minX = Math.min(minX, el.x + pt[0]); + minY = Math.min(minY, el.y + pt[1]); + maxX = Math.max(maxX, el.x + pt[0]); + maxY = Math.max(maxY, el.y + pt[1]); + } + } + } + } + + // Handle edge cases + if (!isFinite(minX)) minX = 0; + if (!isFinite(minY)) minY = 0; + if (!isFinite(maxX)) maxX = 800; + if (!isFinite(maxY)) maxY = 600; + + const width = Math.ceil(maxX - minX + padding * 2); + const height = Math.ceil(maxY - minY + padding * 2); + + return { + width: Math.max(width, 200), // Minimum 200px wide + height: Math.max(height, 150), // Minimum 150px tall + minX, + minY + }; +} + +/** + * Convert Excalidraw JSON content to PNG + * @param {string|object} content - Excalidraw JSON string or object + * @param {string} outputFile - Output PNG file path + * @param {object} options - Optional settings + * @param {boolean} options.silent - Suppress console output + * @returns {Promise} PNG buffer + */ +export async function convertJsonToPng(content, outputFile, options = {}) { + const jsonObj = typeof content === 'string' ? JSON.parse(content) : content; + const jsonContent = typeof content === 'string' ? content : JSON.stringify(content); + + // Calculate bounding box from elements BEFORE calling Kroki + // This is the key fix - we don't trust Kroki's viewBox + const bbox = calculateBoundingBox(jsonObj.elements); + + if (!options.silent) { + console.log(`Calculated bounds: ${bbox.width}x${bbox.height}`); + console.log('Requesting SVG from Kroki.io...'); + } + + // Kroki.io API - get SVG (only format supported for excalidraw) + const response = await fetch('https://kroki.io/excalidraw/svg', { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ + diagram_source: jsonContent, + }), + }); + + if (!response.ok) { + const errorText = await response.text(); + throw new Error(`Kroki API error: ${response.status} - ${errorText}`); + } + + let svgContent = await response.text(); + + if (!options.silent) { + console.log('Converting SVG to PNG with Puppeteer...'); + } + + // Override the SVG's viewBox and dimensions with our calculated values + // This fixes Kroki's broken viewBox calculations + const newViewBox = `0 0 ${bbox.width} ${bbox.height}`; + svgContent = svgContent.replace(/viewBox="[^"]*"/, `viewBox="${newViewBox}"`); + svgContent = svgContent.replace(/width="[^"]*"/, `width="${bbox.width}"`); + svgContent = svgContent.replace(/height="[^"]*"/, `height="${bbox.height}"`); + + // Scale up for better quality (2x) + const scale = 2; + const width = bbox.width; + const height = bbox.height; + + // Create HTML wrapper with the SVG + const html = ` + + + + + + + ${svgContent} + + + `; + + // Use Puppeteer to render + const browser = await getBrowser(); + const page = await browser.newPage(); + + await page.setViewport({ + width: width, + height: height, + deviceScaleFactor: scale + }); + + await page.setContent(html, { waitUntil: 'networkidle0' }); + + // Wait a bit for fonts to load + await new Promise(r => setTimeout(r, 100)); + + // Take screenshot of the full viewport + const pngBuffer = await page.screenshot({ + type: 'png', + omitBackground: true + }); + + await page.close(); + + // Write to file if output path provided + if (outputFile) { + // Ensure output directory exists + const outputDir = path.dirname(outputFile); + if (outputDir && outputDir !== '.') { + await fs.mkdir(outputDir, { recursive: true }); + } + + await fs.writeFile(outputFile, pngBuffer); + + if (!options.silent) { + console.log(`βœ… Exported: ${outputFile} (${Math.round(pngBuffer.length / 1024)}KB, ${width * scale}x${height * scale})`); + } + } + + return pngBuffer; +} + +/** + * Convert an Excalidraw file to PNG + * @param {string} inputFile - Input .excalidraw file path + * @param {string} outputFile - Output PNG file path (defaults to same name with .png) + * @param {object} options - Optional settings + * @returns {Promise} PNG buffer + */ +export async function convertToPng(inputFile, outputFile = null, options = {}) { + const content = await fs.readFile(inputFile, 'utf-8'); + const output = outputFile || inputFile.replace('.excalidraw', '.png'); + return convertJsonToPng(content, output, options); +} + +// CLI execution +const isMainModule = process.argv[1] && import.meta.url.endsWith(process.argv[1].replace(/^file:\/\//, '')); + +if (isMainModule || process.argv[1]?.endsWith('excalidraw_to_png.mjs')) { + const inputFile = process.argv[2]; + const outputFile = process.argv[3] || (inputFile ? inputFile.replace('.excalidraw', '.png') : null); + + if (!inputFile) { + console.error('Usage: node excalidraw_to_png.mjs input.excalidraw [output.png]'); + console.error(''); + console.error('Converts Excalidraw JSON to PNG using Kroki.io and Puppeteer.'); + console.error('Properly renders embedded Virgil (hand-drawn) font.'); + process.exit(1); + } + + convertToPng(inputFile, outputFile) + .then(() => closeBrowser()) + .catch(err => { + console.error('Error:', err.message); + closeBrowser(); + process.exit(1); + }); +} + +export default { convertToPng, convertJsonToPng, closeBrowser }; diff --git a/.cursor/utilities/setup.sh b/.cursor/utilities/setup.sh new file mode 100755 index 0000000..be78c15 --- /dev/null +++ b/.cursor/utilities/setup.sh @@ -0,0 +1,203 @@ +#!/bin/bash +# Setup script for DataCamp Curriculum Assistant +# Run this once to install all dependencies +# +# Creates a project-local virtual environment (.venv/) to ensure +# consistent Python package management across all machines. + +set -e # Exit on error + +echo "==========================================" +echo "DataCamp Curriculum Assistant Setup" +echo "==========================================" +echo "" + +# Get the directory where this script is located +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +PROJECT_DIR="$SCRIPT_DIR/../.." +VENV_DIR="$PROJECT_DIR/.venv" + +# ============================================================================= +# PYTHON VIRTUAL ENVIRONMENT SETUP +# ============================================================================= + +echo "πŸ“¦ Setting up Python environment..." +echo "" + +# Find best Python (prefer 3.10+ for full compatibility) +PYTHON_CMD="" +for cmd in python3.12 python3.11 python3.10 python3; do + if command -v $cmd &> /dev/null; then + VERSION=$($cmd -c 'import sys; print(sys.version_info.minor)') + if [ "$VERSION" -ge 10 ] 2>/dev/null; then + PYTHON_CMD=$cmd + break + elif [ -z "$PYTHON_CMD" ]; then + PYTHON_CMD=$cmd # Fallback to any python3 + fi + fi +done + +if [ -z "$PYTHON_CMD" ]; then + echo "❌ Error: Python 3 is required but not installed." + echo " Install from: https://www.python.org/downloads/" + exit 1 +fi + +PYTHON_VERSION=$($PYTHON_CMD --version) +echo " Using: $PYTHON_CMD ($PYTHON_VERSION)" + +# Check version and warn if too old +MINOR_VERSION=$($PYTHON_CMD -c 'import sys; print(sys.version_info.minor)') +if [ "$MINOR_VERSION" -lt 10 ]; then + echo " ⚠️ Warning: Python 3.10+ recommended for full functionality" + echo " Some packages (datalab-python-sdk) require Python 3.10+" + echo " PDF conversion may not be available." + echo "" +fi + +# Create virtual environment if it doesn't exist +if [ ! -d "$VENV_DIR" ]; then + echo " Creating virtual environment at .venv/..." + $PYTHON_CMD -m venv "$VENV_DIR" + echo " βœ… Virtual environment created" +else + echo " βœ… Virtual environment exists at .venv/" +fi + +# Activate virtual environment +source "$VENV_DIR/bin/activate" +echo " βœ… Virtual environment activated" + +# Upgrade pip +echo " Upgrading pip..." +pip install --upgrade pip -q + +# Install Python dependencies +if [ -f "$SCRIPT_DIR/../requirements.txt" ]; then + echo " Installing Python packages..." + + # Install core packages (work on Python 3.9+) + pip install -q trafilatura youtube-transcript-api python-dotenv brotli 2>/dev/null && \ + echo " βœ… Core packages installed (trafilatura, youtube-transcript-api, etc.)" + + # Install packages requiring Python 3.10+ (may fail on older Python) + if [ "$MINOR_VERSION" -ge 10 ]; then + pip install -q datalab-python-sdk docling 2>/dev/null && \ + echo " βœ… Advanced packages installed (datalab-python-sdk, docling)" + else + echo " ⚠️ Skipped: datalab-python-sdk, docling (require Python 3.10+)" + fi +fi + +echo "" + +# ============================================================================= +# NODE.JS SETUP +# ============================================================================= + +echo "πŸ“¦ Setting up Node.js dependencies..." +echo "" + +# Check if Node.js is available +if ! command -v node &> /dev/null; then + echo "❌ Error: Node.js is required but not installed." + echo " Install from: https://nodejs.org/" + exit 1 +fi + +echo " Node version: $(node --version)" +echo " npm version: $(npm --version)" + +# Install Node.js dependencies +cd "$PROJECT_DIR" +if [ -f "package.json" ]; then + echo " Installing Node.js packages..." + npm install --silent + echo " βœ… Node.js dependencies installed (puppeteer, sharp)" +fi + +echo "" + +# ============================================================================= +# VERIFY INSTALLATION +# ============================================================================= + +echo "πŸ” Verifying installation..." +echo "" + +# Check Python converters +if [ -f "$SCRIPT_DIR/converters/convert_pdf.py" ]; then + echo " βœ… Python converters found" +else + echo " ⚠️ Python converters not found at $SCRIPT_DIR/converters/" +fi + +# Check Excalidraw tools +if [ -f "$SCRIPT_DIR/excalidraw/from_script.mjs" ]; then + echo " βœ… Excalidraw tools found" +else + echo " ⚠️ Excalidraw tools not found at $SCRIPT_DIR/excalidraw/" +fi + +# Test puppeteer installation +cd "$PROJECT_DIR" +if node -e "require('puppeteer')" 2>/dev/null; then + echo " βœ… Puppeteer installed correctly" +else + echo " ⚠️ Puppeteer not installed - run 'npm install' in project root" +fi + +echo "" + +# ============================================================================= +# SUMMARY +# ============================================================================= + +echo "==========================================" +echo "Setup Complete!" +echo "==========================================" +echo "" +echo "Folder Structure:" +echo "" +echo " .cursor/utilities/" +echo " β”œβ”€β”€ converters/ # Python content converters" +echo " β”‚ β”œβ”€β”€ convert_pdf.py" +echo " β”‚ β”œβ”€β”€ convert_html.py" +echo " β”‚ β”œβ”€β”€ convert_webpage.py" +echo " β”‚ └── convert_youtube.py" +echo " β”œβ”€β”€ excalidraw/ # Diagram generation (Node.js)" +echo " β”‚ β”œβ”€β”€ from_script.mjs # Main CLI tool" +echo " β”‚ β”œβ”€β”€ templates.mjs # Diagram templates" +echo " β”‚ └── to_png.mjs # PNG conversion" +echo " └── setup.sh # This script" +echo "" +echo "Available Commands:" +echo "" +echo " πŸ“ Content Converters:" +echo " python $SCRIPT_DIR/converters/convert_pdf.py " +echo " python $SCRIPT_DIR/converters/convert_html.py " +echo " python $SCRIPT_DIR/converters/convert_youtube.py " +echo " python $SCRIPT_DIR/converters/convert_webpage.py " +echo "" +echo " 🎨 Excalidraw Diagram Generator:" +echo " node $SCRIPT_DIR/excalidraw/from_script.mjs --chapter N --lesson M" +echo "" +echo " Or use npm script (from project root):" +echo " npm run excalidraw -- --chapter N --lesson M --update" +echo "" +echo "Available Templates:" +echo " flowchart, cycle, radial, hierarchy, layers," +echo " timeline (year|desc, // breaks), funnel, mindmap," +echo " matrix, comparison, architecture" +echo "" +echo "Notes:" +echo " - PDF conversion requires DATALAB_API_KEY in .cursor/.env" +echo " - Asset upload requires DATACAMP_DCT and DATACAMP_REPO in .cursor/.env" +echo " - Diagrams use Virgil (hand-drawn) font with transparent backgrounds" +echo " - Use --update flag to auto-replace placeholders in source file" +echo " - All templates support dynamic text wrapping for long labels" +echo "" +echo "Verify Setup:" +echo " Run .cursor/utilities/verify_setup.sh to test all services" +echo "" diff --git a/.cursor/utilities/upload_assets.py b/.cursor/utilities/upload_assets.py new file mode 100644 index 0000000..743c21a --- /dev/null +++ b/.cursor/utilities/upload_assets.py @@ -0,0 +1,405 @@ +#!/usr/bin/env python3 +""" +DataCamp Asset Upload Script + +Uploads local images from markdown files to DataCamp's asset system +and updates the markdown with public URLs. + +Usage: + # Upload all images referenced in a markdown file (uses DATACAMP_REPO from .env) + python .cursor/utilities/upload_assets.py slides/chapter_1.md --update + + # Or override with explicit repo + python .cursor/utilities/upload_assets.py slides/chapter_1.md --repo https://github.com/datacamp/courses-example --update + + # Preview changes without writing (dry run) + python .cursor/utilities/upload_assets.py slides/chapter_1.md + +Environment (.cursor/.env): + DATACAMP_DCT: Your DataCamp _dct cookie value (required) + DATACAMP_REPO: GitHub URL or repository ID (required for video workflow) +""" + +import requests +import os +import sys +import re +import argparse +from pathlib import Path +from urllib.parse import urlparse +from dotenv import load_dotenv + +DOMAIN = 'datacamp.com' + + +def load_env(): + """Load environment variables from .cursor/.env file""" + script_dir = Path(__file__).parent.parent # .cursor directory + env_path = script_dir / '.env' + + if env_path.exists(): + load_dotenv(env_path) + + +def load_dct_cookie(): + """ + Load DCT cookie from .cursor/.env file + + Returns: + str: The DCT cookie value + + Raises: + ValueError: If DATACAMP_DCT is not set + """ + load_env() + + dct = os.getenv('DATACAMP_DCT') + if not dct: + raise ValueError( + "DATACAMP_DCT environment variable not set.\n" + "Add DATACAMP_DCT=your_cookie_value to .cursor/.env" + ) + return dct + + +def load_repo(): + """ + Load repository from .cursor/.env file + + Returns: + str: The repository URL or ID, or None if not set + """ + load_env() + return os.getenv('DATACAMP_REPO') + + +def find_repository_by_github_url(github_url, dct): + """ + Find DataCamp repository ID by GitHub repository URL + + Args: + github_url (str): GitHub repository URL + dct (str): DCT cookie value + + Returns: + int: Repository ID from DataCamp + """ + # Parse GitHub URL to extract owner/repo + parsed_url = urlparse(github_url) + if parsed_url.hostname not in ['github.com', 'www.github.com']: + raise ValueError(f"Invalid GitHub URL: {github_url}") + + path_parts = parsed_url.path.strip('/').split('/') + if len(path_parts) < 2: + raise ValueError(f"Invalid GitHub repository URL format: {github_url}") + + repo_name = f"{path_parts[0]}/{path_parts[1]}" + + # Search for repository using teach-api + search_url = f"https://teach-api.{DOMAIN}/teach-browser/repositories/course" + params = { + 'limit': 10, + 'search': repo_name, + 'datacampOnly': 0, + 'offset': 0 + } + + headers = {'accept': 'application/json'} + cookies = {'_dct': dct} + + response = requests.get( + search_url, + params=params, + headers=headers, + cookies=cookies, + timeout=30 + ) + response.raise_for_status() + + data = response.json() + + for repo in data.get('repositories', []): + if repo.get('githubRepoName') == repo_name: + return repo['id'] + + raise ValueError(f"Repository '{repo_name}' not found in DataCamp") + + +def is_github_url(url_or_id): + """Check if the input is a GitHub URL or a repository ID""" + return url_or_id.startswith(('http://', 'https://')) and 'github.com' in url_or_id + + +def upload_asset(file_path, output_filename, repository_id, dct): + """ + Upload an asset file to DataCamp teach editor + + Args: + file_path (str): Path to the file to upload + output_filename (str): Output filename + repository_id (str): Repository ID + dct (str): DCT cookie value + + Returns: + dict: Response containing 'id' and 'public_url' + """ + if not os.path.exists(file_path): + raise FileNotFoundError(f"File not found: {file_path}") + + url = f"https://www.{DOMAIN}/teach/editor/repositories/{repository_id}/branches/master/create_dataset" + + headers = { + 'accept': 'application/json', + 'origin': f'https://www.{DOMAIN}', + } + + cookies = {'_dct': dct} + + with open(file_path, 'rb') as f: + files = { + 'outputFileName': (None, output_filename), + 'file': (output_filename, f, 'application/octet-stream') + } + + response = requests.post( + url, + headers=headers, + cookies=cookies, + files=files, + timeout=60 + ) + + response.raise_for_status() + return response.json() + + +def find_local_images(markdown_content): + """ + Find all local image references in markdown content + + Matches patterns like: + - ![alt](images/lesson_1_1/lesson_1_1_image_1_description.png) + - ![flowchart: ...](images/lesson_1_1/lesson_1_1_image_1_flowchart_a_b_c.png) + + Args: + markdown_content (str): The markdown content to parse + + Returns: + list: List of tuples (full_match, alt_text, local_path) + """ + # Match markdown images with local paths (not http/https URLs) + pattern = r'!\[([^\]]*)\]\(([^)]+\.(?:png|jpg|jpeg|gif|svg))\)' + matches = [] + + for match in re.finditer(pattern, markdown_content, re.IGNORECASE): + full_match = match.group(0) + alt_text = match.group(1) + path = match.group(2) + + # Skip if it's already a URL + if path.startswith(('http://', 'https://')): + continue + + matches.append((full_match, alt_text, path)) + + return matches + + +def process_markdown_file(markdown_path, repository_id, dct, update=False): + """ + Process a markdown file, upload local images, and optionally update the file + + Args: + markdown_path (str): Path to the markdown file + repository_id (str): DataCamp repository ID + dct (str): DCT cookie value + update (bool): Whether to write changes back to the file + + Returns: + tuple: (updated_content, upload_results) + """ + markdown_path = Path(markdown_path) + if not markdown_path.exists(): + raise FileNotFoundError(f"Markdown file not found: {markdown_path}") + + content = markdown_path.read_text(encoding='utf-8') + images = find_local_images(content) + + if not images: + print("No local images found in markdown file.") + return content, [] + + print(f"Found {len(images)} local image(s) to upload:\n") + + results = [] + updated_content = content + + for full_match, alt_text, local_path in images: + # Resolve the image path relative to the markdown file + image_path = markdown_path.parent / local_path + + if not image_path.exists(): + # Try from workspace root + image_path = Path(local_path) + + if not image_path.exists(): + print(f" ⚠️ Skipping (not found): {local_path}") + continue + + output_filename = image_path.name + print(f" πŸ“€ Uploading: {local_path}") + + try: + result = upload_asset( + str(image_path), + output_filename, + repository_id, + dct + ) + + public_url = result.get('public_url') + if public_url: + # Ensure URL has https:// prefix + if not public_url.startswith(('http://', 'https://')): + public_url = f"https://{public_url}" + + # Replace local path with public URL + new_image = f"![{alt_text}]({public_url})" + updated_content = updated_content.replace(full_match, new_image) + + results.append({ + 'local_path': local_path, + 'public_url': public_url, + 'asset_id': result.get('id') + }) + + print(f" βœ… Uploaded: {public_url}") + else: + print(f" ❌ No public URL returned") + + except Exception as e: + print(f" ❌ Failed: {str(e)}") + + if update and results: + markdown_path.write_text(updated_content, encoding='utf-8') + print(f"\nβœ… Updated {markdown_path} with {len(results)} new URL(s)") + elif results: + print(f"\nπŸ“‹ Dry run complete. Use --update to write changes.") + + return updated_content, results + + +def main(): + """Main function to run the script""" + parser = argparse.ArgumentParser( + description='Upload local images from markdown to DataCamp assets', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Upload images using DATACAMP_REPO from .env + python upload_assets.py slides/chapter_1.md --update + + # Override repo with explicit URL + python upload_assets.py slides/chapter_1.md --repo https://github.com/datacamp/courses-example --update + + # Preview what would be uploaded (dry run) + python upload_assets.py slides/chapter_1.md + + # Upload a single image file + python upload_assets.py --file images/diagram.png --name my-diagram.png +""" + ) + + parser.add_argument( + 'markdown_file', + nargs='?', + help='Markdown file to process for local images' + ) + + parser.add_argument( + '--repo', '-r', + help='GitHub URL or DataCamp repository ID (default: DATACAMP_REPO from .env)' + ) + + parser.add_argument( + '--update', '-u', + action='store_true', + help='Update the markdown file with uploaded URLs' + ) + + parser.add_argument( + '--file', '-f', + help='Single file to upload (instead of processing markdown)' + ) + + parser.add_argument( + '--name', '-n', + help='Output filename for single file upload (default: original filename)' + ) + + args = parser.parse_args() + + if not args.markdown_file and not args.file: + parser.error("Either markdown_file or --file must be provided") + + try: + # Load DCT cookie + print("Loading credentials...") + dct = load_dct_cookie() + + # Resolve repository - from args or env + repo = args.repo or load_repo() + if not repo: + print("❌ Repository not specified.") + print(" Either provide --repo or set DATACAMP_REPO in .cursor/.env") + sys.exit(1) + + # Resolve repository ID + if is_github_url(repo): + print(f"Looking up repository: {repo}") + repository_id = find_repository_by_github_url(repo, dct) + print(f"Found repository ID: {repository_id}\n") + else: + repository_id = repo + print(f"Using repository ID: {repository_id}\n") + + if args.file: + # Single file upload mode + file_path = args.file + output_name = args.name or Path(file_path).name + + print(f"Uploading: {file_path}") + result = upload_asset(file_path, output_name, repository_id, dct) + + print("\nβœ… Upload successful!") + print(f"Asset ID: {result.get('id')}") + print(f"Public URL: {result.get('public_url')}") + + else: + # Markdown processing mode + print(f"Processing: {args.markdown_file}") + process_markdown_file( + args.markdown_file, + repository_id, + dct, + update=args.update + ) + + except ValueError as e: + print(f"❌ Configuration error: {e}") + sys.exit(1) + except FileNotFoundError as e: + print(f"❌ File error: {e}") + sys.exit(1) + except requests.exceptions.HTTPError as e: + print(f"❌ API error: {e}") + if hasattr(e, 'response') and e.response is not None: + print(f" Response: {e.response.text[:200]}") + sys.exit(1) + except Exception as e: + print(f"❌ Unexpected error: {e}") + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/.cursor/utilities/verify_setup.sh b/.cursor/utilities/verify_setup.sh new file mode 100755 index 0000000..bd735c3 --- /dev/null +++ b/.cursor/utilities/verify_setup.sh @@ -0,0 +1,213 @@ +#!/bin/bash +# Verification script for DataCamp Curriculum Assistant +# Run this to verify all services are working correctly +# +# Uses the project's virtual environment (.venv/) for Python checks. + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +# Counters +PASSED=0 +FAILED=0 +SKIPPED=0 + +# Get directories +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +PROJECT_DIR="$SCRIPT_DIR/../.." +VENV_DIR="$PROJECT_DIR/.venv" + +# Use virtual environment Python if available +if [ -f "$VENV_DIR/bin/python" ]; then + PYTHON="$VENV_DIR/bin/python" + source "$VENV_DIR/bin/activate" 2>/dev/null +else + PYTHON="python3" +fi + +# Temp directory for test outputs (cleaned up on exit) +TEST_DIR="/tmp/datacamp_verify_$$" +mkdir -p "$TEST_DIR" +trap "rm -rf $TEST_DIR" EXIT + +echo "" +echo "==========================================" +echo " DataCamp Curriculum Assistant" +echo " Setup Verification" +echo "==========================================" +echo "" + +pass() { echo -e " ${GREEN}βœ“${NC} $1"; ((PASSED++)); } +fail() { echo -e " ${RED}βœ—${NC} $1"; ((FAILED++)); } +skip() { echo -e " ${YELLOW}β—‹${NC} $1 (skipped)"; ((SKIPPED++)); } + +# 1. DEPENDENCIES +echo -e "${BLUE}1. Checking Dependencies${NC}" +echo "" +if [ -f "$VENV_DIR/bin/python" ]; then + pass "Python venv ($($PYTHON --version 2>&1))" +elif command -v python3 &>/dev/null; then + pass "Python 3 ($(python3 --version 2>&1)) - Note: Run setup.sh to create .venv" +else + fail "Python 3 not found" +fi +if command -v node &>/dev/null; then + pass "Node.js ($(node --version))" +else + fail "Node.js not found" +fi +if command -v npm &>/dev/null; then + pass "npm (v$(npm --version))" +else + fail "npm not found" +fi +echo "" + +# 2. PYTHON PACKAGES +echo -e "${BLUE}2. Checking Python Packages${NC}" +echo "" +if [ ! -f "$VENV_DIR/bin/python" ]; then + fail "Virtual environment not found - run setup.sh first" +else + $PYTHON -c "import datalab_sdk" 2>/dev/null && pass "datalab-python-sdk" || fail "datalab-python-sdk" + $PYTHON -c "import docling" 2>/dev/null && pass "docling" || fail "docling" + $PYTHON -c "import youtube_transcript_api" 2>/dev/null && pass "youtube-transcript-api" || fail "youtube-transcript-api" + $PYTHON -c "import trafilatura" 2>/dev/null && pass "trafilatura" || fail "trafilatura" + $PYTHON -c "import dotenv" 2>/dev/null && pass "python-dotenv" || fail "python-dotenv" + $PYTHON -c "import brotli" 2>/dev/null && pass "brotli" || fail "brotli" +fi +echo "" + +# 3. NODE.JS PACKAGES +echo -e "${BLUE}3. Checking Node.js Packages${NC}" +echo "" +cd "$PROJECT_DIR" +node -e "require('puppeteer')" 2>/dev/null && pass "puppeteer" || fail "puppeteer (run: npm install)" +node -e "require('sharp')" 2>/dev/null && pass "sharp" || fail "sharp (run: npm install)" +echo "" + +# 4. API KEYS & CONFIGURATION +echo -e "${BLUE}4. Checking API Configuration${NC}" +echo "" +ENV_FILE="$SCRIPT_DIR/../.env" +HAS_API_KEY=false +HAS_DCT=false +HAS_REPO=false + +if [ -f "$ENV_FILE" ]; then + if grep -q "DATALAB_API_KEY=" "$ENV_FILE" 2>/dev/null; then + pass "DATALAB_API_KEY configured (PDF conversion)" + HAS_API_KEY=true + else + skip "DATALAB_API_KEY not set (PDF conversion disabled)" + fi + + if grep -q "DATACAMP_DCT=" "$ENV_FILE" 2>/dev/null; then + pass "DATACAMP_DCT configured (asset upload)" + HAS_DCT=true + else + fail "DATACAMP_DCT not set (required for asset upload)" + fi + + if grep -q "DATACAMP_REPO=" "$ENV_FILE" 2>/dev/null; then + pass "DATACAMP_REPO configured (asset upload)" + HAS_REPO=true + else + fail "DATACAMP_REPO not set (required for asset upload)" + fi +else + fail ".cursor/.env file not found" +fi +echo "" + +# 5. CONVERTER TESTS +echo -e "${BLUE}5. Testing Converters${NC}" +echo "" + +if [ ! -f "$VENV_DIR/bin/python" ]; then + skip "Converters (run setup.sh first)" +else + echo " Testing webpage converter..." + if $PYTHON "$SCRIPT_DIR/converters/convert_webpage.py" "https://example.com" -o "$TEST_DIR/web.md" 2>/dev/null && [ -s "$TEST_DIR/web.md" ]; then + pass "Webpage β†’ Markdown" + else + fail "Webpage β†’ Markdown" + fi + + echo " Testing YouTube converter..." + if $PYTHON "$SCRIPT_DIR/converters/convert_youtube.py" "https://www.youtube.com/watch?v=gXwewPgLmkE" -o "$TEST_DIR/yt.md" 2>/dev/null && [ -s "$TEST_DIR/yt.md" ]; then + pass "YouTube β†’ Transcript" + else + fail "YouTube β†’ Transcript" + fi + + echo " Testing PDF converter..." + if [ "$HAS_API_KEY" = true ]; then + $PYTHON -c "import sys; sys.path.insert(0,'$SCRIPT_DIR/converters'); from convert_pdf import convert_pdf_to_markdown" 2>/dev/null && pass "PDF converter ready" || fail "PDF converter" + else + skip "PDF β†’ Markdown (no API key)" + fi +fi +echo "" + +# 6. EXCALIDRAW +echo -e "${BLUE}6. Testing Excalidraw Diagram Generation${NC}" +echo "" +cat > "$TEST_DIR/test.md" << 'EOF' +## Test +`@part1` +![excalidraw: flowchart: A, B, C]() +EOF + +echo " Generating test diagram..." +cd "$PROJECT_DIR" +if node .cursor/utilities/excalidraw/from_script.mjs "$TEST_DIR/test.md" --chapter 999 --lesson 999 --output "$TEST_DIR" 2>/dev/null; then + # Find any generated PNG (new naming: lesson_999_999_image_1_*.png) + PNG_FILE=$(find "$TEST_DIR" -name "lesson_999_999_image_1_*.png" -type f 2>/dev/null | head -1) + if [ -n "$PNG_FILE" ] && [ -f "$PNG_FILE" ]; then + SIZE=$(wc -c < "$PNG_FILE") + [ "$SIZE" -gt 1000 ] && pass "Excalidraw PNG (${SIZE} bytes)" || fail "Excalidraw PNG (too small)" + else + fail "Excalidraw PNG (no output)" + fi +else + fail "Excalidraw PNG (command failed)" +fi +echo "" + +# 7. FILE STRUCTURE +echo -e "${BLUE}7. Checking File Structure${NC}" +echo "" +[ -d "$SCRIPT_DIR/converters" ] && pass "converters/" || fail "converters/" +[ -d "$SCRIPT_DIR/excalidraw" ] && pass "excalidraw/" || fail "excalidraw/" +[ -f "$SCRIPT_DIR/excalidraw/from_script.mjs" ] && pass "from_script.mjs" || fail "from_script.mjs" +[ -f "$SCRIPT_DIR/excalidraw/templates.mjs" ] && pass "templates.mjs" || fail "templates.mjs" +echo "" + +# SUMMARY +echo "==========================================" +echo " Summary" +echo "==========================================" +echo "" +echo -e " ${GREEN}Passed:${NC} $PASSED" +echo -e " ${RED}Failed:${NC} $FAILED" +echo -e " ${YELLOW}Skipped:${NC} $SKIPPED" +echo "" +if [ $FAILED -eq 0 ]; then + echo -e " ${GREEN}All checks passed! Ready for content creation.${NC}" +elif [ "$HAS_DCT" = false ] || [ "$HAS_REPO" = false ]; then + echo -e " ${RED}Missing required configuration.${NC}" + echo " Add to .cursor/.env:" + [ "$HAS_DCT" = false ] && echo " DATACAMP_DCT=your_cookie_value" + [ "$HAS_REPO" = false ] && echo " DATACAMP_REPO=https://github.com/datacamp-content/courses-..." +else + echo -e " ${RED}Some checks failed.${NC}" + echo " Fix: Run .cursor/utilities/setup.sh" +fi +echo "" +echo "==========================================" +exit $FAILED diff --git a/.cursor/validators/mc_validator.py b/.cursor/validators/mc_validator.py new file mode 100644 index 0000000..5b4914f --- /dev/null +++ b/.cursor/validators/mc_validator.py @@ -0,0 +1,518 @@ +#!/usr/bin/env python3 +""" +MultipleChoiceChallenge Validator + +Validates MultipleChoiceChallenge item markdown structure. + +Usage: + python mc_validator.py + python mc_validator.py /tmp/mc_items.md +""" + +import sys +import re +from pathlib import Path +from typing import List, Optional, Tuple +from dataclasses import dataclass, field + + +# ============================================================================ +# VALIDATION MODELS +# ============================================================================ + +@dataclass +class ItemValidationResult: + """Result of validating a single item.""" + valid: bool + title: str + item_number: int + correct_position: int # 1-4, or 0 if not found + errors: List[str] = field(default_factory=list) + warnings: List[str] = field(default_factory=list) + option_lengths: List[int] = field(default_factory=list) + + +@dataclass +class DocumentValidationResult: + """Result of validating the entire document.""" + valid: bool + message: str + document_title: Optional[str] = None + item_count: int = 0 + items: List[ItemValidationResult] = field(default_factory=list) + document_errors: List[str] = field(default_factory=list) + document_warnings: List[str] = field(default_factory=list) + + +# ============================================================================ +# VAGUE STEM PATTERNS +# ============================================================================ + +VAGUE_STEM_PATTERNS = [ + r'\bwhich of the following\b', + r'\bwhich option\b', + r'\bwhich statement\b', + r'\bhow would you best describe\b', + r'\bwhich best describes\b', + r'\bwhich is true\b', + r'\bwhich is false\b', + r'\bwhich is correct\b', + r'\bwhich is incorrect\b', + r'\ball of the above\b', + r'\bnone of the above\b', +] + + +# ============================================================================ +# PARSER +# ============================================================================ + +def parse_document_header(content: str) -> Tuple[dict, str]: + """ + Parse the document header (title, output, description). + + Returns: + Tuple of (header_data, remaining_content) + """ + header = {} + + # Check for YAML front matter + front_matter_match = re.match(r'^---\s*\n(.*?)\n---\s*\n', content, re.DOTALL) + if front_matter_match: + front_matter = front_matter_match.group(1) + + # Extract title + title_match = re.search(r'title:\s*(.+)', front_matter) + if title_match: + header["title"] = title_match.group(1).strip() + + # Extract output + output_match = re.search(r'output:\s*(.+)', front_matter) + if output_match: + header["output"] = output_match.group(1).strip() + + # Extract description + desc_match = re.search(r'description:\s*(.+)', front_matter) + if desc_match: + header["description"] = desc_match.group(1).strip() + + remaining = content[front_matter_match.end():] + else: + remaining = content + + return header, remaining + + +def split_items(content: str) -> List[str]: + """ + Split content into individual items by --- separator. + + Returns: + List of item content strings + """ + # Split by --- on its own line (item separator) + items = re.split(r'\n---\s*\n', content) + + # Filter out empty items + items = [item.strip() for item in items if item.strip()] + + return items + + +def parse_single_item(content: str) -> Tuple[dict, List[str]]: + """ + Parse a single MultipleChoiceChallenge item. + + Returns: + Tuple of (parsed_data, errors) + """ + errors = [] + + parsed = { + "title": None, + "yaml_block": None, + "assignment": None, + "options": [], + "correct_index": -1, # 0-based index of correct answer + "correct_answer": None, + } + + # Extract title from heading (## Title format) + title_match = re.search(r'^##\s+(.+?)(?:\n|$)', content, re.MULTILINE) + if not title_match: + errors.append("Missing item heading (must have '## Title')") + else: + parsed["title"] = title_match.group(1).strip() + + # Extract YAML metadata block + yaml_match = re.search(r'```yaml\s*\n(.*?)```', content, re.DOTALL) + if yaml_match: + parsed["yaml_block"] = yaml_match.group(1).strip() + else: + errors.append("Missing ```yaml metadata block") + + # Extract assignment (stem/question) + assignment_match = re.search(r'`@assignment1`\s*\n(.*?)(?=`@|\Z)', content, re.DOTALL) + if assignment_match: + parsed["assignment"] = assignment_match.group(1).strip() + else: + errors.append("Missing `@assignment1` section") + + # Extract options + options_match = re.search(r'`@options1`\s*\n(.*?)(?=`@|\Z|---)', content, re.DOTALL) + if options_match: + options_text = options_match.group(1).strip() + + # Parse options - look for lines starting with - + option_lines = re.findall(r'^-\s*(.+)$', options_text, re.MULTILINE) + + for i, opt in enumerate(option_lines): + opt = opt.strip() + # Check if this is the correct answer (wrapped in [...]) + if opt.startswith('[') and opt.endswith(']'): + parsed["correct_index"] = i + parsed["correct_answer"] = opt[1:-1] # Remove brackets + parsed["options"].append(opt[1:-1]) + else: + parsed["options"].append(opt) + else: + errors.append("Missing `@options1` section") + + return parsed, errors + + +# ============================================================================ +# VALIDATORS +# ============================================================================ + +def validate_yaml_block(yaml_content: str) -> Tuple[List[str], List[str]]: + """Validate the YAML metadata block for MultipleChoiceChallenge.""" + errors = [] + warnings = [] + + if not yaml_content: + errors.append("YAML block is empty") + return errors, warnings + + # Check for required fields + required_fields = [ + ("type:", "type"), + ("key:", "key"), + ("unit:", "unit"), + ("subskill:", "subskill"), + ("initial_difficulty:", "initial_difficulty"), + ("item_writer_id:", "item_writer_id"), + ] + + for field_pattern, field_name in required_fields: + if field_pattern not in yaml_content: + errors.append(f"Missing '{field_name}' in YAML block") + + # Validate type is MultipleChoiceChallenge + if "type:" in yaml_content and "MultipleChoiceChallenge" not in yaml_content: + errors.append("type must be 'MultipleChoiceChallenge'") + + # Validate item_writer_id is 999999999 + writer_match = re.search(r"item_writer_id:\s*['\"]?(\d+)['\"]?", yaml_content) + if writer_match: + if writer_match.group(1) != "999999999": + warnings.append(f"item_writer_id should be '999999999', found '{writer_match.group(1)}'") + + # Check unit format (should be kebab-case, 2-4 words) + unit_match = re.search(r'unit:\s*([^\n]+)', yaml_content) + if unit_match: + unit_value = unit_match.group(1).strip() + if not re.match(r'^[a-z0-9]+(-[a-z0-9]+){1,3}$', unit_value): + warnings.append(f"unit '{unit_value}' should be kebab-case (e.g., 'container-basics')") + + return errors, warnings + + +def validate_options(options: List[str], correct_index: int) -> Tuple[List[str], List[str]]: + """Validate option structure and length rules.""" + errors = [] + warnings = [] + + # Check option count + if len(options) != 4: + errors.append(f"Must have exactly 4 options, found {len(options)}") + return errors, warnings + + # Check for correct answer + if correct_index < 0: + errors.append("No correct answer marked with [...] brackets") + return errors, warnings + + # Calculate lengths + lengths = [len(opt) for opt in options] + correct_length = lengths[correct_index] + + # Check Β±8 character rule + min_len = min(lengths) + max_len = max(lengths) + + if max_len - min_len > 16: # Β±8 means max difference of 16 + errors.append(f"Option lengths vary too much: {min_len}-{max_len} chars (max allowed difference: 16)") + + # Check each option is within Β±8 of others + for i, length in enumerate(lengths): + for j, other_length in enumerate(lengths): + if i != j and abs(length - other_length) > 8: + warnings.append(f"Option {i+1} ({length} chars) and option {j+1} ({other_length} chars) differ by more than 8 characters") + break # Only warn once per pair + + # Check correct answer is not longest + if correct_length > max(lengths[i] for i in range(len(lengths)) if i != correct_index): + # Correct is longest + if correct_length > min(lengths[i] for i in range(len(lengths)) if i != correct_index): + warnings.append(f"Correct answer ({correct_length} chars) is longer than some distractors") + + # Check for "All of the above" / "None of the above" + for opt in options: + opt_lower = opt.lower() + if 'all of the above' in opt_lower or 'none of the above' in opt_lower: + errors.append("Options should not include 'All of the above' or 'None of the above'") + break + + return errors, warnings + + +def validate_stem(assignment: str) -> Tuple[List[str], List[str]]: + """Validate the stem/question for clarity.""" + errors = [] + warnings = [] + + if not assignment: + return errors, warnings + + assignment_lower = assignment.lower() + + # Check for vague stem patterns + for pattern in VAGUE_STEM_PATTERNS: + if re.search(pattern, assignment_lower): + warnings.append(f"Vague stem detected: '{pattern.replace(chr(92), '').replace('b', '')}' - stems should stand alone without options") + break + + # Check minimum length (should have context + question) + if len(assignment) < 50: + warnings.append("Stem seems too short - consider adding context or detail") + + return errors, warnings + + +def validate_single_item(content: str, item_number: int) -> ItemValidationResult: + """ + Validate a single MultipleChoiceChallenge item. + + Args: + content: Item markdown content + item_number: 1-based item number + + Returns: + ItemValidationResult + """ + all_errors = [] + all_warnings = [] + + # Parse the item + parsed, parse_errors = parse_single_item(content) + all_errors.extend(parse_errors) + + title = parsed.get("title", f"Item {item_number}") + + # Validate YAML block + if parsed.get("yaml_block"): + yaml_errors, yaml_warnings = validate_yaml_block(parsed["yaml_block"]) + all_errors.extend(yaml_errors) + all_warnings.extend(yaml_warnings) + + # Validate options + options = parsed.get("options", []) + correct_index = parsed.get("correct_index", -1) + + if options: + opt_errors, opt_warnings = validate_options(options, correct_index) + all_errors.extend(opt_errors) + all_warnings.extend(opt_warnings) + + # Validate stem + if parsed.get("assignment"): + stem_errors, stem_warnings = validate_stem(parsed["assignment"]) + all_errors.extend(stem_errors) + all_warnings.extend(stem_warnings) + + # Calculate option lengths for reporting + option_lengths = [len(opt) for opt in options] if options else [] + + # Correct position (1-based) + correct_position = correct_index + 1 if correct_index >= 0 else 0 + + return ItemValidationResult( + valid=len(all_errors) == 0, + title=title, + item_number=item_number, + correct_position=correct_position, + errors=all_errors, + warnings=all_warnings, + option_lengths=option_lengths + ) + + +def validate_rotation(items: List[ItemValidationResult]) -> List[str]: + """Check rotation pattern across items.""" + warnings = [] + + if len(items) < 3: + return warnings + + positions = [item.correct_position for item in items if item.correct_position > 0] + + # Check for same position more than twice in a row + for i in range(len(positions) - 2): + if positions[i] == positions[i+1] == positions[i+2]: + warnings.append(f"Correct answer in position {positions[i]} three times in a row (items {i+1}-{i+3})") + + # Check distribution + if len(positions) >= 8: + from collections import Counter + counts = Counter(positions) + for pos in range(1, 5): + if counts.get(pos, 0) == 0: + warnings.append(f"Position {pos} never used for correct answer across {len(positions)} items") + + return warnings + + +# ============================================================================ +# MAIN VALIDATOR +# ============================================================================ + +def validate_document(content: str) -> DocumentValidationResult: + """ + Validate a complete MultipleChoiceChallenge document with multiple items. + + Args: + content: Full markdown content + + Returns: + DocumentValidationResult + """ + document_errors = [] + document_warnings = [] + + # Parse document header + header, remaining_content = parse_document_header(content) + document_title = header.get("title", "Untitled") + + # Check for document header + if not header.get("title"): + document_errors.append("Missing document 'title:' in front matter") + + # Split into items + items_content = split_items(remaining_content) + + if not items_content: + document_errors.append("No items found in document") + return DocumentValidationResult( + valid=False, + message="❌ No items found in document", + document_title=document_title, + document_errors=document_errors + ) + + # Validate each item + item_results = [] + for i, item_content in enumerate(items_content, start=1): + result = validate_single_item(item_content, i) + item_results.append(result) + + # Validate rotation pattern across items + rotation_warnings = validate_rotation(item_results) + document_warnings.extend(rotation_warnings) + + # Aggregate results + all_valid = all(item.valid for item in item_results) and len(document_errors) == 0 + + if all_valid: + message = f"βœ… Validation passed: {len(item_results)} item(s)" + else: + failed_count = sum(1 for item in item_results if not item.valid) + message = f"❌ Validation failed: {failed_count}/{len(item_results)} item(s) have errors" + + return DocumentValidationResult( + valid=all_valid, + message=message, + document_title=document_title, + item_count=len(item_results), + items=item_results, + document_errors=document_errors, + document_warnings=document_warnings + ) + + +# ============================================================================ +# CLI +# ============================================================================ + +def main(): + """CLI entry point.""" + if len(sys.argv) < 2: + print("MultipleChoiceChallenge Validator") + print("") + print("Usage:") + print(" python mc_validator.py ") + print("") + print("Example:") + print(" python mc_validator.py /tmp/mc_items.md") + sys.exit(1) + + file_path = Path(sys.argv[1]) + + if not file_path.exists(): + print(f"Error: File not found: {file_path}") + sys.exit(1) + + content = file_path.read_text() + + # Validate + result = validate_document(content) + + # Output + print(result.message) + print(f"Document: {result.document_title}") + print("") + + # Document-level errors + if result.document_errors: + print("Document Errors:") + for error in result.document_errors: + print(f" ❌ {error}") + print("") + + # Document-level warnings + if result.document_warnings: + print("Document Warnings:") + for warning in result.document_warnings: + print(f" ⚠️ {warning}") + print("") + + # Per-item results + for item in result.items: + status = "βœ…" if item.valid else "❌" + pos_str = f"correct@{item.correct_position}" if item.correct_position > 0 else "no correct marked" + lengths_str = f"lengths: {item.option_lengths}" if item.option_lengths else "" + print(f"{status} Item {item.item_number}: \"{item.title}\" β€” {pos_str} {lengths_str}") + + if item.errors: + for error in item.errors: + print(f" ❌ {error}") + + if item.warnings: + for warning in item.warnings: + print(f" ⚠️ {warning}") + + sys.exit(0 if result.valid else 1) + + +if __name__ == "__main__": + main() diff --git a/.cursor/validators/python_coding_validator.py b/.cursor/validators/python_coding_validator.py new file mode 100644 index 0000000..9b8bd1c --- /dev/null +++ b/.cursor/validators/python_coding_validator.py @@ -0,0 +1,472 @@ +#!/usr/bin/env python3 +""" +Python BlanksChallenge Validator + +Validates BlanksChallenge exercise markdown structure for Python coding items. + +Usage: + python python_coding_validator.py + python python_coding_validator.py /tmp/exercise_to_validate.md +""" + +import sys +import re +from pathlib import Path +from typing import List, Optional, Tuple +from dataclasses import dataclass, field + + +# ============================================================================ +# VALIDATION MODELS +# ============================================================================ + +@dataclass +class ItemValidationResult: + """Result of validating a single item.""" + valid: bool + title: str + item_number: int + errors: List[str] = field(default_factory=list) + warnings: List[str] = field(default_factory=list) + blank_count: int = 0 + + +@dataclass +class DocumentValidationResult: + """Result of validating the entire document.""" + valid: bool + message: str + document_title: Optional[str] = None + item_count: int = 0 + items: List[ItemValidationResult] = field(default_factory=list) + document_errors: List[str] = field(default_factory=list) + + +# ============================================================================ +# PARSER +# ============================================================================ + +def parse_document_header(content: str) -> Tuple[dict, str]: + """ + Parse the document header (title, output, description). + + Returns: + Tuple of (header_data, remaining_content) + """ + header = {} + + # Check for YAML front matter + front_matter_match = re.match(r'^---\s*\n(.*?)\n---\s*\n', content, re.DOTALL) + if front_matter_match: + front_matter = front_matter_match.group(1) + + # Extract title + title_match = re.search(r'title:\s*(.+)', front_matter) + if title_match: + header["title"] = title_match.group(1).strip() + + # Extract output + output_match = re.search(r'output:\s*(.+)', front_matter) + if output_match: + header["output"] = output_match.group(1).strip() + + # Extract description + desc_match = re.search(r'description:\s*(.+)', front_matter) + if desc_match: + header["description"] = desc_match.group(1).strip() + + remaining = content[front_matter_match.end():] + else: + remaining = content + + return header, remaining + + +def split_items(content: str) -> List[str]: + """ + Split content into individual items by --- separator. + + Returns: + List of item content strings + """ + # Split by --- on its own line (item separator) + items = re.split(r'\n---\s*\n', content) + + # Filter out empty items + items = [item.strip() for item in items if item.strip()] + + return items + + +def parse_single_item(content: str) -> Tuple[dict, List[str]]: + """ + Parse a single BlanksChallenge item. + + Returns: + Tuple of (parsed_data, errors) + """ + errors = [] + + parsed = { + "title": None, + "yaml_block": None, + "context": None, + "code1": None, + "pre_challenge_code": None, + "variables": None, + "distractors": None, + } + + # Extract title from heading (## [Title] format) + title_match = re.search(r'^##\s+\[([^\]]+)\]', content, re.MULTILINE) + if not title_match: + # Try without brackets + title_match = re.search(r'^##\s+(.+?)(?:\n|$)', content, re.MULTILINE) + + if not title_match: + errors.append("Missing item heading (must have '## [Title]' or '## Title')") + else: + parsed["title"] = title_match.group(1).strip() + + # Extract YAML metadata block + yaml_match = re.search(r'```yaml\s*\n(.*?)```', content, re.DOTALL) + if yaml_match: + parsed["yaml_block"] = yaml_match.group(1).strip() + else: + errors.append("Missing ```yaml metadata block") + + # Extract sections - support both ```python and ```{python} + sections = { + "context": r'`@context`\s*\n(.*?)(?=`@|\Z)', + "code1": r'`@code1`\s*\n```(?:python|\{python\})\s*\n(.*?)```', + "pre_challenge_code": r'`@pre_challenge_code`\s*\n```(?:python|\{python\})\s*\n(.*?)```', + "variables": r'`@variables`\s*\n```yaml\s*\n(.*?)```', + "distractors": r'`@distractors`\s*\n```yaml\s*\n(.*?)```', + } + + for section_name, pattern in sections.items(): + match = re.search(pattern, content, re.DOTALL) + if match: + parsed[section_name] = match.group(1).strip() + + return parsed, errors + + +# ============================================================================ +# VALIDATORS +# ============================================================================ + +def validate_yaml_block(yaml_content: str) -> Tuple[List[str], List[str]]: + """Validate the YAML metadata block for BlanksChallenge.""" + errors = [] + warnings = [] + + if not yaml_content: + errors.append("YAML block is empty") + return errors, warnings + + # Check for required fields + required_fields = [ + ("type:", "type"), + ("key:", "key"), + ("unit:", "unit"), + ("subskill:", "subskill"), + ("initial_difficulty:", "initial_difficulty"), + ("item_writer_id:", "item_writer_id"), + ] + + for field_pattern, field_name in required_fields: + if field_pattern not in yaml_content: + errors.append(f"Missing '{field_name}' in YAML block") + + # Validate type is BlanksChallenge + if "type:" in yaml_content and "BlanksChallenge" not in yaml_content: + errors.append("type must be 'BlanksChallenge'") + + # Validate item_writer_id is 999999999 + writer_match = re.search(r"item_writer_id:\s*['\"]?(\d+)['\"]?", yaml_content) + if writer_match: + if writer_match.group(1) != "999999999": + warnings.append(f"item_writer_id should be '999999999', found '{writer_match.group(1)}'") + + # Check unit format (should be kebab-case, 2-4 words) + unit_match = re.search(r'unit:\s*([^\n]+)', yaml_content) + if unit_match: + unit_value = unit_match.group(1).strip() + if not re.match(r'^[a-z0-9]+(-[a-z0-9]+){1,3}$', unit_value): + warnings.append(f"unit '{unit_value}' should be kebab-case (e.g., 'llm-metrics', 'llm-tasks-hf-tools')") + + return errors, warnings + + +def validate_required_sections(parsed: dict) -> List[str]: + """Validate that all required sections are present.""" + errors = [] + + required_sections = [ + ("context", "`@context`"), + ("code1", "`@code1`"), + ("variables", "`@variables`"), + ] + + for section_key, section_name in required_sections: + if not parsed.get(section_key): + errors.append(f"Missing required section: {section_name}") + + return errors + + +def validate_blanks(code1: str, variables: str) -> Tuple[List[str], List[str]]: + """Validate {{_exprN}} placeholders in code1 match variables.""" + errors = [] + warnings = [] + + if not code1: + return errors, warnings + + # Find all {{_exprN}} placeholders in code1 + blanks = re.findall(r'\{\{_expr(\d+)\}\}', code1) + blank_numbers = sorted(set(int(b) for b in blanks)) + + if not blank_numbers: + errors.append("No {{_exprN}} blanks found in @code1") + return errors, warnings + + # Check for consecutive numbering starting at 1 + expected = list(range(1, len(blank_numbers) + 1)) + if blank_numbers != expected: + errors.append(f"Blank numbers should be consecutive starting at 1. Found: {blank_numbers}") + + # Validate variables section has matching entries + if variables: + for num in blank_numbers: + expr_pattern = f"expr{num}:" + if expr_pattern not in variables: + errors.append(f"Missing 'expr{num}:' in @variables for {{{{_expr{num}}}}}") + + # Check for extra variables not in code + var_matches = re.findall(r'expr(\d+):', variables) + for var_num in var_matches: + if int(var_num) not in blank_numbers: + warnings.append(f"Variable 'expr{var_num}' defined but not used in @code1") + + return errors, warnings + + +def validate_code1_content(code1: str) -> Tuple[List[str], List[str]]: + """Validate code1 content (no comments, etc.).""" + errors = [] + warnings = [] + + if not code1: + return errors, warnings + + # Check for comments (should not be in @code1) + comments = re.findall(r'#.*$', code1, re.MULTILINE) + if comments: + errors.append(f"@code1 should not contain comments. Found {len(comments)} comment(s)") + + # Check for string blanks (answers shouldn't be string literals in certain cases) + # This is a warning, not an error, as sometimes strings are valid + + return errors, warnings + + +def validate_variables_format(variables: str) -> Tuple[List[str], List[str]]: + """Validate variables section format.""" + errors = [] + warnings = [] + + if not variables: + return errors, warnings + + # Check each variable has a list with exactly one answer + var_blocks = re.findall(r'(expr\d+):\s*\n\s*-\s*(.+)', variables) + + for var_name, answer in var_blocks: + # Check answer is quoted + if not (answer.startswith("'") or answer.startswith('"')): + warnings.append(f"{var_name} answer should be quoted: '{answer}'") + + return errors, warnings + + +def validate_single_item(content: str, item_number: int) -> ItemValidationResult: + """ + Validate a single BlanksChallenge item. + + Args: + content: Item markdown content + item_number: 1-based item number + + Returns: + ItemValidationResult + """ + all_errors = [] + all_warnings = [] + + # Parse the item + parsed, parse_errors = parse_single_item(content) + all_errors.extend(parse_errors) + + title = parsed.get("title", f"Item {item_number}") + + # Validate YAML block + if parsed.get("yaml_block"): + yaml_errors, yaml_warnings = validate_yaml_block(parsed["yaml_block"]) + all_errors.extend(yaml_errors) + all_warnings.extend(yaml_warnings) + + # Validate required sections + section_errors = validate_required_sections(parsed) + all_errors.extend(section_errors) + + # Validate blanks + blank_count = 0 + if parsed.get("code1"): + blank_errors, blank_warnings = validate_blanks(parsed["code1"], parsed.get("variables")) + all_errors.extend(blank_errors) + all_warnings.extend(blank_warnings) + blank_count = len(re.findall(r'\{\{_expr\d+\}\}', parsed["code1"])) + + # Validate code1 content + if parsed.get("code1"): + code_errors, code_warnings = validate_code1_content(parsed["code1"]) + all_errors.extend(code_errors) + all_warnings.extend(code_warnings) + + # Validate variables format + if parsed.get("variables"): + var_errors, var_warnings = validate_variables_format(parsed["variables"]) + all_errors.extend(var_errors) + all_warnings.extend(var_warnings) + + return ItemValidationResult( + valid=len(all_errors) == 0, + title=title, + item_number=item_number, + errors=all_errors, + warnings=all_warnings, + blank_count=blank_count + ) + + +# ============================================================================ +# MAIN VALIDATOR +# ============================================================================ + +def validate_document(content: str) -> DocumentValidationResult: + """ + Validate a complete BlanksChallenge document with multiple items. + + Args: + content: Full markdown content + + Returns: + DocumentValidationResult + """ + document_errors = [] + + # Parse document header + header, remaining_content = parse_document_header(content) + document_title = header.get("title", "Untitled") + + # Check for document header + if not header.get("title"): + document_errors.append("Missing document 'title:' in front matter") + + # Split into items + items_content = split_items(remaining_content) + + if not items_content: + document_errors.append("No items found in document") + return DocumentValidationResult( + valid=False, + message="❌ No items found in document", + document_title=document_title, + document_errors=document_errors + ) + + # Validate each item + item_results = [] + for i, item_content in enumerate(items_content, start=1): + result = validate_single_item(item_content, i) + item_results.append(result) + + # Aggregate results + all_valid = all(item.valid for item in item_results) and len(document_errors) == 0 + total_blanks = sum(item.blank_count for item in item_results) + + if all_valid: + message = f"βœ… Validation passed: {len(item_results)} item(s), {total_blanks} total blanks" + else: + failed_count = sum(1 for item in item_results if not item.valid) + message = f"❌ Validation failed: {failed_count}/{len(item_results)} item(s) have errors" + + return DocumentValidationResult( + valid=all_valid, + message=message, + document_title=document_title, + item_count=len(item_results), + items=item_results, + document_errors=document_errors + ) + + +# ============================================================================ +# CLI +# ============================================================================ + +def main(): + """CLI entry point.""" + if len(sys.argv) < 2: + print("Python BlanksChallenge Validator") + print("") + print("Usage:") + print(" python python_coding_validator.py ") + print("") + print("Example:") + print(" python python_coding_validator.py /tmp/exercise_to_validate.md") + sys.exit(1) + + file_path = Path(sys.argv[1]) + + if not file_path.exists(): + print(f"Error: File not found: {file_path}") + sys.exit(1) + + content = file_path.read_text() + + # Validate + result = validate_document(content) + + # Output + print(result.message) + print(f"Document: {result.document_title}") + print("") + + # Document-level errors + if result.document_errors: + print("Document Errors:") + for error in result.document_errors: + print(f" ❌ {error}") + print("") + + # Per-item results + for item in result.items: + status = "βœ…" if item.valid else "❌" + print(f"{status} Item {item.item_number}: \"{item.title}\" β€” {item.blank_count} blank(s)") + + if item.errors: + for error in item.errors: + print(f" ❌ {error}") + + if item.warnings: + for warning in item.warnings: + print(f" ⚠️ {warning}") + + sys.exit(0 if result.valid else 1) + + +if __name__ == "__main__": + main() diff --git a/.cursor/validators/r_coding_validator.py b/.cursor/validators/r_coding_validator.py new file mode 100644 index 0000000..f024007 --- /dev/null +++ b/.cursor/validators/r_coding_validator.py @@ -0,0 +1,478 @@ +#!/usr/bin/env python3 +""" +R BlanksChallenge Validator + +Validates BlanksChallenge exercise markdown structure for R coding items. + +Usage: + python r_coding_validator.py + python r_coding_validator.py /tmp/exercise_to_validate.md +""" + +import sys +import re +from pathlib import Path +from typing import List, Optional, Tuple +from dataclasses import dataclass, field + + +# ============================================================================ +# VALIDATION MODELS +# ============================================================================ + +@dataclass +class ItemValidationResult: + """Result of validating a single item.""" + valid: bool + title: str + item_number: int + errors: List[str] = field(default_factory=list) + warnings: List[str] = field(default_factory=list) + blank_count: int = 0 + + +@dataclass +class DocumentValidationResult: + """Result of validating the entire document.""" + valid: bool + message: str + document_title: Optional[str] = None + item_count: int = 0 + items: List[ItemValidationResult] = field(default_factory=list) + document_errors: List[str] = field(default_factory=list) + + +# ============================================================================ +# PARSER +# ============================================================================ + +def parse_document_header(content: str) -> Tuple[dict, str]: + """ + Parse the document header (title, output, description). + + Returns: + Tuple of (header_data, remaining_content) + """ + header = {} + + # Check for YAML front matter + front_matter_match = re.match(r'^---\s*\n(.*?)\n---\s*\n', content, re.DOTALL) + if front_matter_match: + front_matter = front_matter_match.group(1) + + # Extract title + title_match = re.search(r'title:\s*(.+)', front_matter) + if title_match: + header["title"] = title_match.group(1).strip() + + # Extract output + output_match = re.search(r'output:\s*(.+)', front_matter) + if output_match: + header["output"] = output_match.group(1).strip() + + # Extract description + desc_match = re.search(r'description:\s*(.+)', front_matter) + if desc_match: + header["description"] = desc_match.group(1).strip() + + remaining = content[front_matter_match.end():] + else: + remaining = content + + return header, remaining + + +def split_items(content: str) -> List[str]: + """ + Split content into individual items by --- separator. + + Returns: + List of item content strings + """ + # Split by --- on its own line (item separator) + items = re.split(r'\n---\s*\n', content) + + # Filter out empty items + items = [item.strip() for item in items if item.strip()] + + return items + + +def parse_single_item(content: str) -> Tuple[dict, List[str]]: + """ + Parse a single BlanksChallenge item. + + Returns: + Tuple of (parsed_data, errors) + """ + errors = [] + + parsed = { + "title": None, + "yaml_block": None, + "context": None, + "code1": None, + "pre_challenge_code": None, + "variables": None, + "distractors": None, + } + + # Extract title from heading (## [Title] format) + title_match = re.search(r'^##\s+\[([^\]]+)\]', content, re.MULTILINE) + if not title_match: + # Try without brackets + title_match = re.search(r'^##\s+(.+?)(?:\n|$)', content, re.MULTILINE) + + if not title_match: + errors.append("Missing item heading (must have '## [Title]' or '## Title')") + else: + parsed["title"] = title_match.group(1).strip() + + # Extract YAML metadata block + yaml_match = re.search(r'```yaml\s*\n(.*?)```', content, re.DOTALL) + if yaml_match: + parsed["yaml_block"] = yaml_match.group(1).strip() + else: + errors.append("Missing ```yaml metadata block") + + # Extract sections - support both ```r and ```{r} + sections = { + "context": r'`@context`\s*\n(.*?)(?=`@|\Z)', + "code1": r'`@code1`\s*\n```(?:r|\{r\})\s*\n(.*?)```', + "pre_challenge_code": r'`@pre_challenge_code`\s*\n```(?:r|\{r\})\s*\n(.*?)```', + "variables": r'`@variables`\s*\n```yaml\s*\n(.*?)```', + "distractors": r'`@distractors`\s*\n```yaml\s*\n(.*?)```', + } + + for section_name, pattern in sections.items(): + match = re.search(pattern, content, re.DOTALL) + if match: + parsed[section_name] = match.group(1).strip() + + return parsed, errors + + +# ============================================================================ +# VALIDATORS +# ============================================================================ + +def validate_yaml_block(yaml_content: str) -> Tuple[List[str], List[str]]: + """Validate the YAML metadata block for BlanksChallenge.""" + errors = [] + warnings = [] + + if not yaml_content: + errors.append("YAML block is empty") + return errors, warnings + + # Check for required fields + required_fields = [ + ("type:", "type"), + ("key:", "key"), + ("unit:", "unit"), + ("subskill:", "subskill"), + ("initial_difficulty:", "initial_difficulty"), + ("item_writer_id:", "item_writer_id"), + ] + + for field_pattern, field_name in required_fields: + if field_pattern not in yaml_content: + errors.append(f"Missing '{field_name}' in YAML block") + + # Validate type is BlanksChallenge + if "type:" in yaml_content and "BlanksChallenge" not in yaml_content: + errors.append("type must be 'BlanksChallenge'") + + # Validate item_writer_id is 999999999 + writer_match = re.search(r"item_writer_id:\s*['\"]?(\d+)['\"]?", yaml_content) + if writer_match: + if writer_match.group(1) != "999999999": + warnings.append(f"item_writer_id should be '999999999', found '{writer_match.group(1)}'") + + # Check unit format (should be kebab-case, 2-4 words) + unit_match = re.search(r'unit:\s*([^\n]+)', yaml_content) + if unit_match: + unit_value = unit_match.group(1).strip() + if not re.match(r'^[a-z0-9]+(-[a-z0-9]+){1,3}$', unit_value): + warnings.append(f"unit '{unit_value}' should be kebab-case (e.g., 'dplyr-wrangling', 'tidyverse-pipes')") + + return errors, warnings + + +def validate_required_sections(parsed: dict) -> List[str]: + """Validate that all required sections are present.""" + errors = [] + + required_sections = [ + ("context", "`@context`"), + ("code1", "`@code1`"), + ("variables", "`@variables`"), + ] + + for section_key, section_name in required_sections: + if not parsed.get(section_key): + errors.append(f"Missing required section: {section_name}") + + return errors + + +def validate_blanks(code1: str, variables: str) -> Tuple[List[str], List[str]]: + """Validate {{_exprN}} placeholders in code1 match variables.""" + errors = [] + warnings = [] + + if not code1: + return errors, warnings + + # Find all {{_exprN}} placeholders in code1 + blanks = re.findall(r'\{\{_expr(\d+)\}\}', code1) + blank_numbers = sorted(set(int(b) for b in blanks)) + + if not blank_numbers: + errors.append("No {{_exprN}} blanks found in @code1") + return errors, warnings + + # Check for consecutive numbering starting at 1 + expected = list(range(1, len(blank_numbers) + 1)) + if blank_numbers != expected: + errors.append(f"Blank numbers should be consecutive starting at 1. Found: {blank_numbers}") + + # Validate variables section has matching entries + if variables: + for num in blank_numbers: + expr_pattern = f"expr{num}:" + if expr_pattern not in variables: + errors.append(f"Missing 'expr{num}:' in @variables for {{{{_expr{num}}}}}") + + # Check for extra variables not in code + var_matches = re.findall(r'expr(\d+):', variables) + for var_num in var_matches: + if int(var_num) not in blank_numbers: + warnings.append(f"Variable 'expr{var_num}' defined but not used in @code1") + + return errors, warnings + + +def validate_code1_content(code1: str) -> Tuple[List[str], List[str]]: + """Validate code1 content (no comments, R-specific checks).""" + errors = [] + warnings = [] + + if not code1: + return errors, warnings + + # Check for comments (should not be in @code1) + comments = re.findall(r'#.*$', code1, re.MULTILINE) + if comments: + errors.append(f"@code1 should not contain comments. Found {len(comments)} comment(s)") + + # R-specific checks + + # Check for = assignment (should use <-) + # Look for = used for assignment (not in function args or comparisons) + # Pattern: variable = value (not inside function call) + assignment_equals = re.findall(r'^\s*[a-zA-Z_][a-zA-Z0-9_.]*\s*=\s*', code1, re.MULTILINE) + if assignment_equals: + warnings.append("R convention: use '<-' for assignment instead of '='") + + return errors, warnings + + +def validate_variables_format(variables: str) -> Tuple[List[str], List[str]]: + """Validate variables section format.""" + errors = [] + warnings = [] + + if not variables: + return errors, warnings + + # Check each variable has a list with exactly one answer + var_blocks = re.findall(r'(expr\d+):\s*\n\s*-\s*(.+)', variables) + + for var_name, answer in var_blocks: + # Check answer is quoted + if not (answer.startswith("'") or answer.startswith('"')): + warnings.append(f"{var_name} answer should be quoted: '{answer}'") + + return errors, warnings + + +def validate_single_item(content: str, item_number: int) -> ItemValidationResult: + """ + Validate a single BlanksChallenge item. + + Args: + content: Item markdown content + item_number: 1-based item number + + Returns: + ItemValidationResult + """ + all_errors = [] + all_warnings = [] + + # Parse the item + parsed, parse_errors = parse_single_item(content) + all_errors.extend(parse_errors) + + title = parsed.get("title", f"Item {item_number}") + + # Validate YAML block + if parsed.get("yaml_block"): + yaml_errors, yaml_warnings = validate_yaml_block(parsed["yaml_block"]) + all_errors.extend(yaml_errors) + all_warnings.extend(yaml_warnings) + + # Validate required sections + section_errors = validate_required_sections(parsed) + all_errors.extend(section_errors) + + # Validate blanks + blank_count = 0 + if parsed.get("code1"): + blank_errors, blank_warnings = validate_blanks(parsed["code1"], parsed.get("variables")) + all_errors.extend(blank_errors) + all_warnings.extend(blank_warnings) + blank_count = len(re.findall(r'\{\{_expr\d+\}\}', parsed["code1"])) + + # Validate code1 content + if parsed.get("code1"): + code_errors, code_warnings = validate_code1_content(parsed["code1"]) + all_errors.extend(code_errors) + all_warnings.extend(code_warnings) + + # Validate variables format + if parsed.get("variables"): + var_errors, var_warnings = validate_variables_format(parsed["variables"]) + all_errors.extend(var_errors) + all_warnings.extend(var_warnings) + + return ItemValidationResult( + valid=len(all_errors) == 0, + title=title, + item_number=item_number, + errors=all_errors, + warnings=all_warnings, + blank_count=blank_count + ) + + +# ============================================================================ +# MAIN VALIDATOR +# ============================================================================ + +def validate_document(content: str) -> DocumentValidationResult: + """ + Validate a complete BlanksChallenge document with multiple items. + + Args: + content: Full markdown content + + Returns: + DocumentValidationResult + """ + document_errors = [] + + # Parse document header + header, remaining_content = parse_document_header(content) + document_title = header.get("title", "Untitled") + + # Check for document header + if not header.get("title"): + document_errors.append("Missing document 'title:' in front matter") + + # Split into items + items_content = split_items(remaining_content) + + if not items_content: + document_errors.append("No items found in document") + return DocumentValidationResult( + valid=False, + message="❌ No items found in document", + document_title=document_title, + document_errors=document_errors + ) + + # Validate each item + item_results = [] + for i, item_content in enumerate(items_content, start=1): + result = validate_single_item(item_content, i) + item_results.append(result) + + # Aggregate results + all_valid = all(item.valid for item in item_results) and len(document_errors) == 0 + total_blanks = sum(item.blank_count for item in item_results) + + if all_valid: + message = f"βœ… Validation passed: {len(item_results)} item(s), {total_blanks} total blanks" + else: + failed_count = sum(1 for item in item_results if not item.valid) + message = f"❌ Validation failed: {failed_count}/{len(item_results)} item(s) have errors" + + return DocumentValidationResult( + valid=all_valid, + message=message, + document_title=document_title, + item_count=len(item_results), + items=item_results, + document_errors=document_errors + ) + + +# ============================================================================ +# CLI +# ============================================================================ + +def main(): + """CLI entry point.""" + if len(sys.argv) < 2: + print("R BlanksChallenge Validator") + print("") + print("Usage:") + print(" python r_coding_validator.py ") + print("") + print("Example:") + print(" python r_coding_validator.py /tmp/exercise_to_validate.md") + sys.exit(1) + + file_path = Path(sys.argv[1]) + + if not file_path.exists(): + print(f"Error: File not found: {file_path}") + sys.exit(1) + + content = file_path.read_text() + + # Validate + result = validate_document(content) + + # Output + print(result.message) + print(f"Document: {result.document_title}") + print("") + + # Document-level errors + if result.document_errors: + print("Document Errors:") + for error in result.document_errors: + print(f" ❌ {error}") + print("") + + # Per-item results + for item in result.items: + status = "βœ…" if item.valid else "❌" + print(f"{status} Item {item.item_number}: \"{item.title}\" β€” {item.blank_count} blank(s)") + + if item.errors: + for error in item.errors: + print(f" ❌ {error}") + + if item.warnings: + for warning in item.warnings: + print(f" ⚠️ {warning}") + + sys.exit(0 if result.valid else 1) + + +if __name__ == "__main__": + main() diff --git a/.cursor/validators/r_iterative_validator.py b/.cursor/validators/r_iterative_validator.py new file mode 100644 index 0000000..f28b2c0 --- /dev/null +++ b/.cursor/validators/r_iterative_validator.py @@ -0,0 +1,679 @@ +#!/usr/bin/env python3 +""" +R Iterative (Bullet) Exercise Validator + +Validates the complete exercise markdown structure for R iterative exercises. +These exercises have a parent BulletExercise with multiple NormalExercise children. +Each step is INDEPENDENT - code does NOT accumulate across steps. + +Usage: + python r_iterative_validator.py + python r_iterative_validator.py /tmp/exercise_to_validate.md + +R-specific rules: +- Uses 3 underscores (___) for scaffolding +- Uses ```{r} or ```r code blocks +- Follows tidyverse style guide (<- for assignment, %>% for pipes) +""" + +import sys +import re +from pathlib import Path +from typing import List, Optional, Tuple +from dataclasses import dataclass, field + + +# ============================================================================ +# CONSTANTS +# ============================================================================ + +# Structural constraints (will break production if violated) +MIN_STEPS = 2 +MAX_STEPS = 4 +EXPECTED_TOTAL_XP = 100 + +# Content guidelines (optional, for information only) +GUIDELINE_MAX_CONTEXT_LENGTH = 550 +GUIDELINE_RECOMMENDED_CONTEXT_LENGTH = 300 +GUIDELINE_MAX_TITLE_LENGTH = 25 +GUIDELINE_RECOMMENDED_INSTRUCTION_LENGTH = 60 + + +# ============================================================================ +# DATA CLASSES +# ============================================================================ + +@dataclass +class StepData: + """Data for a single exercise step.""" + index: int + yaml_block: Optional[str] = None + xp: Optional[int] = None + instructions: Optional[str] = None + hint: Optional[str] = None + sample_code: Optional[str] = None + solution: Optional[str] = None + sct: Optional[str] = None + + +@dataclass +class ExerciseData: + """Data for the complete iterative exercise.""" + title: Optional[str] = None + yaml_block: Optional[str] = None + context: Optional[str] = None + pre_exercise_code: Optional[str] = None + steps: List[StepData] = field(default_factory=list) + + +@dataclass +class ValidationResult: + """Result of validation.""" + valid: bool + message: str + title: Optional[str] = None + errors: List[str] = field(default_factory=list) + warnings: List[str] = field(default_factory=list) + + +# ============================================================================ +# PARSER +# ============================================================================ + +def parse_iterative_exercise(content: str) -> Tuple[ExerciseData, List[str]]: + """ + Parse the iterative exercise markdown to extract components. + + Returns: + Tuple of (ExerciseData, parse_errors) + """ + errors = [] + content = content.strip() + + # Remove leading --- separator if present + content = re.sub(r'^---\s*\n', '', content).strip() + + exercise = ExerciseData() + + # Extract title from heading + title_match = re.match(r'^##\s+(.+?)(?:\n|$)', content) + if not title_match: + errors.append("Missing markdown heading (must start with '## ')") + else: + exercise.title = title_match.group(1).strip() + + # Split into parent and steps using *** separator + parts = re.split(r'\n\*\*\*\s*\n', content) + + if len(parts) < 2: + errors.append("No step separators (***) found. Iterative exercises need at least 2 steps.") + return exercise, errors + + parent_section = parts[0] + step_sections = parts[1:] + + # Parse parent section + exercise = parse_parent_section(parent_section, exercise, errors) + + # Parse each step + for i, step_content in enumerate(step_sections): + step = parse_step_section(step_content, i + 1, errors) + exercise.steps.append(step) + + return exercise, errors + + +def parse_parent_section(content: str, exercise: ExerciseData, errors: List[str]) -> ExerciseData: + """Parse the parent BulletExercise section.""" + + # Extract YAML metadata block + yaml_match = re.search(r'```yaml\s*\n(.*?)```', content, re.DOTALL) + if yaml_match: + exercise.yaml_block = yaml_match.group(1).strip() + else: + errors.append("Missing ```yaml metadata block in parent section") + + # Extract pre_exercise_code (R uses ```{r} or ```r) + pre_code_match = re.search(r'`@pre_exercise_code`\s*\n```\{r\}\s*\n(.*?)```', content, re.DOTALL) + if pre_code_match: + exercise.pre_exercise_code = pre_code_match.group(1).strip() + else: + # Try alternative syntax without curly braces + pre_code_match = re.search(r'`@pre_exercise_code`\s*\n```r\s*\n(.*?)```', content, re.DOTALL) + if pre_code_match: + exercise.pre_exercise_code = pre_code_match.group(1).strip() + + # Extract context (text between yaml block and @pre_exercise_code or first ***) + yaml_end_match = re.search(r'```yaml.*?```\s*\n', content, re.DOTALL) + if yaml_end_match: + after_yaml = content[yaml_end_match.end():] + pre_code_start = after_yaml.find('`@pre_exercise_code`') + if pre_code_start > 0: + exercise.context = after_yaml[:pre_code_start].strip() + else: + exercise.context = after_yaml.strip() + + return exercise + + +def parse_step_section(content: str, step_index: int, errors: List[str]) -> StepData: + """Parse a single step (NormalExercise) section.""" + + step = StepData(index=step_index) + + # Extract YAML metadata block + yaml_match = re.search(r'```yaml\s*\n(.*?)```', content, re.DOTALL) + if yaml_match: + step.yaml_block = yaml_match.group(1).strip() + # Extract XP + xp_match = re.search(r'xp:\s*(\d+)', step.yaml_block) + if xp_match: + step.xp = int(xp_match.group(1)) + else: + errors.append(f"Step {step_index}: Missing ```yaml metadata block") + + # Extract text sections + sections = { + "instructions": r'`@instructions`\s*\n(.*?)(?=`@|\Z)', + "hint": r'`@hint`\s*\n(.*?)(?=`@|\Z)', + } + + # Extract code sections (R uses ```{r} or ```r) + code_sections = { + "sample_code": r'`@sample_code`\s*\n```\{r\}\s*\n(.*?)```', + "solution": r'`@solution`\s*\n```\{r\}\s*\n(.*?)```', + "sct": r'`@sct`\s*\n```\{r\}\s*\n(.*?)```', + } + + # Fallback patterns without curly braces + code_sections_fallback = { + "sample_code": r'`@sample_code`\s*\n```r\s*\n(.*?)```', + "solution": r'`@solution`\s*\n```r\s*\n(.*?)```', + "sct": r'`@sct`\s*\n```r\s*\n(.*?)```', + } + + for section_name, pattern in sections.items(): + match = re.search(pattern, content, re.DOTALL) + if match: + setattr(step, section_name, match.group(1).strip()) + + for section_name, pattern in code_sections.items(): + match = re.search(pattern, content, re.DOTALL) + if match: + setattr(step, section_name, match.group(1).strip()) + else: + # Try fallback + fallback_pattern = code_sections_fallback.get(section_name) + if fallback_pattern: + match = re.search(fallback_pattern, content, re.DOTALL) + if match: + setattr(step, section_name, match.group(1).strip()) + + return step + + +# ============================================================================ +# VALIDATORS +# ============================================================================ + +def validate_parent_yaml(yaml_content: str) -> Tuple[List[str], List[str]]: + """Validate the parent YAML metadata block.""" + errors = [] + warnings = [] + + if not yaml_content: + errors.append("Parent YAML block is empty") + return errors, warnings + + # Check for required type + if "type:" not in yaml_content: + errors.append("Missing 'type:' in parent YAML block") + elif "BulletExercise" not in yaml_content: + errors.append("Parent type must be 'BulletExercise' (not TabExercise)") + + # Check for XP + if "xp:" not in yaml_content: + errors.append("Missing 'xp:' in parent YAML block") + else: + xp_match = re.search(r'xp:\s*(\d+)', yaml_content) + if xp_match and int(xp_match.group(1)) != EXPECTED_TOTAL_XP: + errors.append(f"Parent xp should be {EXPECTED_TOTAL_XP}, found {xp_match.group(1)}") + + return errors, warnings + + +def validate_step_yaml(step: StepData) -> Tuple[List[str], List[str]]: + """Validate a step's YAML metadata block.""" + errors = [] + warnings = [] + + if not step.yaml_block: + errors.append(f"Step {step.index}: YAML block is empty") + return errors, warnings + + # Check for required type + if "type:" not in step.yaml_block: + errors.append(f"Step {step.index}: Missing 'type:' in YAML block") + elif "NormalExercise" not in step.yaml_block: + errors.append(f"Step {step.index}: Type must be 'NormalExercise'") + + # Check for XP + if "xp:" not in step.yaml_block: + errors.append(f"Step {step.index}: Missing 'xp:' in YAML block") + + return errors, warnings + + +def validate_title(title: str) -> Tuple[List[str], List[str]]: + """Validate exercise title (structural check only).""" + errors = [] + warnings = [] + + if not title: + errors.append("Missing exercise title") + + return errors, warnings + + +def validate_context(context: str) -> Tuple[List[str], List[str]]: + """Validate exercise context (structural check only).""" + errors = [] + warnings = [] + + if not context: + warnings.append("Missing context (narrative explaining the scenario)") + + return errors, warnings + + +def validate_step_count(steps: List[StepData]) -> Tuple[List[str], List[str]]: + """Validate the number of steps (structural check).""" + errors = [] + warnings = [] + + step_count = len(steps) + + if step_count < MIN_STEPS: + errors.append(f"Too few steps: {step_count}. Minimum is {MIN_STEPS}") + elif step_count > MAX_STEPS: + errors.append(f"Too many steps: {step_count}. Maximum is {MAX_STEPS}") + + return errors, warnings + + +def validate_xp_distribution(steps: List[StepData]) -> Tuple[List[str], List[str]]: + """Validate that XP values sum to expected total.""" + errors = [] + warnings = [] + + total_xp = 0 + missing_xp = [] + + for step in steps: + if step.xp is not None: + total_xp += step.xp + else: + missing_xp.append(step.index) + + if missing_xp: + errors.append(f"Steps missing XP values: {missing_xp}") + elif total_xp != EXPECTED_TOTAL_XP: + errors.append(f"XP values sum to {total_xp}, expected {EXPECTED_TOTAL_XP}") + + return errors, warnings + + +def validate_step_sections(step: StepData) -> Tuple[List[str], List[str]]: + """Validate that a step has all required sections.""" + errors = [] + warnings = [] + + required_sections = [ + ("instructions", "`@instructions`"), + ("hint", "`@hint`"), + ("sample_code", "`@sample_code`"), + ("solution", "`@solution`"), + ("sct", "`@sct`"), + ] + + for section_key, section_name in required_sections: + if not getattr(step, section_key, None): + errors.append(f"Step {step.index}: Missing required section {section_name}") + + return errors, warnings + + +def validate_instruction_presence(step: StepData) -> Tuple[List[str], List[str]]: + """Validate that step has instruction content (structural check only).""" + errors = [] + warnings = [] + + if not step.instructions: + return errors, warnings + + instruction_text = step.instructions.strip() + if not instruction_text: + warnings.append(f"Step {step.index}: Instructions section is empty") + + return errors, warnings + + +def validate_scaffolding(step: StepData) -> Tuple[List[str], List[str]]: + """Validate scaffolding in sample code (R uses 3 underscores).""" + errors = [] + warnings = [] + + if not step.sample_code: + return errors, warnings + + # Check for 3-underscore scaffolding (R uses 3, not 4) + three_underscores = re.findall(r'(? Tuple[List[str], List[str]]: + """ + Validate that sample and solution code are identical except for scaffolding. + + This is a critical structural check - sample code should be the solution + with ___ placeholders where learners fill in answers. + """ + errors = [] + warnings = [] + + if not step.sample_code or not step.solution: + return errors, warnings + + sample_normalized = step.sample_code.strip() + solution_normalized = step.solution.strip() + + # First check: line counts must match + sample_lines = sample_normalized.split('\n') + solution_lines = solution_normalized.split('\n') + + if len(sample_lines) != len(solution_lines): + errors.append( + f"Step {step.index}: Line count mismatch - sample has {len(sample_lines)} lines, " + f"solution has {len(solution_lines)} lines" + ) + return errors, warnings + + # Second check: each line must match when scaffolding is accounted for + for i, (sample_line, solution_line) in enumerate(zip(sample_lines, solution_lines), 1): + # Check if lines match (sample line with ___ replaced by regex pattern) + if '___' in sample_line: + # Build pattern: escape everything except ___ which becomes .+ + parts = sample_line.split('___') + escaped_parts = [re.escape(p) for p in parts] + line_pattern = '.+'.join(escaped_parts) + + if not re.fullmatch(line_pattern, solution_line): + errors.append( + f"Step {step.index}: Line {i} structure mismatch - " + f"sample and solution differ beyond scaffolding" + ) + else: + # No scaffolding on this line - must be identical + if sample_line != solution_line: + errors.append( + f"Step {step.index}: Line {i} mismatch - " + f"lines must be identical (no scaffolding on this line)" + ) + + return errors, warnings + + +def validate_r_style(step: StepData) -> Tuple[List[str], List[str]]: + """Validate R-specific style conventions (tidyverse guidelines).""" + warnings = [] + errors = [] + + if not step.solution: + return errors, warnings + + # Check for = assignment (should use <-) + lines = step.solution.split('\n') + for i, line in enumerate(lines, 1): + # Skip comments and lines inside function calls + if line.strip().startswith('#'): + continue + # Check for = assignment pattern at start of line (not inside function) + if re.search(r'^\s*\w+\s*=\s*[^=]', line) and '(' not in line.split('=')[0]: + warnings.append( + f"Step {step.index}, Line {i}: Consider using '<-' for assignment instead of '='" + ) + + return errors, warnings + + +def validate_success_message(steps: List[StepData]) -> Tuple[List[str], List[str]]: + """Validate that success_msg is only in the last step.""" + errors = [] + warnings = [] + + for i, step in enumerate(steps): + if not step.sct: + continue + + has_success_msg = 'success_msg(' in step.sct + is_last_step = (i == len(steps) - 1) + + if has_success_msg and not is_last_step: + warnings.append(f"Step {step.index}: success_msg() should only be in the last step") + + if is_last_step and not has_success_msg: + warnings.append(f"Step {step.index} (last step): Missing success_msg() in SCT") + + return errors, warnings + + +def validate_pre_exercise_code(pre_exercise_code: str) -> Tuple[List[str], List[str]]: + """Validate pre-exercise code exists.""" + errors = [] + warnings = [] + + if not pre_exercise_code: + warnings.append("Missing `@pre_exercise_code` section in parent (may be intentional)") + + return errors, warnings + + +# ============================================================================ +# MAIN VALIDATOR +# ============================================================================ + +def validate_exercise(content: str) -> ValidationResult: + """ + Validate a complete R iterative exercise. + + Args: + content: Full markdown content + + Returns: + ValidationResult with validation status and details + """ + all_errors = [] + all_warnings = [] + + # Step 1: Parse the markdown + exercise, parse_errors = parse_iterative_exercise(content) + all_errors.extend(parse_errors) + + title = exercise.title or "Unknown" + + # If parsing failed badly, return early + if len(exercise.steps) == 0 and parse_errors: + return ValidationResult( + valid=False, + message=f"❌ Parsing failed: \"{title}\"", + title=title, + errors=all_errors, + warnings=all_warnings + ) + + # Step 2: Validate title + title_errors, title_warnings = validate_title(exercise.title) + all_errors.extend(title_errors) + all_warnings.extend(title_warnings) + + # Step 3: Validate parent YAML block + if exercise.yaml_block: + yaml_errors, yaml_warnings = validate_parent_yaml(exercise.yaml_block) + all_errors.extend(yaml_errors) + all_warnings.extend(yaml_warnings) + + # Step 4: Validate context + context_errors, context_warnings = validate_context(exercise.context) + all_errors.extend(context_errors) + all_warnings.extend(context_warnings) + + # Step 5: Validate pre-exercise code + pre_errors, pre_warnings = validate_pre_exercise_code(exercise.pre_exercise_code) + all_errors.extend(pre_errors) + all_warnings.extend(pre_warnings) + + # Step 6: Validate step count + step_count_errors, step_count_warnings = validate_step_count(exercise.steps) + all_errors.extend(step_count_errors) + all_warnings.extend(step_count_warnings) + + # Step 7: Validate XP distribution + xp_errors, xp_warnings = validate_xp_distribution(exercise.steps) + all_errors.extend(xp_errors) + all_warnings.extend(xp_warnings) + + # Step 8: Validate each step + for step in exercise.steps: + # Step YAML + step_yaml_errors, step_yaml_warnings = validate_step_yaml(step) + all_errors.extend(step_yaml_errors) + all_warnings.extend(step_yaml_warnings) + + # Required sections + section_errors, section_warnings = validate_step_sections(step) + all_errors.extend(section_errors) + all_warnings.extend(section_warnings) + + # Instruction presence + inst_errors, inst_warnings = validate_instruction_presence(step) + all_errors.extend(inst_errors) + all_warnings.extend(inst_warnings) + + # Scaffolding (R uses 3 underscores) + scaffold_errors, scaffold_warnings = validate_scaffolding(step) + all_errors.extend(scaffold_errors) + all_warnings.extend(scaffold_warnings) + + # Code structure + struct_errors, struct_warnings = validate_code_structure(step) + all_errors.extend(struct_errors) + all_warnings.extend(struct_warnings) + + # R style (tidyverse guidelines) + style_errors, style_warnings = validate_r_style(step) + all_errors.extend(style_errors) + all_warnings.extend(style_warnings) + + # Step 9: Validate success message placement + success_errors, success_warnings = validate_success_message(exercise.steps) + all_errors.extend(success_errors) + all_warnings.extend(success_warnings) + + # Build result + if all_errors: + return ValidationResult( + valid=False, + message=f"❌ Validation failed: \"{title}\"", + title=title, + errors=all_errors, + warnings=all_warnings + ) + + # Count total scaffolding + total_scaffolds = 0 + for step in exercise.steps: + if step.sample_code: + total_scaffolds += len(re.findall(r'(?") + print("") + print("Example:") + print(" python r_iterative_validator.py /tmp/exercise_to_validate.md") + print("") + print("Validates:") + print(" - Parent structure (BulletExercise)") + print(" - Step count (2-4 steps)") + print(" - XP distribution (sums to 100)") + print(" - Required sections per step") + print(" - Scaffolding (3 underscores for R)") + print(" - Code structure matching") + print(" - R style (tidyverse: <- assignment, %>% pipes)") + sys.exit(1) + + file_path = Path(sys.argv[1]) + + if not file_path.exists(): + print(f"Error: File not found: {file_path}") + sys.exit(1) + + content = file_path.read_text() + + # Validate + result = validate_exercise(content) + + # Output + print(result.message) + + if result.errors: + print("") + print("🚨 Errors (must fix):") + for error in result.errors: + print(f" ❌ {error}") + + if result.warnings: + print("") + print("πŸ’‘ Warnings (suggestions):") + for warning in result.warnings: + print(f" ⚠️ {warning}") + + # Summary + print("") + if result.valid: + print(f"Summary: Exercise is valid with {len(result.warnings)} warning(s)") + else: + print(f"Summary: {len(result.errors)} error(s), {len(result.warnings)} warning(s)") + + sys.exit(0 if result.valid else 1) + + +if __name__ == "__main__": + main() diff --git a/.cursor/validators/requirements.txt b/.cursor/validators/requirements.txt new file mode 100644 index 0000000..ad99e03 --- /dev/null +++ b/.cursor/validators/requirements.txt @@ -0,0 +1,3 @@ +pydantic>=2.0 +pyyaml>=6.0 + diff --git a/.cursor/validators/sql_coding_validator.py b/.cursor/validators/sql_coding_validator.py new file mode 100644 index 0000000..e93ab1b --- /dev/null +++ b/.cursor/validators/sql_coding_validator.py @@ -0,0 +1,500 @@ +#!/usr/bin/env python3 +""" +SQL BlanksChallenge Validator + +Validates BlanksChallenge exercise markdown structure for SQL coding items. + +Usage: + python sql_coding_validator.py + python sql_coding_validator.py /tmp/exercise_to_validate.md +""" + +import sys +import re +from pathlib import Path +from typing import List, Optional, Tuple +from dataclasses import dataclass, field + + +# ============================================================================ +# VALIDATION MODELS +# ============================================================================ + +@dataclass +class ItemValidationResult: + """Result of validating a single item.""" + valid: bool + title: str + item_number: int + errors: List[str] = field(default_factory=list) + warnings: List[str] = field(default_factory=list) + blank_count: int = 0 + + +@dataclass +class DocumentValidationResult: + """Result of validating the entire document.""" + valid: bool + message: str + document_title: Optional[str] = None + item_count: int = 0 + items: List[ItemValidationResult] = field(default_factory=list) + document_errors: List[str] = field(default_factory=list) + + +# ============================================================================ +# PARSER +# ============================================================================ + +def parse_document_header(content: str) -> Tuple[dict, str]: + """ + Parse the document header (title, output, description). + + Returns: + Tuple of (header_data, remaining_content) + """ + header = {} + + # Check for YAML front matter + front_matter_match = re.match(r'^---\s*\n(.*?)\n---\s*\n', content, re.DOTALL) + if front_matter_match: + front_matter = front_matter_match.group(1) + + # Extract title + title_match = re.search(r'title:\s*(.+)', front_matter) + if title_match: + header["title"] = title_match.group(1).strip() + + # Extract output + output_match = re.search(r'output:\s*(.+)', front_matter) + if output_match: + header["output"] = output_match.group(1).strip() + + # Extract description + desc_match = re.search(r'description:\s*(.+)', front_matter) + if desc_match: + header["description"] = desc_match.group(1).strip() + + remaining = content[front_matter_match.end():] + else: + remaining = content + + return header, remaining + + +def split_items(content: str) -> List[str]: + """ + Split content into individual items by --- separator. + + Returns: + List of item content strings + """ + # Split by --- on its own line (item separator) + items = re.split(r'\n---\s*\n', content) + + # Filter out empty items + items = [item.strip() for item in items if item.strip()] + + return items + + +def parse_single_item(content: str) -> Tuple[dict, List[str]]: + """ + Parse a single BlanksChallenge item. + + Returns: + Tuple of (parsed_data, errors) + """ + errors = [] + + parsed = { + "title": None, + "yaml_block": None, + "context": None, + "code1": None, + "pre_challenge_code": None, + "variables": None, + "distractors": None, + } + + # Extract title from heading (## [Title] format) + title_match = re.search(r'^##\s+\[([^\]]+)\]', content, re.MULTILINE) + if not title_match: + # Try without brackets + title_match = re.search(r'^##\s+(.+?)(?:\n|$)', content, re.MULTILINE) + + if not title_match: + errors.append("Missing item heading (must have '## [Title]' or '## Title')") + else: + parsed["title"] = title_match.group(1).strip() + + # Extract YAML metadata block + yaml_match = re.search(r'```yaml\s*\n(.*?)```', content, re.DOTALL) + if yaml_match: + parsed["yaml_block"] = yaml_match.group(1).strip() + else: + errors.append("Missing ```yaml metadata block") + + # Extract sections - SQL uses {sql} for code1 and {python} for pre_challenge_code + sections = { + "context": r'`@context`\s*\n(.*?)(?=`@|\Z)', + "code1": r'`@code1`\s*\n```(?:sql|\{sql\})\s*\n(.*?)```', + "pre_challenge_code": r'`@pre_challenge_code`\s*\n```(?:python|\{python\})\s*\n(.*?)```', + "variables": r'`@variables`\s*\n```yaml\s*\n(.*?)```', + "distractors": r'`@distractors`\s*\n```yaml\s*\n(.*?)```', + } + + for section_name, pattern in sections.items(): + match = re.search(pattern, content, re.DOTALL) + if match: + parsed[section_name] = match.group(1).strip() + + return parsed, errors + + +# ============================================================================ +# VALIDATORS +# ============================================================================ + +def validate_yaml_block(yaml_content: str) -> Tuple[List[str], List[str]]: + """Validate the YAML metadata block for BlanksChallenge.""" + errors = [] + warnings = [] + + if not yaml_content: + errors.append("YAML block is empty") + return errors, warnings + + # Check for required fields + required_fields = [ + ("type:", "type"), + ("key:", "key"), + ("unit:", "unit"), + ("subskill:", "subskill"), + ("initial_difficulty:", "initial_difficulty"), + ("item_writer_id:", "item_writer_id"), + ] + + for field_pattern, field_name in required_fields: + if field_pattern not in yaml_content: + errors.append(f"Missing '{field_name}' in YAML block") + + # Validate type is BlanksChallenge + if "type:" in yaml_content and "BlanksChallenge" not in yaml_content: + errors.append("type must be 'BlanksChallenge'") + + # Validate item_writer_id is 999999999 + writer_match = re.search(r"item_writer_id:\s*['\"]?(\d+)['\"]?", yaml_content) + if writer_match: + if writer_match.group(1) != "999999999": + warnings.append(f"item_writer_id should be '999999999', found '{writer_match.group(1)}'") + + # Check unit format (should be kebab-case, 2-4 words) + unit_match = re.search(r'unit:\s*([^\n]+)', yaml_content) + if unit_match: + unit_value = unit_match.group(1).strip() + if not re.match(r'^[a-z0-9]+(-[a-z0-9]+){1,3}$', unit_value): + warnings.append(f"unit '{unit_value}' should be kebab-case (e.g., 'sql-aggregations', 'query-joins')") + + return errors, warnings + + +def validate_required_sections(parsed: dict) -> List[str]: + """Validate that all required sections are present.""" + errors = [] + + required_sections = [ + ("context", "`@context`"), + ("code1", "`@code1`"), + ("variables", "`@variables`"), + ] + + for section_key, section_name in required_sections: + if not parsed.get(section_key): + errors.append(f"Missing required section: {section_name}") + + return errors + + +def validate_blanks(code1: str, variables: str) -> Tuple[List[str], List[str]]: + """Validate {{_exprN}} placeholders in code1 match variables.""" + errors = [] + warnings = [] + + if not code1: + return errors, warnings + + # Find all {{_exprN}} placeholders in code1 + blanks = re.findall(r'\{\{_expr(\d+)\}\}', code1) + blank_numbers = sorted(set(int(b) for b in blanks)) + + if not blank_numbers: + errors.append("No {{_exprN}} blanks found in @code1") + return errors, warnings + + # Check for consecutive numbering starting at 1 + expected = list(range(1, len(blank_numbers) + 1)) + if blank_numbers != expected: + errors.append(f"Blank numbers should be consecutive starting at 1. Found: {blank_numbers}") + + # Validate variables section has matching entries + if variables: + for num in blank_numbers: + expr_pattern = f"expr{num}:" + if expr_pattern not in variables: + errors.append(f"Missing 'expr{num}:' in @variables for {{{{_expr{num}}}}}") + + # Check for extra variables not in code + var_matches = re.findall(r'expr(\d+):', variables) + for var_num in var_matches: + if int(var_num) not in blank_numbers: + warnings.append(f"Variable 'expr{var_num}' defined but not used in @code1") + + return errors, warnings + + +def validate_code1_content(code1: str) -> Tuple[List[str], List[str]]: + """Validate code1 content (no comments, SQL-specific checks).""" + errors = [] + warnings = [] + + if not code1: + return errors, warnings + + # Check for comments (should not be in @code1) + # SQL uses -- for comments + comments = re.findall(r'--.*$', code1, re.MULTILINE) + if comments: + errors.append(f"@code1 should not contain comments. Found {len(comments)} comment(s)") + + # SQL-specific checks + + # Check for lowercase SQL keywords (should be UPPERCASE) + sql_keywords = [ + 'select', 'from', 'where', 'group by', 'order by', 'having', + 'join', 'inner join', 'left join', 'right join', 'full join', + 'on', 'as', 'and', 'or', 'not', 'in', 'between', 'like', + 'is null', 'is not null', 'distinct', 'union', 'limit', 'offset', + 'case', 'when', 'then', 'else', 'end', 'insert', 'update', 'delete', + 'create', 'alter', 'drop', 'with' + ] + + lowercase_found = [] + for keyword in sql_keywords: + # Look for keyword as whole word (not part of column name) + pattern = r'\b' + keyword + r'\b' + if re.search(pattern, code1): + lowercase_found.append(keyword.upper()) + + if lowercase_found: + warnings.append(f"SQL keywords should be UPPERCASE: {', '.join(lowercase_found[:5])}" + + ("..." if len(lowercase_found) > 5 else "")) + + # Check for missing AS in aliases (common pattern: name city_name vs name AS city_name) + # This is a heuristic check for potential missing AS keywords + + # Check for missing semicolon + if not code1.rstrip().endswith(';'): + warnings.append("SQL query should end with semicolon (;)") + + return errors, warnings + + +def validate_variables_format(variables: str) -> Tuple[List[str], List[str]]: + """Validate variables section format.""" + errors = [] + warnings = [] + + if not variables: + return errors, warnings + + # Check each variable has a list with exactly one answer + var_blocks = re.findall(r'(expr\d+):\s*\n\s*-\s*(.+)', variables) + + for var_name, answer in var_blocks: + # Check answer is quoted + if not (answer.startswith("'") or answer.startswith('"')): + warnings.append(f"{var_name} answer should be quoted: '{answer}'") + + return errors, warnings + + +def validate_single_item(content: str, item_number: int) -> ItemValidationResult: + """ + Validate a single BlanksChallenge item. + + Args: + content: Item markdown content + item_number: 1-based item number + + Returns: + ItemValidationResult + """ + all_errors = [] + all_warnings = [] + + # Parse the item + parsed, parse_errors = parse_single_item(content) + all_errors.extend(parse_errors) + + title = parsed.get("title", f"Item {item_number}") + + # Validate YAML block + if parsed.get("yaml_block"): + yaml_errors, yaml_warnings = validate_yaml_block(parsed["yaml_block"]) + all_errors.extend(yaml_errors) + all_warnings.extend(yaml_warnings) + + # Validate required sections + section_errors = validate_required_sections(parsed) + all_errors.extend(section_errors) + + # Validate blanks + blank_count = 0 + if parsed.get("code1"): + blank_errors, blank_warnings = validate_blanks(parsed["code1"], parsed.get("variables")) + all_errors.extend(blank_errors) + all_warnings.extend(blank_warnings) + blank_count = len(re.findall(r'\{\{_expr\d+\}\}', parsed["code1"])) + + # Validate code1 content + if parsed.get("code1"): + code_errors, code_warnings = validate_code1_content(parsed["code1"]) + all_errors.extend(code_errors) + all_warnings.extend(code_warnings) + + # Validate variables format + if parsed.get("variables"): + var_errors, var_warnings = validate_variables_format(parsed["variables"]) + all_errors.extend(var_errors) + all_warnings.extend(var_warnings) + + return ItemValidationResult( + valid=len(all_errors) == 0, + title=title, + item_number=item_number, + errors=all_errors, + warnings=all_warnings, + blank_count=blank_count + ) + + +# ============================================================================ +# MAIN VALIDATOR +# ============================================================================ + +def validate_document(content: str) -> DocumentValidationResult: + """ + Validate a complete BlanksChallenge document with multiple items. + + Args: + content: Full markdown content + + Returns: + DocumentValidationResult + """ + document_errors = [] + + # Parse document header + header, remaining_content = parse_document_header(content) + document_title = header.get("title", "Untitled") + + # Check for document header + if not header.get("title"): + document_errors.append("Missing document 'title:' in front matter") + + # Split into items + items_content = split_items(remaining_content) + + if not items_content: + document_errors.append("No items found in document") + return DocumentValidationResult( + valid=False, + message="❌ No items found in document", + document_title=document_title, + document_errors=document_errors + ) + + # Validate each item + item_results = [] + for i, item_content in enumerate(items_content, start=1): + result = validate_single_item(item_content, i) + item_results.append(result) + + # Aggregate results + all_valid = all(item.valid for item in item_results) and len(document_errors) == 0 + total_blanks = sum(item.blank_count for item in item_results) + + if all_valid: + message = f"βœ… Validation passed: {len(item_results)} item(s), {total_blanks} total blanks" + else: + failed_count = sum(1 for item in item_results if not item.valid) + message = f"❌ Validation failed: {failed_count}/{len(item_results)} item(s) have errors" + + return DocumentValidationResult( + valid=all_valid, + message=message, + document_title=document_title, + item_count=len(item_results), + items=item_results, + document_errors=document_errors + ) + + +# ============================================================================ +# CLI +# ============================================================================ + +def main(): + """CLI entry point.""" + if len(sys.argv) < 2: + print("SQL BlanksChallenge Validator") + print("") + print("Usage:") + print(" python sql_coding_validator.py ") + print("") + print("Example:") + print(" python sql_coding_validator.py /tmp/exercise_to_validate.md") + sys.exit(1) + + file_path = Path(sys.argv[1]) + + if not file_path.exists(): + print(f"Error: File not found: {file_path}") + sys.exit(1) + + content = file_path.read_text() + + # Validate + result = validate_document(content) + + # Output + print(result.message) + print(f"Document: {result.document_title}") + print("") + + # Document-level errors + if result.document_errors: + print("Document Errors:") + for error in result.document_errors: + print(f" ❌ {error}") + print("") + + # Per-item results + for item in result.items: + status = "βœ…" if item.valid else "❌" + print(f"{status} Item {item.item_number}: \"{item.title}\" β€” {item.blank_count} blank(s)") + + if item.errors: + for error in item.errors: + print(f" ❌ {error}") + + if item.warnings: + for warning in item.warnings: + print(f" ⚠️ {warning}") + + sys.exit(0 if result.valid else 1) + + +if __name__ == "__main__": + main()