From 0673fd81ff38da18ba2f54d93b32d21368f5803c Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Wed, 15 Apr 2026 10:48:27 +0200 Subject: [PATCH 01/66] Add experimental branch installation option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a release channel selection during installation allowing users to choose between stable (default) and experimental branches. When experimental is selected: - Displays feedback request with links to issues/discussions - Re-downloads install.sh from the experimental branch - Re-executes with --experimental flag (preserving other args) Features: - New --experimental flag and DEVKIT_CHANNEL env var - Interactive radio selector for channel choice - Channel shown in summary and completion messages - Feedback reminder at end of experimental installs Closes #468 πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- install.sh | 76 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/install.sh b/install.sh index 114cb2c4..64c322ba 100644 --- a/install.sh +++ b/install.sh @@ -53,6 +53,7 @@ USER_TOOLS="" USER_MCP_PATH="${DEVKIT_MCP_PATH:-}" SKILLS_PROFILE="${DEVKIT_SKILLS_PROFILE:-}" USER_SKILLS="${DEVKIT_SKILLS:-}" +CHANNEL="${DEVKIT_CHANNEL:-stable}" # stable or experimental # Convert string booleans from env vars to actual booleans [ "$FORCE" = "true" ] || [ "$FORCE" = "1" ] && FORCE=true || FORCE=false @@ -135,6 +136,7 @@ while [ $# -gt 0 ]; do --list-skills) LIST_SKILLS=true; shift ;; --silent) SILENT=true; shift ;; --tools) USER_TOOLS="$2"; shift 2 ;; + --experimental) CHANNEL="experimental"; shift ;; -f|--force) FORCE=true; shift ;; -h|--help) echo "Databricks AI Dev Kit Installer" @@ -153,6 +155,7 @@ while [ $# -gt 0 ]; do echo " --skills-profile LIST Comma-separated profiles: all,data-engineer,analyst,ai-ml-engineer,app-developer" echo " --skills LIST Comma-separated skill names to install (overrides profile)" echo " --list-skills List available skills and profiles, then exit" + echo " --experimental Install from experimental branch (early access features)" echo " -f, --force Force reinstall" echo " -h, --help Show this help" echo "" @@ -166,6 +169,7 @@ while [ $# -gt 0 ]; do echo " DEVKIT_SKILLS_PROFILE Comma-separated skill profiles" echo " DEVKIT_SKILLS Comma-separated skill names" echo " DEVKIT_SILENT Set to 'true' for silent mode" + echo " DEVKIT_CHANNEL 'stable' (default) or 'experimental'" echo " AIDEVKIT_HOME Installation directory (default: ~/.ai-dev-kit)" echo "" echo "Examples:" @@ -1509,6 +1513,7 @@ summary() { echo "" echo -e "${G}${B}Installation complete!${N}" echo "────────────────────────────────" + [ "$CHANNEL" = "experimental" ] && msg "Channel: ${Y}experimental πŸ§ͺ${N}" msg "Location: $INSTALL_DIR" msg "Scope: $SCOPE" msg "Tools: $(echo "$TOOLS" | tr ' ' ', ')" @@ -1537,6 +1542,15 @@ summary() { step=$((step + 1)) msg "${step}. Try: \"List my SQL warehouses\"" echo "" + if [ "$CHANNEL" = "experimental" ]; then + echo -e " ${Y}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${N}" + echo -e " ${B}πŸ§ͺ You're using the experimental channel${N}" + echo -e " ${Y}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${N}" + echo "" + msg "Thank you for testing early features! Your feedback helps us improve." + msg "Report issues: ${BL}https://github.com/databricks-solutions/ai-dev-kit/issues${N}" + echo "" + fi fi } @@ -1609,6 +1623,64 @@ prompt_scope() { SCOPE="${values[$selected]}" } +# Prompt for release channel (stable vs experimental) +prompt_channel() { + # Skip if already set via --experimental flag or env var + if [ "$CHANNEL" = "experimental" ]; then + return + fi + + # Skip in silent mode or non-interactive + if [ "$SILENT" = true ] || [ ! -e /dev/tty ]; then + return + fi + + echo "" + echo -e " ${B}Select release channel${N}" + + local selected + selected=$(radio_select \ + "Stable|stable|on|Latest stable release (recommended)" \ + "Experimental|experimental|off|Early access to new features β€” help us test!" \ + ) + + CHANNEL="$selected" + + # If experimental was selected, re-download and re-exec from experimental branch + if [ "$CHANNEL" = "experimental" ]; then + echo "" + echo -e " ${Y}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${N}" + echo -e " ${B}πŸ§ͺ Experimental Channel${N}" + echo -e " ${Y}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${N}" + echo "" + echo -e " You're about to install the ${B}experimental${N} version of AI Dev Kit." + echo -e " This includes early access features that may change or break." + echo "" + echo -e " ${B}We'd love your feedback!${N}" + echo -e " Report issues: ${BL}https://github.com/databricks-solutions/ai-dev-kit/issues${N}" + echo -e " Discussions: ${BL}https://github.com/databricks-solutions/ai-dev-kit/discussions${N}" + echo "" + echo -e " ${D}Downloading installer from experimental branch...${N}" + echo "" + + # Build the command with all current flags preserved + local args="--experimental" + [ "$FORCE" = true ] && args="$args --force" + [ "$SILENT" = true ] && args="$args --silent" + [ -n "$USER_TOOLS" ] && args="$args --tools $USER_TOOLS" + [ -n "$USER_MCP_PATH" ] && args="$args --mcp-path $USER_MCP_PATH" + [ -n "$SKILLS_PROFILE" ] && args="$args --skills-profile $SKILLS_PROFILE" + [ -n "$USER_SKILLS" ] && args="$args --skills $USER_SKILLS" + [ "$SCOPE_EXPLICIT" = true ] && [ "$SCOPE" = "global" ] && args="$args --global" + [ "$PROFILE" != "DEFAULT" ] && args="$args --profile $PROFILE" + [ "$INSTALL_MCP" = false ] && args="$args --skills-only" + [ "$INSTALL_SKILLS" = false ] && args="$args --mcp-only" + + # Download and execute the experimental installer + exec bash <(curl -fsSL "https://raw.githubusercontent.com/databricks-solutions/ai-dev-kit/experimental/install.sh") $args + fi +} + # Prompt to run auth prompt_auth() { if [ "$SILENT" = true ] || [ ! -e /dev/tty ]; then @@ -1664,6 +1736,9 @@ main() { echo "────────────────────────────────" fi + # ── Step 1: Release channel selection (may re-exec from experimental branch) ── + prompt_channel + # Check dependencies step "Checking prerequisites" check_deps @@ -1717,6 +1792,7 @@ main() { echo "" echo -e " ${B}Summary${N}" echo -e " ────────────────────────────────────" + [ "$CHANNEL" = "experimental" ] && echo -e " Channel: ${Y}experimental πŸ§ͺ${N}" echo -e " Tools: ${G}$(echo "$TOOLS" | tr ' ' ', ')${N}" echo -e " Profile: ${G}${PROFILE}${N}" echo -e " Scope: ${G}${SCOPE}${N}" From 82bb94415e4989cee7b8cc15fd3dd6e8774f778b Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Wed, 15 Apr 2026 11:03:00 +0200 Subject: [PATCH 02/66] Add release workflow with experimental branch sync MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Automates releases while ensuring the experimental branch stays in sync: - Triggers on VERSION file changes on main - Checks if experimental is behind main - Creates sync PR (main β†’ experimental) if needed - Auto-merges if no conflicts, blocks release if conflicts exist - Clear error messages with PR links when blocked - Creates git tag and GitHub Release when sync is complete Part of #468 πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .github/workflows/release.yml | 237 ++++++++++++++++++++++++++++++++++ 1 file changed, 237 insertions(+) create mode 100644 .github/workflows/release.yml diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 00000000..0d96ed39 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,237 @@ +name: Release + +# Triggers when VERSION file is updated on main (typically via a version bump PR) +on: + push: + branches: [main] + paths: + - 'VERSION' + +permissions: + contents: write + pull-requests: write + +env: + SYNC_PR_BRANCH: sync-main-to-experimental + EXPERIMENTAL_BRANCH: experimental + +jobs: + sync-experimental: + name: Sync Experimental Branch + runs-on: ubuntu-latest + outputs: + synced: ${{ steps.check-sync.outputs.synced }} + pr_number: ${{ steps.find-or-create-pr.outputs.pr_number }} + pr_url: ${{ steps.find-or-create-pr.outputs.pr_url }} + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Configure Git + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + + - name: Check if experimental branch exists + id: check-experimental + run: | + if git ls-remote --heads origin ${{ env.EXPERIMENTAL_BRANCH }} | grep -q ${{ env.EXPERIMENTAL_BRANCH }}; then + echo "exists=true" >> $GITHUB_OUTPUT + else + echo "exists=false" >> $GITHUB_OUTPUT + echo "::notice::Experimental branch does not exist yet. It will be created from main." + fi + + - name: Create experimental branch if missing + if: steps.check-experimental.outputs.exists == 'false' + run: | + git checkout -b ${{ env.EXPERIMENTAL_BRANCH }} + git push origin ${{ env.EXPERIMENTAL_BRANCH }} + echo "::notice::Created '${{ env.EXPERIMENTAL_BRANCH }}' branch from main" + + - name: Check if experimental is in sync with main + id: check-sync + if: steps.check-experimental.outputs.exists == 'true' + run: | + git fetch origin ${{ env.EXPERIMENTAL_BRANCH }} + + # Check if main is ahead of experimental + BEHIND_COUNT=$(git rev-list --count origin/${{ env.EXPERIMENTAL_BRANCH }}..origin/main) + + if [ "$BEHIND_COUNT" -eq 0 ]; then + echo "synced=true" >> $GITHUB_OUTPUT + echo "::notice::βœ… Experimental branch is in sync with main" + else + echo "synced=false" >> $GITHUB_OUTPUT + echo "behind_count=$BEHIND_COUNT" >> $GITHUB_OUTPUT + echo "::warning::Experimental branch is $BEHIND_COUNT commit(s) behind main" + fi + + - name: Find or create sync PR + id: find-or-create-pr + if: steps.check-sync.outputs.synced == 'false' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + # Check for existing open PR + EXISTING_PR=$(gh pr list --head main --base ${{ env.EXPERIMENTAL_BRANCH }} --state open --json number,url --jq '.[0]') + + if [ -n "$EXISTING_PR" ]; then + PR_NUMBER=$(echo "$EXISTING_PR" | jq -r '.number') + PR_URL=$(echo "$EXISTING_PR" | jq -r '.url') + echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT + echo "pr_url=$PR_URL" >> $GITHUB_OUTPUT + echo "pr_existed=true" >> $GITHUB_OUTPUT + echo "::notice::Found existing sync PR #$PR_NUMBER" + else + # Create new PR + PR_URL=$(gh pr create \ + --title "πŸ”„ Sync: merge main into experimental" \ + --body "## Auto-generated sync PR + +This PR keeps the \`experimental\` branch up to date with \`main\`. + +### Why is this needed? +The experimental branch allows users to opt-in to early access features. It must stay in sync with main to ensure experimental users get all stable fixes and features. + +### What to do? +- **If this PR has no conflicts**: It will be auto-merged by the release workflow +- **If this PR has conflicts**: Please resolve them manually, then the next release attempt will succeed + +--- +*This PR was automatically created by the release workflow.*" \ + --head main \ + --base ${{ env.EXPERIMENTAL_BRANCH }}) + + PR_NUMBER=$(echo "$PR_URL" | grep -oE '[0-9]+$') + echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT + echo "pr_url=$PR_URL" >> $GITHUB_OUTPUT + echo "pr_existed=false" >> $GITHUB_OUTPUT + echo "::notice::Created sync PR #$PR_NUMBER: $PR_URL" + fi + + - name: Check PR mergeability + id: check-mergeable + if: steps.check-sync.outputs.synced == 'false' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + PR_NUMBER="${{ steps.find-or-create-pr.outputs.pr_number }}" + + # Wait a moment for GitHub to compute mergeability + sleep 5 + + # Get PR mergeable state + MERGEABLE=$(gh pr view "$PR_NUMBER" --json mergeable --jq '.mergeable') + + echo "mergeable=$MERGEABLE" >> $GITHUB_OUTPUT + + if [ "$MERGEABLE" = "MERGEABLE" ]; then + echo "::notice::βœ… Sync PR #$PR_NUMBER is mergeable (no conflicts)" + elif [ "$MERGEABLE" = "CONFLICTING" ]; then + echo "::error::❌ Sync PR #$PR_NUMBER has merge conflicts that must be resolved manually" + else + echo "::warning::⏳ Sync PR #$PR_NUMBER mergeability is unknown (state: $MERGEABLE)" + fi + + - name: Auto-merge sync PR + id: auto-merge + if: steps.check-sync.outputs.synced == 'false' && steps.check-mergeable.outputs.mergeable == 'MERGEABLE' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + PR_NUMBER="${{ steps.find-or-create-pr.outputs.pr_number }}" + + echo "Auto-merging sync PR #$PR_NUMBER..." + gh pr merge "$PR_NUMBER" --merge --admin -t "Sync main into experimental (auto-merge)" + + echo "merged=true" >> $GITHUB_OUTPUT + echo "::notice::βœ… Successfully merged sync PR #$PR_NUMBER" + + - name: Fail if conflicts exist + if: steps.check-sync.outputs.synced == 'false' && steps.check-mergeable.outputs.mergeable == 'CONFLICTING' + run: | + PR_URL="${{ steps.find-or-create-pr.outputs.pr_url }}" + PR_NUMBER="${{ steps.find-or-create-pr.outputs.pr_number }}" + + echo "" + echo "╔══════════════════════════════════════════════════════════════════╗" + echo "β•‘ ❌ RELEASE BLOCKED β•‘" + echo "╠══════════════════════════════════════════════════════════════════╣" + echo "β•‘ β•‘" + echo "β•‘ The experimental branch has merge conflicts with main. β•‘" + echo "β•‘ β•‘" + echo "β•‘ Before releasing, you must: β•‘" + echo "β•‘ β•‘" + echo "β•‘ 1. Resolve conflicts in PR #$PR_NUMBER" + echo "β•‘ $PR_URL" + echo "β•‘ β•‘" + echo "β•‘ 2. Merge the PR β•‘" + echo "β•‘ β•‘" + echo "β•‘ 3. Re-trigger this workflow (push to VERSION or re-run) β•‘" + echo "β•‘ β•‘" + echo "β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•" + echo "" + + exit 1 + + create-release: + name: Create Release + runs-on: ubuntu-latest + needs: sync-experimental + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Read version + id: version + run: | + VERSION=$(cat VERSION | tr -d '[:space:]') + echo "version=$VERSION" >> $GITHUB_OUTPUT + echo "::notice::Releasing version $VERSION" + + - name: Check if tag already exists + id: check-tag + run: | + VERSION="${{ steps.version.outputs.version }}" + if git rev-parse "v$VERSION" >/dev/null 2>&1; then + echo "exists=true" >> $GITHUB_OUTPUT + echo "::warning::Tag v$VERSION already exists, skipping release creation" + else + echo "exists=false" >> $GITHUB_OUTPUT + fi + + - name: Create and push tag + if: steps.check-tag.outputs.exists == 'false' + run: | + VERSION="${{ steps.version.outputs.version }}" + git tag "v$VERSION" + git push origin "v$VERSION" + echo "::notice::Created tag v$VERSION" + + - name: Create GitHub Release + if: steps.check-tag.outputs.exists == 'false' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + VERSION="${{ steps.version.outputs.version }}" + + gh release create "v$VERSION" \ + --title "v$VERSION" \ + --generate-notes \ + --latest + + echo "" + echo "╔══════════════════════════════════════════════════════════════════╗" + echo "β•‘ βœ… RELEASE SUCCESSFUL β•‘" + echo "╠══════════════════════════════════════════════════════════════════╣" + echo "β•‘ β•‘" + echo "β•‘ Version: v$VERSION" + echo "β•‘ β•‘" + echo "β•‘ β€’ GitHub Release created β•‘" + echo "β•‘ β€’ Experimental branch is in sync β•‘" + echo "β•‘ β•‘" + echo "β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•" From b2b81c7a4947cc8a13c888013ed6d5640f0f3397 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Wed, 15 Apr 2026 11:47:45 +0200 Subject: [PATCH 03/66] Add Claude Code prompt for resolving merge conflicts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When release is blocked due to conflicts between main and experimental, the error message now includes: - Step-by-step instructions for resolution - A ready-to-use Claude Code prompt that: - First analyzes commits in experimental to understand intent - Reviews conflicted files from both sides - Resolves by keeping both changes when possible - Asks for human confirmation when resolution isn't obvious πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .github/workflows/release.yml | 100 ++++++++++++++++++++++++++++------ 1 file changed, 84 insertions(+), 16 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 0d96ed39..a47797ae 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -157,23 +157,91 @@ The experimental branch allows users to opt-in to early access features. It must PR_NUMBER="${{ steps.find-or-create-pr.outputs.pr_number }}" echo "" - echo "╔══════════════════════════════════════════════════════════════════╗" - echo "β•‘ ❌ RELEASE BLOCKED β•‘" - echo "╠══════════════════════════════════════════════════════════════════╣" - echo "β•‘ β•‘" - echo "β•‘ The experimental branch has merge conflicts with main. β•‘" - echo "β•‘ β•‘" - echo "β•‘ Before releasing, you must: β•‘" - echo "β•‘ β•‘" - echo "β•‘ 1. Resolve conflicts in PR #$PR_NUMBER" - echo "β•‘ $PR_URL" - echo "β•‘ β•‘" - echo "β•‘ 2. Merge the PR β•‘" - echo "β•‘ β•‘" - echo "β•‘ 3. Re-trigger this workflow (push to VERSION or re-run) β•‘" - echo "β•‘ β•‘" - echo "β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•" + echo "❌ RELEASE BLOCKED - Merge conflicts detected" + echo "" + echo "πŸ“‹ Sync PR: $PR_URL" + echo "" + echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + echo " HOW TO FIX" + echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" + echo "" + echo "Open Claude Code in the ai-dev-kit repo and paste this prompt:" + echo "" + cat << 'PROMPT' +━━━━━━━━━━━━━━━━━━━━━ COPY BELOW THIS LINE ━━━━━━━━━━━━━━━━━━━━━ + +Merge main into experimental and resolve any conflicts. + +## Step 1: Start the merge + +Run: + git checkout experimental + git pull + git merge origin/main + +## Step 2: Understand what's in experimental (IMPORTANT - do this BEFORE resolving) + +I need you to fully understand the experimental branch before touching any conflicts. + +1. List commits only in experimental: + git log main..experimental --oneline + +2. For each commit, read the actual changes (not just the message): + git show --stat + Then read the key files if needed. + +3. Give me a detailed summary: + - What experimental features exist (describe each one) + - What files are experimental-only vs modified from main + - The intent/purpose of these experimental changes + +Do NOT proceed to conflict resolution until you've given me this summary. + +## Step 3: Analyze each conflict + +Run: git diff --name-only --diff-filter=U + +For each conflicted file: +1. Show me the conflict markers (the <<<<<<< ======= >>>>>>> sections) +2. Explain what MAIN is changing (likely: bugfix, stable feature, refactor) +3. Explain what EXPERIMENTAL is changing (likely: early-access feature) +4. Explain if these changes are independent or overlapping + +## Step 4: Resolve conflicts + +Apply these rules: +- **Independent changes** (different parts of file): Keep BOTH - main's updates AND experimental's features +- **Compatible changes** (e.g., main fixed a bug in code experimental also modified): Apply main's fix within experimental's version +- **Conflicting intent**: STOP and ask me. Based on your analysis from Step 2, explain the tradeoff and give me clear options to choose from. + +## Step 5: Complete the merge + +After ALL conflicts are resolved, commit with a detailed message: + + git add . + git commit -m "Merge main into experimental + + Kept from main: + - [list bugfixes and features from main] + + Preserved from experimental: + - [list experimental features preserved] + + Resolutions: + - [list any non-trivial merge decisions] + " + git push origin experimental + +## Step 6: Confirm + +Tell me when the merge is pushed so I can re-run the release workflow. + +━━━━━━━━━━━━━━━━━━━━━ COPY ABOVE THIS LINE ━━━━━━━━━━━━━━━━━━━━━ +PROMPT + echo "" + echo "After the merge is pushed, re-run this workflow." echo "" + echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" exit 1 From 72f57aa6f4c7e5e5079c65ff8ecece1c4b6fbd39 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Mon, 13 Apr 2026 09:35:27 +0200 Subject: [PATCH 04/66] Replace MCP tools with CLI commands in agent-bricks, dashboards, and apps skills MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - databricks-agent-bricks: Use CLI for KA/Genie, add manager.py for MAS operations - databricks-aibi-dashboards: Use databricks lakeview CLI commands - databricks-app-python: Update to use CLI-based deployment This is part of the effort to simplify skills by removing MCP tool dependencies and using Databricks CLI directly where possible. πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../1-knowledge-assistants.md | 87 +++-- .../2-supervisor-agents.md | 100 +++-- .../databricks-agent-bricks/SKILL.md | 165 ++++---- .../databricks-agent-bricks/manager.py | 351 ++++++++++++++++++ .../databricks-aibi-dashboards/3-examples.md | 39 +- .../databricks-aibi-dashboards/SKILL.md | 86 +++-- .../databricks-app-python/4-deployment.md | 6 +- .../databricks-app-python/6-mcp-approach.md | 76 ++-- .../databricks-app-python/SKILL.md | 4 +- 9 files changed, 644 insertions(+), 270 deletions(-) create mode 100644 databricks-skills/databricks-agent-bricks/manager.py diff --git a/databricks-skills/databricks-agent-bricks/1-knowledge-assistants.md b/databricks-skills/databricks-agent-bricks/1-knowledge-assistants.md index 3adff469..f4872332 100644 --- a/databricks-skills/databricks-agent-bricks/1-knowledge-assistants.md +++ b/databricks-skills/databricks-agent-bricks/1-knowledge-assistants.md @@ -30,17 +30,30 @@ Before creating a KA, you need documents in a Unity Catalog Volume: ## Creating a Knowledge Assistant -Use the `manage_ka` tool with `action="create_or_update"`: - -- `name`: "HR Policy Assistant" -- `volume_path`: "/Volumes/my_catalog/my_schema/raw_data/hr_docs" -- `description`: "Answers questions about HR policies and procedures" -- `instructions`: "Be helpful and always cite the specific policy document when answering. If you're unsure, say so." +Use the CLI to create a Knowledge Assistant: + +```bash +# Step 1: Create the Knowledge Assistant +databricks knowledge-assistants create-knowledge-assistant \ + "HR Policy Assistant" \ + "Answers questions about HR policies and procedures" + +# Step 2: Add a knowledge source (volume with documents) +databricks knowledge-assistants create-knowledge-source \ + "knowledge-assistants/{ka_id}" \ + "HR Documents" \ + "HR policy documents" \ + "VOLUME" \ + --volume-config '{"volume_id": "/Volumes/my_catalog/my_schema/raw_data/hr_docs"}' + +# Step 3: Sync (index) the knowledge sources +databricks knowledge-assistants sync-knowledge-sources "knowledge-assistants/{ka_id}" +``` -The tool will: -1. Create the KA with the specified volume as a knowledge source -2. Scan the volume for JSON files with example questions (from PDF generation) -3. Queue examples to be added once the endpoint is ready +The CLI will: +1. Create the KA with the specified configuration +2. Create a knowledge source pointing to your volume +3. Trigger indexing of the documents ## Provisioning Timeline @@ -52,9 +65,11 @@ After creation, the KA endpoint needs to provision: | `ONLINE` | Ready to use | - | | `OFFLINE` | Not currently running | - | -Use `manage_ka` with `action="get"` to check the status: +Use the CLI to check the status: -- `tile_id`: "" +```bash +databricks knowledge-assistants get-knowledge-assistant "knowledge-assistants/{ka_id}" +``` ## Adding Example Questions @@ -76,7 +91,7 @@ These are automatically added when `add_examples_from_volume=true` (default). ### Manual -Examples can also be specified in the `manage_ka` create_or_update call if needed. +Examples can also be added manually via the Databricks UI or SDK. ## Best Practices @@ -101,7 +116,10 @@ Be helpful and professional. When answering: To update the indexed documents: 1. Add/remove/modify files in the volume -2. Call `manage_ka` with `action="create_or_update"`, the same name and `tile_id` +2. Trigger a sync to re-index: + ```bash + databricks knowledge-assistants sync-knowledge-sources "knowledge-assistants/{ka_id}" + ``` 3. The KA will re-index the updated content ## Example Workflow @@ -126,40 +144,39 @@ Knowledge Assistants can be used as agents in a Supervisor Agent (formerly Multi ### Finding the Endpoint Name -Use `manage_ka` with `action="get"` to retrieve the KA details. The response includes: -- `tile_id`: The unique identifier for the KA -- `name`: The KA name (sanitized) -- `endpoint_status`: Current status (ONLINE, PROVISIONING, etc.) +Use the CLI to retrieve the KA details: -The endpoint name follows this pattern: `ka-{tile_id}-endpoint` +```bash +# List all KAs to find the one you want +databricks knowledge-assistants list-knowledge-assistants -### Finding a KA by Name +# Get details for a specific KA +databricks knowledge-assistants get-knowledge-assistant "knowledge-assistants/{ka_id}" +``` -If you know the KA name but not the tile_id, use `manage_ka` with `action="find_by_name"`: +The response includes: +- `name`: The resource name (knowledge-assistants/{ka_id}) +- `display_name`: The KA display name +- Status information -```python -manage_ka(action="find_by_name", name="HR_Policy_Assistant") -# Returns: {"found": True, "tile_id": "01abc...", "name": "HR_Policy_Assistant", "endpoint_name": "ka-01abc...-endpoint"} -``` +The endpoint name follows this pattern: `ka-{tile_id}-endpoint` ### Example: Adding KA to Supervisor Agent -```python -# First, find the KA -manage_ka(action="find_by_name", name="HR_Policy_Assistant") +```bash +# First, list KAs to find the tile_id +databricks knowledge-assistants list-knowledge-assistants -# Then use the tile_id in a Supervisor Agent -manage_mas( - action="create_or_update", - name="Support_MAS", - agents=[ +# Then use the tile_id to create a Supervisor Agent with manager.py +python manager.py create_mas "Support_MAS" '{ + "agents": [ { "name": "hr_agent", - "ka_tile_id": "", + "ka_tile_id": "", "description": "Answers HR policy questions from the employee handbook" } ] -) +}' ``` ## Troubleshooting diff --git a/databricks-skills/databricks-agent-bricks/2-supervisor-agents.md b/databricks-skills/databricks-agent-bricks/2-supervisor-agents.md index 7121bfcf..eb1f4ddc 100644 --- a/databricks-skills/databricks-agent-bricks/2-supervisor-agents.md +++ b/databricks-skills/databricks-agent-bricks/2-supervisor-agents.md @@ -40,7 +40,7 @@ Before creating a Supervisor Agent, you need agents of one or both types: - Existing Genie spaces for SQL-based data exploration - Great for analytics, metrics, and data-driven questions - No separate endpoint deployment required - reference the space directly -- To find a Genie space by name, use `find_genie_by_name(display_name="My Genie")` +- To find a Genie space, use `databricks genie list-spaces` - **Note**: There is NO system table for Genie spaces - do not try to query `system.ai.genie_spaces` ## Unity Catalog Functions @@ -135,13 +135,13 @@ Reference the UC Connection using the `connection_name` field: ### Complete Example: Multi-System Supervisor -Example showing integration of Genie, KA, and external MCP: +Example showing integration of Genie, KA, and external MCP using `manager.py`: -```python -manage_mas( - action="create_or_update", - name="AP_Invoice_Supervisor", - agents=[ +```bash +python manager.py create_mas "AP_Invoice_Supervisor" '{ + "description": "AP automation assistant with analytics, policy guidance, and operational actions", + "instructions": "Route queries as follows:\n- Data questions (invoice counts, spend analysis, vendor metrics) β†’ billing_analyst\n- Policy questions (thresholds, SLAs, compliance rules) β†’ policy_expert\n- Actions (approve, reject, flag, search, workflows) β†’ ap_operations\n\nWhen a user asks to approve, reject, or flag an invoice, ALWAYS use ap_operations.", + "agents": [ { "name": "billing_analyst", "genie_space_id": "01abc123...", @@ -155,22 +155,10 @@ manage_mas( { "name": "ap_operations", "connection_name": "ap_invoice_mcp", - "description": ( - "Execute AP operations: approve/reject/flag invoices, search invoice details, " - "get vendor summaries, trigger batch workflows. Use for ANY action or write operation." - ) + "description": "Execute AP operations: approve/reject/flag invoices, search invoice details, get vendor summaries, trigger batch workflows. Use for ANY action or write operation." } - ], - description="AP automation assistant with analytics, policy guidance, and operational actions", - instructions=""" - Route queries as follows: - - Data questions (invoice counts, spend analysis, vendor metrics) β†’ billing_analyst - - Policy questions (thresholds, SLAs, compliance rules) β†’ policy_expert - - Actions (approve, reject, flag, search, workflows) β†’ ap_operations - - When a user asks to approve, reject, or flag an invoice, ALWAYS use ap_operations. - """ -) + ] +}' ``` ### MCP Connection Testing @@ -193,31 +181,31 @@ SELECT http_request( ## Creating a Supervisor Agent -Use the `manage_mas` tool with `action="create_or_update"`: +**NO CLI AVAILABLE** - Use the `manager.py` script in this skill folder: -- `name`: "Customer Support MAS" -- `agents`: - ```json - [ - { - "name": "policy_agent", - "ka_tile_id": "f32c5f73-466b-4798-b3a0-5396b5ece2a5", - "description": "Answers questions about company policies and procedures from indexed documents" - }, - { - "name": "usage_analytics", - "genie_space_id": "01abc123-def4-5678-90ab-cdef12345678", - "description": "Answers data questions about usage metrics, trends, and statistics" - }, - { - "name": "custom_agent", - "endpoint_name": "my-custom-endpoint", - "description": "Handles specialized queries via custom model endpoint" - } - ] - ``` -- `description`: "Routes customer queries to specialized support agents" -- `instructions`: "Analyze the user's question and route to the most appropriate agent. If unclear, ask for clarification." +```bash +python manager.py create_mas "Customer Support MAS" '{ + "description": "Routes customer queries to specialized support agents", + "instructions": "Analyze the user'\''s question and route to the most appropriate agent. If unclear, ask for clarification.", + "agents": [ + { + "name": "policy_agent", + "ka_tile_id": "f32c5f73-466b-4798-b3a0-5396b5ece2a5", + "description": "Answers questions about company policies and procedures from indexed documents" + }, + { + "name": "usage_analytics", + "genie_space_id": "01abc123-def4-5678-90ab-cdef12345678", + "description": "Answers data questions about usage metrics, trends, and statistics" + }, + { + "name": "custom_agent", + "endpoint_name": "my-custom-endpoint", + "description": "Handles specialized queries via custom model endpoint" + } + ] +}' +``` This example shows mixing Knowledge Assistants (policy_agent), Genie spaces (usage_analytics), and custom endpoints (custom_agent). @@ -237,8 +225,8 @@ Each agent in the `agents` list needs: **Note**: Provide exactly one of: `ka_tile_id`, `genie_space_id`, `endpoint_name`, `uc_function_name`, or `connection_name`. -To find a KA tile_id, use `manage_ka(action="find_by_name", name="Your KA Name")`. -To find a Genie space_id, use `find_genie_by_name(display_name="Your Genie Name")`. +To find a KA tile_id, use `databricks knowledge-assistants list-knowledge-assistants`. +To find a Genie space_id, use `databricks genie list-spaces`. ### Writing Good Descriptions @@ -264,7 +252,7 @@ After creation, the Supervisor Agent endpoint needs to provision: | `ONLINE` | Ready to route queries | - | | `OFFLINE` | Not currently running | - | -Use `manage_mas` with `action="get"` to check the status. +Use `python manager.py get_mas TILE_ID` to check the status. ## Adding Example Questions @@ -344,14 +332,20 @@ Consider adding a general-purpose agent for queries that don't fit elsewhere: ## Updating a Supervisor Agent -To update an existing Supervisor Agent: +To update an existing Supervisor Agent, use `manager.py`: + +```bash +# Get current state +python manager.py get_mas TILE_ID -1. **Add/remove agents**: Call `manage_mas` with `action="create_or_update"` and updated `agents` list +# Update with new configuration +python manager.py update_mas TILE_ID '{"name": "New Name", "agents": [...], "instructions": "..."}' +``` + +1. **Add/remove agents**: Include updated `agents` list 2. **Update descriptions**: Change agent descriptions to improve routing 3. **Modify instructions**: Update routing rules -The tool finds the existing Supervisor Agent by name and updates it. - ## Troubleshooting ### Queries routed to wrong agent diff --git a/databricks-skills/databricks-agent-bricks/SKILL.md b/databricks-skills/databricks-agent-bricks/SKILL.md index 026f204a..dc4a6e87 100644 --- a/databricks-skills/databricks-agent-bricks/SKILL.md +++ b/databricks-skills/databricks-agent-bricks/SKILL.md @@ -44,67 +44,95 @@ Before creating Agent Bricks, ensure you have the required data: - **Existing UC HTTP Connection**: Connection configured with `is_mcp_connection: 'true'` - Agent service principal has `USE CONNECTION` privilege on the connection -## MCP Tools +## CLI Tools -### Knowledge Assistant Tool +### Knowledge Assistant CLI -**manage_ka** - Manage Knowledge Assistants (KA) -- `action`: "create_or_update", "get", "find_by_name", or "delete" -- `name`: Name for the KA (for create_or_update, find_by_name) -- `volume_path`: Path to documents (e.g., `/Volumes/catalog/schema/volume/folder`) (for create_or_update) -- `description`: (optional) What the KA does (for create_or_update) -- `instructions`: (optional) How the KA should answer (for create_or_update) -- `tile_id`: The KA tile ID (for get, delete, or update via create_or_update) -- `add_examples_from_volume`: (optional, default: true) Auto-add examples from JSON files (for create_or_update) +```bash +# List all Knowledge Assistants +databricks knowledge-assistants list-knowledge-assistants -Actions: -- **create_or_update**: Requires `name`, `volume_path`. Optionally pass `tile_id` to update. -- **get**: Requires `tile_id`. Returns tile_id, name, description, endpoint_status, knowledge_sources, examples_count. -- **find_by_name**: Requires `name` (exact match). Returns found, tile_id, name, endpoint_name, endpoint_status. Use this to look up an existing KA when you know the name but not the tile_id. -- **delete**: Requires `tile_id`. +# Create a Knowledge Assistant +databricks knowledge-assistants create-knowledge-assistant "My KA Name" "Description of what the KA does" -### Genie Space Tools +# Get a Knowledge Assistant by name (resource name format: knowledge-assistants/{id}) +databricks knowledge-assistants get-knowledge-assistant "knowledge-assistants/{ka_id}" + +# Update a Knowledge Assistant +databricks knowledge-assistants update-knowledge-assistant "knowledge-assistants/{ka_id}" "*" "New Name" "New Description" + +# Delete a Knowledge Assistant +databricks knowledge-assistants delete-knowledge-assistant "knowledge-assistants/{ka_id}" + +# Create a Knowledge Source (volume-based documents) +databricks knowledge-assistants create-knowledge-source "knowledge-assistants/{ka_id}" "Source Name" "Description" "VOLUME" \ + --volume-config '{"volume_id": "/Volumes/catalog/schema/volume"}' + +# List Knowledge Sources for a KA +databricks knowledge-assistants list-knowledge-sources "knowledge-assistants/{ka_id}" + +# Sync (re-index) Knowledge Sources +databricks knowledge-assistants sync-knowledge-sources "knowledge-assistants/{ka_id}" +``` + +### Genie Space CLI **For comprehensive Genie guidance, use the `databricks-genie` skill.** -Use `manage_genie` with actions: -- `create_or_update` - Create or update a Genie Space -- `get` - Get Genie Space details -- `list` - List all Genie Spaces -- `delete` - Delete a Genie Space -- `export` / `import` - For migration +```bash +# List all Genie Spaces +databricks genie list-spaces + +# Create a Genie Space +databricks genie create-space --json '{"display_name": "My Genie", "description": "...", "table_identifiers": ["catalog.schema.table"]}' + +# Get a Genie Space +databricks genie get-space SPACE_ID + +# Update a Genie Space +databricks genie update-space SPACE_ID --json '{"display_name": "New Name"}' + +# Delete (trash) a Genie Space +databricks genie trash-space SPACE_ID +``` See `databricks-genie` skill for: - Table inspection workflow - Sample question best practices - Curation (instructions, certified queries) -**IMPORTANT**: There is NO system table for Genie spaces (e.g., `system.ai.genie_spaces` does not exist). Use `manage_genie(action="list")` to find spaces. - -### Supervisor Agent Tool - -**manage_mas** - Manage Supervisor Agents (MAS) -- `action`: "create_or_update", "get", "find_by_name", or "delete" -- `name`: Name for the Supervisor Agent (for create_or_update, find_by_name) -- `agents`: List of agent configurations (for create_or_update), each with: - - `name`: Agent identifier (required) - - `description`: What this agent handles - critical for routing (required) - - `ka_tile_id`: Knowledge Assistant tile ID (use for document Q&A agents - recommended for KAs) - - `genie_space_id`: Genie space ID (use for SQL-based data agents) - - `endpoint_name`: Model serving endpoint name (for custom agents) - - `uc_function_name`: Unity Catalog function name in format `catalog.schema.function_name` - - `connection_name`: Unity Catalog connection name (for external MCP servers) - - Note: Provide exactly one of: `ka_tile_id`, `genie_space_id`, `endpoint_name`, `uc_function_name`, or `connection_name` -- `description`: (optional) What the Supervisor Agent does (for create_or_update) -- `instructions`: (optional) Routing instructions for the supervisor (for create_or_update) -- `tile_id`: The Supervisor Agent tile ID (for get, delete, or update via create_or_update) -- `examples`: (optional) List of example questions with `question` and `guideline` fields (for create_or_update) - -Actions: -- **create_or_update**: Requires `name`, `agents`. Optionally pass `tile_id` to update. -- **get**: Requires `tile_id`. Returns tile_id, name, description, endpoint_status, agents, examples_count. -- **find_by_name**: Requires `name` (exact match). Returns found, tile_id, name, endpoint_status, agents_count. Use this to look up an existing Supervisor Agent when you know the name but not the tile_id. -- **delete**: Requires `tile_id`. +**IMPORTANT**: There is NO system table for Genie spaces (e.g., `system.ai.genie_spaces` does not exist). Use `databricks genie list-spaces` to find spaces. + +### Supervisor Agent (MAS) + +**NO CLI AVAILABLE** - Supervisor Agents are managed via the `manager.py` script in this skill folder: + +```bash +# List all Supervisor Agents +python manager.py list_mas + +# Create a Supervisor Agent +python manager.py create_mas "My Supervisor" '{"agents": [...], "description": "...", "instructions": "..."}' + +# Get a Supervisor Agent by tile ID +python manager.py get_mas TILE_ID + +# Find a Supervisor Agent by name +python manager.py find_mas "My Supervisor" + +# Update a Supervisor Agent +python manager.py update_mas TILE_ID '{"name": "New Name", "agents": [...], ...}' + +# Delete a Supervisor Agent +python manager.py delete_mas TILE_ID +``` + +Agent configuration options (provide exactly one per agent): +- `ka_tile_id`: Knowledge Assistant tile ID (for document Q&A agents) +- `genie_space_id`: Genie space ID (for SQL-based data agents) +- `endpoint_name`: Model serving endpoint name (for custom agents) +- `uc_function_name`: Unity Catalog function name in format `catalog.schema.function_name` +- `connection_name`: Unity Catalog connection name (for external MCP servers) ## Typical Workflow @@ -126,7 +154,7 @@ Before creating Agent Bricks, generate the required source data: ### 2. Create the Agent Brick -Use `manage_ka(action="create_or_update", ...)` or `manage_mas(action="create_or_update", ...)` with your data sources. +Use the CLI commands above or SDK to create your Agent Bricks with data sources. ### 3. Wait for Provisioning @@ -149,51 +177,40 @@ For KA, if `add_examples_from_volume=true`, examples are automatically extracted ## Example: Multi-Modal Supervisor Agent -```python -manage_mas( - action="create_or_update", - name="Enterprise Support Supervisor", - agents=[ +Use `manager.py` to create a Supervisor Agent: + +```bash +python manager.py create_mas "Enterprise Support Supervisor" '{ + "description": "Comprehensive enterprise support agent", + "instructions": "Route queries as follows:\n1. Policy/procedure questions β†’ knowledge_base\n2. Data analysis requests β†’ analytics_engine\n3. Ticket classification β†’ ml_classifier", + "agents": [ { "name": "knowledge_base", "ka_tile_id": "f32c5f73-466b-...", - "description": "Answers questions about company policies, procedures, and documentation from indexed files" + "description": "Answers questions about company policies from indexed files" }, { "name": "analytics_engine", "genie_space_id": "01abc123...", - "description": "Runs SQL analytics on usage metrics, performance stats, and operational data" + "description": "Runs SQL analytics on usage metrics" }, { "name": "ml_classifier", "endpoint_name": "custom-classification-endpoint", - "description": "Classifies support tickets and predicts resolution time using custom ML model" + "description": "Classifies support tickets using custom ML model" }, { "name": "data_enrichment", "uc_function_name": "support.utils.enrich_ticket_data", - "description": "Enriches support ticket data with customer history and context" + "description": "Enriches support ticket data with customer history" }, { "name": "ticket_operations", "connection_name": "ticket_system_mcp", - "description": "Creates, updates, assigns, and closes support tickets in external ticketing system" + "description": "Creates and updates support tickets in external system" } - ], - description="Comprehensive enterprise support agent with knowledge retrieval, analytics, ML, data enrichment, and ticketing operations", - instructions=""" - Route queries as follows: - 1. Policy/procedure questions β†’ knowledge_base - 2. Data analysis requests β†’ analytics_engine - 3. Ticket classification β†’ ml_classifier - 4. Customer context lookups β†’ data_enrichment - 5. Ticket creation/updates β†’ ticket_operations - - If a query spans multiple domains, chain agents: - - First gather information (analytics_engine or knowledge_base) - - Then take action (ticket_operations) - """ -) + ] +}' ``` ## Related Skills diff --git a/databricks-skills/databricks-agent-bricks/manager.py b/databricks-skills/databricks-agent-bricks/manager.py new file mode 100644 index 00000000..719e57bc --- /dev/null +++ b/databricks-skills/databricks-agent-bricks/manager.py @@ -0,0 +1,351 @@ +#!/usr/bin/env python3 +""" +Supervisor Agent (MAS) Manager - CLI interface for MAS operations. + +Usage: + python manager.py create_mas "Name" '{"agents": [...], "description": "...", "instructions": "..."}' + python manager.py get_mas TILE_ID + python manager.py find_mas "Name" + python manager.py delete_mas TILE_ID + python manager.py list_mas + +Requires: databricks-tools-core package +""" + +import json +import sys +from typing import Any, Dict, List, Optional + +from databricks_tools_core.agent_bricks import AgentBricksManager, EndpointStatus + + +def _get_manager() -> AgentBricksManager: + """Get AgentBricksManager instance.""" + return AgentBricksManager() + + +def _build_agent_list(agents: List[Dict[str, str]]) -> List[Dict[str, Any]]: + """Build agent list for API from simplified config.""" + agent_list = [] + for agent in agents: + agent_name = agent.get("name", "") + agent_description = agent.get("description", "") + + agent_config = { + "name": agent_name, + "description": agent_description, + } + + if agent.get("genie_space_id"): + agent_config["agent_type"] = "genie" + agent_config["genie_space"] = {"id": agent.get("genie_space_id")} + elif agent.get("ka_tile_id"): + ka_tile_id = agent.get("ka_tile_id") + tile_id_prefix = ka_tile_id.split("-")[0] + agent_config["agent_type"] = "serving_endpoint" + agent_config["serving_endpoint"] = {"name": f"ka-{tile_id_prefix}-endpoint"} + elif agent.get("uc_function_name"): + uc_function_name = agent.get("uc_function_name") + uc_parts = uc_function_name.split(".") + agent_config["agent_type"] = "unity_catalog_function" + agent_config["unity_catalog_function"] = { + "uc_path": { + "catalog": uc_parts[0], + "schema": uc_parts[1], + "name": uc_parts[2], + } + } + elif agent.get("connection_name"): + agent_config["agent_type"] = "external_mcp_server" + agent_config["external_mcp_server"] = {"connection_name": agent.get("connection_name")} + else: + agent_config["agent_type"] = "serving_endpoint" + agent_config["serving_endpoint"] = {"name": agent.get("endpoint_name")} + + agent_list.append(agent_config) + return agent_list + + +def create_mas( + name: str, + agents: List[Dict[str, str]], + description: str = None, + instructions: str = None, +) -> Dict[str, Any]: + """Create a new Supervisor Agent. + + Args: + name: Display name for the MAS + agents: List of agent configs, each with: + - name: Agent identifier + - description: What this agent handles (critical for routing) + - ONE OF: endpoint_name, genie_space_id, ka_tile_id, uc_function_name, connection_name + description: What this MAS does + instructions: Routing rules for the supervisor + + Returns: + Dict with tile_id, name, endpoint_status + """ + manager = _get_manager() + agent_list = _build_agent_list(agents) + + result = manager.mas_create( + name=name, + agents=agent_list, + description=description, + instructions=instructions, + ) + + mas_data = result.get("multi_agent_supervisor", {}) + tile_data = mas_data.get("tile", {}) + status_data = mas_data.get("status", {}) + + return { + "tile_id": tile_data.get("tile_id", ""), + "name": tile_data.get("name", name), + "endpoint_status": status_data.get("endpoint_status", "UNKNOWN"), + "agents_count": len(agents), + } + + +def get_mas(tile_id: str) -> Dict[str, Any]: + """Get a Supervisor Agent by tile ID. + + Args: + tile_id: The MAS tile ID + + Returns: + Dict with tile_id, name, description, endpoint_status, agents, instructions + """ + manager = _get_manager() + result = manager.mas_get(tile_id) + + if not result: + return {"error": f"Supervisor Agent {tile_id} not found"} + + mas_data = result.get("multi_agent_supervisor", {}) + tile_data = mas_data.get("tile", {}) + status_data = mas_data.get("status", {}) + + return { + "tile_id": tile_data.get("tile_id", tile_id), + "name": tile_data.get("name", ""), + "description": tile_data.get("description", ""), + "endpoint_status": status_data.get("endpoint_status", "UNKNOWN"), + "agents": mas_data.get("agents", []), + "instructions": mas_data.get("instructions", ""), + } + + +def find_mas(name: str) -> Dict[str, Any]: + """Find a Supervisor Agent by name. + + Args: + name: The display name to search for + + Returns: + Dict with found, tile_id, name, endpoint_status if found + """ + manager = _get_manager() + result = manager.mas_find_by_name(name) + + if result is None: + return {"found": False, "name": name} + + full_details = manager.mas_get(result.tile_id) + if full_details: + mas_data = full_details.get("multi_agent_supervisor", {}) + status_data = mas_data.get("status", {}) + return { + "found": True, + "tile_id": result.tile_id, + "name": result.name, + "endpoint_status": status_data.get("endpoint_status", "UNKNOWN"), + "agents_count": len(mas_data.get("agents", [])), + } + + return { + "found": True, + "tile_id": result.tile_id, + "name": result.name, + } + + +def update_mas( + tile_id: str, + name: str = None, + agents: List[Dict[str, str]] = None, + description: str = None, + instructions: str = None, +) -> Dict[str, Any]: + """Update an existing Supervisor Agent. + + Args: + tile_id: The MAS tile ID to update + name: New display name (optional) + agents: New agent list (optional) + description: New description (optional) + instructions: New routing instructions (optional) + + Returns: + Dict with tile_id, name, endpoint_status + """ + manager = _get_manager() + + # Get existing to merge + existing = manager.mas_get(tile_id) + if not existing: + return {"error": f"Supervisor Agent {tile_id} not found"} + + mas_data = existing.get("multi_agent_supervisor", {}) + tile_data = mas_data.get("tile", {}) + + # Use existing values if not provided + final_name = name or tile_data.get("name", "") + final_description = description or tile_data.get("description", "") + final_instructions = instructions or mas_data.get("instructions", "") + + if agents: + agent_list = _build_agent_list(agents) + else: + agent_list = mas_data.get("agents", []) + + result = manager.mas_update( + tile_id=tile_id, + name=final_name, + description=final_description, + instructions=final_instructions, + agents=agent_list, + ) + + updated_data = result.get("multi_agent_supervisor", {}) + updated_tile = updated_data.get("tile", {}) + updated_status = updated_data.get("status", {}) + + return { + "tile_id": updated_tile.get("tile_id", tile_id), + "name": updated_tile.get("name", final_name), + "endpoint_status": updated_status.get("endpoint_status", "UNKNOWN"), + } + + +def delete_mas(tile_id: str) -> Dict[str, Any]: + """Delete a Supervisor Agent. + + Args: + tile_id: The MAS tile ID to delete + + Returns: + Dict with success status + """ + manager = _get_manager() + try: + manager.delete(tile_id) + return {"success": True, "tile_id": tile_id} + except Exception as e: + return {"success": False, "tile_id": tile_id, "error": str(e)} + + +def list_mas() -> List[Dict[str, Any]]: + """List all Supervisor Agents. + + Returns: + List of MAS summaries with tile_id, name, endpoint_status + """ + manager = _get_manager() + results = [] + + # List all tiles and filter to MAS type + tiles = manager.list_tiles() + for tile in tiles: + if tile.tile_type == "MULTI_AGENT_SUPERVISOR": + details = manager.mas_get(tile.tile_id) + if details: + mas_data = details.get("multi_agent_supervisor", {}) + tile_data = mas_data.get("tile", {}) + status_data = mas_data.get("status", {}) + results.append({ + "tile_id": tile.tile_id, + "name": tile_data.get("name", ""), + "endpoint_status": status_data.get("endpoint_status", "UNKNOWN"), + "agents_count": len(mas_data.get("agents", [])), + }) + + return results + + +def _print_json(data: Any) -> None: + """Print data as formatted JSON.""" + print(json.dumps(data, indent=2)) + + +def main(): + """CLI entry point.""" + if len(sys.argv) < 2: + print(__doc__) + sys.exit(1) + + command = sys.argv[1] + + if command == "create_mas": + if len(sys.argv) < 4: + print("Usage: python manager.py create_mas NAME '{\"agents\": [...], ...}'") + sys.exit(1) + name = sys.argv[2] + config = json.loads(sys.argv[3]) + result = create_mas( + name=name, + agents=config.get("agents", []), + description=config.get("description"), + instructions=config.get("instructions"), + ) + _print_json(result) + + elif command == "get_mas": + if len(sys.argv) < 3: + print("Usage: python manager.py get_mas TILE_ID") + sys.exit(1) + result = get_mas(sys.argv[2]) + _print_json(result) + + elif command == "find_mas": + if len(sys.argv) < 3: + print("Usage: python manager.py find_mas NAME") + sys.exit(1) + result = find_mas(sys.argv[2]) + _print_json(result) + + elif command == "update_mas": + if len(sys.argv) < 4: + print("Usage: python manager.py update_mas TILE_ID '{\"name\": ..., \"agents\": [...], ...}'") + sys.exit(1) + tile_id = sys.argv[2] + config = json.loads(sys.argv[3]) + result = update_mas( + tile_id=tile_id, + name=config.get("name"), + agents=config.get("agents"), + description=config.get("description"), + instructions=config.get("instructions"), + ) + _print_json(result) + + elif command == "delete_mas": + if len(sys.argv) < 3: + print("Usage: python manager.py delete_mas TILE_ID") + sys.exit(1) + result = delete_mas(sys.argv[2]) + _print_json(result) + + elif command == "list_mas": + result = list_mas() + _print_json(result) + + else: + print(f"Unknown command: {command}") + print(__doc__) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/databricks-skills/databricks-aibi-dashboards/3-examples.md b/databricks-skills/databricks-aibi-dashboards/3-examples.md index fe128d6b..078d3792 100644 --- a/databricks-skills/databricks-aibi-dashboards/3-examples.md +++ b/databricks-skills/databricks-aibi-dashboards/3-examples.md @@ -4,21 +4,14 @@ Production-ready templates you can adapt for your use case. ## Basic Dashboard (NYC Taxi) -```python -import json - +```bash # Step 1: Check table schema -table_info = get_table_stats_and_schema(catalog="samples", schema="nyctaxi") +databricks experimental aitools tools discover-schema samples.nyctaxi.trips # Step 2: Test queries -execute_sql("SELECT COUNT(*) as trips, AVG(fare_amount) as avg_fare, AVG(trip_distance) as avg_distance FROM samples.nyctaxi.trips") -execute_sql(""" - SELECT pickup_zip, COUNT(*) as trip_count - FROM samples.nyctaxi.trips - GROUP BY pickup_zip - ORDER BY trip_count DESC - LIMIT 10 -""") +databricks experimental aitools tools query --warehouse YOUR_WAREHOUSE_ID "SELECT COUNT(*) as trips, AVG(fare_amount) as avg_fare, AVG(trip_distance) as avg_distance FROM samples.nyctaxi.trips" + +databricks experimental aitools tools query --warehouse YOUR_WAREHOUSE_ID "SELECT pickup_zip, COUNT(*) as trip_count FROM samples.nyctaxi.trips GROUP BY pickup_zip ORDER BY trip_count DESC LIMIT 10" # Step 3: Build dashboard JSON dashboard = { @@ -195,15 +188,9 @@ dashboard = { }] } -# Step 4: Deploy -result = manage_dashboard( - action="create_or_update", - display_name="NYC Taxi Dashboard", - parent_path="/Workspace/Users/me/dashboards", - serialized_dashboard=json.dumps(dashboard), - warehouse_id=manage_warehouse(action="get_best"), -) -print(result["url"]) +# Step 4: Save dashboard JSON to file, then deploy via CLI +# Save the above dashboard dict to dashboard.json, then: +# databricks lakeview create --json @dashboard.json ``` ## Dashboard with Global Filters @@ -294,12 +281,6 @@ dashboard_with_filters = { } # Deploy with filters -result = manage_dashboard( - action="create_or_update", - display_name="Sales Dashboard with Filters", - parent_path="/Workspace/Users/me/dashboards", - serialized_dashboard=json.dumps(dashboard_with_filters), - warehouse_id=manage_warehouse(action="get_best"), -) -print(result["url"]) +# Save dashboard_with_filters to dashboard_filters.json, then: +# databricks lakeview create --json @dashboard_filters.json ``` diff --git a/databricks-skills/databricks-aibi-dashboards/SKILL.md b/databricks-skills/databricks-aibi-dashboards/SKILL.md index 99cff124..ba325d2c 100644 --- a/databricks-skills/databricks-aibi-dashboards/SKILL.md +++ b/databricks-skills/databricks-aibi-dashboards/SKILL.md @@ -1,6 +1,6 @@ --- name: databricks-aibi-dashboards -description: "Create Databricks AI/BI dashboards. Use when creating, updating, or deploying Lakeview dashboards. CRITICAL: You MUST test ALL SQL queries via execute_sql BEFORE deploying. Follow guidelines strictly." +description: "Create Databricks AI/BI dashboards. Use when creating, updating, or deploying Lakeview dashboards. CRITICAL: You MUST test ALL SQL queries via CLI BEFORE deploying. Follow guidelines strictly." --- # AI/BI Dashboard Skill @@ -13,60 +13,66 @@ Create Databricks AI/BI dashboards (formerly Lakeview dashboards). **Follow thes ``` β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ STEP 1: Get table schemas via get_table_stats_and_schema(catalog, schema) β”‚ +β”‚ STEP 1: Get table schemas via discover-schema β”‚ β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ β”‚ STEP 2: Write SQL queries for each dataset β”‚ β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ -β”‚ STEP 3: TEST EVERY QUERY via execute_sql() ← DO NOT SKIP! β”‚ +β”‚ STEP 3: TEST EVERY QUERY via CLI ← DO NOT SKIP! β”‚ β”‚ - If query fails, FIX IT before proceeding β”‚ β”‚ - Verify column names match what widgets will reference β”‚ β”‚ - Verify data types are correct (dates, numbers, strings) β”‚ β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ β”‚ STEP 4: Build dashboard JSON using ONLY verified queries β”‚ β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ -β”‚ STEP 5: Deploy via manage_dashboard(action="create_or_update") β”‚ +β”‚ STEP 5: Deploy via databricks lakeview create β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ ``` **WARNING: If you deploy without testing queries, widgets WILL show "Invalid widget definition" errors!** -## Available MCP Tools - -| Tool | Description | -|------|-------------| -| `get_table_stats_and_schema` | **STEP 1**: Get table schemas for designing queries | -| `execute_sql` | **STEP 3**: Test SQL queries - MANDATORY before deployment! | -| `manage_warehouse` (action="get_best") | Get available warehouse ID | -| `manage_dashboard` | **STEP 5**: Dashboard lifecycle management (see actions below) | - -### manage_dashboard Actions - -| Action | Description | Required Params | -|--------|-------------|-----------------| -| `create_or_update` | Deploy dashboard JSON (only after validation!) | display_name, parent_path, serialized_dashboard, warehouse_id | -| `get` | Get dashboard details by ID | dashboard_id | -| `list` | List all dashboards | (none) | -| `delete` | Move dashboard to trash | dashboard_id | -| `publish` | Publish a dashboard | dashboard_id, warehouse_id | -| `unpublish` | Unpublish a dashboard | dashboard_id | - -**Example usage:** -```python -# Create/update dashboard -manage_dashboard( - action="create_or_update", - display_name="Sales Dashboard", - parent_path="/Workspace/Users/me/dashboards", - serialized_dashboard=dashboard_json, - warehouse_id="abc123", - publish=True # auto-publish after create -) +## CLI Commands -# Get dashboard details -manage_dashboard(action="get", dashboard_id="dashboard_123") +### Step 1: Discover Table Schemas + +```bash +# Get table schemas for designing queries +databricks experimental aitools tools discover-schema catalog.schema.table1 catalog.schema.table2 +``` + +### Step 3: Test SQL Queries +```bash +# Test SQL queries - MANDATORY before deployment! +databricks experimental aitools tools query --warehouse WAREHOUSE_ID "SELECT COUNT(*) FROM catalog.schema.table" +``` + +### Step 5: Dashboard Lifecycle + +```bash # List all dashboards -manage_dashboard(action="list") +databricks lakeview list + +# Create a dashboard from JSON file +databricks lakeview create --json @dashboard.json + +# Get dashboard details +databricks lakeview get DASHBOARD_ID + +# Publish a dashboard +databricks lakeview publish DASHBOARD_ID --warehouse-id WAREHOUSE_ID + +# Unpublish a dashboard +databricks lakeview unpublish DASHBOARD_ID + +# Delete (trash) a dashboard +databricks lakeview trash DASHBOARD_ID +``` + +### Get Available Warehouse + +```bash +# List warehouses to find one for SQL execution +databricks warehouses list ``` ## Reference Files @@ -186,7 +192,7 @@ y=12: Table (w=6, h=6) - Detailed data | High cardinality | **Table only** | customer_id, order_id, SKU | **Before creating any chart with color/grouping:** -1. Check column cardinality (use `get_table_stats_and_schema` to see distinct values) +1. Check column cardinality (use `databricks experimental aitools tools discover-schema` to see distinct values) 2. If >10 distinct values, aggregate to higher level OR use TOP-N + "Other" bucket 3. For high-cardinality dimensions, use a table widget instead of a chart @@ -202,7 +208,7 @@ Before deploying, verify: 7. Counter datasets: use `disaggregated: true` for 1-row datasets, `disaggregated: false` with aggregation for multi-row 8. Percent values are 0-1 (not 0-100) 9. SQL uses Spark syntax (date_sub, not INTERVAL) -10. **All SQL queries tested via `execute_sql` and return expected data** +10. **All SQL queries tested via CLI and return expected data** --- diff --git a/databricks-skills/databricks-app-python/4-deployment.md b/databricks-skills/databricks-app-python/4-deployment.md index b318bbdf..d7b138e1 100644 --- a/databricks-skills/databricks-app-python/4-deployment.md +++ b/databricks-skills/databricks-app-python/4-deployment.md @@ -1,6 +1,6 @@ # Deploying Databricks Apps -Three deployment options: Databricks CLI (simplest), Asset Bundles (multi-environment), or MCP tools (programmatic). +Three deployment options: Databricks CLI (simplest), Asset Bundles (multi-environment), or CLI commands (programmatic). **Cookbook deployment guide**: https://apps-cookbook.dev/docs/deploy @@ -107,9 +107,9 @@ For complete DABs guidance, use the **databricks-bundles** skill. --- -## Option 3: MCP Tools +## Option 3: CLI Commands -For programmatic app lifecycle management, see [6-mcp-approach.md](6-mcp-approach.md). +For CLI-based app lifecycle management, see [6-mcp-approach.md](6-mcp-approach.md). --- diff --git a/databricks-skills/databricks-app-python/6-mcp-approach.md b/databricks-skills/databricks-app-python/6-mcp-approach.md index 943c49ba..01543509 100644 --- a/databricks-skills/databricks-app-python/6-mcp-approach.md +++ b/databricks-skills/databricks-app-python/6-mcp-approach.md @@ -1,17 +1,30 @@ -# MCP Tools for App Lifecycle +# CLI Commands for App Lifecycle -Use MCP tools to create, deploy, and manage Databricks Apps programmatically. This mirrors the CLI workflow but can be invoked by AI agents. +Use the Databricks CLI to create, deploy, and manage Databricks Apps. --- -## manage_app - App Lifecycle Management +## databricks apps - App Lifecycle Management -| Action | Description | Required Params | -|--------|-------------|-----------------| -| `create_or_update` | Idempotent create, deploys if source_code_path provided | name | -| `get` | Get app details (with optional logs) | name | -| `list` | List all apps | (none, optional name_contains filter) | -| `delete` | Delete an app | name | +```bash +# List all apps +databricks apps list + +# Create an app +databricks apps create --name my-dashboard --json '{"description": "Customer analytics dashboard"}' + +# Get app details +databricks apps get my-dashboard + +# Deploy an app (from workspace source code) +databricks apps deploy my-dashboard --source-code-path /Workspace/Users/user@example.com/my_app + +# Get app logs +databricks apps logs my-dashboard + +# Delete an app +databricks apps delete my-dashboard +``` --- @@ -32,42 +45,37 @@ my_app/ ### Step 2: Upload to Workspace -```python -# MCP Tool: manage_workspace_files -manage_workspace_files( - action="upload", - local_path="/path/to/my_app", - workspace_path="/Workspace/Users/user@example.com/my_app" -) +```bash +# Upload local folder to workspace +databricks workspace import-dir /path/to/my_app /Workspace/Users/user@example.com/my_app ``` ### Step 3: Create and Deploy App -```python -# MCP Tool: manage_app (creates if needed + deploys) -result = manage_app( - action="create_or_update", - name="my-dashboard", - description="Customer analytics dashboard", - source_code_path="/Workspace/Users/user@example.com/my_app" -) -# Returns: {"name": "my-dashboard", "url": "...", "created": True, "deployment": {...}} +```bash +# Create the app +databricks apps create --name my-dashboard --json '{"description": "Customer analytics dashboard"}' + +# Deploy from workspace source +databricks apps deploy my-dashboard --source-code-path /Workspace/Users/user@example.com/my_app ``` ### Step 4: Verify -```python -# MCP Tool: manage_app (get with logs) -app = manage_app(action="get", name="my-dashboard", include_logs=True) -# Returns: {"name": "...", "url": "...", "status": "RUNNING", "logs": "...", ...} +```bash +# Check app status +databricks apps get my-dashboard + +# Check logs for errors +databricks apps logs my-dashboard ``` ### Step 5: Iterate 1. Fix issues in local files -2. Re-upload with `manage_workspace_files(action="upload", ...)` -3. Re-deploy with `manage_app(action="create_or_update", ...)` (will update existing + deploy) -4. Check `manage_app(action="get", name=..., include_logs=True)` for errors +2. Re-upload with `databricks workspace import-dir /path/to/my_app /Workspace/Users/user@example.com/my_app` +3. Re-deploy with `databricks apps deploy my-dashboard --source-code-path ...` +4. Check `databricks apps logs my-dashboard` for errors 5. Repeat until app is healthy --- @@ -75,5 +83,5 @@ app = manage_app(action="get", name="my-dashboard", include_logs=True) ## Notes - Add resources (SQL warehouse, Lakebase, etc.) via the Databricks Apps UI after creating the app -- MCP tools use the service principal's permissions β€” ensure it has access to required resources -- For manual deployment, see [4-deployment.md](4-deployment.md) +- CLI uses your configured profile's credentials β€” ensure you have access to required resources +- For DABs deployment, see [4-deployment.md](4-deployment.md) diff --git a/databricks-skills/databricks-app-python/SKILL.md b/databricks-skills/databricks-app-python/SKILL.md index 777d3377..2604e8a6 100644 --- a/databricks-skills/databricks-app-python/SKILL.md +++ b/databricks-skills/databricks-app-python/SKILL.md @@ -72,7 +72,7 @@ Copy this checklist and verify each item: **Lakebase**: Use [5-lakebase.md](5-lakebase.md) when using Lakebase (PostgreSQL) as your app's data layer β€” covers auto-injected env vars, psycopg2/asyncpg patterns, and when to choose Lakebase vs SQL warehouse. (Keywords: Lakebase, PostgreSQL, psycopg2, asyncpg, transactional, PGHOST) -**MCP tools**: Use [6-mcp-approach.md](6-mcp-approach.md) for managing app lifecycle via MCP tools β€” covers creating, deploying, monitoring, and deleting apps programmatically. (Keywords: MCP, create app, deploy app, app logs) +**CLI commands**: Use [6-mcp-approach.md](6-mcp-approach.md) for managing app lifecycle via CLI β€” covers creating, deploying, monitoring, and deleting apps. (Keywords: CLI, create app, deploy app, app logs) **Foundation Models**: See [examples/llm_config.py](examples/llm_config.py) for calling Databricks foundation model APIs β€” covers OAuth M2M auth, OpenAI-compatible client wiring, and token caching. (Keywords: foundation model, LLM, OpenAI client, chat completions) @@ -87,7 +87,7 @@ Copy this checklist and verify each item: **Connecting to data/resources?** β†’ Read [2-app-resources.md](2-app-resources.md) **Using Lakebase (PostgreSQL)?** β†’ Read [5-lakebase.md](5-lakebase.md) **Deploying to Databricks?** β†’ Read [4-deployment.md](4-deployment.md) - **Using MCP tools?** β†’ Read [6-mcp-approach.md](6-mcp-approach.md) + **Using CLI for app lifecycle?** β†’ Read [6-mcp-approach.md](6-mcp-approach.md) **Calling foundation model/LLM APIs?** β†’ See [examples/llm_config.py](examples/llm_config.py) 2. Follow the instructions in the relevant guide From b2aaf60cfb77385aff0ae6a4a1587cb00a11bcd5 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Mon, 13 Apr 2026 09:51:49 +0200 Subject: [PATCH 05/66] Replace MCP tools with CLI commands in databricks-genie skill MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add conversation.py script for Genie Conversation API (ask_genie) - Update SKILL.md to use databricks genie CLI commands - Update spaces.md with CLI-based export/import/migration workflows - Update conversation.md to use conversation.py script πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- databricks-skills/databricks-genie/SKILL.md | 195 +++++++------- .../databricks-genie/conversation.md | 143 +++++----- .../databricks-genie/conversation.py | 171 ++++++++++++ databricks-skills/databricks-genie/spaces.md | 245 ++++++++---------- 4 files changed, 431 insertions(+), 323 deletions(-) create mode 100644 databricks-skills/databricks-genie/conversation.py diff --git a/databricks-skills/databricks-genie/SKILL.md b/databricks-skills/databricks-genie/SKILL.md index 82332476..39c0ab00 100644 --- a/databricks-skills/databricks-genie/SKILL.md +++ b/databricks-skills/databricks-genie/SKILL.md @@ -25,80 +25,72 @@ Use this skill when: - Not supported for schema and/or table names that differ across environments - Not including migration of tables between environments (only migration of Genie Spaces) -## MCP Tools - -| Tool | Purpose | -|------|---------| -| `manage_genie` | Create, get, list, delete, export, and import Genie Spaces | -| `ask_genie` | Ask natural language questions to a Genie Space | -| `get_table_stats_and_schema` | Inspect table schemas before creating a space | -| `execute_sql` | Test SQL queries directly | - -### manage_genie - Space Management - -| Action | Description | Required Params | -|--------|-------------|-----------------| -| `create_or_update` | Idempotent create/update a space | display_name, table_identifiers (or serialized_space) | -| `get` | Get space details | space_id | -| `list` | List all spaces | (none) | -| `delete` | Delete a space | space_id | -| `export` | Export space config for migration/backup | space_id | -| `import` | Import space from serialized config | warehouse_id, serialized_space | - -**Example tool calls:** -``` -# MCP Tool: manage_genie -# Create a new space -manage_genie( - action="create_or_update", - display_name="Sales Analytics", - table_identifiers=["catalog.schema.customers", "catalog.schema.orders"], - description="Explore sales data with natural language", - sample_questions=["What were total sales last month?"] -) - -# MCP Tool: manage_genie -# Get space details with full config -manage_genie(action="get", space_id="space_123", include_serialized_space=True) - -# MCP Tool: manage_genie -# List all spaces -manage_genie(action="list") - -# MCP Tool: manage_genie -# Export for migration -exported = manage_genie(action="export", space_id="space_123") - -# MCP Tool: manage_genie -# Import to new workspace -manage_genie( - action="import", - warehouse_id="warehouse_456", - serialized_space=exported["serialized_space"], - title="Sales Analytics (Prod)" -) +## CLI Commands + +### Space Management + +```bash +# List all Genie Spaces +databricks genie list-spaces + +# Create a Genie Space +databricks genie create-space --json '{ + "display_name": "Sales Analytics", + "description": "Explore sales data with natural language", + "table_identifiers": ["catalog.schema.customers", "catalog.schema.orders"] +}' + +# Get space details +databricks genie get-space SPACE_ID + +# Update a Genie Space +databricks genie update-space SPACE_ID --json '{ + "display_name": "Updated Name", + "description": "Updated description" +}' + +# Delete (trash) a Genie Space +databricks genie trash-space SPACE_ID ``` -### ask_genie - Conversation API (Query) +### Export & Import (Migration) -Ask natural language questions to a Genie Space. Pass `conversation_id` for follow-up questions. +```bash +# Export space configuration (returns JSON with serialized_space) +databricks genie export-space SPACE_ID +# Import space from exported config +databricks genie import-space --json '{ + "warehouse_id": "WAREHOUSE_ID", + "serialized_space": "...", + "title": "Sales Analytics (Prod)" +}' ``` -# MCP Tool: ask_genie -# Start a new conversation -result = ask_genie( - space_id="space_123", - question="What were total sales last month?" -) + +### Conversation API (Query) + +Use the `conversation.py` script in this skill folder to ask questions: + +```bash +# Ask a question to a Genie Space +python conversation.py ask SPACE_ID "What were total sales last month?" # Returns: {question, conversation_id, message_id, status, sql, columns, data, row_count} -# MCP Tool: ask_genie # Follow-up question in same conversation -result = ask_genie( - space_id="space_123", - question="Break that down by region", - conversation_id=result["conversation_id"] -) +python conversation.py ask SPACE_ID "Break that down by region" --conversation-id CONV_ID + +# With custom timeout (default: 60 seconds) +python conversation.py ask SPACE_ID "Complex analysis query" --timeout 120 +``` + +### Table Inspection + +```bash +# Inspect table schemas before creating a space +databricks unity-catalog tables get CATALOG.SCHEMA.TABLE + +# Or use the discover-schema tool for multiple tables +databricks experimental aitools tools discover-schema catalog.schema.table1 catalog.schema.table2 ``` ## Quick Start @@ -107,42 +99,32 @@ result = ask_genie( Before creating a Genie Space, understand your data: -``` -# MCP Tool: get_table_stats_and_schema -get_table_stats_and_schema( - catalog="my_catalog", - schema="sales", - table_stat_level="SIMPLE" -) +```bash +# Get table details +databricks unity-catalog tables get my_catalog.sales.customers +databricks unity-catalog tables get my_catalog.sales.orders + +# Or use discover-schema for multiple tables +databricks experimental aitools tools discover-schema my_catalog.sales.customers my_catalog.sales.orders ``` ### 2. Create the Genie Space -``` -# MCP Tool: manage_genie -manage_genie( - action="create_or_update", - display_name="Sales Analytics", - table_identifiers=[ - "my_catalog.sales.customers", - "my_catalog.sales.orders" - ], - description="Explore sales data with natural language", - sample_questions=[ - "What were total sales last month?", - "Who are our top 10 customers?" - ] -) +```bash +databricks genie create-space --json '{ + "display_name": "Sales Analytics", + "description": "Explore sales data with natural language", + "table_identifiers": [ + "my_catalog.sales.customers", + "my_catalog.sales.orders" + ] +}' ``` ### 3. Ask Questions (Conversation API) -``` -# MCP Tool: ask_genie -ask_genie( - space_id="your_space_id", - question="What were total sales last month?" -) +```bash +python conversation.py ask YOUR_SPACE_ID "What were total sales last month?" # Returns: SQL, columns, data, row_count ``` @@ -150,26 +132,23 @@ ask_genie( Export a space (preserves all tables, instructions, SQL examples, and layout): -``` -# MCP Tool: manage_genie -exported = manage_genie(action="export", space_id="your_space_id") -# exported["serialized_space"] contains the full config +```bash +databricks genie export-space YOUR_SPACE_ID > exported_space.json +# exported_space.json contains serialized_space with full config ``` Clone to a new space (same catalog): -``` -# MCP Tool: manage_genie -manage_genie( - action="import", - warehouse_id=exported["warehouse_id"], - serialized_space=exported["serialized_space"], - title=exported["title"], # override title; omit to keep original - description=exported["description"], -) +```bash +# Extract and import +databricks genie import-space --json '{ + "warehouse_id": "WAREHOUSE_ID", + "serialized_space": "...", + "title": "Sales Analytics (Clone)" +}' ``` -> **Cross-workspace migration:** Each MCP server is workspace-scoped. Configure one server entry per workspace profile in your IDE's MCP config, then `manage_genie(action="export")` from the source server and `manage_genie(action="import")` via the target server. See [spaces.md Β§Migration](spaces.md#migrating-across-workspaces-with-catalog-remapping) for the full workflow. +> **Cross-workspace migration:** Use different Databricks CLI profiles for source and target workspaces. Export from source profile, remap catalog names in `serialized_space`, then import via target profile. See [spaces.md Β§Migration](spaces.md#migrating-across-workspaces-with-catalog-remapping) for the full workflow. ## Reference Files diff --git a/databricks-skills/databricks-genie/conversation.md b/databricks-skills/databricks-genie/conversation.md index e4320e8b..4be5aa64 100644 --- a/databricks-skills/databricks-genie/conversation.md +++ b/databricks-skills/databricks-genie/conversation.md @@ -4,11 +4,11 @@ Use the Genie Conversation API to ask natural language questions to a curated Ge ## Overview -The `ask_genie` tool allows you to programmatically send questions to a Genie Space and receive SQL-generated answers. Instead of writing SQL directly, you delegate the query generation to Genie, which has been curated with business logic, instructions, and certified queries. +The `conversation.py` script in this skill folder allows you to programmatically send questions to a Genie Space and receive SQL-generated answers. Instead of writing SQL directly, you delegate the query generation to Genie, which has been curated with business logic, instructions, and certified queries. -## When to Use `ask_genie` +## When to Use the Conversation API -### Use `ask_genie` When: +### Use Conversation API When: | Scenario | Why | |----------|-----| @@ -18,7 +18,7 @@ The `ask_genie` tool allows you to programmatically send questions to a Genie Sp | Testing a Genie Space after creating it | Validate the space works correctly | | User wants conversational data exploration | Genie handles context for follow-up questions | -### Use Direct SQL (`execute_sql`) Instead When: +### Use Direct SQL Instead When: | Scenario | Why | |----------|-----| @@ -27,25 +27,24 @@ The `ask_genie` tool allows you to programmatically send questions to a Genie Sp | Genie Space doesn't exist for this data | Can't use Genie without a space | | Need precise control over the query | Direct SQL gives exact control | -## MCP Tools +## CLI Usage -| Tool | Purpose | -|------|---------| -| `ask_genie` | Ask a question or follow-up (`conversation_id` optional) | +Use the `conversation.py` script to ask questions: + +```bash +python conversation.py ask SPACE_ID "Your question here" +``` ## Basic Usage ### Ask a Question -```python -ask_genie( - space_id="01abc123...", - question="What were total sales last month?" -) +```bash +python conversation.py ask 01abc123... "What were total sales last month?" ``` **Response:** -```python +```json { "question": "What were total sales last month?", "conversation_id": "conv_xyz789", @@ -62,19 +61,13 @@ ask_genie( Use the `conversation_id` from the first response to ask follow-up questions with context: -```python -# First question -result = ask_genie( - space_id="01abc123...", - question="What were total sales last month?" -) +```bash +# First question - capture the conversation_id from output +python conversation.py ask 01abc123... "What were total sales last month?" +# Output includes: "conversation_id": "conv_xyz789" # Follow-up (uses context from first question) -ask_genie( - space_id="01abc123...", - question="Break that down by region", - conversation_id=result["conversation_id"] -) +python conversation.py ask 01abc123... "Break that down by region" --conversation-id conv_xyz789 ``` Genie remembers the context, so "that" refers to "total sales last month". @@ -98,35 +91,38 @@ Genie remembers the context, so "that" refers to "total sales last month". ### Successful Response -```python -result = ask_genie(space_id, "Who are our top 10 customers?") +The script returns JSON that can be parsed: -if result["status"] == "COMPLETED": - print(f"SQL: {result['sql']}") - print(f"Rows: {result['row_count']}") - for row in result["data"]: - print(row) +```bash +python conversation.py ask SPACE_ID "Who are our top 10 customers?" | jq '.status' +# Output: "COMPLETED" ``` -### Failed Response +Response fields when status is `COMPLETED`: +- `sql`: The SQL query Genie generated +- `columns`: List of column names +- `data`: Query results as list of rows +- `row_count`: Number of rows returned -```python -result = ask_genie(space_id, "What is the meaning of life?") +### Failed Response -if result["status"] == "FAILED": - print(f"Error: {result['error']}") - # Genie couldn't answer - may need to rephrase or use direct SQL +```bash +python conversation.py ask SPACE_ID "What is the meaning of life?" | jq '.status, .error' +# Output: "FAILED" +# Output: "Could not generate SQL for this question" ``` -### Timeout +Genie couldn't answer - may need to rephrase or use direct SQL. -```python -result = ask_genie(space_id, question, timeout_seconds=60) +### Timeout -if result["status"] == "TIMEOUT": - print("Query took too long - try a simpler question or increase timeout") +```bash +python conversation.py ask SPACE_ID "Complex query" --timeout 120 | jq '.status' +# If timeout occurs: "TIMEOUT" ``` +Query took too long - try a simpler question or increase timeout. + ## Example Workflows ### Workflow 1: User Asks to Use Genie @@ -136,7 +132,7 @@ User: "Ask my Sales Genie what the churn rate is" Claude: 1. Identifies user wants to use Genie (explicit request) -2. Calls ask_genie(space_id="sales_genie_id", question="What is the churn rate?") +2. Runs: python conversation.py ask sales_genie_id "What is the churn rate?" 3. Returns: "Based on your Sales Genie, the churn rate is 4.2%. Genie used this SQL: SELECT ..." ``` @@ -147,10 +143,10 @@ Claude: User: "I just created a Genie Space for HR data. Can you test it?" Claude: -1. Gets the space_id from the user or recent manage_genie(action="create_or_update") result -2. Calls ask_genie with test questions: - - "How many employees do we have?" - - "What is the average salary by department?" +1. Gets the space_id from the user or recent databricks genie create-space result +2. Runs conversation.py with test questions: + - python conversation.py ask SPACE_ID "How many employees do we have?" + - python conversation.py ask SPACE_ID "What is the average salary by department?" 3. Reports results: "Your HR Genie is working. It correctly answered..." ``` @@ -160,11 +156,12 @@ Claude: User: "Use my analytics Genie to explore sales trends" Claude: -1. ask_genie(space_id, "What were total sales by month this year?") +1. python conversation.py ask SPACE_ID "What were total sales by month this year?" + # Returns conversation_id: conv_xyz 2. User: "Which month had the highest growth?" -3. ask_genie(space_id, "Which month had the highest growth?", conversation_id=conv_id) +3. python conversation.py ask SPACE_ID "Which month had the highest growth?" -c conv_xyz 4. User: "What products drove that growth?" -5. ask_genie(space_id, "What products drove that growth?", conversation_id=conv_id) +5. python conversation.py ask SPACE_ID "What products drove that growth?" -c conv_xyz ``` ## Best Practices @@ -173,28 +170,25 @@ Claude: Don't reuse conversations across unrelated questions: -```python +```bash # Good: New conversation for new topic -result1 = ask_genie(space_id, "What were sales last month?") # New conversation -result2 = ask_genie(space_id, "How many employees do we have?") # New conversation +python conversation.py ask SPACE_ID "What were sales last month?" # New conversation +python conversation.py ask SPACE_ID "How many employees do we have?" # New conversation # Good: Follow-up for related question -result1 = ask_genie(space_id, "What were sales last month?") -result2 = ask_genie(space_id, "Break that down by product", - conversation_id=result1["conversation_id"]) # Related follow-up +python conversation.py ask SPACE_ID "What were sales last month?" +# Get conversation_id from output, then: +python conversation.py ask SPACE_ID "Break that down by product" -c CONV_ID # Related follow-up ``` ### Handle Clarification Requests Genie may ask for clarification instead of returning results: -```python -result = ask_genie(space_id, "Show me the data") - -if result.get("text_response"): - # Genie is asking for clarification - print(f"Genie asks: {result['text_response']}") - # Rephrase with more specifics +```bash +python conversation.py ask SPACE_ID "Show me the data" | jq '.text_response' +# If Genie needs clarification, text_response will contain the question +# Rephrase with more specifics ``` ### Set Appropriate Timeouts @@ -203,13 +197,12 @@ if result.get("text_response"): - Complex joins: 60-120 seconds - Large data scans: 120+ seconds -```python -# Quick question -ask_genie(space_id, "How many orders today?", timeout_seconds=30) +```bash +# Quick question (default 60s) +python conversation.py ask SPACE_ID "How many orders today?" -# Complex analysis -ask_genie(space_id, "Calculate customer lifetime value for all customers", - timeout_seconds=180) +# Complex analysis with longer timeout +python conversation.py ask SPACE_ID "Calculate customer lifetime value for all customers" --timeout 180 ``` ## Troubleshooting @@ -218,22 +211,22 @@ ask_genie(space_id, "Calculate customer lifetime value for all customers", - Verify the `space_id` is correct - Check you have access to the space -- Use `manage_genie(action="get", space_id=...)` to verify it exists +- Use `databricks genie get-space SPACE_ID` to verify it exists ### "Query timed out" -- Increase `timeout_seconds` +- Increase timeout: `--timeout 120` - Simplify the question -- Check if the SQL warehouse is running +- Check if the SQL warehouse is running: `databricks warehouses list` ### "Failed to generate SQL" - Rephrase the question more clearly - Check if the question is answerable with the available tables -- Add more instructions/curation to the Genie Space +- Add more instructions/curation to the Genie Space via the Databricks UI ### Unexpected Results -- Review the generated SQL in the response +- Review the generated SQL in the response (`jq '.sql'`) - Add SQL instructions to the Genie Space via the Databricks UI - Add sample questions that demonstrate correct patterns diff --git a/databricks-skills/databricks-genie/conversation.py b/databricks-skills/databricks-genie/conversation.py new file mode 100644 index 00000000..e1a670ff --- /dev/null +++ b/databricks-skills/databricks-genie/conversation.py @@ -0,0 +1,171 @@ +#!/usr/bin/env python3 +""" +Genie Conversation API - CLI interface for asking questions to Genie Spaces. + +Usage: + python conversation.py ask SPACE_ID "What were total sales last month?" + python conversation.py ask SPACE_ID "Break that down by region" --conversation-id CONV_ID + python conversation.py ask SPACE_ID "Complex query" --timeout 120 + +Requires: databricks-sdk package +""" + +import argparse +import json +import sys +import time +from typing import Any, Dict, Optional + +from databricks.sdk import WorkspaceClient +from databricks.sdk.service.dashboards import GenieMessage + + +def ask_genie( + space_id: str, + question: str, + conversation_id: Optional[str] = None, + timeout_seconds: int = 60, +) -> Dict[str, Any]: + """Ask a question to a Genie Space. + + Args: + space_id: The Genie Space ID + question: Natural language question to ask + conversation_id: Optional conversation ID for follow-up questions + timeout_seconds: Maximum time to wait for response (default: 60) + + Returns: + Dict with question, conversation_id, message_id, status, sql, columns, data, row_count + """ + client = WorkspaceClient() + + # Start or continue conversation + if conversation_id: + response = client.genie.start_conversation_and_wait( + space_id=space_id, + content=question, + conversation_id=conversation_id, + ) + else: + response = client.genie.start_conversation_and_wait( + space_id=space_id, + content=question, + ) + + # Extract conversation and message IDs + conv_id = response.conversation_id if hasattr(response, 'conversation_id') else None + msg_id = response.message_id if hasattr(response, 'message_id') else None + + # Poll for completion + start_time = time.time() + while True: + if time.time() - start_time > timeout_seconds: + return { + "question": question, + "conversation_id": conv_id, + "message_id": msg_id, + "status": "TIMEOUT", + "error": f"Query timed out after {timeout_seconds} seconds", + } + + # Get message details + message = client.genie.get_message( + space_id=space_id, + conversation_id=conv_id, + message_id=msg_id, + ) + + status = message.status.value if hasattr(message.status, 'value') else str(message.status) + + if status == "COMPLETED": + # Extract results + result = { + "question": question, + "conversation_id": conv_id, + "message_id": msg_id, + "status": "COMPLETED", + } + + # Get SQL and data from attachments + if message.attachments: + for attachment in message.attachments: + if hasattr(attachment, 'query') and attachment.query: + result["sql"] = attachment.query.query + if hasattr(attachment, 'text') and attachment.text: + result["text_response"] = attachment.text.content + + # Get query result if available + if hasattr(message, 'query_result') and message.query_result: + qr = message.query_result + if hasattr(qr, 'columns'): + result["columns"] = [c.name for c in qr.columns] + if hasattr(qr, 'data_array'): + result["data"] = qr.data_array + result["row_count"] = len(qr.data_array) + + return result + + elif status in ["FAILED", "CANCELLED"]: + error_msg = "" + if message.attachments: + for attachment in message.attachments: + if hasattr(attachment, 'text') and attachment.text: + error_msg = attachment.text.content + return { + "question": question, + "conversation_id": conv_id, + "message_id": msg_id, + "status": status, + "error": error_msg or f"Query {status.lower()}", + } + + # Still processing, wait and retry + time.sleep(2) + + +def _print_json(data: Any) -> None: + """Print data as formatted JSON.""" + print(json.dumps(data, indent=2, default=str)) + + +def main(): + """CLI entry point.""" + parser = argparse.ArgumentParser( + description="Ask questions to a Genie Space", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + subparsers = parser.add_subparsers(dest="command", required=True) + + # ask command + ask_parser = subparsers.add_parser("ask", help="Ask a question to a Genie Space") + ask_parser.add_argument("space_id", help="The Genie Space ID") + ask_parser.add_argument("question", help="Natural language question to ask") + ask_parser.add_argument( + "--conversation-id", "-c", + help="Conversation ID for follow-up questions", + ) + ask_parser.add_argument( + "--timeout", "-t", + type=int, + default=60, + help="Timeout in seconds (default: 60)", + ) + + args = parser.parse_args() + + if args.command == "ask": + result = ask_genie( + space_id=args.space_id, + question=args.question, + conversation_id=args.conversation_id, + timeout_seconds=args.timeout, + ) + _print_json(result) + else: + parser.print_help() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/databricks-skills/databricks-genie/spaces.md b/databricks-skills/databricks-genie/spaces.md index ff8acb60..af6ea640 100644 --- a/databricks-skills/databricks-genie/spaces.md +++ b/databricks-skills/databricks-genie/spaces.md @@ -12,12 +12,13 @@ A Genie Space connects to Unity Catalog tables and translates natural language q **Before creating a Genie Space, you MUST inspect the table schemas** to understand what data is available: -```python -get_table_stats_and_schema( - catalog="my_catalog", - schema="sales", - table_stat_level="SIMPLE" -) +```bash +# Get table details +databricks unity-catalog tables get my_catalog.sales.customers +databricks unity-catalog tables get my_catalog.sales.orders + +# Or use discover-schema for multiple tables with statistics +databricks experimental aitools tools discover-schema my_catalog.sales.customers my_catalog.sales.orders ``` This returns: @@ -39,32 +40,26 @@ Based on the schema information: Create the space with content tailored to the actual data: -```python -manage_genie( - action="create_or_update", - display_name="Sales Analytics", - table_identifiers=[ - "my_catalog.sales.customers", - "my_catalog.sales.orders", - "my_catalog.sales.products" - ], - description="""Explore retail sales data with three related tables: -- customers: Customer demographics including region, segment, and signup date -- orders: Transaction history with order_date, total_amount, and status -- products: Product catalog with category, price, and inventory - -Tables join on customer_id and product_id.""", - sample_questions=[ - "What were total sales last month?", - "Who are our top 10 customers by total_amount?", - "How many orders were placed in Q4 by region?", - "What's the average order value by customer segment?", - "Which product categories have the highest revenue?", - "Show me customers who haven't ordered in 90 days" - ] -) +```bash +databricks genie create-space --json '{ + "display_name": "Sales Analytics", + "description": "Explore retail sales data with three related tables:\n- customers: Customer demographics including region, segment, and signup date\n- orders: Transaction history with order_date, total_amount, and status\n- products: Product catalog with category, price, and inventory\n\nTables join on customer_id and product_id.", + "table_identifiers": [ + "my_catalog.sales.customers", + "my_catalog.sales.orders", + "my_catalog.sales.products" + ] +}' ``` +Sample questions can be added via the Databricks UI after creation: +- "What were total sales last month?" +- "Who are our top 10 customers by total_amount?" +- "How many orders were placed in Q4 by region?" +- "What's the average order value by customer segment?" +- "Which product categories have the highest revenue?" +- "Show me customers who haven't ordered in 90 days" + ## Why This Workflow Matters **Sample questions that reference actual column names** help Genie: @@ -149,55 +144,40 @@ Write sample questions that: ## Updating a Genie Space -`manage_genie(action="create_or_update")` handles both create and update automatically. There are two ways it locates an existing space to update: - -- **By `space_id`** (explicit, preferred): pass `space_id=` to target a specific space. -- **By `display_name`** (implicit fallback): if `space_id` is omitted, the tool searches for a space with a matching name and updates it if found; otherwise it creates a new one. - -### Simple field updates (tables, questions, warehouse) - -To update metadata without a serialized config: - -```python -manage_genie( - action="create_or_update", - display_name="Sales Analytics", - space_id="01abc123...", # omit to match by name instead - table_identifiers=[ # updated table list - "my_catalog.sales.customers", - "my_catalog.sales.orders", - "my_catalog.sales.products", - ], - sample_questions=[ # updated sample questions - "What were total sales last month?", - "Who are our top 10 customers by revenue?", - ], - warehouse_id="abc123def456", # omit to keep current / auto-detect - description="Updated description.", -) +Use `databricks genie update-space` to update an existing space by ID. + +### Simple field updates + +```bash +# Update display name and description +databricks genie update-space SPACE_ID --json '{ + "display_name": "Sales Analytics", + "description": "Updated description.", + "table_identifiers": [ + "my_catalog.sales.customers", + "my_catalog.sales.orders", + "my_catalog.sales.products" + ] +}' ``` -### Full config update via `serialized_space` +### Full config update via serialized_space + +To push a complete serialized configuration to an existing space (preserves all instructions, SQL examples, join specs, etc.): -To push a complete serialized configuration to an existing space (the dict contains all regular table metadata, plus it preserves all instructions, SQL examples, join specs, etc.): +```bash +# First export the current config +databricks genie export-space SOURCE_SPACE_ID > config.json -```python -manage_genie( - action="create_or_update", - display_name="Sales Analytics", # overrides title embedded in serialized_space - table_identifiers=[], # ignored when serialized_space is provided - space_id="01abc123...", # target space to overwrite - warehouse_id="abc123def456", # overrides warehouse embedded in serialized_space - description="Updated description.", # overrides description embedded in serialized_space; omit to keep the one in the payload - serialized_space=remapped_config, # JSON string from manage_genie(action="export") (after catalog remap if needed) -) +# Modify the serialized_space as needed, then update +databricks genie update-space TARGET_SPACE_ID --json @updated_config.json ``` -> **Note:** When `serialized_space` is provided, `table_identifiers` and `sample_questions` are ignored β€” the full config comes from the serialized payload. However, `display_name`, `warehouse_id`, and `description` are still applied as top-level overrides on top of the serialized payload. Omit any of them to keep the values embedded in `serialized_space`. +> **Note:** When using serialized_space, the full config comes from the serialized payload. Top-level overrides (display_name, warehouse_id, description) can still be applied. ## Export, Import & Migration -`manage_genie(action="export")` returns a dictionary with four top-level keys: +`databricks genie export-space SPACE_ID` returns a JSON object with these top-level keys: | Key | Description | |-----|-------------| @@ -207,7 +187,7 @@ manage_genie( | `warehouse_id` | SQL warehouse associated with the space (workspace-specific β€” do **not** reuse across workspaces) | | `serialized_space` | JSON-encoded string with the full space configuration (see below) | -This envelope enables cloning, backup, and cross-workspace migration. Use `manage_genie(action="export")` and `manage_genie(action="import")` for all export/import operations β€” no direct REST calls needed. +This envelope enables cloning, backup, and cross-workspace migration. ### What is `serialized_space`? @@ -230,10 +210,10 @@ Minimum structure: ### Exporting a Space -Use `manage_genie(action="export")` to export the full configuration (requires CAN EDIT permission): +Use `databricks genie export-space` to export the full configuration (requires CAN EDIT permission): -```python -exported = manage_genie(action="export", space_id="01abc123...") +```bash +databricks genie export-space 01abc123... > exported_space.json # Returns: # { # "space_id": "01abc123...", @@ -244,90 +224,72 @@ exported = manage_genie(action="export", space_id="01abc123...") # } ``` -You can also get `serialized_space` inline via `manage_genie(action="get")`: - -```python -details = manage_genie(action="get", space_id="01abc123...", include_serialized_space=True) -serialized = details["serialized_space"] -``` - ### Cloning a Space (Same Workspace) -```python +```bash # Step 1: Export the source space -source = manage_genie(action="export", space_id="01abc123...") - -# Step 2: Import as a new space -manage_genie( - action="import", - warehouse_id=source["warehouse_id"], - serialized_space=source["serialized_space"], - title=source["title"], # override title; omit to keep original - description=source["description"], -) -# Returns: {"space_id": "01def456...", "title": "Sales Analytics (Dev Copy)", "operation": "imported"} +databricks genie export-space 01abc123... > source.json + +# Step 2: Import as a new space (modify title in JSON if needed) +databricks genie import-space --json @source.json +# Returns: {"space_id": "01def456...", "title": "Sales Analytics", "operation": "imported"} ``` ### Migrating Across Workspaces with Catalog Remapping When migrating between environments (e.g. prod β†’ dev), Unity Catalog names are often different. The `serialized_space` string contains the source catalog name **everywhere** β€” in table identifiers, SQL queries, join specs, and filter snippets. You must remap it before importing. -**Agent workflow (3 steps):** +**Workflow (3 steps):** **Step 1 β€” Export from source workspace:** -```python -exported = manage_genie(action="export", space_id="01f106e1239d14b28d6ab46f9c15e540") -# exported keys: warehouse_id, title, description, serialized_space -# exported["serialized_space"] contains all references to source catalog +```bash +# Use source workspace profile +DATABRICKS_CONFIG_PROFILE=source databricks genie export-space 01f106e1239d14b28d6ab46f9c15e540 > exported.json ``` **Step 2 β€” Remap catalog name in `serialized_space`:** -The agent does this as an inline string substitution between the two MCP calls: -```python -modified_serialized = exported["serialized_space"].replace( - "source_catalog_name", # e.g. "healthverity_claims_sample_patient_dataset" - "target_catalog_name" # e.g. "healthverity_claims_sample_patient_dataset_dev" -) +Use sed or a script to replace catalog names: +```bash +# Replace source catalog with target catalog in the serialized_space +sed -i '' 's/source_catalog_name/target_catalog_name/g' exported.json ``` This replaces all occurrences β€” table identifiers, SQL FROM clauses, join specs, and filter snippets. **Step 3 β€” Import to target workspace:** -```python -manage_genie( - action="import", - warehouse_id="", # from manage_warehouse(action="list") on target - serialized_space=modified_serialized, - title=exported["title"], - description=exported["description"] -) +```bash +# Use target workspace profile +DATABRICKS_CONFIG_PROFILE=target databricks genie import-space --json @exported.json ``` ### Batch Migration of Multiple Spaces -To migrate several spaces at once, loop through space IDs. The agent exports, remaps the catalog, then imports each: - -``` -For each space_id in [id1, id2, id3]: - 1. exported = manage_genie(action="export", space_id=space_id) - 2. modified = exported["serialized_space"].replace(src_catalog, tgt_catalog) - 3. result = manage_genie(action="import", warehouse_id=wh_id, serialized_space=modified, title=exported["title"], description=exported["description"]) - 4. record result["space_id"] for updating databricks.yml +To migrate several spaces at once, use a shell loop: + +```bash +for space_id in id1 id2 id3; do + # Export + DATABRICKS_CONFIG_PROFILE=source databricks genie export-space $space_id > ${space_id}.json + # Remap catalog + sed -i '' 's/src_catalog/tgt_catalog/g' ${space_id}.json + # Import + DATABRICKS_CONFIG_PROFILE=target databricks genie import-space --json @${space_id}.json +done ``` After migration, update `databricks.yml` with the new dev `space_id` values under the `dev` target's `genie_space_ids` variable. ### Updating an Existing Space with New Config -To push a serialized config to an already-existing space (rather than creating a new one), use `manage_genie(action="create_or_update")` with `space_id=` and `serialized_space=`. The export β†’ remap β†’ push pattern is identical to the migration steps above; just replace `manage_genie(action="import")` with `manage_genie(action="create_or_update", space_id=TARGET_SPACE_ID, ...)` as the final call. +To push a serialized config to an already-existing space (rather than creating a new one), use `databricks genie update-space` with the serialized config. The export β†’ remap β†’ push pattern is identical to the migration steps above; just replace `import-space` with `update-space TARGET_SPACE_ID` as the final call. ### Permissions Required | Operation | Required Permission | |-----------|-------------------| -| `manage_genie(action="export")` / `manage_genie(action="get", include_serialized_space=True)` | CAN EDIT on source space | -| `manage_genie(action="import")` | Can create items in target workspace folder | -| `manage_genie(action="create_or_update")` with `serialized_space` (update) | CAN EDIT on target space | +| `databricks genie export-space` | CAN EDIT on source space | +| `databricks genie import-space` | Can create items in target workspace folder | +| `databricks genie update-space` with serialized_space | CAN EDIT on target space | ## Example End-to-End Workflow @@ -338,17 +300,21 @@ To push a serialized config to an already-existing space (rather than creating a - Creates `catalog.schema.bronze_*` β†’ `catalog.schema.silver_*` β†’ `catalog.schema.gold_*` 3. **Inspect the tables**: - ```python - get_table_stats_and_schema(catalog="catalog", schema="schema") + ```bash + databricks experimental aitools tools discover-schema catalog.schema.silver_customers catalog.schema.silver_orders ``` 4. **Create the Genie Space**: - - `display_name`: "My Data Explorer" - - `table_identifiers`: `["catalog.schema.silver_customers", "catalog.schema.silver_orders"]` + ```bash + databricks genie create-space --json '{ + "display_name": "My Data Explorer", + "table_identifiers": ["catalog.schema.silver_customers", "catalog.schema.silver_orders"] + }' + ``` -5. **Add sample questions** based on actual column names +5. **Add sample questions** via the Databricks UI based on actual column names -6. **Test** in the Databricks UI +6. **Test** using conversation.py or the Databricks UI ## Troubleshooting @@ -370,26 +336,25 @@ To push a serialized config to an already-existing space (rather than creating a - Include sample questions that demonstrate the vocabulary - Add instructions via the Databricks Genie UI -### `manage_genie(action="export")` returns empty `serialized_space` +### `databricks genie export-space` returns empty `serialized_space` Requires at least **CAN EDIT** permission on the space. -### `manage_genie(action="import")` fails with permission error +### `databricks genie import-space` fails with permission error Ensure you have CREATE privileges in the target workspace folder. ### Tables not found after migration -Catalog name was not remapped β€” replace the source catalog name in `serialized_space` before calling `manage_genie(action="import")`. The catalog appears in table identifiers, SQL FROM clauses, join specs, and filter snippets; a single `.replace(src_catalog, tgt_catalog)` on the whole string covers all occurrences. +Catalog name was not remapped β€” replace the source catalog name in `serialized_space` before calling `databricks genie import-space`. The catalog appears in table identifiers, SQL FROM clauses, join specs, and filter snippets; a single `sed 's/src_catalog/tgt_catalog/g'` on the whole JSON covers all occurrences. -### `manage_genie` lands in the wrong workspace +### CLI targets the wrong workspace -Each MCP server is workspace-scoped. Set up two named MCP server entries (one per profile) in your IDE's MCP config instead of switching a single server's profile mid-session. - -### MCP server doesn't pick up profile change - -The MCP process reads `DATABRICKS_CONFIG_PROFILE` once at startup β€” editing the config file requires an IDE reload to take effect. +Use `DATABRICKS_CONFIG_PROFILE=profile_name` to specify which workspace profile to use: +```bash +DATABRICKS_CONFIG_PROFILE=dev databricks genie list-spaces +``` -### `manage_genie(action="import")` fails with JSON parse error +### `databricks genie import-space` fails with JSON parse error The `serialized_space` string may contain multi-line SQL arrays with `\n` escape sequences. Flatten SQL arrays to single-line strings before passing to avoid double-escaping issues. From c0e89dee217f387de1769062cb41c553f6747c15 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Mon, 13 Apr 2026 10:13:39 +0200 Subject: [PATCH 06/66] Replace MCP tools with CLI commands in Lakebase skills MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - databricks-lakebase-autoscale: Remove MCP section, expand CLI commands - databricks-lakebase-provisioned: Remove MCP section, expand CLI commands πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../databricks-lakebase-autoscale/SKILL.md | 102 ++++++------------ .../databricks-lakebase-provisioned/SKILL.md | 102 ++++++------------ 2 files changed, 65 insertions(+), 139 deletions(-) diff --git a/databricks-skills/databricks-lakebase-autoscale/SKILL.md b/databricks-skills/databricks-lakebase-autoscale/SKILL.md index f471765c..848e6e67 100644 --- a/databricks-skills/databricks-lakebase-autoscale/SKILL.md +++ b/databricks-skills/databricks-lakebase-autoscale/SKILL.md @@ -169,71 +169,6 @@ w.postgres.update_endpoint( ).wait() ``` -## MCP Tools - -The following MCP tools are available for managing Lakebase infrastructure. Use `type="autoscale"` for Lakebase Autoscaling. - -### manage_lakebase_database - Project Management - -| Action | Description | Required Params | -|--------|-------------|-----------------| -| `create_or_update` | Create or update a project | name | -| `get` | Get project details (includes branches/endpoints) | name | -| `list` | List all projects | (none, optional type filter) | -| `delete` | Delete project and all branches/computes/data | name | - -**Example usage:** -```python -# Create an autoscale project -manage_lakebase_database( - action="create_or_update", - name="my-app", - type="autoscale", - display_name="My Application", - pg_version="17" -) - -# Get project with branches -manage_lakebase_database(action="get", name="my-app", type="autoscale") - -# Delete project -manage_lakebase_database(action="delete", name="my-app", type="autoscale") -``` - -### manage_lakebase_branch - Branch Management - -| Action | Description | Required Params | -|--------|-------------|-----------------| -| `create_or_update` | Create/update branch with compute endpoint | project_name, branch_id | -| `delete` | Delete branch and endpoints | name (full branch name) | - -**Example usage:** -```python -# Create a dev branch with 7-day TTL -manage_lakebase_branch( - action="create_or_update", - project_name="my-app", - branch_id="development", - source_branch="production", - ttl_seconds=604800, # 7 days - autoscaling_limit_min_cu=0.5, - autoscaling_limit_max_cu=4.0, - scale_to_zero_seconds=300 -) - -# Delete branch -manage_lakebase_branch(action="delete", name="projects/my-app/branches/development") -``` - -### generate_lakebase_credential - OAuth Tokens - -Generate OAuth token (~1hr) for PostgreSQL connections. Use as password with `sslmode=require`. - -```python -# For autoscale endpoints -generate_lakebase_credential(endpoint="projects/my-app/branches/production/endpoints/ep-primary") -``` - ## Reference Files - [projects.md](projects.md) - Project management patterns and settings @@ -242,7 +177,9 @@ generate_lakebase_credential(endpoint="projects/my-app/branches/production/endpo - [connection-patterns.md](connection-patterns.md) - Connection patterns for different use cases - [reverse-etl.md](reverse-etl.md) - Synced tables from Delta Lake to Lakebase -## CLI Quick Reference +## CLI Commands + +### Project Management ```bash # Create a project @@ -256,18 +193,45 @@ databricks postgres list-projects # Get project details databricks postgres get-project projects/my-app -# Create a branch +# Delete a project +databricks postgres delete-project projects/my-app +``` + +### Branch Management + +```bash +# Create a branch with TTL +databricks postgres create-branch projects/my-app development \ + --json '{"spec": {"source_branch": "projects/my-app/branches/production", "ttl": {"seconds": 604800}}}' + +# Create a branch with no expiry databricks postgres create-branch projects/my-app development \ --json '{"spec": {"source_branch": "projects/my-app/branches/production", "no_expiry": true}}' # List branches databricks postgres list-branches projects/my-app +# Delete a branch +databricks postgres delete-branch projects/my-app/branches/development +``` + +### Endpoint Management + +```bash # Get endpoint details databricks postgres get-endpoint projects/my-app/branches/production/endpoints/ep-primary -# Delete a project -databricks postgres delete-project projects/my-app +# Update endpoint autoscaling limits +databricks postgres update-endpoint projects/my-app/branches/production/endpoints/ep-primary \ + --json '{"spec": {"autoscaling_limit_min_cu": 2.0, "autoscaling_limit_max_cu": 8.0}}' +``` + +### OAuth Credentials + +```bash +# Generate database credential (for connections) +databricks postgres generate-database-credential \ + --endpoint projects/my-app/branches/production/endpoints/ep-primary ``` ## Key Differences from Lakebase Provisioned diff --git a/databricks-skills/databricks-lakebase-provisioned/SKILL.md b/databricks-skills/databricks-lakebase-provisioned/SKILL.md index 7548219c..2dacbaa2 100644 --- a/databricks-skills/databricks-lakebase-provisioned/SKILL.md +++ b/databricks-skills/databricks-lakebase-provisioned/SKILL.md @@ -221,76 +221,14 @@ mlflow.langchain.log_model( ) ``` -## MCP Tools - -The following MCP tools are available for managing Lakebase infrastructure. Use `type="provisioned"` for Lakebase Provisioned. - -### manage_lakebase_database - Database Management - -| Action | Description | Required Params | -|--------|-------------|-----------------| -| `create_or_update` | Create or update a database | name | -| `get` | Get database details | name | -| `list` | List all databases | (none, optional type filter) | -| `delete` | Delete database and resources | name | - -**Example usage:** -```python -# Create a provisioned database -manage_lakebase_database( - action="create_or_update", - name="my-lakebase-instance", - type="provisioned", - capacity="CU_1" -) - -# Get database details -manage_lakebase_database(action="get", name="my-lakebase-instance", type="provisioned") - -# List all databases -manage_lakebase_database(action="list") - -# Delete with cascade -manage_lakebase_database(action="delete", name="my-lakebase-instance", type="provisioned", force=True) -``` - -### manage_lakebase_sync - Reverse ETL - -| Action | Description | Required Params | -|--------|-------------|-----------------| -| `create_or_update` | Set up reverse ETL from Delta to Lakebase | instance_name, source_table_name, target_table_name | -| `delete` | Remove synced table (and optionally catalog) | table_name | - -**Example usage:** -```python -# Set up reverse ETL -manage_lakebase_sync( - action="create_or_update", - instance_name="my-lakebase-instance", - source_table_name="catalog.schema.delta_table", - target_table_name="lakebase_catalog.schema.postgres_table", - scheduling_policy="TRIGGERED" # or SNAPSHOT, CONTINUOUS -) - -# Delete synced table -manage_lakebase_sync(action="delete", table_name="lakebase_catalog.schema.postgres_table") -``` - -### generate_lakebase_credential - OAuth Tokens - -Generate OAuth token (~1hr) for PostgreSQL connections. Use as password with `sslmode=require`. - -```python -# For provisioned instances -generate_lakebase_credential(instance_names=["my-lakebase-instance"]) -``` - ## Reference Files - [connection-patterns.md](connection-patterns.md) - Detailed connection patterns for different use cases - [reverse-etl.md](reverse-etl.md) - Syncing data from Delta Lake to Lakebase -## CLI Quick Reference +## CLI Commands + +### Instance Management ```bash # Create instance @@ -301,11 +239,6 @@ databricks database create-database-instance \ # Get instance details databricks database get-database-instance --name my-lakebase-instance -# Generate credentials -databricks database generate-database-credential \ - --request-id $(uuidgen) \ - --json '{"instance_names": ["my-lakebase-instance"]}' - # List instances databricks database list-database-instances @@ -314,6 +247,35 @@ databricks database stop-database-instance --name my-lakebase-instance # Start instance databricks database start-database-instance --name my-lakebase-instance + +# Delete instance +databricks database delete-database-instance --name my-lakebase-instance +``` + +### OAuth Credentials + +```bash +# Generate credentials for connection +databricks database generate-database-credential \ + --request-id $(uuidgen) \ + --json '{"instance_names": ["my-lakebase-instance"]}' +``` + +### Reverse ETL (Synced Tables) + +Synced tables are managed via Unity Catalog SQL commands: + +```sql +-- Create synced table from Delta to Lakebase +CREATE TABLE lakebase_catalog.schema.target_table +SYNC FROM catalog.schema.source_delta_table +SCHEDULE TRIGGERED; + +-- List synced tables +SHOW TABLES IN lakebase_catalog.schema; + +-- Drop synced table +DROP TABLE lakebase_catalog.schema.target_table; ``` ## Common Issues From c2377dd5a735138aee53ccadf3459c696241bc36 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Mon, 13 Apr 2026 10:14:53 +0200 Subject: [PATCH 07/66] Replace MCP tools with CLI commands in model-serving, unity-catalog, and dbsql skills MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - databricks-model-serving: Use databricks CLI for endpoints and workspace ops - databricks-unity-catalog: Use databricks fs CLI for volume operations - databricks-dbsql: Update guideline to use CLI instead of MCP πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- databricks-skills/databricks-dbsql/SKILL.md | 2 +- .../databricks-model-serving/SKILL.md | 149 +++++++----------- .../databricks-unity-catalog/SKILL.md | 43 ++--- 3 files changed, 78 insertions(+), 116 deletions(-) diff --git a/databricks-skills/databricks-dbsql/SKILL.md b/databricks-skills/databricks-dbsql/SKILL.md index 24bf2694..4def6b57 100644 --- a/databricks-skills/databricks-dbsql/SKILL.md +++ b/databricks-skills/databricks-dbsql/SKILL.md @@ -297,4 +297,4 @@ Load these for detailed syntax, full parameter lists, and advanced patterns: - **Star schema in Gold layer** for BI; OBT acceptable in Silver - **Define PK/FK constraints** on dimensional models for query optimization - **Use `COLLATE UTF8_LCASE`** for user-facing string columns that need case-insensitive search -- **Use MCP tools** (`execute_sql`, `execute_sql_multi`) to test and validate all SQL before deploying +- **Test SQL via CLI** (`databricks sql execute`) or notebooks before deploying diff --git a/databricks-skills/databricks-model-serving/SKILL.md b/databricks-skills/databricks-model-serving/SKILL.md index 74160298..59f5b330 100644 --- a/databricks-skills/databricks-model-serving/SKILL.md +++ b/databricks-skills/databricks-model-serving/SKILL.md @@ -92,49 +92,30 @@ ALWAYS use exact endpoint names from this table. NEVER guess or abbreviate. ## Quick Start: Deploy a GenAI Agent -### Step 1: Install Packages (in notebook or via MCP) +### Step 1: Install Packages (in notebook) ```python %pip install -U mlflow==3.6.0 databricks-langchain langgraph==0.3.4 databricks-agents pydantic dbutils.library.restartPython() ``` -Or via MCP: -``` -execute_code(code="%pip install -U mlflow==3.6.0 databricks-langchain langgraph==0.3.4 databricks-agents pydantic") -``` - ### Step 2: Create Agent File Create `agent.py` locally with `ResponsesAgent` pattern (see [3-genai-agents.md](3-genai-agents.md)). ### Step 3: Upload to Workspace -``` -manage_workspace_files( - action="upload", - local_path="./my_agent", - workspace_path="/Workspace/Users/you@company.com/my_agent" -) +```bash +databricks workspace import-dir ./my_agent /Workspace/Users/you@company.com/my_agent ``` ### Step 4: Test Agent -``` -execute_code( - file_path="./my_agent/test_agent.py", - cluster_id="" -) -``` +Run `test_agent.py` on a cluster to validate the agent works. ### Step 5: Log Model -``` -execute_code( - file_path="./my_agent/log_model.py", - cluster_id="" -) -``` +Run `log_model.py` on a cluster to register the model in Unity Catalog. ### Step 6: Deploy (Async via Job) @@ -142,12 +123,10 @@ See [7-deployment.md](7-deployment.md) for job-based deployment that doesn't tim ### Step 7: Query Endpoint -``` -manage_serving_endpoint( - action="query", - name="my-agent-endpoint", - messages=[{"role": "user", "content": "Hello!"}] -) +```bash +databricks serving-endpoints query my-agent-endpoint --json '{ + "messages": [{"role": "user", "content": "Hello!"}] +}' ``` --- @@ -174,55 +153,50 @@ Then deploy via UI or SDK. See [1-classical-ml.md](1-classical-ml.md). --- -## MCP Tools +## CLI Commands -> **If MCP tools are not available**, use the SDK/CLI examples in the reference files below. +### Endpoint Management -### Development & Testing +```bash +# List all serving endpoints +databricks serving-endpoints list -| Tool | Purpose | -|------|---------| -| `manage_workspace_files` (action="upload") | Upload agent files to workspace | -| `execute_code` | Install packages, test agent, log model | +# Get endpoint details and status +databricks serving-endpoints get my-agent-endpoint -### Deployment +# Query a chat/agent endpoint +databricks serving-endpoints query my-agent-endpoint --json '{ + "messages": [{"role": "user", "content": "Hello!"}], + "max_tokens": 500 +}' -| Tool | Purpose | -|------|---------| -| `manage_jobs` (action="create") | Create deployment job (one-time) | -| `manage_job_runs` (action="run_now") | Kick off deployment (async) | -| `manage_job_runs` (action="get") | Check deployment job status | +# Query a traditional ML endpoint +databricks serving-endpoints query sklearn-classifier --json '{ + "dataframe_records": [{"age": 25, "income": 50000, "credit_score": 720}] +}' +``` -### manage_serving_endpoint - Querying +### Workspace File Operations -| Action | Description | Required Params | -|--------|-------------|-----------------| -| `get` | Check endpoint status (READY/NOT_READY/NOT_FOUND) | name | -| `list` | List all endpoints | (none, optional limit) | -| `query` | Send requests to endpoint | name + one of: messages, inputs, dataframe_records | +```bash +# Upload agent files to workspace +databricks workspace import-dir ./my_agent /Workspace/Users/you@company.com/my_agent -**Example usage:** -```python -# Check endpoint status -manage_serving_endpoint(action="get", name="my-agent-endpoint") +# List workspace files +databricks workspace ls /Workspace/Users/you@company.com/my_agent +``` -# List all endpoints -manage_serving_endpoint(action="list") +### Jobs for Deployment -# Query a chat/agent endpoint -manage_serving_endpoint( - action="query", - name="my-agent-endpoint", - messages=[{"role": "user", "content": "Hello!"}], - max_tokens=500 -) +```bash +# Create a deployment job +databricks jobs create --json @deploy_job.json -# Query a traditional ML endpoint -manage_serving_endpoint( - action="query", - name="sklearn-classifier", - dataframe_records=[{"age": 25, "income": 50000, "credit_score": 720}] -) +# Run the deployment job +databricks jobs run-now --job-id JOB_ID + +# Check job run status +databricks jobs get-run --run-id RUN_ID ``` --- @@ -231,42 +205,27 @@ manage_serving_endpoint( ### Check Endpoint Status After Deployment -``` -manage_serving_endpoint(action="get", name="my-agent-endpoint") +```bash +databricks serving-endpoints get my-agent-endpoint ``` -Returns: -```json -{ - "name": "my-agent-endpoint", - "state": "READY", - "served_entities": [...] -} -``` +Returns JSON with endpoint status (`READY`, `NOT_READY`, etc.). ### Query a Chat/Agent Endpoint -``` -manage_serving_endpoint( - action="query", - name="my-agent-endpoint", - messages=[ - {"role": "user", "content": "What is Databricks?"} - ], - max_tokens=500 -) +```bash +databricks serving-endpoints query my-agent-endpoint --json '{ + "messages": [{"role": "user", "content": "What is Databricks?"}], + "max_tokens": 500 +}' ``` ### Query a Traditional ML Endpoint -``` -manage_serving_endpoint( - action="query", - name="sklearn-classifier", - dataframe_records=[ - {"age": 25, "income": 50000, "credit_score": 720} - ] -) +```bash +databricks serving-endpoints query sklearn-classifier --json '{ + "dataframe_records": [{"age": 25, "income": 50000, "credit_score": 720}] +}' ``` --- diff --git a/databricks-skills/databricks-unity-catalog/SKILL.md b/databricks-skills/databricks-unity-catalog/SKILL.md index 2e3d05fa..5b9da187 100644 --- a/databricks-skills/databricks-unity-catalog/SKILL.md +++ b/databricks-skills/databricks-unity-catalog/SKILL.md @@ -29,15 +29,21 @@ Use this skill when: ## Quick Start -### Volume File Operations (MCP Tools) +### Volume File Operations (CLI) -| Tool | Usage | -|------|-------| -| `list_volume_files` | `list_volume_files(volume_path="/Volumes/catalog/schema/volume/path/")` | -| `get_volume_folder_details` | `get_volume_folder_details(volume_path="catalog/schema/volume/path", format="parquet")` - schema, row counts, stats | -| `upload_to_volume` | `upload_to_volume(local_path="/tmp/data/*", volume_path="/Volumes/.../dest")` | -| `download_from_volume` | `download_from_volume(volume_path="/Volumes/.../file.csv", local_path="/tmp/file.csv")` | -| `create_volume_directory` | `create_volume_directory(volume_path="/Volumes/.../new_folder")` | +```bash +# List files in a volume +databricks fs ls /Volumes/catalog/schema/volume/path/ + +# Upload files to a volume +databricks fs cp /tmp/data/* /Volumes/catalog/schema/volume/dest/ --recursive + +# Download files from a volume +databricks fs cp /Volumes/catalog/schema/volume/file.csv /tmp/file.csv + +# Create a directory in a volume +databricks fs mkdirs /Volumes/catalog/schema/volume/new_folder +``` ### Enable System Tables Access @@ -71,20 +77,17 @@ WHERE usage_date >= current_date() - 30 GROUP BY workspace_id, sku_name; ``` -## MCP Tool Integration +## SQL Queries via CLI -Use `mcp__databricks__execute_sql` for system table queries: +Use `databricks sql` for system table queries: -```python -# Query lineage -mcp__databricks__execute_sql( - sql_query=""" - SELECT source_table_full_name, target_table_full_name - FROM system.access.table_lineage - WHERE event_date >= current_date() - 7 - """, - catalog="system" -) +```bash +# Query lineage via CLI +databricks sql execute --warehouse-id WAREHOUSE_ID --query " + SELECT source_table_full_name, target_table_full_name + FROM system.access.table_lineage + WHERE event_date >= current_date() - 7 +" ``` ## Best Practices From 290395e943eea6f738fa257b010426fe2359a351 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Mon, 13 Apr 2026 10:17:05 +0200 Subject: [PATCH 08/66] refactor(vector-search): replace MCP tools with CLI commands MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove MCP Tools section from SKILL.md (manage_vs_endpoint, manage_vs_index, query_vs_index, manage_vs_data) - Update Common Issues to remove MCP-specific truncation issue - Update Notes section to reference CLI/SDK instead of MCP - Update end-to-end-rag.md: replace MCP tools table with CLI commands - Update troubleshooting-and-operations.md: replace MCP tool references with CLI πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../databricks-vector-search/SKILL.md | 110 +----------------- .../end-to-end-rag.md | 26 ++--- .../troubleshooting-and-operations.md | 4 +- 3 files changed, 17 insertions(+), 123 deletions(-) diff --git a/databricks-skills/databricks-vector-search/SKILL.md b/databricks-skills/databricks-vector-search/SKILL.md index 72068ec5..18cc7679 100644 --- a/databricks-skills/databricks-vector-search/SKILL.md +++ b/databricks-skills/databricks-vector-search/SKILL.md @@ -302,7 +302,7 @@ databricks vector-search indexes delete-index \ | **Embedding dimension mismatch** | Ensure query and index dimensions match | | **Index not updating** | Check pipeline_type; use sync_index() for TRIGGERED | | **Out of capacity** | Upgrade to Storage-Optimized (1B+ vectors) | -| **`query_vector` truncated by MCP tool** | MCP tool calls serialize arrays as JSON and can truncate large vectors (e.g. 1024-dim). Use `query_text` instead (for managed embedding indexes), or use the Databricks SDK/CLI to pass raw vectors | +| **`query_vector` truncated** | Large vectors (e.g. 1024-dim) can be truncated when serialized as JSON. Use `query_text` instead (for managed embedding indexes), or use the Databricks SDK to pass raw vectors | ## Embedding Models @@ -323,112 +323,6 @@ embedding_source_columns=[ ] ``` -## MCP Tools - -The following MCP tools are available for managing Vector Search infrastructure. For a full end-to-end walkthrough, see [end-to-end-rag.md](end-to-end-rag.md). - -### manage_vs_endpoint - Endpoint Management - -| Action | Description | Required Params | -|--------|-------------|-----------------| -| `create_or_update` | Create endpoint (STANDARD or STORAGE_OPTIMIZED). Idempotent | name | -| `get` | Get endpoint details | name | -| `list` | List all endpoints | (none) | -| `delete` | Delete endpoint (indexes must be deleted first) | name | - -```python -# Create or update an endpoint -result = manage_vs_endpoint(action="create_or_update", name="my-vs-endpoint", endpoint_type="STANDARD") -# Returns {"name": "my-vs-endpoint", "endpoint_type": "STANDARD", "created": True} - -# List all endpoints -endpoints = manage_vs_endpoint(action="list") - -# Get specific endpoint -endpoint = manage_vs_endpoint(action="get", name="my-vs-endpoint") -``` - -### manage_vs_index - Index Management - -| Action | Description | Required Params | -|--------|-------------|-----------------| -| `create_or_update` | Create index. Idempotent, auto-triggers sync for DELTA_SYNC | name, endpoint_name, primary_key | -| `get` | Get index details | name | -| `list` | List indexes. Optional endpoint_name filter | (none) | -| `delete` | Delete index | name | - -```python -# Create a Delta Sync index with managed embeddings -result = manage_vs_index( - action="create_or_update", - name="catalog.schema.my_index", - endpoint_name="my-vs-endpoint", - primary_key="id", - index_type="DELTA_SYNC", - delta_sync_index_spec={ - "source_table": "catalog.schema.docs", - "embedding_source_columns": [{"name": "content", "embedding_model_endpoint_name": "databricks-gte-large-en"}], - "pipeline_type": "TRIGGERED" - } -) - -# Get a specific index -index = manage_vs_index(action="get", name="catalog.schema.my_index") - -# List all indexes on an endpoint -indexes = manage_vs_index(action="list", endpoint_name="my-vs-endpoint") - -# List all indexes across all endpoints -all_indexes = manage_vs_index(action="list") -``` - -### query_vs_index - Query (Hot Path) - -Query index with `query_text`, `query_vector`, or hybrid (`query_type="HYBRID"`). Prefer `query_text` over `query_vector` β€” MCP tool calls can truncate large embedding arrays (1024-dim). - -```python -# Query an index -results = query_vs_index( - index_name="catalog.schema.my_index", - columns=["id", "content"], - query_text="machine learning best practices", - num_results=5 -) - -# Hybrid search (combines vector + keyword) -results = query_vs_index( - index_name="catalog.schema.my_index", - columns=["id", "content"], - query_text="SPARK-12345 memory error", - query_type="HYBRID", - num_results=10 -) -``` - -### manage_vs_data - Data Operations - -| Action | Description | Required Params | -|--------|-------------|-----------------| -| `upsert` | Insert/update records | index_name, inputs_json | -| `delete` | Delete by primary key | index_name, primary_keys | -| `scan` | Scan index contents | index_name | -| `sync` | Trigger sync for TRIGGERED indexes | index_name | - -```python -# Upsert data into a Direct Access index -manage_vs_data( - action="upsert", - index_name="catalog.schema.my_index", - inputs_json=[{"id": "doc1", "content": "...", "embedding": [0.1, 0.2, ...]}] -) - -# Trigger manual sync for a TRIGGERED pipeline index -manage_vs_data(action="sync", index_name="catalog.schema.my_index") - -# Scan index contents -manage_vs_data(action="scan", index_name="catalog.schema.my_index", num_results=100) -``` - ## Notes - **Storage-Optimized is newer** β€” better for most use cases unless you need <100ms latency @@ -436,7 +330,7 @@ manage_vs_data(action="scan", index_name="catalog.schema.my_index", num_results= - **Hybrid search** β€” available for both Delta Sync and Direct Access indexes - **`columns_to_sync` matters** β€” only synced columns are available in query results; include all columns you need - **Filter syntax differs by endpoint** β€” Standard uses dict-format filters, Storage-Optimized uses SQL-like string filters. Use the `databricks-vectorsearch` package's `filters` parameter which accepts both formats -- **Management vs runtime** β€” MCP tools above handle lifecycle management; for agent tool-calling at runtime, use `VectorSearchRetrieverTool` or the Databricks managed Vector Search MCP server +- **Management vs runtime** β€” CLI and SDK handle lifecycle management; for agent tool-calling at runtime, use `VectorSearchRetrieverTool` ## Related Skills diff --git a/databricks-skills/databricks-vector-search/end-to-end-rag.md b/databricks-skills/databricks-vector-search/end-to-end-rag.md index a3808d1b..4d020c83 100644 --- a/databricks-skills/databricks-vector-search/end-to-end-rag.md +++ b/databricks-skills/databricks-vector-search/end-to-end-rag.md @@ -2,16 +2,16 @@ Build a complete Retrieval-Augmented Generation pipeline: prepare documents, create a vector index, query it, and wire it into an agent. -## MCP Tools Used +## CLI Commands Used -| Tool | Step | -|------|------| -| `execute_sql` | Create source table, insert documents | -| `manage_vs_endpoint(action="create")` | Create compute endpoint | -| `manage_vs_index(action="create")` | Create Delta Sync index with managed embeddings | -| `manage_vs_index(action="sync")` | Trigger index sync | -| `manage_vs_index(action="get")` | Check index status | -| `query_vs_index` | Test similarity search | +| Command | Step | +|---------|------| +| `databricks sql execute` | Create source table, insert documents | +| `databricks vector-search endpoints create` | Create compute endpoint | +| `databricks vector-search indexes create-index` | Create Delta Sync index with managed embeddings | +| `databricks vector-search indexes sync-index` | Trigger index sync | +| `databricks vector-search indexes get-index` | Check index status | +| `databricks vector-search indexes query-index` | Test similarity search | --- @@ -34,10 +34,10 @@ INSERT INTO catalog.schema.knowledge_base VALUES ('doc-003', 'Delta Lake', 'Delta Lake is an open-source storage layer...', 'storage', current_timestamp()); ``` -Or via MCP: +Or via CLI: -```python -execute_sql(sql_query=""" +```bash +databricks sql execute --warehouse-id WAREHOUSE_ID --query " CREATE TABLE IF NOT EXISTS catalog.schema.knowledge_base ( doc_id STRING, title STRING, @@ -45,7 +45,7 @@ execute_sql(sql_query=""" category STRING, updated_at TIMESTAMP DEFAULT current_timestamp() ) -""") +" ``` ## Step 2: Create Vector Search Endpoint diff --git a/databricks-skills/databricks-vector-search/troubleshooting-and-operations.md b/databricks-skills/databricks-vector-search/troubleshooting-and-operations.md index 7dc4b8c9..23385adc 100644 --- a/databricks-skills/databricks-vector-search/troubleshooting-and-operations.md +++ b/databricks-skills/databricks-vector-search/troubleshooting-and-operations.md @@ -4,7 +4,7 @@ Operational guidance for monitoring, cost optimization, capacity planning, and m ## Monitoring Endpoint Status -Use `manage_vs_endpoint(action="get")` (MCP tool) or `w.vector_search_endpoints.get_endpoint()` (SDK) to check endpoint health. +Use `databricks vector-search endpoints get` (CLI) or `w.vector_search_endpoints.get_endpoint()` (SDK) to check endpoint health. ### Endpoint fields @@ -34,7 +34,7 @@ print(f"Indexes: {endpoint.num_indexes}") ## Monitoring Index Status -Use `manage_vs_index(action="get")` (MCP tool) or `w.vector_search_indexes.get_index()` (SDK) to check index health. +Use `databricks vector-search indexes get-index` (CLI) or `w.vector_search_indexes.get_index()` (SDK) to check index health. ### Index fields From e10a2c20c91f44cf84ec58bd2b5205abbea165e9 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Mon, 13 Apr 2026 10:18:51 +0200 Subject: [PATCH 09/66] refactor(spark-declarative-pipelines): replace MCP tools with CLI commands MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Rename Option C from "MCP Tools" to "CLI" approach - Replace references/2-mcp-approach.md with 2-cli-approach.md (full rewrite) - Update Post-Run Validation section to use `databricks pipelines` CLI - Update all workflow references from MCP to CLI/SDK - Update 1-project-initialization.md reference πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../SKILL.md | 48 +++-- .../references/1-project-initialization.md | 2 +- .../references/2-cli-approach.md | 167 ++++++++++++++++++ .../references/2-mcp-approach.md | 163 ----------------- 4 files changed, 197 insertions(+), 183 deletions(-) create mode 100644 databricks-skills/databricks-spark-declarative-pipelines/references/2-cli-approach.md delete mode 100644 databricks-skills/databricks-spark-declarative-pipelines/references/2-mcp-approach.md diff --git a/databricks-skills/databricks-spark-declarative-pipelines/SKILL.md b/databricks-skills/databricks-spark-declarative-pipelines/SKILL.md index a1bdd7c3..3fe77e2a 100644 --- a/databricks-skills/databricks-spark-declarative-pipelines/SKILL.md +++ b/databricks-skills/databricks-spark-declarative-pipelines/SKILL.md @@ -83,15 +83,14 @@ Use this when the pipeline is **part of an existing DAB project**: β†’ See [1-project-initialization.md](references/1-project-initialization.md) for adding pipelines to existing bundles -### Option C: Rapid Iteration with MCP Tools (no bundle management) +### Option C: Rapid Iteration with CLI (no bundle management) Use this when you need to **quickly create, test, and iterate** on a pipeline without managing bundle files: - User wants to "just run a pipeline and see if it works" - Part of a larger demo where bundle is managed separately, or the DAB bundle will be created at the end as you want to quickly test the project first - Prototyping or experimenting with pipeline logic -- User explicitly asks to use MCP tools -β†’ See [2-mcp-approach.md](references/2-mcp-approach.md) for MCP-based workflow +β†’ See [2-cli-approach.md](references/2-cli-approach.md) for CLI-based workflow --- @@ -101,7 +100,7 @@ Before writing pipeline code, make sure you have: ``` - [ ] Language selected: Python or SQL - [ ] Read the syntax basics: **SQL**: Always Read [sql/1-syntax-basics.md](references/sql/1-syntax-basics.md), **Python**: Always Read [python/1-syntax-basics.md](references/python/1-syntax-basics.md) -- [ ] Workflow chosen: Standalone DAB / Existing DAB / MCP iteration +- [ ] Workflow chosen: Standalone DAB / Existing DAB / CLI iteration - [ ] Compute type: serverless (default) or classic - [ ] Schema strategy: single schema with prefixes vs. multi-schema - [ ] Consider [Multi-Schema Patterns](#multi-schema-patterns) and [Modern Defaults](#modern-defaults) @@ -179,7 +178,7 @@ After choosing your workflow (see [Choose Your Workflow](#choose-your-workflow)) | Task | Guide | |------|-------| | **Setting up standalone pipeline project** | [1-project-initialization.md](references/1-project-initialization.md) | -| **Rapid iteration with MCP tools** | [2-mcp-approach.md](references/2-mcp-approach.md) | +| **Rapid iteration with CLI** | [2-cli-approach.md](references/2-cli-approach.md) | | **Advanced configuration** | [3-advanced-configuration.md](references/3-advanced-configuration.md) | | **Migrating from DLT** | [4-dlt-migration.md](references/4-dlt-migration.md) | @@ -248,7 +247,7 @@ For detailed syntax, see [sql/1-syntax-basics.md](references/sql/1-syntax-basics ### Project Structure - **Standalone pipeline projects**: Use `databricks pipelines init` for Asset Bundle with multi-environment support - **Pipeline in existing bundle**: Add to `resources/*.pipeline.yml` -- **Rapid iteration/prototyping**: Use MCP tools, formalize in bundle later +- **Rapid iteration/prototyping**: Use CLI/SDK, formalize in bundle later - See **[1-project-initialization.md](references/1-project-initialization.md)** for project setup details ### Minimal pipeline config pointers @@ -278,29 +277,40 @@ For detailed examples, see **[3-advanced-configuration.md](references/3-advanced ## Post-Run Validation (Required) -After running a pipeline (via DAB or MCP), you **MUST** validate both the execution status AND the actual data. +After running a pipeline (via DAB or CLI), you **MUST** validate both the execution status AND the actual data. ### Step 1: Check Pipeline Execution Status -**From MCP (`manage_pipeline(action="run")` or `manage_pipeline(action="create_or_update")`):** -- Check `result["success"]` and `result["state"]` -- If failed, check `result["message"]` and `result["errors"]` for details +```bash +# Get pipeline status and details +databricks pipelines get --pipeline-id + +# Get recent events/logs +databricks pipelines list-pipeline-events --pipeline-id +``` **From DAB (`databricks bundle run`):** - Check the command output for success/failure -- Use `manage_pipeline(action="get", pipeline_id=...)` to get detailed status and recent events +- Use `databricks pipelines get --pipeline-id ...` to get detailed status and recent events ### Step 2: Validate Output Data Even if the pipeline reports SUCCESS, you **MUST** verify the data is correct: -``` -# MCP Tool: get_table_stats_and_schema - validates schema, row counts, and stats -get_table_stats_and_schema( - catalog="my_catalog", - schema="my_schema", - table_names=["bronze_*", "silver_*", "gold_*"] # Use glob patterns -) +```bash +# Check table schema and stats +databricks sql execute --warehouse-id WAREHOUSE_ID --query " +DESCRIBE TABLE EXTENDED my_catalog.my_schema.bronze_orders; +" + +# Check row counts +databricks sql execute --warehouse-id WAREHOUSE_ID --query " +SELECT 'bronze_orders' as table_name, COUNT(*) as row_count FROM my_catalog.my_schema.bronze_orders +UNION ALL +SELECT 'silver_orders', COUNT(*) FROM my_catalog.my_schema.silver_orders +UNION ALL +SELECT 'gold_summary', COUNT(*) FROM my_catalog.my_schema.gold_summary; +" ``` **Check for:** @@ -314,7 +324,7 @@ get_table_stats_and_schema( If validation reveals problems, trace upstream to find the root cause: 1. **Start from the problematic table** - identify what's wrong (empty, wrong counts, bad data) -2. **Check its source table** - use `get_table_stats_and_schema` on the upstream table +2. **Check its source table** - run `DESCRIBE` and `COUNT(*)` on the upstream table 3. **Trace back to bronze** - continue until you find where the issue originates 4. **Common causes:** - Bronze empty β†’ source files missing or path incorrect diff --git a/databricks-skills/databricks-spark-declarative-pipelines/references/1-project-initialization.md b/databricks-skills/databricks-spark-declarative-pipelines/references/1-project-initialization.md index fbab69b3..fcea0c45 100644 --- a/databricks-skills/databricks-spark-declarative-pipelines/references/1-project-initialization.md +++ b/databricks-skills/databricks-spark-declarative-pipelines/references/1-project-initialization.md @@ -576,7 +576,7 @@ For technical best practices (Liquid Clustering, serverless, etc.), see **[SKILL ## References -- **[SKILL.md](../SKILL.md)** - Main development workflow and MCP tools +- **[SKILL.md](../SKILL.md)** - Main development workflow and CLI commands - **[Declarative Automation Bundles (DABs) Documentation](https://docs.databricks.com/dev-tools/bundles/)** - Official bundle reference - **[Pipeline Configuration Reference](https://docs.databricks.com/aws/en/ldp/configure-pipeline)** - Pipeline settings - **[Databricks CLI Reference](https://docs.databricks.com/dev-tools/cli/)** - CLI commands and options diff --git a/databricks-skills/databricks-spark-declarative-pipelines/references/2-cli-approach.md b/databricks-skills/databricks-spark-declarative-pipelines/references/2-cli-approach.md new file mode 100644 index 00000000..848cd2ea --- /dev/null +++ b/databricks-skills/databricks-spark-declarative-pipelines/references/2-cli-approach.md @@ -0,0 +1,167 @@ +# Rapid Pipeline Iteration with CLI + +Use CLI commands to create, run, and iterate on **SDP pipelines**. This is the fastest approach for prototyping without managing bundle files. + +**IMPORTANT: Default to serverless pipelines.** Only use classic clusters if user explicitly requires R language, Spark RDD APIs, or JAR libraries. + +### Step 1: Write Pipeline Files Locally + +Create `.sql` or `.py` files in a local folder. For syntax examples, see: +- [sql/1-syntax-basics.md](sql/1-syntax-basics.md) for SQL syntax +- [python/1-syntax-basics.md](python/1-syntax-basics.md) for Python syntax + +### Step 2: Upload to Databricks Workspace + +```bash +# Upload local folder to workspace +databricks workspace import-dir ./my_pipeline /Workspace/Users/user@example.com/my_pipeline +``` + +### Step 3: Create Pipeline + +```bash +# Create pipeline with JSON config +databricks pipelines create --json '{ + "name": "my_orders_pipeline", + "catalog": "my_catalog", + "schema": "my_schema", + "serverless": true, + "libraries": [ + {"notebook": {"path": "/Workspace/Users/user@example.com/my_pipeline/bronze/ingest_orders.sql"}}, + {"notebook": {"path": "/Workspace/Users/user@example.com/my_pipeline/silver/clean_orders.sql"}}, + {"notebook": {"path": "/Workspace/Users/user@example.com/my_pipeline/gold/daily_summary.sql"}} + ], + "development": true +}' +``` + +Save the returned `pipeline_id` for subsequent operations. + +### Step 4: Run Pipeline + +```bash +# Start a full refresh run +databricks pipelines start-update --pipeline-id --full-refresh + +# Check run status +databricks pipelines get --pipeline-id +``` + +### Step 5: Validate Results + +**On Success** - Verify tables were created with correct data: + +```bash +# Check table schemas and row counts +databricks sql execute --warehouse-id WAREHOUSE_ID --query " +DESCRIBE TABLE EXTENDED my_catalog.my_schema.bronze_orders; +" + +databricks sql execute --warehouse-id WAREHOUSE_ID --query " +SELECT COUNT(*) as row_count FROM my_catalog.my_schema.bronze_orders; +" +``` + +Or use Python for detailed stats: +```python +from databricks.sdk import WorkspaceClient + +w = WorkspaceClient() + +# Get table info +table = w.tables.get("my_catalog.my_schema.bronze_orders") +print(f"Columns: {len(table.columns)}") +print(f"Created: {table.created_at}") +``` + +**On Failure** - Get pipeline events and errors: + +```bash +# Get pipeline details with recent events +databricks pipelines get --pipeline-id + +# Get specific run events +databricks pipelines list-pipeline-events --pipeline-id +``` + +### Step 6: Iterate Until Working + +1. Review errors from pipeline status or events +2. Fix issues in local files +3. Re-upload: `databricks workspace import-dir ./my_pipeline /Workspace/Users/user@example.com/my_pipeline --overwrite` +4. Update and run: `databricks pipelines update --pipeline-id --json '...'` then `databricks pipelines start-update --pipeline-id ` +5. Repeat until pipeline completes successfully + +--- + +## Quick Reference: CLI Commands + +### Pipeline Lifecycle + +| Command | Description | +|---------|-------------| +| `databricks pipelines create --json '{...}'` | Create new pipeline | +| `databricks pipelines get --pipeline-id ID` | Get pipeline details and status | +| `databricks pipelines update --pipeline-id ID --json '{...}'` | Update pipeline config | +| `databricks pipelines delete --pipeline-id ID` | Delete a pipeline | +| `databricks pipelines list` | List all pipelines | + +### Run Management + +| Command | Description | +|---------|-------------| +| `databricks pipelines start-update --pipeline-id ID` | Start pipeline update | +| `databricks pipelines start-update --pipeline-id ID --full-refresh` | Start with full refresh | +| `databricks pipelines stop --pipeline-id ID` | Stop running pipeline | +| `databricks pipelines list-pipeline-events --pipeline-id ID` | Get events/logs | +| `databricks pipelines list-updates --pipeline-id ID` | List recent runs | + +### Supporting Commands + +| Command | Description | +|---------|-------------| +| `databricks workspace import-dir` | Upload files/folders to workspace | +| `databricks workspace ls` | List workspace files | +| `databricks sql execute` | Run ad-hoc SQL to inspect data | + +--- + +## Python SDK Alternative + +For more programmatic control, use the Databricks SDK: + +```python +from databricks.sdk import WorkspaceClient + +w = WorkspaceClient() + +# Create pipeline +pipeline = w.pipelines.create( + name="my_orders_pipeline", + catalog="my_catalog", + schema="my_schema", + serverless=True, + libraries=[ + {"notebook": {"path": "/Workspace/Users/user@example.com/my_pipeline/bronze/ingest_orders.sql"}} + ], + development=True +) +print(f"Created pipeline: {pipeline.pipeline_id}") + +# Start update +update = w.pipelines.start_update( + pipeline_id=pipeline.pipeline_id, + full_refresh=True +) + +# Poll for completion +import time +while True: + status = w.pipelines.get(pipeline_id=pipeline.pipeline_id) + if status.state in ["IDLE", "FAILED"]: + print(f"Pipeline state: {status.state}") + break + time.sleep(10) +``` + +--- diff --git a/databricks-skills/databricks-spark-declarative-pipelines/references/2-mcp-approach.md b/databricks-skills/databricks-spark-declarative-pipelines/references/2-mcp-approach.md deleted file mode 100644 index 87e0ed70..00000000 --- a/databricks-skills/databricks-spark-declarative-pipelines/references/2-mcp-approach.md +++ /dev/null @@ -1,163 +0,0 @@ -Use MCP tools to create, run, and iterate on **SDP pipelines**. The **primary tool is `manage_pipeline`** which handles the entire lifecycle. - -**IMPORTANT: Default to serverless pipelines.** Only use classic clusters if user explicitly requires R language, Spark RDD APIs, or JAR libraries. - -### Step 1: Write Pipeline Files Locally - -Create `.sql` or `.py` files in a local folder. For syntax examples, see: -- [sql/1-syntax-basics.md](sql/1-syntax-basics.md) for SQL syntax -- [python/1-syntax-basics.md](python/1-syntax-basics.md) for Python syntax - -### Step 2: Upload to Databricks Workspace - -``` -# MCP Tool: manage_workspace_files -manage_workspace_files( - action="upload", - local_path="/path/to/my_pipeline", - workspace_path="/Workspace/Users/user@example.com/my_pipeline" -) -``` - -### Step 3: Create/Update and Run Pipeline - -Use **`manage_pipeline`** with `action="create_or_update"` to manage the resource: - -``` -# MCP Tool: manage_pipeline -manage_pipeline( - action="create_or_update", - name="my_orders_pipeline", - root_path="/Workspace/Users/user@example.com/my_pipeline", - catalog="my_catalog", - schema="my_schema", - workspace_file_paths=[ - "/Workspace/Users/user@example.com/my_pipeline/bronze/ingest_orders.sql", - "/Workspace/Users/user@example.com/my_pipeline/silver/clean_orders.sql", - "/Workspace/Users/user@example.com/my_pipeline/gold/daily_summary.sql" - ], - start_run=True, # Automatically run after create/update - wait_for_completion=True, # Wait for run to finish - full_refresh=True # Reprocess all data -) -``` - -**Result contains actionable information:** -```json -{ - "success": true, - "pipeline_id": "abc-123", - "pipeline_name": "my_orders_pipeline", - "created": true, - "state": "COMPLETED", - "catalog": "my_catalog", - "schema": "my_schema", - "duration_seconds": 45.2, - "message": "Pipeline created and completed successfully in 45.2s. Tables written to my_catalog.my_schema", - "error_message": null, - "errors": [] -} -``` - -### Alternative: Run Pipeline Separately - -If you want to run an existing pipeline or control the run separately: - -``` -# MCP Tool: manage_pipeline_run -manage_pipeline_run( - action="start", - pipeline_id="", - full_refresh=True, - wait=True, # Wait for completion - timeout=1800 # 30 minute timeout -) -``` - -### Step 4: Validate Results - -**On Success** - Use `get_table_stats_and_schema` to verify tables (NOT manual SQL COUNT queries): -``` -# MCP Tool: get_table_stats_and_schema -get_table_stats_and_schema( - catalog="my_catalog", - schema="my_schema", - table_names=["bronze_orders", "silver_orders", "gold_daily_summary"] -) -# Returns schema, row counts, and column stats for all tables in one call -``` - -**On Failure** - Check `run_result["message"]` for suggested next steps, then get detailed errors: -``` -# MCP Tool: manage_pipeline -manage_pipeline(action="get", pipeline_id="") -# Returns pipeline details enriched with recent events and error messages - -# Or get events/logs directly: -# MCP Tool: manage_pipeline_run -manage_pipeline_run( - action="get_events", - pipeline_id="", - event_log_level="ERROR", # ERROR, WARN, or INFO - max_results=10 -) -``` - -### Step 5: Iterate Until Working - -1. Review errors from run result or `manage_pipeline(action="get")` -2. Fix issues in local files -3. Re-upload with `manage_workspace_files(action="upload")` -4. Run `manage_pipeline(action="create_or_update", start_run=True)` again (it will update, not recreate) -5. Repeat until `result["success"] == True` - ---- - -## Quick Reference: MCP Tools - -### manage_pipeline - Pipeline Lifecycle - -| Action | Description | Required Params | -|--------|-------------|-----------------| -| `create` | Create new pipeline | name, root_path, catalog, schema, workspace_file_paths | -| `create_or_update` | **Main entry point.** Idempotent create/update, optionally run | name, root_path, catalog, schema, workspace_file_paths | -| `get` | Get pipeline details by ID | pipeline_id | -| `update` | Update pipeline config | pipeline_id + fields to change | -| `delete` | Delete a pipeline | pipeline_id | -| `find_by_name` | Find pipeline by name | name | - -**create_or_update options:** -- `start_run=True`: Automatically run after create/update -- `wait_for_completion=True`: Block until run finishes -- `full_refresh=True`: Reprocess all data (default) -- `timeout=1800`: Max wait time in seconds - -### manage_pipeline_run - Run Management - -| Action | Description | Required Params | -|--------|-------------|-----------------| -| `start` | Start pipeline update | pipeline_id | -| `get` | Get run status | pipeline_id, update_id | -| `stop` | Stop running pipeline | pipeline_id | -| `get_events` | Get events/logs for debugging | pipeline_id | - -**start options:** -- `wait=True`: Block until complete (default) -- `full_refresh=True`: Reprocess all data -- `validate_only=True`: Dry run without writing data -- `refresh_selection=["table1", "table2"]`: Refresh specific tables only - -**get_events options:** -- `event_log_level`: "ERROR", "WARN" (default), "INFO" -- `max_results`: Number of events (default 5) -- `update_id`: Filter to specific run - -### Supporting Tools - -| Tool | Description | -|------|-------------| -| `manage_workspace_files(action="upload")` | Upload files/folders to workspace | -| `get_table_stats_and_schema` | **Use this to validate tables** - returns schema, row counts, and stats in one call | -| `execute_sql` | Run ad-hoc SQL to inspect actual data content (not for row counts) | - ---- From bee86412e78e4929495f440b5253cd1791c4758f Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Mon, 13 Apr 2026 10:22:03 +0200 Subject: [PATCH 10/66] refactor(skills): replace MCP tools with CLI/SQL across remaining skills MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - databricks-config: Rewrite to use `databricks auth` CLI commands - databricks-docs: Update references from MCP to CLI/SDK - databricks-metric-views: Replace MCP tools with SQL CREATE/DESCRIBE commands - databricks-execution-compute: Replace MCP tools with CLI job commands - databricks-unity-catalog/6-volumes: Replace MCP tools with `databricks fs` CLI - databricks-unity-catalog/7-data-profiling: Replace MCP tools with SQL QUALITY MONITOR πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- databricks-skills/databricks-config/SKILL.md | 98 +++++++++++-- databricks-skills/databricks-docs/SKILL.md | 10 +- .../databricks-execution-compute/SKILL.md | 54 +++++--- .../references/1-databricks-connect.md | 2 +- .../references/2-serverless-job.md | 2 +- .../references/3-interactive-cluster.md | 2 +- .../databricks-metric-views/SKILL.md | 130 ++++++++++-------- .../databricks-metric-views/patterns.md | 124 +++++++++-------- .../databricks-unity-catalog/6-volumes.md | 22 ++- .../7-data-profiling.md | 61 ++++---- 10 files changed, 304 insertions(+), 201 deletions(-) diff --git a/databricks-skills/databricks-config/SKILL.md b/databricks-skills/databricks-config/SKILL.md index 118713d1..58441f4a 100644 --- a/databricks-skills/databricks-config/SKILL.md +++ b/databricks-skills/databricks-config/SKILL.md @@ -3,20 +3,96 @@ name: databricks-config description: "Manage Databricks workspace connections: check current workspace, switch profiles, list available workspaces, or authenticate to a new workspace. Use when the user mentions \"switch workspace\", \"which workspace\", \"current profile\", \"databrickscfg\", \"connect to workspace\", or \"databricks auth\"." --- -Use the `manage_workspace` MCP tool for all workspace operations. Do NOT edit `~/.databrickscfg`, use Bash, or use the Databricks CLI. +Use the Databricks CLI for all workspace operations. -## Steps +## CLI Commands -1. Call `ToolSearch` with query `select:mcp__databricks__manage_workspace` to load the tool. +### Check Current Workspace -2. Map user intent to action: - - status / which workspace / current β†’ `action="status"` - - list / available workspaces β†’ `action="list"` - - switch to X β†’ call `list` first to find the profile name, then `action="switch", profile=""` (or `host=""` if a URL was given) - - login / connect / authenticate β†’ `action="login", host=""` +```bash +# Show current configuration status +databricks auth describe -3. Call `mcp__databricks__manage_workspace` with the action and any parameters. +# Show current workspace URL +databricks config get --key host -4. Present the result. For `status`/`switch`/`login`: show host, profile, username. For `list`: formatted table with the active profile marked. +# Show current profile +databricks config get --key profile +``` -> **Note:** The switch is session-scoped β€” it resets on MCP server restart. For permanent profile setup, use `databricks auth login -p ` and update `~/.databrickscfg` with `cluster_id` or `serverless_compute_id = auto`. +### List Available Profiles + +```bash +# List all configured profiles from ~/.databrickscfg +cat ~/.databrickscfg | grep '^\[' | tr -d '[]' +``` + +### Switch Workspace/Profile + +```bash +# Use a different profile for subsequent commands +databricks --profile auth describe + +# Or set environment variable for the session +export DATABRICKS_CONFIG_PROFILE= +``` + +### Authenticate to New Workspace + +```bash +# OAuth login (opens browser) +databricks auth login --host https://your-workspace.cloud.databricks.com + +# OAuth login with profile name +databricks auth login --host https://your-workspace.cloud.databricks.com --profile my-profile + +# Configure with PAT +databricks configure --profile my-profile +``` + +### Verify Authentication + +```bash +# Check auth status +databricks auth describe + +# Test by listing clusters +databricks clusters list +``` + +## ~/.databrickscfg Format + +```ini +[DEFAULT] +host = https://your-workspace.cloud.databricks.com +cluster_id = 0123-456789-abc123 +# or +serverless_compute_id = auto + +[production] +host = https://prod-workspace.cloud.databricks.com +token = dapi... + +[development] +host = https://dev-workspace.cloud.databricks.com +``` + +## Python SDK + +```python +from databricks.sdk import WorkspaceClient + +# Use default profile +w = WorkspaceClient() + +# Use specific profile +w = WorkspaceClient(profile="production") + +# Use specific host +w = WorkspaceClient(host="https://your-workspace.cloud.databricks.com") + +# Check current user +print(w.current_user.me().user_name) +``` + +> **Note:** Profile changes via environment variables or CLI flags are session-scoped. For permanent profile setup, use `databricks auth login -p ` and update `~/.databrickscfg` with `cluster_id` or `serverless_compute_id = auto`. diff --git a/databricks-skills/databricks-docs/SKILL.md b/databricks-skills/databricks-docs/SKILL.md index ceca11e0..8e9d68d5 100644 --- a/databricks-skills/databricks-docs/SKILL.md +++ b/databricks-skills/databricks-docs/SKILL.md @@ -5,7 +5,7 @@ description: "Databricks documentation reference via llms.txt index. Use when ot # Databricks Documentation Reference -This skill provides access to the complete Databricks documentation index via llms.txt - use it as a **reference resource** to supplement other skills and inform your use of MCP tools. +This skill provides access to the complete Databricks documentation index via llms.txt - use it as a **reference resource** to supplement other skills. ## Role of This Skill @@ -13,10 +13,10 @@ This is a **reference skill**, not an action skill. Use it to: - Look up documentation when other skills don't cover a topic - Get authoritative guidance on Databricks concepts and APIs -- Find detailed information to inform how you use MCP tools +- Find detailed information to inform CLI commands and SDK usage - Discover features and capabilities you may not know about -**Always prefer using MCP tools for actions** (execute_sql, manage_pipeline, etc.) and **load specific skills for workflows** (databricks-python-sdk, databricks-spark-declarative-pipelines, etc.). Use this skill when you need reference documentation. +**Always prefer using CLI/SDK for actions** and **load specific skills for workflows** (databricks-python-sdk, databricks-spark-declarative-pipelines, etc.). Use this skill when you need reference documentation. ## How to Use @@ -28,7 +28,7 @@ Use WebFetch to retrieve this index, then: 1. Search for relevant sections/links 2. Fetch specific documentation pages for detailed guidance -3. Apply what you learn using the appropriate MCP tools +3. Apply what you learn using the appropriate CLI commands or SDK ## Documentation Structure @@ -47,7 +47,7 @@ The llms.txt file is organized by category: 1. Load `databricks-spark-declarative-pipelines` skill for workflow patterns 2. Use this skill to fetch docs if you need clarification on specific DLT features -3. Use `manage_pipeline(action="create_or_update")` MCP tool to actually create the pipeline +3. Use `databricks pipelines create` CLI command to create the pipeline **Scenario:** User asks about an unfamiliar Databricks feature diff --git a/databricks-skills/databricks-execution-compute/SKILL.md b/databricks-skills/databricks-execution-compute/SKILL.md index c3518385..770d139a 100644 --- a/databricks-skills/databricks-execution-compute/SKILL.md +++ b/databricks-skills/databricks-execution-compute/SKILL.md @@ -42,7 +42,7 @@ Scala/R? β†’ Interactive Cluster (list and ask which one to use) **Read the reference file for your chosen mode before proceeding.** -### Databricks Connect (no MCP tool, run locally) β†’ [reference](references/1-databricks-connect.md) +### Databricks Connect (run locally) β†’ [reference](references/1-databricks-connect.md) ```bash python my_spark_script.py @@ -50,30 +50,48 @@ python my_spark_script.py ### Serverless Job β†’ [reference](references/2-serverless-job.md) -```python -execute_code(file_path="/path/to/script.py") +```bash +# Create and run a job with serverless compute +databricks jobs create --json '{ + "name": "my-script-job", + "tasks": [{ + "task_key": "main", + "spark_python_task": {"python_file": "/Workspace/Users/me/script.py"}, + "environment_key": "default" + }], + "environments": [{"environment_key": "default", "spec": {"client": "4"}}] +}' + +# Run the job +databricks jobs run-now --job-id JOB_ID ``` ### Interactive Cluster β†’ [reference](references/3-interactive-cluster.md) -```python -# Check for running clusters first (or use the one instructed) -list_compute(resource="clusters") -# Ask the customer which one to use - -# Run code, reuse context_id for follow-up MCP call -result = execute_code(code="...", compute_type="cluster", cluster_id="...") -execute_code(code="...", context_id=result["context_id"], cluster_id=result["cluster_id"]) +```bash +# List running clusters +databricks clusters list --output json | jq '.[] | select(.state == "RUNNING")' + +# Run a notebook or script on a cluster +databricks workspace import /Workspace/Users/me/script.py --file ./script.py +databricks jobs create --json '{ + "name": "cluster-job", + "tasks": [{ + "task_key": "main", + "existing_cluster_id": "CLUSTER_ID", + "spark_python_task": {"python_file": "/Workspace/Users/me/script.py"} + }] +}' ``` -## MCP Tools +## CLI Commands -| Tool | For | Purpose | -|------|-----|---------| -| `execute_code` | Serverless, Interactive | Run code remotely | -| `list_compute` | Interactive | List clusters, check status, auto-select running cluster | -| `manage_cluster` | Interactive | Create, start, terminate, delete. **COSTLY:** `start` takes 3-8 minβ€”ask user | -| `manage_sql_warehouse` | SQL | Create, modify, delete SQL warehouses | +| Command | For | Purpose | +|---------|-----|---------| +| `databricks jobs create/run-now` | Serverless, Cluster | Run code remotely | +| `databricks clusters list` | Interactive | List clusters, check status | +| `databricks clusters create/start/delete` | Interactive | Manage clusters. **COSTLY:** `start` takes 3-8 min | +| `databricks sql warehouses create/list` | SQL | Manage SQL warehouses | ## Related Skills diff --git a/databricks-skills/databricks-execution-compute/references/1-databricks-connect.md b/databricks-skills/databricks-execution-compute/references/1-databricks-connect.md index 838d2a7d..8b0d74ad 100644 --- a/databricks-skills/databricks-execution-compute/references/1-databricks-connect.md +++ b/databricks-skills/databricks-execution-compute/references/1-databricks-connect.md @@ -68,5 +68,5 @@ Switch to **[Serverless Job](2-serverless-job.md)** when: - Non-Spark Python code (pure sklearn, pytorch, etc.) Switch to **[Interactive Cluster](3-interactive-cluster.md)** when: -- Need state across multiple separate MCP tool calls +- Need state across multiple separate tool calls - Need Scala or R support diff --git a/databricks-skills/databricks-execution-compute/references/2-serverless-job.md b/databricks-skills/databricks-execution-compute/references/2-serverless-job.md index 4be8801c..6cc29fd9 100644 --- a/databricks-skills/databricks-execution-compute/references/2-serverless-job.md +++ b/databricks-skills/databricks-execution-compute/references/2-serverless-job.md @@ -72,5 +72,5 @@ Switch to **[Databricks Connect](1-databricks-connect.md)** when: - Need local debugging with breakpoints Switch to **[Interactive Cluster](3-interactive-cluster.md)** when: -- Need state across multiple MCP tool calls +- Need state across multiple tool calls - Need Scala or R support diff --git a/databricks-skills/databricks-execution-compute/references/3-interactive-cluster.md b/databricks-skills/databricks-execution-compute/references/3-interactive-cluster.md index aa73ea90..1ed6fd36 100644 --- a/databricks-skills/databricks-execution-compute/references/3-interactive-cluster.md +++ b/databricks-skills/databricks-execution-compute/references/3-interactive-cluster.md @@ -1,6 +1,6 @@ # Interactive Cluster Execution -**Use when:** You have an existing running cluster and need to preserve state across multiple MCP tool calls, or need Scala/R support. +**Use when:** You have an existing running cluster and need to preserve state across multiple tool calls, or need Scala/R support. ## When to Choose Interactive Cluster diff --git a/databricks-skills/databricks-metric-views/SKILL.md b/databricks-skills/databricks-metric-views/SKILL.md index 3cc4b427..94db66d0 100644 --- a/databricks-skills/databricks-metric-views/SKILL.md +++ b/databricks-skills/databricks-metric-views/SKILL.md @@ -95,72 +95,88 @@ ORDER BY ALL | YAML Syntax | [yaml-reference.md](yaml-reference.md) | Complete YAML spec: dimensions, measures, joins, materialization | | Patterns & Examples | [patterns.md](patterns.md) | Common patterns: star schema, snowflake, filtered measures, window measures, ratios | -## MCP Tools - -Use the `manage_metric_views` tool for all metric view operations: - -| Action | Description | -|--------|-------------| -| `create` | Create a metric view with dimensions and measures | -| `alter` | Update a metric view's YAML definition | -| `describe` | Get the full definition and metadata | -| `query` | Query measures grouped by dimensions | -| `drop` | Drop a metric view | -| `grant` | Grant SELECT privileges to users/groups | - -### Create via MCP - -```python -manage_metric_views( - action="create", - full_name="catalog.schema.orders_metrics", - source="catalog.schema.orders", - or_replace=True, - comment="Orders KPIs for sales analysis", - filter_expr="order_date > '2020-01-01'", - dimensions=[ - {"name": "Order Month", "expr": "DATE_TRUNC('MONTH', order_date)", "comment": "Month of order"}, - {"name": "Order Status", "expr": "status"}, - ], - measures=[ - {"name": "Order Count", "expr": "COUNT(1)"}, - {"name": "Total Revenue", "expr": "SUM(total_price)", "comment": "Sum of total price"}, - ], -) +## SQL Operations + +### Create Metric View + +```sql +CREATE OR REPLACE VIEW catalog.schema.orders_metrics +WITH METRICS +LANGUAGE YAML +AS $$ + version: 1.1 + comment: "Orders KPIs for sales analysis" + source: catalog.schema.orders + filter: order_date > '2020-01-01' + dimensions: + - name: Order Month + expr: DATE_TRUNC('MONTH', order_date) + comment: "Month of order" + - name: Order Status + expr: status + measures: + - name: Order Count + expr: COUNT(1) + - name: Total Revenue + expr: SUM(total_price) + comment: "Sum of total price" +$$; ``` -### Query via MCP - -```python -manage_metric_views( - action="query", - full_name="catalog.schema.orders_metrics", - query_measures=["Total Revenue", "Order Count"], - query_dimensions=["Order Month"], - where="extract(year FROM `Order Month`) = 2024", - order_by="ALL", - limit=100, -) +### Query Metric View + +```sql +SELECT + `Order Month`, + MEASURE(`Total Revenue`) AS total_revenue, + MEASURE(`Order Count`) AS order_count +FROM catalog.schema.orders_metrics +WHERE extract(year FROM `Order Month`) = 2024 +GROUP BY ALL +ORDER BY ALL +LIMIT 100; ``` -### Describe via MCP +### Describe Metric View -```python -manage_metric_views( - action="describe", - full_name="catalog.schema.orders_metrics", -) +```sql +DESCRIBE TABLE EXTENDED catalog.schema.orders_metrics; + +-- Or get YAML definition +SHOW CREATE TABLE catalog.schema.orders_metrics; ``` ### Grant Access -```python -manage_metric_views( - action="grant", - full_name="catalog.schema.orders_metrics", - principal="data-consumers", - privileges=["SELECT"], -) +```sql +GRANT SELECT ON VIEW catalog.schema.orders_metrics TO `data-consumers`; +``` + +### Drop Metric View + +```sql +DROP VIEW IF EXISTS catalog.schema.orders_metrics; +``` + +### CLI Execution + +```bash +# Execute SQL via CLI +databricks sql execute --warehouse-id WAREHOUSE_ID --query " +CREATE OR REPLACE VIEW catalog.schema.orders_metrics +WITH METRICS +LANGUAGE YAML +AS \$\$ + version: 1.1 + source: catalog.schema.orders + dimensions: + - name: Order Month + expr: DATE_TRUNC('MONTH', order_date) + measures: + - name: Total Revenue + expr: SUM(total_price) +\$\$ +" ``` ## YAML Spec Quick Reference diff --git a/databricks-skills/databricks-metric-views/patterns.md b/databricks-skills/databricks-metric-views/patterns.md index 1f067f4c..c109abcc 100644 --- a/databricks-skills/databricks-metric-views/patterns.md +++ b/databricks-skills/databricks-metric-views/patterns.md @@ -579,73 +579,81 @@ GROUP BY ALL ORDER BY ALL ``` -## MCP Tool Examples +## SQL Examples ### Create with joins -```python -manage_metric_views( - action="create", - full_name="catalog.schema.sales_metrics", - source="catalog.schema.fact_sales", - or_replace=True, - joins=[ - { - "name": "customer", - "source": "catalog.schema.dim_customer", - "on": "source.customer_id = customer.id" - }, - { - "name": "product", - "source": "catalog.schema.dim_product", - "on": "source.product_id = product.id" - } - ], - dimensions=[ - {"name": "Customer Segment", "expr": "customer.segment"}, - {"name": "Product Category", "expr": "product.category"}, - {"name": "Sale Month", "expr": "DATE_TRUNC('MONTH', source.sale_date)"}, - ], - measures=[ - {"name": "Total Revenue", "expr": "SUM(source.amount)"}, - {"name": "Order Count", "expr": "COUNT(1)"}, - {"name": "Unique Customers", "expr": "COUNT(DISTINCT source.customer_id)"}, - ], -) +```sql +CREATE OR REPLACE VIEW catalog.schema.sales_metrics +WITH METRICS +LANGUAGE YAML +AS $$ + version: 1.1 + source: catalog.schema.fact_sales + joins: + - name: customer + source: catalog.schema.dim_customer + on: source.customer_id = customer.id + - name: product + source: catalog.schema.dim_product + on: source.product_id = product.id + dimensions: + - name: Customer Segment + expr: customer.segment + - name: Product Category + expr: product.category + - name: Sale Month + expr: DATE_TRUNC('MONTH', source.sale_date) + measures: + - name: Total Revenue + expr: SUM(source.amount) + - name: Order Count + expr: COUNT(1) + - name: Unique Customers + expr: COUNT(DISTINCT source.customer_id) +$$; ``` ### Alter to add a new measure -```python -manage_metric_views( - action="alter", - full_name="catalog.schema.sales_metrics", - source="catalog.schema.fact_sales", - joins=[ - {"name": "customer", "source": "catalog.schema.dim_customer", "on": "source.customer_id = customer.id"}, - ], - dimensions=[ - {"name": "Customer Segment", "expr": "customer.segment"}, - {"name": "Sale Month", "expr": "DATE_TRUNC('MONTH', source.sale_date)"}, - ], - measures=[ - {"name": "Total Revenue", "expr": "SUM(source.amount)"}, - {"name": "Order Count", "expr": "COUNT(1)"}, - {"name": "Average Order Value", "expr": "AVG(source.amount)"}, # New measure - ], -) +```sql +-- Use CREATE OR REPLACE to update the metric view +CREATE OR REPLACE VIEW catalog.schema.sales_metrics +WITH METRICS +LANGUAGE YAML +AS $$ + version: 1.1 + source: catalog.schema.fact_sales + joins: + - name: customer + source: catalog.schema.dim_customer + on: source.customer_id = customer.id + dimensions: + - name: Customer Segment + expr: customer.segment + - name: Sale Month + expr: DATE_TRUNC('MONTH', source.sale_date) + measures: + - name: Total Revenue + expr: SUM(source.amount) + - name: Order Count + expr: COUNT(1) + - name: Average Order Value + expr: AVG(source.amount) +$$; ``` ### Query with filters -```python -manage_metric_views( - action="query", - full_name="catalog.schema.sales_metrics", - query_measures=["Total Revenue", "Order Count"], - query_dimensions=["Customer Segment", "Sale Month"], - where="`Customer Segment` = 'Enterprise'", - order_by="ALL", - limit=50, -) +```sql +SELECT + `Customer Segment`, + `Sale Month`, + MEASURE(`Total Revenue`) AS total_revenue, + MEASURE(`Order Count`) AS order_count +FROM catalog.schema.sales_metrics +WHERE `Customer Segment` = 'Enterprise' +GROUP BY ALL +ORDER BY ALL +LIMIT 50; ``` diff --git a/databricks-skills/databricks-unity-catalog/6-volumes.md b/databricks-skills/databricks-unity-catalog/6-volumes.md index 497b6090..179baa67 100644 --- a/databricks-skills/databricks-unity-catalog/6-volumes.md +++ b/databricks-skills/databricks-unity-catalog/6-volumes.md @@ -37,18 +37,16 @@ All volume operations use the path format: --- -## MCP Tools - -| Tool | Usage | -|------|-------| -| `list_volume_files` | `list_volume_files(volume_path="/Volumes/catalog/schema/volume/path/")` | -| `get_volume_folder_details` | `get_volume_folder_details(volume_path="catalog/schema/volume/path", format="parquet")` - schema, row counts, stats | -| `upload_to_volume` | `upload_to_volume(local_path="/tmp/data/*", volume_path="/Volumes/.../dest")` - supports files, folders, globs | -| `download_from_volume` | `download_from_volume(volume_path="/Volumes/.../file.csv", local_path="/tmp/file.csv")` | -| `create_volume_directory` | `create_volume_directory(volume_path="/Volumes/.../new_folder")` - creates parents like `mkdir -p` | -| `delete_volume_file` | `delete_volume_file(volume_path="/Volumes/.../file.csv")` | -| `delete_volume_directory` | `delete_volume_directory(volume_path="/Volumes/.../folder")` - directory must be empty | -| `get_volume_file_info` | `get_volume_file_info(volume_path="/Volumes/.../file.csv")` - returns size, modified date | +## CLI Commands + +| Command | Description | +|---------|-------------| +| `databricks fs ls /Volumes/catalog/schema/volume/path/` | List files in a volume | +| `databricks fs cp /tmp/data/* /Volumes/.../dest --recursive` | Upload files/folders to volume | +| `databricks fs cp /Volumes/.../file.csv /tmp/file.csv` | Download files from volume | +| `databricks fs mkdirs /Volumes/.../new_folder` | Create directory (like `mkdir -p`) | +| `databricks fs rm /Volumes/.../file.csv` | Delete file | +| `databricks fs rm /Volumes/.../folder --recursive` | Delete directory recursively | --- diff --git a/databricks-skills/databricks-unity-catalog/7-data-profiling.md b/databricks-skills/databricks-unity-catalog/7-data-profiling.md index 23a2b62f..3103adf0 100644 --- a/databricks-skills/databricks-unity-catalog/7-data-profiling.md +++ b/databricks-skills/databricks-unity-catalog/7-data-profiling.md @@ -36,55 +36,42 @@ Supported `AggregationGranularity` values: `AGGREGATION_GRANULARITY_5_MINUTES`, --- -## MCP Tools +## CLI & SQL Commands -Use the `manage_uc_monitors` tool for all monitor operations: +### Create a Monitor (SQL) -| Action | Description | -|--------|-------------| -| `create` | Create a quality monitor on a table | -| `get` | Get monitor details and status | -| `run_refresh` | Trigger a metric refresh | -| `list_refreshes` | List refresh history | -| `delete` | Delete the monitor (assets are not deleted) | - -### Create a Monitor +```sql +CREATE OR REPLACE QUALITY MONITOR catalog.schema.my_table +OPTIONS ( + OUTPUT_SCHEMA 'catalog.schema' +); +``` -> **Note:** The MCP tool currently only creates **snapshot** monitors. For TimeSeries or InferenceLog monitors, use the Python SDK directly (see below). +### Get Monitor Status (SQL) -```python -manage_uc_monitors( - action="create", - table_name="catalog.schema.my_table", - output_schema_name="catalog.schema", -) +```sql +DESCRIBE QUALITY MONITOR catalog.schema.my_table; ``` -### Get Monitor Status +### Trigger a Refresh (SQL) -```python -manage_uc_monitors( - action="get", - table_name="catalog.schema.my_table", -) +```sql +REFRESH QUALITY MONITOR catalog.schema.my_table; ``` -### Trigger a Refresh +### Delete a Monitor (SQL) -```python -manage_uc_monitors( - action="run_refresh", - table_name="catalog.schema.my_table", -) +```sql +DROP QUALITY MONITOR catalog.schema.my_table; ``` -### Delete a Monitor +### Execute via CLI -```python -manage_uc_monitors( - action="delete", - table_name="catalog.schema.my_table", -) +```bash +databricks sql execute --warehouse-id WAREHOUSE_ID --query " +CREATE OR REPLACE QUALITY MONITOR catalog.schema.my_table +OPTIONS (OUTPUT_SCHEMA 'catalog.schema') +" ``` --- @@ -300,7 +287,7 @@ LIMIT 100; --- > **Note:** Data profiling was formerly known as Lakehouse Monitoring. The legacy SDK accessor -> `w.lakehouse_monitors` and the MCP tool `manage_uc_monitors` still use the previous API. +> `w.lakehouse_monitors` still uses the previous API. Use `w.data_quality` for the new API. ## Resources From cff9805419a271e0ce9b1079f87d2938ad2a4adb Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Mon, 13 Apr 2026 10:23:33 +0200 Subject: [PATCH 11/66] refactor(model-serving): replace MCP tools with CLI commands MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 5-development-testing.md: Update workflow from MCP to CLI - 8-querying-endpoints.md: Replace MCP tools section with CLI commands - SKILL.md: Update reference table descriptions πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../5-development-testing.md | 39 ++++----- .../8-querying-endpoints.md | 82 ++++--------------- .../databricks-model-serving/SKILL.md | 4 +- 3 files changed, 36 insertions(+), 89 deletions(-) diff --git a/databricks-skills/databricks-model-serving/5-development-testing.md b/databricks-skills/databricks-model-serving/5-development-testing.md index 2a3806cf..71970aa9 100644 --- a/databricks-skills/databricks-model-serving/5-development-testing.md +++ b/databricks-skills/databricks-model-serving/5-development-testing.md @@ -1,8 +1,6 @@ # Development & Testing Workflow -MCP-based workflow for developing and testing agents on Databricks. - -> **If MCP tools are not available**, use Databricks CLI or the Python SDK directly. See [Databricks CLI docs](https://docs.databricks.com/dev-tools/cli/) for `databricks workspace import` and `databricks clusters spark-submit` commands. +CLI-based workflow for developing and testing agents on Databricks. ## Overview @@ -13,17 +11,17 @@ MCP-based workflow for developing and testing agents on Databricks. β–Ό β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ Step 2: Upload to workspace β”‚ -β”‚ β†’ manage_workspace_files MCP tool β”‚ +β”‚ β†’ databricks workspace import-dir β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β–Ό β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ Step 3: Install packages β”‚ -β”‚ β†’ execute_code MCP tool β”‚ +β”‚ β†’ databricks jobs (serverless with pip requirements) β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β–Ό β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”‚ Step 4: Test agent (iterate) β”‚ -β”‚ β†’ execute_code MCP tool (with file_path) β”‚ +β”‚ β†’ databricks jobs run-now β”‚ β”‚ β†’ If error: fix locally, re-upload, re-run β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ ``` @@ -85,17 +83,13 @@ print("Response:", result.model_dump(exclude_none=True)) ## Step 2: Upload to Workspace -Use the `manage_workspace_files` MCP tool: +Use the Databricks CLI: -``` -manage_workspace_files( - action="upload", - local_path="./my_agent", - workspace_path="/Workspace/Users/you@company.com/my_agent" -) +```bash +databricks workspace import-dir ./my_agent /Workspace/Users/you@company.com/my_agent ``` -This uploads all files in parallel. +This uploads all files recursively. ## Step 3: Install Packages @@ -135,8 +129,8 @@ execute_code( 1. Read the error from the output 2. Fix the local file (`agent.py` or `test_agent.py`) -3. Re-upload: `manage_workspace_files(action="upload", ...)` -4. Re-run: `execute_code(file_path=...)` +3. Re-upload: `databricks workspace import-dir ./my_agent /Workspace/.../my_agent` +4. Re-run the job ### Iteration Tips @@ -188,13 +182,12 @@ print(response.content) ## Workflow Summary -| Step | MCP Tool | Purpose | -|------|----------|---------| -| Upload files | `manage_workspace_files` (action="upload") | Sync local files to workspace | -| Install packages | `execute_code` | Set up dependencies | -| Restart Python | `execute_code` | Apply package changes | -| Test agent | `execute_code` (with `file_path`) | Run test script | -| Debug | `execute_code` | Quick checks | +| Step | CLI Command | Purpose | +|------|-------------|---------| +| Upload files | `databricks workspace import-dir` | Sync local files to workspace | +| Install packages | Job with pip requirements | Set up dependencies | +| Test agent | `databricks jobs run-now` | Run test script | +| Debug | Run notebook or script | Quick checks | ## Next Steps diff --git a/databricks-skills/databricks-model-serving/8-querying-endpoints.md b/databricks-skills/databricks-model-serving/8-querying-endpoints.md index 4dfa2f91..2cebb0c1 100644 --- a/databricks-skills/databricks-model-serving/8-querying-endpoints.md +++ b/databricks-skills/databricks-model-serving/8-querying-endpoints.md @@ -2,87 +2,41 @@ Send requests to deployed Model Serving endpoints. -> **If MCP tools are not available**, use the Python SDK or REST API examples below. - -## MCP Tools +## CLI Commands ### Check Endpoint Status Before querying, verify the endpoint is ready: -``` -manage_serving_endpoint(action="get", name="my-agent-endpoint") -``` - -Response: -```json -{ - "name": "my-agent-endpoint", - "state": "READY", - "served_entities": [ - {"name": "my_agent-1", "entity_name": "main.agents.my_agent", "deployment_state": "READY"} - ] -} +```bash +databricks serving-endpoints get my-agent-endpoint ``` ### Query Chat/Agent Endpoint -``` -manage_serving_endpoint( - action="query", - name="my-agent-endpoint", - messages=[ - {"role": "user", "content": "What is Databricks?"} - ], - max_tokens=500, - temperature=0.7 -) -``` - -Response: -```json -{ - "choices": [ - { - "message": { - "role": "assistant", - "content": "Databricks is a unified data intelligence platform..." - }, - "finish_reason": "stop" - } - ], - "usage": { - "prompt_tokens": 10, - "completion_tokens": 150, - "total_tokens": 160 - } -} +```bash +databricks serving-endpoints query my-agent-endpoint --json '{ + "messages": [{"role": "user", "content": "What is Databricks?"}], + "max_tokens": 500, + "temperature": 0.7 +}' ``` ### Query ML Model Endpoint -``` -manage_serving_endpoint( - action="query", - name="sklearn-classifier", - dataframe_records=[ - {"age": 25, "income": 50000, "credit_score": 720}, - {"age": 35, "income": 75000, "credit_score": 680} - ] -) -``` - -Response: -```json -{ - "predictions": [0.85, 0.72] -} +```bash +databricks serving-endpoints query sklearn-classifier --json '{ + "dataframe_records": [ + {"age": 25, "income": 50000, "credit_score": 720}, + {"age": 35, "income": 75000, "credit_score": 680} + ] +}' ``` ### List All Endpoints -``` -manage_serving_endpoint(action="list", limit=20) +```bash +databricks serving-endpoints list ``` ## Python SDK diff --git a/databricks-skills/databricks-model-serving/SKILL.md b/databricks-skills/databricks-model-serving/SKILL.md index 59f5b330..aed60ee6 100644 --- a/databricks-skills/databricks-model-serving/SKILL.md +++ b/databricks-skills/databricks-model-serving/SKILL.md @@ -82,10 +82,10 @@ ALWAYS use exact endpoint names from this table. NEVER guess or abbreviate. | Custom PyFunc | [2-custom-pyfunc.md](2-custom-pyfunc.md) | Custom preprocessing, signatures | | GenAI Agents | [3-genai-agents.md](3-genai-agents.md) | ResponsesAgent, LangGraph | | Tools Integration | [4-tools-integration.md](4-tools-integration.md) | UC Functions, Vector Search | -| Development & Testing | [5-development-testing.md](5-development-testing.md) | MCP workflow, iteration | +| Development & Testing | [5-development-testing.md](5-development-testing.md) | CLI workflow, iteration | | Logging & Registration | [6-logging-registration.md](6-logging-registration.md) | mlflow.pyfunc.log_model | | Deployment | [7-deployment.md](7-deployment.md) | Job-based async deployment | -| Querying Endpoints | [8-querying-endpoints.md](8-querying-endpoints.md) | SDK, REST, MCP tools | +| Querying Endpoints | [8-querying-endpoints.md](8-querying-endpoints.md) | CLI, SDK, REST | | Package Requirements | [9-package-requirements.md](9-package-requirements.md) | DBR versions, pip | --- From 94f318cd3080dac007677a13e93e3fc79ab66605 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Mon, 13 Apr 2026 10:28:46 +0200 Subject: [PATCH 12/66] Replace MCP tool references with CLI/SDK across remaining skills MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - README.md: Update description and diagram to reference CLI/SDK - install_skills.sh: Update comment describing skills - databricks-app-python: Rename 6-mcp-approach.md to 6-cli-approach.md - databricks-jobs/task-types.md: Remove MCP tool note - databricks-model-serving: Replace MCP tools with CLI commands - 1-classical-ml.md: CLI for querying endpoints - 3-genai-agents.md: CLI for testing and querying - 6-logging-registration.md: CLI for running scripts - 7-deployment.md: CLI for job creation and management - 9-package-requirements.md: Notebook commands instead of MCP - databricks-unstructured-pdf-generation: Python script pattern - databricks-zerobus-ingest: CLI workflow instead of MCP execute_code Note: MCP references in databricks-agent-bricks (External MCP Server feature) and databricks-mlflow-evaluation (MLflow MCP server) are legitimate product features and remain unchanged. πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- databricks-skills/README.md | 28 +++--- .../databricks-app-python/4-deployment.md | 2 +- .../{6-mcp-approach.md => 6-cli-approach.md} | 0 .../databricks-app-python/SKILL.md | 4 +- .../databricks-jobs/task-types.md | 1 - .../1-classical-ml.md | 18 ++-- .../3-genai-agents.md | 22 ++--- .../6-logging-registration.md | 10 +- .../databricks-model-serving/7-deployment.md | 85 +++++++++-------- .../9-package-requirements.md | 33 +++---- .../SKILL.md | 94 +++++++++---------- .../databricks-zerobus-ingest/SKILL.md | 80 ++++++++-------- databricks-skills/install_skills.sh | 4 +- 13 files changed, 185 insertions(+), 196 deletions(-) rename databricks-skills/databricks-app-python/{6-mcp-approach.md => 6-cli-approach.md} (100%) diff --git a/databricks-skills/README.md b/databricks-skills/README.md index a81730a2..08a77d32 100644 --- a/databricks-skills/README.md +++ b/databricks-skills/README.md @@ -1,6 +1,6 @@ # Databricks Skills for Claude Code -Skills that teach Claude Code how to work effectively with Databricks - providing patterns, best practices, and code examples that work with Databricks MCP tools. +Skills that teach Claude Code how to work effectively with Databricks - providing patterns, best practices, and code examples using the Databricks CLI, Python SDK, and REST APIs. ## Installation @@ -113,22 +113,21 @@ cp -r ai-dev-kit/databricks-skills/databricks-agent-bricks .claude/skills/ ## How It Works ``` -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ .claude/skills/ + .claude/mcp.json β”‚ -β”‚ (Knowledge) (Actions) β”‚ -β”‚ β”‚ -β”‚ Skills teach HOW + MCP does it β”‚ -β”‚ ↓ ↓ β”‚ -β”‚ Claude Code learns patterns and executes β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ .claude/skills/ + Databricks CLI/SDK β”‚ +β”‚ (Knowledge) (Actions) β”‚ +β”‚ β”‚ +β”‚ Skills teach HOW + CLI/SDK executes β”‚ +β”‚ ↓ ↓ β”‚ +β”‚ Claude Code learns patterns and executes β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ ``` **Example:** User says "Create a sales dashboard" 1. Claude loads `databricks-aibi-dashboards` skill β†’ learns validation workflow -2. Calls `get_table_stats_and_schema()` β†’ gets schemas -3. Calls `execute_sql()` β†’ tests queries -4. Calls `manage_dashboard(action="create_or_update")` β†’ deploys -5. Returns working dashboard URL +2. Runs `databricks sql execute` β†’ tests queries +3. Uses Python SDK to create dashboard via REST API +4. Returns working dashboard URL ## Custom Skills @@ -158,6 +157,7 @@ description: "What this teaches" ## Related - [databricks-tools-core](../databricks-tools-core/) - Python library -- [databricks-mcp-server](../databricks-mcp-server/) - MCP server +- [Databricks CLI](https://docs.databricks.com/dev-tools/cli/index.html) - Official CLI +- [Databricks SDK](https://docs.databricks.com/en/dev-tools/sdk-python.html) - Python SDK - [Databricks Docs](https://docs.databricks.com/) - Official documentation - [MLflow Skills](https://github.com/mlflow/skills) - Upstream MLflow skills repository diff --git a/databricks-skills/databricks-app-python/4-deployment.md b/databricks-skills/databricks-app-python/4-deployment.md index d7b138e1..384c82ac 100644 --- a/databricks-skills/databricks-app-python/4-deployment.md +++ b/databricks-skills/databricks-app-python/4-deployment.md @@ -109,7 +109,7 @@ For complete DABs guidance, use the **databricks-bundles** skill. ## Option 3: CLI Commands -For CLI-based app lifecycle management, see [6-mcp-approach.md](6-mcp-approach.md). +For CLI-based app lifecycle management, see [6-cli-approach.md](6-cli-approach.md). --- diff --git a/databricks-skills/databricks-app-python/6-mcp-approach.md b/databricks-skills/databricks-app-python/6-cli-approach.md similarity index 100% rename from databricks-skills/databricks-app-python/6-mcp-approach.md rename to databricks-skills/databricks-app-python/6-cli-approach.md diff --git a/databricks-skills/databricks-app-python/SKILL.md b/databricks-skills/databricks-app-python/SKILL.md index 2604e8a6..7b34b74b 100644 --- a/databricks-skills/databricks-app-python/SKILL.md +++ b/databricks-skills/databricks-app-python/SKILL.md @@ -72,7 +72,7 @@ Copy this checklist and verify each item: **Lakebase**: Use [5-lakebase.md](5-lakebase.md) when using Lakebase (PostgreSQL) as your app's data layer β€” covers auto-injected env vars, psycopg2/asyncpg patterns, and when to choose Lakebase vs SQL warehouse. (Keywords: Lakebase, PostgreSQL, psycopg2, asyncpg, transactional, PGHOST) -**CLI commands**: Use [6-mcp-approach.md](6-mcp-approach.md) for managing app lifecycle via CLI β€” covers creating, deploying, monitoring, and deleting apps. (Keywords: CLI, create app, deploy app, app logs) +**CLI commands**: Use [6-cli-approach.md](6-cli-approach.md) for managing app lifecycle via CLI β€” covers creating, deploying, monitoring, and deleting apps. (Keywords: CLI, create app, deploy app, app logs) **Foundation Models**: See [examples/llm_config.py](examples/llm_config.py) for calling Databricks foundation model APIs β€” covers OAuth M2M auth, OpenAI-compatible client wiring, and token caching. (Keywords: foundation model, LLM, OpenAI client, chat completions) @@ -87,7 +87,7 @@ Copy this checklist and verify each item: **Connecting to data/resources?** β†’ Read [2-app-resources.md](2-app-resources.md) **Using Lakebase (PostgreSQL)?** β†’ Read [5-lakebase.md](5-lakebase.md) **Deploying to Databricks?** β†’ Read [4-deployment.md](4-deployment.md) - **Using CLI for app lifecycle?** β†’ Read [6-mcp-approach.md](6-mcp-approach.md) + **Using CLI for app lifecycle?** β†’ Read [6-cli-approach.md](6-cli-approach.md) **Calling foundation model/LLM APIs?** β†’ See [examples/llm_config.py](examples/llm_config.py) 2. Follow the instructions in the relevant guide diff --git a/databricks-skills/databricks-jobs/task-types.md b/databricks-skills/databricks-jobs/task-types.md index c5b06fbe..f7c3e043 100644 --- a/databricks-skills/databricks-jobs/task-types.md +++ b/databricks-skills/databricks-jobs/task-types.md @@ -618,7 +618,6 @@ Define reusable Python environments for serverless tasks with custom pip depende > **IMPORTANT:** The `client` field is **required** in the environment `spec`. It specifies the > base serverless environment version. Use `"4"` as the value. Without it, the API returns: > `"Either base environment or version must be provided for environment"`. -> The MCP `manage_jobs` tool (action="create") auto-injects `client: "4"` if omitted, but CLI/SDK calls require it explicitly. ### DABs YAML diff --git a/databricks-skills/databricks-model-serving/1-classical-ml.md b/databricks-skills/databricks-model-serving/1-classical-ml.md index 4b973e0a..42b6a016 100644 --- a/databricks-skills/databricks-model-serving/1-classical-ml.md +++ b/databricks-skills/databricks-model-serving/1-classical-ml.md @@ -140,16 +140,14 @@ endpoint = w.serving_endpoints.create_and_wait( ## Query the Endpoint -### Via MCP Tool - -``` -manage_serving_endpoint( - action="query", - name="diabetes-predictor", - dataframe_records=[ - {"age": 45, "bmi": 25.3, "bp": 120, "s1": 200} - ] -) +### Via CLI + +```bash +databricks serving-endpoints query diabetes-predictor --json '{ + "dataframe_records": [ + {"age": 45, "bmi": 25.3, "bp": 120, "s1": 200} + ] +}' ``` ### Via Python SDK diff --git a/databricks-skills/databricks-model-serving/3-genai-agents.md b/databricks-skills/databricks-model-serving/3-genai-agents.md index 4061dbab..66647687 100644 --- a/databricks-skills/databricks-model-serving/3-genai-agents.md +++ b/databricks-skills/databricks-model-serving/3-genai-agents.md @@ -221,10 +221,12 @@ for event in AGENT.predict_stream(request): print(event) ``` -Run via MCP: +Run via CLI: -``` -execute_code(file_path="./my_agent/test_agent.py") +```bash +# Upload and run on Databricks +databricks workspace import-dir ./my_agent /Workspace/Users//my_agent +databricks jobs run-now --job-id # Job configured to run test_agent.py ``` ## Logging the Agent @@ -267,18 +269,16 @@ from databricks import agents agents.deploy( "main.agents.my_agent", version="1", - tags={"source": "mcp"} + tags={"source": "cli"} ) # Takes ~15 minutes ``` ## Query Deployed Agent -``` -manage_serving_endpoint( - action="query", - name="my-agent-endpoint", - messages=[{"role": "user", "content": "What is Databricks?"}], - max_tokens=500 -) +```bash +databricks serving-endpoints query my-agent-endpoint --json '{ + "messages": [{"role": "user", "content": "What is Databricks?"}], + "max_tokens": 500 +}' ``` diff --git a/databricks-skills/databricks-model-serving/6-logging-registration.md b/databricks-skills/databricks-model-serving/6-logging-registration.md index cd687358..bfa643b9 100644 --- a/databricks-skills/databricks-model-serving/6-logging-registration.md +++ b/databricks-skills/databricks-model-serving/6-logging-registration.md @@ -60,10 +60,12 @@ uc_model_info = mlflow.register_model( print(f"Registered: {uc_model_info.name} version {uc_model_info.version}") ``` -Run via MCP: +Run via CLI: -``` -execute_code(file_path="./my_agent/log_model.py") +```bash +# Upload and run on Databricks +databricks workspace import-dir ./my_agent /Workspace/Users//my_agent +databricks jobs run-now --job-id # Job configured to run log_model.py ``` ## Resources for Auto Authentication @@ -141,7 +143,7 @@ mlflow.models.predict( ) ``` -Run via MCP (in log_model.py or separate file): +Run validation (in log_model.py or separate file): ```python # validate_model.py diff --git a/databricks-skills/databricks-model-serving/7-deployment.md b/databricks-skills/databricks-model-serving/7-deployment.md index 666cb168..2f503112 100644 --- a/databricks-skills/databricks-model-serving/7-deployment.md +++ b/databricks-skills/databricks-model-serving/7-deployment.md @@ -2,7 +2,7 @@ Deploy models to serving endpoints. Uses async job-based approach for agents (deployment takes ~15 min). -> **If MCP tools are not available**, use `databricks.agents.deploy()` directly in a notebook, or create jobs via CLI: `databricks jobs create --json @job.json` +> Use `databricks.agents.deploy()` directly in a notebook, or create jobs via CLI: `databricks jobs create --json @job.json` ## Deployment Options @@ -13,7 +13,7 @@ Deploy models to serving endpoints. Uses async job-based approach for agents (de ## GenAI Agent Deployment (Job-Based) -Since agent deployment takes ~15 minutes, use a job to avoid MCP timeouts. +Since agent deployment takes ~15 minutes, use a job for async deployment. ### Step 1: Create Deployment Script @@ -32,7 +32,7 @@ print(f"Deploying {model_name} version {version}...") deployment = agents.deploy( model_name, version, - tags={"source": "mcp", "environment": "dev"} + tags={"source": "cli", "environment": "dev"} ) print(f"Deployment complete!") @@ -41,40 +41,39 @@ print(f"Endpoint: {deployment.endpoint_name}") ### Step 2: Create Deployment Job (One-Time) -Use the `manage_jobs` MCP tool with action="create": +Use the Databricks CLI: -``` -manage_jobs( - action="create", - name="deploy-agent-job", - tasks=[ - { - "task_key": "deploy", - "spark_python_task": { - "python_file": "/Workspace/Users/you@company.com/my_agent/deploy_agent.py", - "parameters": ["{{job.parameters.model_name}}", "{{job.parameters.version}}"] - } - } - ], - parameters=[ - {"name": "model_name", "default": "main.agents.my_agent"}, - {"name": "version", "default": "1"} - ] -) +```bash +databricks jobs create --json '{ + "name": "deploy-agent-job", + "tasks": [{ + "task_key": "deploy", + "spark_python_task": { + "python_file": "/Workspace/Users/you@company.com/my_agent/deploy_agent.py", + "parameters": ["{{job.parameters.model_name}}", "{{job.parameters.version}}"] + }, + "new_cluster": { + "spark_version": "16.1.x-scala2.12", + "node_type_id": "i3.xlarge", + "num_workers": 0 + } + }], + "parameters": [ + {"name": "model_name", "default": "main.agents.my_agent"}, + {"name": "version", "default": "1"} + ] +}' ``` Save the returned `job_id`. ### Step 3: Run Deployment (Async) -Use `manage_job_runs` with action="run_now" - returns immediately: +Run the job - returns immediately: -``` -manage_job_runs( - action="run_now", - job_id="", - job_parameters={"model_name": "main.agents.my_agent", "version": "1"} -) +```bash +databricks jobs run-now --job-id \ + --params '{"model_name": "main.agents.my_agent", "version": "1"}' ``` Save the returned `run_id`. @@ -83,14 +82,14 @@ Save the returned `run_id`. Check job run status: -``` -manage_job_runs(action="get", run_id="") +```bash +databricks jobs get-run --run-id ``` Or check endpoint directly: -``` -manage_serving_endpoint(action="get", name="") +```bash +databricks serving-endpoints get ``` ## Classical ML Deployment @@ -163,7 +162,7 @@ deployment = agents.deploy( "main.agents.my_agent", "1", endpoint_name="my-agent-endpoint", # Control the name - tags={"source": "mcp", "environment": "dev"} + tags={"source": "cli", "environment": "dev"} ) ``` @@ -172,7 +171,7 @@ deployment = agents.deploy( Endpoints created via `agents.deploy()` appear under **Serving** in the Databricks UI. If you don't see your endpoint: 1. **Check the filter** - The Serving page defaults to "Owned by me". If the deployment ran as a service principal (e.g., via a job), switch to "All" to see it. -2. **Verify via API** - Use `manage_serving_endpoint(action="list")` or `manage_serving_endpoint(action="get", name="...")` to confirm the endpoint exists and check its state. +2. **Verify via CLI** - Use `databricks serving-endpoints list` or `databricks serving-endpoints get ` to confirm the endpoint exists and check its state. 3. **Check the name** - The auto-generated name may not be what you expect. Print `deployment.endpoint_name` in the deploy script or check the job run output. ### Deployment Script with Explicit Naming @@ -261,18 +260,18 @@ client.update_endpoint( ## Workflow Summary -| Step | MCP Tool | Waits? | -|------|----------|--------| -| Upload deploy script | `manage_workspace_files` (action="upload") | Yes | -| Create job (one-time) | `manage_jobs` (action="create") | Yes | -| Run deployment | `manage_job_runs` (action="run_now") | **No** - returns immediately | -| Check job status | `manage_job_runs` (action="get") | Yes | -| Check endpoint status | `manage_serving_endpoint` (action="get") | Yes | +| Step | CLI Command | Waits? | +|------|-------------|--------| +| Upload deploy script | `databricks workspace import-dir` | Yes | +| Create job (one-time) | `databricks jobs create` | Yes | +| Run deployment | `databricks jobs run-now` | **No** - returns immediately | +| Check job status | `databricks jobs get-run` | Yes | +| Check endpoint status | `databricks serving-endpoints get` | Yes | ## After Deployment Once endpoint is READY: -1. **Test with MCP**: `manage_serving_endpoint(action="query", name="...", messages=[...])` +1. **Test with CLI**: `databricks serving-endpoints query --json '{"messages": [...]}'` 2. **Share with team**: Endpoint URL in Databricks UI 3. **Integrate in apps**: Use REST API or SDK diff --git a/databricks-skills/databricks-model-serving/9-package-requirements.md b/databricks-skills/databricks-model-serving/9-package-requirements.md index f9ceb7a9..e5508b6a 100644 --- a/databricks-skills/databricks-model-serving/9-package-requirements.md +++ b/databricks-skills/databricks-model-serving/9-package-requirements.md @@ -137,24 +137,23 @@ export DATABRICKS_TOKEN="your-token" export DATABRICKS_CONFIG_PROFILE="your-profile" ``` -## Installing Packages via MCP +## Installing Packages -Use `execute_code`: +In a notebook or Python script on Databricks: -``` -execute_code( - code="%pip install -U mlflow==3.6.0 databricks-langchain langgraph==0.3.4 databricks-agents pydantic" -) +```python +%pip install -U mlflow==3.6.0 databricks-langchain langgraph==0.3.4 databricks-agents pydantic +dbutils.library.restartPython() ``` -Then restart Python: +Or via job libraries configuration: -``` -execute_code( - code="dbutils.library.restartPython()", - cluster_id="", - context_id="" -) +```json +"libraries": [ + {"pypi": {"package": "mlflow==3.6.0"}}, + {"pypi": {"package": "databricks-langchain"}}, + {"pypi": {"package": "langgraph==0.3.4"}} +] ``` ## Checking Installed Versions @@ -171,17 +170,13 @@ for pkg in packages: print(f"{pkg}: NOT INSTALLED") ``` -Via MCP: +In a notebook: -``` -execute_code( - code=""" +```python import pkg_resources for pkg in ['mlflow', 'langchain', 'langgraph', 'pydantic', 'databricks-langchain']: try: print(f"{pkg}: {pkg_resources.get_distribution(pkg).version}") except: print(f"{pkg}: NOT INSTALLED") - """ -) ``` diff --git a/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md b/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md index 92322fd0..bb3d262d 100644 --- a/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md +++ b/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md @@ -9,19 +9,23 @@ Convert HTML content to PDF documents and upload them to Unity Catalog Volumes. ## Overview -The `generate_and_upload_pdf` MCP tool converts HTML to PDF and uploads to a Unity Catalog Volume. You (the LLM) generate the HTML content, and the tool handles conversion and upload. +Generate PDFs from HTML using the `databricks-tools-core` library. You (the LLM) generate the HTML content, and the Python script handles conversion and upload. -## Tool Signature +## Python Script Pattern -``` -generate_and_upload_pdf( - html_content: str, # Complete HTML document - filename: str, # PDF filename (e.g., "report.pdf") - catalog: str, # Unity Catalog name - schema: str, # Schema name - volume: str = "raw_data", # Volume name (default: "raw_data") - folder: str = None, # Optional subfolder +```python +# generate_pdf.py +from databricks_tools_core.pdf_generator import generate_and_upload_pdf + +result = generate_and_upload_pdf( + html_content=html_content, # Complete HTML document + filename="report.pdf", # PDF filename + catalog="my_catalog", # Unity Catalog name + schema="my_schema", # Schema name + volume="raw_data", # Volume name (default: "raw_data") + folder=None, # Optional subfolder ) +print(f"Uploaded to: {result['volume_path']}") ``` **Returns:** @@ -37,7 +41,9 @@ generate_and_upload_pdf( Generate a simple PDF: -``` +```python +from databricks_tools_core.pdf_generator import generate_and_upload_pdf + generate_and_upload_pdf( html_content=''' @@ -64,47 +70,37 @@ generate_and_upload_pdf( ## Performance: Generate Multiple PDFs in Parallel -**IMPORTANT**: PDF generation and upload can take 2-5 seconds per document. When generating multiple PDFs, **call the tool in parallel** to maximize throughput. +**IMPORTANT**: PDF generation and upload can take 2-5 seconds per document. When generating multiple PDFs, use concurrent execution to maximize throughput. ### Example: Generate 5 PDFs in Parallel -Make 5 simultaneous `generate_and_upload_pdf` calls: - -``` -# Call 1 -generate_and_upload_pdf( - html_content="...Employee Handbook content...", - filename="employee_handbook.pdf", - catalog="hr_catalog", schema="policies", folder="2024" -) - -# Call 2 (parallel) -generate_and_upload_pdf( - html_content="...Leave Policy content...", - filename="leave_policy.pdf", - catalog="hr_catalog", schema="policies", folder="2024" -) - -# Call 3 (parallel) -generate_and_upload_pdf( - html_content="...Code of Conduct content...", - filename="code_of_conduct.pdf", - catalog="hr_catalog", schema="policies", folder="2024" -) - -# Call 4 (parallel) -generate_and_upload_pdf( - html_content="...Benefits Guide content...", - filename="benefits_guide.pdf", - catalog="hr_catalog", schema="policies", folder="2024" -) - -# Call 5 (parallel) -generate_and_upload_pdf( - html_content="...Remote Work Policy content...", - filename="remote_work_policy.pdf", - catalog="hr_catalog", schema="policies", folder="2024" -) +```python +import concurrent.futures +from databricks_tools_core.pdf_generator import generate_and_upload_pdf + +pdfs_to_generate = [ + {"html_content": "...Employee Handbook content...", "filename": "employee_handbook.pdf"}, + {"html_content": "...Leave Policy content...", "filename": "leave_policy.pdf"}, + {"html_content": "...Code of Conduct content...", "filename": "code_of_conduct.pdf"}, + {"html_content": "...Benefits Guide content...", "filename": "benefits_guide.pdf"}, + {"html_content": "...Remote Work Policy content...", "filename": "remote_work_policy.pdf"}, +] + +def generate_pdf(pdf_config): + return generate_and_upload_pdf( + html_content=pdf_config["html_content"], + filename=pdf_config["filename"], + catalog="hr_catalog", + schema="policies", + folder="2024" + ) + +# Generate in parallel +with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: + results = list(executor.map(generate_pdf, pdfs_to_generate)) + +for result in results: + print(f"Uploaded: {result['volume_path']}") ``` By calling these in parallel (not sequentially), 5 PDFs that would take 15-25 seconds sequentially complete in 3-5 seconds total. diff --git a/databricks-skills/databricks-zerobus-ingest/SKILL.md b/databricks-skills/databricks-zerobus-ingest/SKILL.md index 22f90c55..668a4be2 100644 --- a/databricks-skills/databricks-zerobus-ingest/SKILL.md +++ b/databricks-skills/databricks-zerobus-ingest/SKILL.md @@ -120,54 +120,54 @@ You must always follow all the steps in the Workflow ## Workflow 0. **Display the plan of your execution** -1. **Determinate the type of client** -2. **Get schema** Always use 4-protobuf-schema.md. Execute using the `execute_code` MCP tool -3. **Write Python code to a local file follow the instructions in the relevant guide to ingest with zerobus** in the project (e.g., `scripts/zerobus_ingest.py`). -4. **Execute on Databricks** using the `execute_code` MCP tool (with `file_path` parameter) -5. **If execution fails**: Edit the local file to fix the error, then re-execute -6. **Reuse the context** for follow-up executions by passing the returned `cluster_id` and `context_id` +1. **Determine the type of client** +2. **Get schema** Always use 4-protobuf-schema.md +3. **Write Python code to a local file** following the instructions in the relevant guide (e.g., `scripts/zerobus_ingest.py`) +4. **Upload to workspace**: `databricks workspace import-dir ./scripts /Workspace/Users//scripts` +5. **Execute on Databricks** using a job or notebook +6. **If execution fails**: Edit the local file, re-upload, and re-execute --- ## Important - Never install local packages -- Always validate MCP server requirement before execution - **Serverless limitation**: The Zerobus SDK cannot pip-install on serverless compute. Use classic compute clusters, or use the [Zerobus REST API](https://docs.databricks.com/aws/en/ingestion/zerobus-rest-api) (Beta) for notebook-based ingestion without the SDK. - **Explicit table grants**: Service principals need explicit `MODIFY` and `SELECT` grants on the target table. Schema-level inherited permissions may not be sufficient for the `authorization_details` OAuth flow. --- -### Context Reuse Pattern +### Execution Workflow -The first execution auto-selects a running cluster and creates an execution context. **Reuse this context for follow-up calls** - it's much faster (~1s vs ~15s) and shares variables/imports: - -**First execution** - use `execute_code` tool: -- `file_path`: "scripts/zerobus_ingest.py" - -Returns: `{ success, output, error, cluster_id, context_id, ... }` +**Step 1: Upload code to workspace** +```bash +databricks workspace import-dir ./scripts /Workspace/Users//scripts +``` -Save `cluster_id` and `context_id` for follow-up calls. +**Step 2: Create and run a job** +```bash +databricks jobs create --json '{ + "name": "zerobus-ingest", + "tasks": [{ + "task_key": "ingest", + "spark_python_task": { + "python_file": "/Workspace/Users//scripts/zerobus_ingest.py" + }, + "new_cluster": { + "spark_version": "16.1.x-scala2.12", + "node_type_id": "i3.xlarge", + "num_workers": 0 + } + }] +}' + +databricks jobs run-now --job-id +``` **If execution fails:** -1. Read the error from the result +1. Read the error from the job run output 2. Edit the local Python file to fix the issue -3. Re-execute with same context using `execute_code` tool: - - `file_path`: "scripts/zerobus_ingest.py" - - `cluster_id`: "" - - `context_id`: "" - -**Follow-up executions** reuse the context (faster, shares state): -- `file_path`: "scripts/validate_ingestion.py" -- `cluster_id`: "" -- `context_id`: "" - -### Handling Failures - -When execution fails: -1. Read the error from the result -2. **Edit the local Python file** to fix the issue -3. Re-execute using the same `cluster_id` and `context_id` (faster, keeps installed libraries) -4. If the context is corrupted, omit `context_id` to create a fresh one +3. Re-upload: `databricks workspace import-dir ./scripts /Workspace/Users//scripts` +4. Re-run: `databricks jobs run-now --job-id ` --- @@ -175,14 +175,14 @@ When execution fails: Databricks provides Spark, pandas, numpy, and common data libraries by default. **Only install a library if you get an import error.** -Use `execute_code` tool: -- `code`: "%pip install databricks-zerobus-ingest-sdk>=1.0.0" -- `cluster_id`: "" -- `context_id`: "" - -The library is immediately available in the same context. +Add to the job configuration: +```json +"libraries": [ + {"pypi": {"package": "databricks-zerobus-ingest-sdk>=1.0.0"}} +] +``` -**Note:** Keeping the same `context_id` means installed libraries persist across calls. +Or use init scripts in the cluster configuration. ## 🚨 Critical Learning: Timestamp Format Fix diff --git a/databricks-skills/install_skills.sh b/databricks-skills/install_skills.sh index 7630615c..3613b00c 100755 --- a/databricks-skills/install_skills.sh +++ b/databricks-skills/install_skills.sh @@ -3,7 +3,7 @@ # Databricks Skills Installer # # Installs Databricks skills for Claude Code into your project. -# These skills teach Claude how to work with Databricks using MCP tools. +# These skills teach Claude how to work with Databricks using CLI, SDK, and REST APIs. # # Usage: # # Install all skills (Databricks + MLflow + APX) @@ -119,7 +119,7 @@ get_skill_extra_files() { "databricks-bundles") echo "alerts_guidance.md SDP_guidance.md" ;; "databricks-iceberg") echo "1-managed-iceberg-tables.md 2-uniform-and-compatibility.md 3-iceberg-rest-catalog.md 4-snowflake-interop.md 5-external-engine-interop.md" ;; "databricks-app-apx") echo "backend-patterns.md best-practices.md frontend-patterns.md" ;; - "databricks-app-python") echo "1-authorization.md 2-app-resources.md 3-frameworks.md 4-deployment.md 5-lakebase.md 6-mcp-approach.md examples/llm_config.py examples/fm-minimal-chat.py examples/fm-parallel-calls.py examples/fm-structured-outputs.py" ;; + "databricks-app-python") echo "1-authorization.md 2-app-resources.md 3-frameworks.md 4-deployment.md 5-lakebase.md 6-cli-approach.md examples/llm_config.py examples/fm-minimal-chat.py examples/fm-parallel-calls.py examples/fm-structured-outputs.py" ;; "databricks-jobs") echo "task-types.md triggers-schedules.md notifications-monitoring.md examples.md" ;; "databricks-python-sdk") echo "doc-index.md examples/1-authentication.py examples/2-clusters-and-jobs.py examples/3-sql-and-warehouses.py examples/4-unity-catalog.py examples/5-serving-and-vector-search.py" ;; "databricks-unity-catalog") echo "5-system-tables.md" ;; From 8bd86b3e91d7eef5ca03a1ea5506b4ed0b8326bc Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Mon, 13 Apr 2026 10:50:59 +0200 Subject: [PATCH 13/66] test(skills): add integration test infrastructure for databricks-skills MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Added test infrastructure for Python scripts in databricks-skills: - .tests/conftest.py: Pytest fixtures for Databricks connection - workspace_client: Session-scoped WorkspaceClient - warehouse_id: Finds running SQL warehouse - Custom markers for integration tests - .tests/test_agent_bricks_manager.py: Tests for supervisor agent CLI - Unit tests for _build_agent_list helper (all agent types) - Integration tests for MAS lifecycle (list, find, get) - .tests/test_genie_conversation.py: Tests for Genie conversation CLI - Unit tests with mocks for ask_genie function - Tests for timeout, failure handling, conversation tracking - Integration tests for live Genie Space queries - .tests/run_tests.py: Test runner script - Supports --unit and --integration flags - HTML and JUnit XML report generation - Colored terminal output with summary Tests cover the remaining Python scripts in skills: - databricks-agent-bricks/manager.py - databricks-genie/conversation.py All 11 unit tests pass. Integration tests require Databricks connection. πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .gitignore | 1 + databricks-skills/.tests/__init__.py | 1 + databricks-skills/.tests/conftest.py | 78 +++++++ databricks-skills/.tests/run_tests.py | 156 ++++++++++++++ .../.tests/test_agent_bricks_manager.py | 158 ++++++++++++++ .../.tests/test_genie_conversation.py | 204 ++++++++++++++++++ 6 files changed, 598 insertions(+) create mode 100644 databricks-skills/.tests/__init__.py create mode 100644 databricks-skills/.tests/conftest.py create mode 100755 databricks-skills/.tests/run_tests.py create mode 100644 databricks-skills/.tests/test_agent_bricks_manager.py create mode 100644 databricks-skills/.tests/test_genie_conversation.py diff --git a/.gitignore b/.gitignore index d7ae2881..f33db3f3 100644 --- a/.gitignore +++ b/.gitignore @@ -63,6 +63,7 @@ claude_agent_settings.json .coverage htmlcov/ .pytest_cache/ +.test-results/ # Skill test run results (detailed per-task logs with full responses) .test/skills/*/runs/ diff --git a/databricks-skills/.tests/__init__.py b/databricks-skills/.tests/__init__.py new file mode 100644 index 00000000..22366876 --- /dev/null +++ b/databricks-skills/.tests/__init__.py @@ -0,0 +1 @@ +"""databricks-skills integration tests.""" diff --git a/databricks-skills/.tests/conftest.py b/databricks-skills/.tests/conftest.py new file mode 100644 index 00000000..f5612394 --- /dev/null +++ b/databricks-skills/.tests/conftest.py @@ -0,0 +1,78 @@ +""" +Pytest fixtures for databricks-skills integration tests. + +These fixtures set up test resources in Databricks for testing the Python scripts +in databricks-skills that use databricks-tools-core functionality. + +Requires a valid Databricks connection (via env vars or ~/.databrickscfg). +""" + +import logging +import os +from pathlib import Path + +import pytest +from databricks.sdk import WorkspaceClient + +# Load .env.test file if it exists +_env_file = Path(__file__).parent.parent.parent / "databricks-tools-core" / ".env.test" +if _env_file.exists(): + from dotenv import load_dotenv + + load_dotenv(_env_file) + logging.getLogger(__name__).info(f"Loaded environment from {_env_file}") + +logger = logging.getLogger(__name__) + + +def pytest_configure(config): + """Configure pytest with custom markers.""" + config.addinivalue_line( + "markers", "integration: mark test as integration test requiring Databricks" + ) + + +@pytest.fixture(scope="session") +def workspace_client() -> WorkspaceClient: + """ + Create a WorkspaceClient for the test session. + + Uses standard Databricks authentication: + 1. DATABRICKS_HOST + DATABRICKS_TOKEN env vars + 2. ~/.databrickscfg profile + """ + try: + client = WorkspaceClient() + # Verify connection works + client.current_user.me() + logger.info(f"Connected to Databricks: {client.config.host}") + return client + except Exception as e: + pytest.skip(f"Could not connect to Databricks: {e}") + + +@pytest.fixture(scope="session") +def warehouse_id(workspace_client: WorkspaceClient) -> str: + """ + Get a running SQL warehouse for tests. + + Prefers shared endpoints, falls back to any running warehouse. + """ + from databricks.sdk.service.sql import State + + warehouses = list(workspace_client.warehouses.list()) + + # Priority: running shared endpoint + for w in warehouses: + if w.state == State.RUNNING and "shared" in (w.name or "").lower(): + logger.info(f"Using warehouse: {w.name} ({w.id})") + return w.id + + # Fallback: any running warehouse + for w in warehouses: + if w.state == State.RUNNING: + logger.info(f"Using warehouse: {w.name} ({w.id})") + return w.id + + # No running warehouse found + pytest.skip("No running SQL warehouse available for tests") diff --git a/databricks-skills/.tests/run_tests.py b/databricks-skills/.tests/run_tests.py new file mode 100755 index 00000000..cae0da56 --- /dev/null +++ b/databricks-skills/.tests/run_tests.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python3 +""" +Test runner for databricks-skills. + +Runs unit tests (mocked, no Databricks connection required) and integration tests +(require Databricks connection). Generates HTML and terminal reports. + +Usage: + python run_tests.py # Run all tests + python run_tests.py --unit # Run only unit tests + python run_tests.py --integration # Run only integration tests + python run_tests.py -v # Verbose output + python run_tests.py --html # Generate HTML report +""" + +import argparse +import os +import subprocess +import sys +from datetime import datetime +from pathlib import Path + + +def main(): + parser = argparse.ArgumentParser( + description="Run databricks-skills tests with reports" + ) + parser.add_argument( + "--unit", + action="store_true", + help="Run only unit tests (mocked, no Databricks connection)", + ) + parser.add_argument( + "--integration", + action="store_true", + help="Run only integration tests (requires Databricks connection)", + ) + parser.add_argument( + "-v", "--verbose", + action="store_true", + help="Verbose output", + ) + parser.add_argument( + "--html", + action="store_true", + help="Generate HTML report", + ) + parser.add_argument( + "--xml", + action="store_true", + help="Generate JUnit XML report for CI", + ) + parser.add_argument( + "-k", + metavar="EXPRESSION", + help="Only run tests matching the given expression", + ) + + args = parser.parse_args() + + # Determine test directory + tests_dir = Path(__file__).parent + skills_dir = tests_dir.parent + repo_root = skills_dir.parent + + # Results directory for reports + results_dir = tests_dir / ".test-results" + results_dir.mkdir(exist_ok=True) + + # Build pytest command + pytest_args = [ + sys.executable, + "-m", "pytest", + str(tests_dir), + ] + + # Filter by test type + if args.unit and not args.integration: + # Unit tests: exclude integration marker + pytest_args.extend(["-m", "not integration"]) + elif args.integration and not args.unit: + # Integration tests only + pytest_args.extend(["-m", "integration"]) + # If both or neither specified, run all tests + + # Add verbosity + if args.verbose: + pytest_args.append("-v") + else: + pytest_args.append("-q") + + # Add expression filter + if args.k: + pytest_args.extend(["-k", args.k]) + + # Add HTML report + if args.html: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + html_path = results_dir / f"report_{timestamp}.html" + pytest_args.extend(["--html", str(html_path), "--self-contained-html"]) + print(f"HTML report will be saved to: {html_path}") + + # Add XML report + if args.xml: + xml_path = results_dir / "junit.xml" + pytest_args.extend(["--junitxml", str(xml_path)]) + print(f"JUnit XML report will be saved to: {xml_path}") + + # Add color output + pytest_args.append("--color=yes") + + # Show captured output on failure + pytest_args.append("-rA") + + # Set PYTHONPATH to include skills directory + env = os.environ.copy() + pythonpath = env.get("PYTHONPATH", "") + env["PYTHONPATH"] = f"{skills_dir}:{repo_root / 'databricks-tools-core'}:{pythonpath}" + + # Print test configuration + print("=" * 60) + print("databricks-skills Test Runner") + print("=" * 60) + print(f"Tests directory: {tests_dir}") + print(f"Results directory: {results_dir}") + test_type = "all" + if args.unit and not args.integration: + test_type = "unit only" + elif args.integration and not args.unit: + test_type = "integration only" + print(f"Test type: {test_type}") + print("=" * 60) + print() + + # Run pytest + result = subprocess.run(pytest_args, env=env) + + # Print summary + print() + print("=" * 60) + if result.returncode == 0: + print("All tests PASSED") + else: + print(f"Tests FAILED (exit code: {result.returncode})") + + if args.html: + print(f"HTML report: {html_path}") + if args.xml: + print(f"JUnit XML: {xml_path}") + print("=" * 60) + + return result.returncode + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/databricks-skills/.tests/test_agent_bricks_manager.py b/databricks-skills/.tests/test_agent_bricks_manager.py new file mode 100644 index 00000000..ad8941ed --- /dev/null +++ b/databricks-skills/.tests/test_agent_bricks_manager.py @@ -0,0 +1,158 @@ +""" +Integration tests for databricks-agent-bricks/manager.py + +Tests the Supervisor Agent (MAS) CLI interface functions. +Requires databricks-tools-core.agent_bricks module. +""" + +import json +import sys +from pathlib import Path + +import pytest + +# Add the skills directory to the path +SKILLS_DIR = Path(__file__).parent.parent +sys.path.insert(0, str(SKILLS_DIR / "databricks-agent-bricks")) + +from manager import ( + create_mas, + get_mas, + find_mas, + update_mas, + delete_mas, + list_mas, + _build_agent_list, +) + + +@pytest.fixture +def sample_agent_config(): + """Sample agent configuration for testing.""" + return { + "name": "Test Agent", + "description": "A test agent for unit testing", + "endpoint_name": "test-endpoint", + } + + +@pytest.fixture +def sample_genie_agent(): + """Sample Genie agent configuration.""" + return { + "name": "Genie Agent", + "description": "A Genie-based agent", + "genie_space_id": "test-space-123", + } + + +@pytest.fixture +def sample_uc_function_agent(): + """Sample UC Function agent configuration.""" + return { + "name": "UC Function Agent", + "description": "A UC function agent", + "uc_function_name": "catalog.schema.function_name", + } + + +class TestBuildAgentList: + """Tests for _build_agent_list helper function.""" + + def test_build_serving_endpoint_agent(self, sample_agent_config): + """Should build serving endpoint agent config.""" + result = _build_agent_list([sample_agent_config]) + + assert len(result) == 1 + agent = result[0] + assert agent["name"] == "Test Agent" + assert agent["description"] == "A test agent for unit testing" + assert agent["agent_type"] == "serving_endpoint" + assert agent["serving_endpoint"]["name"] == "test-endpoint" + + def test_build_genie_agent(self, sample_genie_agent): + """Should build Genie agent config.""" + result = _build_agent_list([sample_genie_agent]) + + assert len(result) == 1 + agent = result[0] + assert agent["agent_type"] == "genie" + assert agent["genie_space"]["id"] == "test-space-123" + + def test_build_uc_function_agent(self, sample_uc_function_agent): + """Should build UC function agent config.""" + result = _build_agent_list([sample_uc_function_agent]) + + assert len(result) == 1 + agent = result[0] + assert agent["agent_type"] == "unity_catalog_function" + assert agent["unity_catalog_function"]["uc_path"]["catalog"] == "catalog" + assert agent["unity_catalog_function"]["uc_path"]["schema"] == "schema" + assert agent["unity_catalog_function"]["uc_path"]["name"] == "function_name" + + def test_build_mcp_connection_agent(self): + """Should build external MCP server agent config.""" + agent_config = { + "name": "MCP Agent", + "description": "External MCP server", + "connection_name": "my-mcp-connection", + } + result = _build_agent_list([agent_config]) + + assert len(result) == 1 + agent = result[0] + assert agent["agent_type"] == "external_mcp_server" + assert agent["external_mcp_server"]["connection_name"] == "my-mcp-connection" + + def test_build_multiple_agents(self, sample_agent_config, sample_genie_agent): + """Should build multiple agent configs.""" + result = _build_agent_list([sample_agent_config, sample_genie_agent]) + + assert len(result) == 2 + assert result[0]["agent_type"] == "serving_endpoint" + assert result[1]["agent_type"] == "genie" + + +@pytest.mark.integration +class TestMASLifecycle: + """Integration tests for MAS CRUD operations. + + Note: These tests require a Databricks workspace with Agent Bricks enabled. + They are marked as integration tests and may be skipped if connection fails. + """ + + @pytest.fixture + def test_mas_name(self): + """Unique name for test MAS.""" + import uuid + return f"test-mas-{uuid.uuid4().hex[:8]}" + + def test_list_mas(self, workspace_client): + """Should list existing MAS tiles.""" + try: + result = list_mas() + assert isinstance(result, list) + except Exception as e: + if "Agent Bricks" in str(e) or "not enabled" in str(e).lower(): + pytest.skip("Agent Bricks not enabled in workspace") + raise + + def test_find_mas_not_found(self, workspace_client): + """Should return not found for non-existent MAS.""" + try: + result = find_mas("nonexistent-mas-name-xyz-123") + assert result["found"] is False + except Exception as e: + if "Agent Bricks" in str(e) or "not enabled" in str(e).lower(): + pytest.skip("Agent Bricks not enabled in workspace") + raise + + def test_get_mas_not_found(self, workspace_client): + """Should return error for non-existent tile ID.""" + try: + result = get_mas("00000000-0000-0000-0000-000000000000") + assert "error" in result or result.get("tile_id") == "" + except Exception as e: + if "Agent Bricks" in str(e) or "not enabled" in str(e).lower(): + pytest.skip("Agent Bricks not enabled in workspace") + raise diff --git a/databricks-skills/.tests/test_genie_conversation.py b/databricks-skills/.tests/test_genie_conversation.py new file mode 100644 index 00000000..2ec4f281 --- /dev/null +++ b/databricks-skills/.tests/test_genie_conversation.py @@ -0,0 +1,204 @@ +""" +Integration tests for databricks-genie/conversation.py + +Tests the Genie Conversation API CLI interface. +Requires databricks.sdk for Genie Space operations. +""" + +import json +import os +import sys +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +# Add the skills directory to the path +SKILLS_DIR = Path(__file__).parent.parent +sys.path.insert(0, str(SKILLS_DIR / "databricks-genie")) + +from conversation import ask_genie, _print_json + + +class TestAskGenieFunction: + """Tests for the ask_genie function structure and error handling.""" + + def test_ask_genie_returns_dict(self): + """Should return a dictionary result.""" + # Test with a mock to verify return structure + with patch("conversation.WorkspaceClient") as mock_client: + # Setup mock + mock_response = MagicMock() + mock_response.conversation_id = "conv-123" + mock_response.message_id = "msg-456" + + mock_message = MagicMock() + mock_message.status = MagicMock() + mock_message.status.value = "COMPLETED" + mock_message.attachments = [] + mock_message.query_result = None + + mock_instance = mock_client.return_value + mock_instance.genie.start_conversation_and_wait.return_value = mock_response + mock_instance.genie.get_message.return_value = mock_message + + result = ask_genie( + space_id="test-space", + question="Test question", + timeout_seconds=5, + ) + + assert isinstance(result, dict) + assert "question" in result + assert "conversation_id" in result + assert "message_id" in result + assert "status" in result + + def test_ask_genie_with_conversation_id(self): + """Should pass conversation_id for follow-up questions.""" + with patch("conversation.WorkspaceClient") as mock_client: + mock_response = MagicMock() + mock_response.conversation_id = "conv-123" + mock_response.message_id = "msg-456" + + mock_message = MagicMock() + mock_message.status = MagicMock() + mock_message.status.value = "COMPLETED" + mock_message.attachments = [] + mock_message.query_result = None + + mock_instance = mock_client.return_value + mock_instance.genie.start_conversation_and_wait.return_value = mock_response + mock_instance.genie.get_message.return_value = mock_message + + result = ask_genie( + space_id="test-space", + question="Follow-up question", + conversation_id="existing-conv-id", + timeout_seconds=5, + ) + + # Verify the conversation_id was passed + call_args = mock_instance.genie.start_conversation_and_wait.call_args + assert call_args.kwargs.get("conversation_id") == "existing-conv-id" + + def test_ask_genie_handles_timeout(self): + """Should return timeout status when query exceeds timeout.""" + with patch("conversation.WorkspaceClient") as mock_client: + mock_response = MagicMock() + mock_response.conversation_id = "conv-123" + mock_response.message_id = "msg-456" + + mock_message = MagicMock() + mock_message.status = MagicMock() + mock_message.status.value = "EXECUTING_QUERY" # Never completes + mock_message.attachments = [] + + mock_instance = mock_client.return_value + mock_instance.genie.start_conversation_and_wait.return_value = mock_response + mock_instance.genie.get_message.return_value = mock_message + + # Very short timeout to trigger timeout path + result = ask_genie( + space_id="test-space", + question="Test question", + timeout_seconds=0.1, # Will timeout immediately + ) + + assert result["status"] == "TIMEOUT" + assert "error" in result + + def test_ask_genie_handles_failure(self): + """Should return failure status when query fails.""" + with patch("conversation.WorkspaceClient") as mock_client: + mock_response = MagicMock() + mock_response.conversation_id = "conv-123" + mock_response.message_id = "msg-456" + + mock_message = MagicMock() + mock_message.status = MagicMock() + mock_message.status.value = "FAILED" + mock_message.attachments = [] + + mock_instance = mock_client.return_value + mock_instance.genie.start_conversation_and_wait.return_value = mock_response + mock_instance.genie.get_message.return_value = mock_message + + result = ask_genie( + space_id="test-space", + question="Test question", + timeout_seconds=5, + ) + + assert result["status"] == "FAILED" + + +class TestPrintJson: + """Tests for the _print_json helper function.""" + + def test_print_json_dict(self, capsys): + """Should print dict as formatted JSON.""" + _print_json({"key": "value", "number": 42}) + captured = capsys.readouterr() + assert '"key": "value"' in captured.out + assert '"number": 42' in captured.out + + def test_print_json_list(self, capsys): + """Should print list as formatted JSON.""" + _print_json([1, 2, 3]) + captured = capsys.readouterr() + assert "1" in captured.out + assert "2" in captured.out + assert "3" in captured.out + + +@pytest.mark.integration +class TestGenieConversationIntegration: + """Integration tests for Genie Conversation API. + + Note: These tests require a Databricks workspace with Genie enabled + and a valid Genie Space ID configured via environment variable. + """ + + @pytest.fixture + def genie_space_id(self): + """Get Genie Space ID from environment.""" + space_id = os.environ.get("TEST_GENIE_SPACE_ID") + if not space_id: + pytest.skip("TEST_GENIE_SPACE_ID not set - skipping Genie integration tests") + return space_id + + def test_ask_genie_simple_question(self, workspace_client, genie_space_id): + """Should be able to ask a simple question to Genie.""" + result = ask_genie( + space_id=genie_space_id, + question="How many rows are in the table?", + timeout_seconds=120, + ) + + # Should return a valid result + assert result["conversation_id"] is not None + assert result["status"] in ["COMPLETED", "FAILED", "TIMEOUT"] + + def test_ask_genie_follow_up(self, workspace_client, genie_space_id): + """Should be able to ask follow-up questions.""" + # First question + result1 = ask_genie( + space_id=genie_space_id, + question="Show me the first 5 rows", + timeout_seconds=120, + ) + + if result1["status"] != "COMPLETED": + pytest.skip("First query did not complete - skipping follow-up test") + + # Follow-up question + result2 = ask_genie( + space_id=genie_space_id, + question="Now show me the count", + conversation_id=result1["conversation_id"], + timeout_seconds=120, + ) + + # Should use same conversation + assert result2["conversation_id"] == result1["conversation_id"] From a02f1ce2192191af60ab6183e4d65e4990a81765 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Mon, 13 Apr 2026 11:11:59 +0200 Subject: [PATCH 14/66] refactor(agent-bricks): rename manager.py to mas_manager.py and add example queue support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Changes: - Renamed manager.py β†’ mas_manager.py for clearer naming - Added example question management functions: - add_examples(): Add examples to ONLINE MAS - add_examples_queued(): Queue examples for when MAS becomes ONLINE - list_examples(): List all examples for a MAS - Integrated with TileExampleQueue from databricks-tools-core - Updated all documentation references to use mas_manager.py - Updated test imports to use mas_manager module This allows users to add example questions immediately after creating a MAS, even before it finishes provisioning. Examples are automatically added when the endpoint becomes ONLINE. πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../.tests/test_agent_bricks_manager.py | 7 +- .../1-knowledge-assistants.md | 4 +- .../2-supervisor-agents.md | 54 +++--- .../databricks-agent-bricks/SKILL.md | 27 ++- .../{manager.py => mas_manager.py} | 159 ++++++++++++++++-- 5 files changed, 200 insertions(+), 51 deletions(-) rename databricks-skills/databricks-agent-bricks/{manager.py => mas_manager.py} (66%) diff --git a/databricks-skills/.tests/test_agent_bricks_manager.py b/databricks-skills/.tests/test_agent_bricks_manager.py index ad8941ed..a7b24aaa 100644 --- a/databricks-skills/.tests/test_agent_bricks_manager.py +++ b/databricks-skills/.tests/test_agent_bricks_manager.py @@ -1,5 +1,5 @@ """ -Integration tests for databricks-agent-bricks/manager.py +Integration tests for databricks-agent-bricks/mas_manager.py Tests the Supervisor Agent (MAS) CLI interface functions. Requires databricks-tools-core.agent_bricks module. @@ -15,13 +15,16 @@ SKILLS_DIR = Path(__file__).parent.parent sys.path.insert(0, str(SKILLS_DIR / "databricks-agent-bricks")) -from manager import ( +from mas_manager import ( create_mas, get_mas, find_mas, update_mas, delete_mas, list_mas, + add_examples, + add_examples_queued, + list_examples, _build_agent_list, ) diff --git a/databricks-skills/databricks-agent-bricks/1-knowledge-assistants.md b/databricks-skills/databricks-agent-bricks/1-knowledge-assistants.md index f4872332..fa28eb9f 100644 --- a/databricks-skills/databricks-agent-bricks/1-knowledge-assistants.md +++ b/databricks-skills/databricks-agent-bricks/1-knowledge-assistants.md @@ -167,8 +167,8 @@ The endpoint name follows this pattern: `ka-{tile_id}-endpoint` # First, list KAs to find the tile_id databricks knowledge-assistants list-knowledge-assistants -# Then use the tile_id to create a Supervisor Agent with manager.py -python manager.py create_mas "Support_MAS" '{ +# Then use the tile_id to create a Supervisor Agent with mas_manager.py +python mas_manager.py create_mas "Support_MAS" '{ "agents": [ { "name": "hr_agent", diff --git a/databricks-skills/databricks-agent-bricks/2-supervisor-agents.md b/databricks-skills/databricks-agent-bricks/2-supervisor-agents.md index eb1f4ddc..f2c6cf74 100644 --- a/databricks-skills/databricks-agent-bricks/2-supervisor-agents.md +++ b/databricks-skills/databricks-agent-bricks/2-supervisor-agents.md @@ -135,10 +135,10 @@ Reference the UC Connection using the `connection_name` field: ### Complete Example: Multi-System Supervisor -Example showing integration of Genie, KA, and external MCP using `manager.py`: +Example showing integration of Genie, KA, and external MCP using `mas_manager.py`: ```bash -python manager.py create_mas "AP_Invoice_Supervisor" '{ +python mas_manager.py create_mas "AP_Invoice_Supervisor" '{ "description": "AP automation assistant with analytics, policy guidance, and operational actions", "instructions": "Route queries as follows:\n- Data questions (invoice counts, spend analysis, vendor metrics) β†’ billing_analyst\n- Policy questions (thresholds, SLAs, compliance rules) β†’ policy_expert\n- Actions (approve, reject, flag, search, workflows) β†’ ap_operations\n\nWhen a user asks to approve, reject, or flag an invoice, ALWAYS use ap_operations.", "agents": [ @@ -181,10 +181,10 @@ SELECT http_request( ## Creating a Supervisor Agent -**NO CLI AVAILABLE** - Use the `manager.py` script in this skill folder: +**NO CLI AVAILABLE** - Use the `mas_manager.py` script in this skill folder: ```bash -python manager.py create_mas "Customer Support MAS" '{ +python mas_manager.py create_mas "Customer Support MAS" '{ "description": "Routes customer queries to specialized support agents", "instructions": "Analyze the user'\''s question and route to the most appropriate agent. If unclear, ask for clarification.", "agents": [ @@ -252,32 +252,32 @@ After creation, the Supervisor Agent endpoint needs to provision: | `ONLINE` | Ready to route queries | - | | `OFFLINE` | Not currently running | - | -Use `python manager.py get_mas TILE_ID` to check the status. +Use `python mas_manager.py get_mas TILE_ID` to check the status. ## Adding Example Questions -Example questions help with evaluation and can guide routing optimization: +Example questions help with evaluation and can guide routing optimization. -```json -{ - "examples": [ - { - "question": "I haven't received my invoice for this month", - "guideline": "Should be routed to billing_agent" - }, - { - "question": "The API is returning a 500 error", - "guideline": "Should be routed to technical_agent" - }, - { - "question": "How many vacation days do I have?", - "guideline": "Should be routed to hr_agent" - } - ] -} +### Using the CLI + +```bash +# Add examples immediately (MAS must be ONLINE) +python mas_manager.py add_examples TILE_ID '[ + {"question": "I haven'\''t received my invoice for this month", "guideline": "Should be routed to billing_agent"}, + {"question": "The API is returning a 500 error", "guideline": "Should be routed to technical_agent"}, + {"question": "How many vacation days do I have?", "guideline": "Should be routed to hr_agent"} +]' + +# Add examples with queuing (waits for ONLINE if provisioning) +python mas_manager.py add_examples_queued TILE_ID '[ + {"question": "I haven'\''t received my invoice for this month", "guideline": "Should be routed to billing_agent"} +]' + +# List existing examples +python mas_manager.py list_examples TILE_ID ``` -If the Supervisor Agent is not yet `ONLINE`, examples are queued and added automatically when ready. +If the Supervisor Agent is not yet `ONLINE`, use `add_examples_queued` - examples are queued and added automatically when the endpoint becomes ready. ## Best Practices @@ -332,14 +332,14 @@ Consider adding a general-purpose agent for queries that don't fit elsewhere: ## Updating a Supervisor Agent -To update an existing Supervisor Agent, use `manager.py`: +To update an existing Supervisor Agent, use `mas_manager.py`: ```bash # Get current state -python manager.py get_mas TILE_ID +python mas_manager.py get_mas TILE_ID # Update with new configuration -python manager.py update_mas TILE_ID '{"name": "New Name", "agents": [...], "instructions": "..."}' +python mas_manager.py update_mas TILE_ID '{"name": "New Name", "agents": [...], "instructions": "..."}' ``` 1. **Add/remove agents**: Include updated `agents` list diff --git a/databricks-skills/databricks-agent-bricks/SKILL.md b/databricks-skills/databricks-agent-bricks/SKILL.md index dc4a6e87..21e30a78 100644 --- a/databricks-skills/databricks-agent-bricks/SKILL.md +++ b/databricks-skills/databricks-agent-bricks/SKILL.md @@ -105,26 +105,35 @@ See `databricks-genie` skill for: ### Supervisor Agent (MAS) -**NO CLI AVAILABLE** - Supervisor Agents are managed via the `manager.py` script in this skill folder: +**NO CLI AVAILABLE** - Supervisor Agents are managed via the `mas_manager.py` script in this skill folder: ```bash # List all Supervisor Agents -python manager.py list_mas +python mas_manager.py list_mas # Create a Supervisor Agent -python manager.py create_mas "My Supervisor" '{"agents": [...], "description": "...", "instructions": "..."}' +python mas_manager.py create_mas "My Supervisor" '{"agents": [...], "description": "...", "instructions": "..."}' # Get a Supervisor Agent by tile ID -python manager.py get_mas TILE_ID +python mas_manager.py get_mas TILE_ID # Find a Supervisor Agent by name -python manager.py find_mas "My Supervisor" +python mas_manager.py find_mas "My Supervisor" # Update a Supervisor Agent -python manager.py update_mas TILE_ID '{"name": "New Name", "agents": [...], ...}' +python mas_manager.py update_mas TILE_ID '{"name": "New Name", "agents": [...], ...}' # Delete a Supervisor Agent -python manager.py delete_mas TILE_ID +python mas_manager.py delete_mas TILE_ID + +# Add examples (must be ONLINE) +python mas_manager.py add_examples TILE_ID '[{"question": "...", "guideline": "..."}]' + +# Add examples (queued - waits for ONLINE) +python mas_manager.py add_examples_queued TILE_ID '[{"question": "...", "guideline": "..."}]' + +# List examples +python mas_manager.py list_examples TILE_ID ``` Agent configuration options (provide exactly one per agent): @@ -177,10 +186,10 @@ For KA, if `add_examples_from_volume=true`, examples are automatically extracted ## Example: Multi-Modal Supervisor Agent -Use `manager.py` to create a Supervisor Agent: +Use `mas_manager.py` to create a Supervisor Agent: ```bash -python manager.py create_mas "Enterprise Support Supervisor" '{ +python mas_manager.py create_mas "Enterprise Support Supervisor" '{ "description": "Comprehensive enterprise support agent", "instructions": "Route queries as follows:\n1. Policy/procedure questions β†’ knowledge_base\n2. Data analysis requests β†’ analytics_engine\n3. Ticket classification β†’ ml_classifier", "agents": [ diff --git a/databricks-skills/databricks-agent-bricks/manager.py b/databricks-skills/databricks-agent-bricks/mas_manager.py similarity index 66% rename from databricks-skills/databricks-agent-bricks/manager.py rename to databricks-skills/databricks-agent-bricks/mas_manager.py index 719e57bc..e9317321 100644 --- a/databricks-skills/databricks-agent-bricks/manager.py +++ b/databricks-skills/databricks-agent-bricks/mas_manager.py @@ -3,11 +3,15 @@ Supervisor Agent (MAS) Manager - CLI interface for MAS operations. Usage: - python manager.py create_mas "Name" '{"agents": [...], "description": "...", "instructions": "..."}' - python manager.py get_mas TILE_ID - python manager.py find_mas "Name" - python manager.py delete_mas TILE_ID - python manager.py list_mas + python mas_manager.py create_mas "Name" '{"agents": [...], "description": "...", "instructions": "..."}' + python mas_manager.py get_mas TILE_ID + python mas_manager.py find_mas "Name" + python mas_manager.py update_mas TILE_ID '{"name": ..., "agents": [...], ...}' + python mas_manager.py delete_mas TILE_ID + python mas_manager.py list_mas + python mas_manager.py add_examples TILE_ID '[{"question": "...", "guideline": "..."}]' + python mas_manager.py add_examples_queued TILE_ID '[{"question": "...", "guideline": "..."}]' + python mas_manager.py list_examples TILE_ID Requires: databricks-tools-core package """ @@ -16,7 +20,11 @@ import sys from typing import Any, Dict, List, Optional -from databricks_tools_core.agent_bricks import AgentBricksManager, EndpointStatus +from databricks_tools_core.agent_bricks import ( + AgentBricksManager, + EndpointStatus, + get_tile_example_queue, +) def _get_manager() -> AgentBricksManager: @@ -274,6 +282,110 @@ def list_mas() -> List[Dict[str, Any]]: return results +def add_examples( + tile_id: str, + examples: List[Dict[str, Any]], +) -> Dict[str, Any]: + """Add example questions to a Supervisor Agent. + + Args: + tile_id: The MAS tile ID + examples: List of example dicts with: + - question: The example question + - guideline: Optional routing guideline (e.g., "Should route to billing_agent") + + Returns: + Dict with added_count and any errors + """ + manager = _get_manager() + + # Check if MAS is online + status = get_mas(tile_id) + if "error" in status: + return status + + if status.get("endpoint_status") != "ONLINE": + return { + "error": f"MAS is not ONLINE (status: {status.get('endpoint_status')}). " + "Use add_examples_queued to queue examples for when it's ready.", + "tile_id": tile_id, + } + + # Add examples directly + created = manager.mas_add_examples_batch(tile_id, examples) + return { + "tile_id": tile_id, + "added_count": len(created), + "total_requested": len(examples), + } + + +def add_examples_queued( + tile_id: str, + examples: List[Dict[str, Any]], +) -> Dict[str, Any]: + """Queue example questions to be added when MAS becomes ONLINE. + + If the MAS is already ONLINE, examples are added immediately. + If not, they're queued and added automatically when the endpoint is ready. + + Args: + tile_id: The MAS tile ID + examples: List of example dicts with: + - question: The example question + - guideline: Optional routing guideline + + Returns: + Dict with status (queued or added) and count + """ + manager = _get_manager() + + # Check current status + status = get_mas(tile_id) + if "error" in status: + return status + + if status.get("endpoint_status") == "ONLINE": + # Add immediately + created = manager.mas_add_examples_batch(tile_id, examples) + return { + "tile_id": tile_id, + "status": "added", + "added_count": len(created), + "total_requested": len(examples), + } + else: + # Queue for later + queue = get_tile_example_queue() + queue.start() + queue.enqueue(tile_id, manager, examples, tile_type="mas") + return { + "tile_id": tile_id, + "status": "queued", + "queued_count": len(examples), + "endpoint_status": status.get("endpoint_status"), + "message": "Examples will be added automatically when endpoint becomes ONLINE", + } + + +def list_examples(tile_id: str) -> Dict[str, Any]: + """List all examples for a Supervisor Agent. + + Args: + tile_id: The MAS tile ID + + Returns: + Dict with examples list + """ + manager = _get_manager() + result = manager.mas_list_examples(tile_id) + return { + "tile_id": tile_id, + "examples": result.get("examples", []), + "count": len(result.get("examples", [])), + } + + def _print_json(data: Any) -> None: """Print data as formatted JSON.""" print(json.dumps(data, indent=2)) @@ -289,7 +401,7 @@ def main(): if command == "create_mas": if len(sys.argv) < 4: - print("Usage: python manager.py create_mas NAME '{\"agents\": [...], ...}'") + print("Usage: python mas_manager.py create_mas NAME '{\"agents\": [...], ...}'") sys.exit(1) name = sys.argv[2] config = json.loads(sys.argv[3]) @@ -303,21 +415,21 @@ def main(): elif command == "get_mas": if len(sys.argv) < 3: - print("Usage: python manager.py get_mas TILE_ID") + print("Usage: python mas_manager.py get_mas TILE_ID") sys.exit(1) result = get_mas(sys.argv[2]) _print_json(result) elif command == "find_mas": if len(sys.argv) < 3: - print("Usage: python manager.py find_mas NAME") + print("Usage: python mas_manager.py find_mas NAME") sys.exit(1) result = find_mas(sys.argv[2]) _print_json(result) elif command == "update_mas": if len(sys.argv) < 4: - print("Usage: python manager.py update_mas TILE_ID '{\"name\": ..., \"agents\": [...], ...}'") + print("Usage: python mas_manager.py update_mas TILE_ID '{\"name\": ..., \"agents\": [...], ...}'") sys.exit(1) tile_id = sys.argv[2] config = json.loads(sys.argv[3]) @@ -332,7 +444,7 @@ def main(): elif command == "delete_mas": if len(sys.argv) < 3: - print("Usage: python manager.py delete_mas TILE_ID") + print("Usage: python mas_manager.py delete_mas TILE_ID") sys.exit(1) result = delete_mas(sys.argv[2]) _print_json(result) @@ -341,6 +453,31 @@ def main(): result = list_mas() _print_json(result) + elif command == "add_examples": + if len(sys.argv) < 4: + print("Usage: python mas_manager.py add_examples TILE_ID '[{\"question\": \"...\", \"guideline\": \"...\"}]'") + sys.exit(1) + tile_id = sys.argv[2] + examples = json.loads(sys.argv[3]) + result = add_examples(tile_id, examples) + _print_json(result) + + elif command == "add_examples_queued": + if len(sys.argv) < 4: + print("Usage: python mas_manager.py add_examples_queued TILE_ID '[{\"question\": \"...\", \"guideline\": \"...\"}]'") + sys.exit(1) + tile_id = sys.argv[2] + examples = json.loads(sys.argv[3]) + result = add_examples_queued(tile_id, examples) + _print_json(result) + + elif command == "list_examples": + if len(sys.argv) < 3: + print("Usage: python mas_manager.py list_examples TILE_ID") + sys.exit(1) + result = list_examples(sys.argv[2]) + _print_json(result) + else: print(f"Unknown command: {command}") print(__doc__) From 0e34e8abc4a361b0020c240128ae5c92ee3a0651 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Mon, 13 Apr 2026 11:54:49 +0200 Subject: [PATCH 15/66] Add installation instructions to PDF generation skill MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add installation section with uv (preferred) and pip fallback for installing databricks-tools-core library. πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../databricks-unstructured-pdf-generation/SKILL.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md b/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md index bb3d262d..5c1e2802 100644 --- a/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md +++ b/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md @@ -11,6 +11,18 @@ Convert HTML content to PDF documents and upload them to Unity Catalog Volumes. Generate PDFs from HTML using the `databricks-tools-core` library. You (the LLM) generate the HTML content, and the Python script handles conversion and upload. +## Installation + +Install the `databricks-tools-core` library before using PDF generation: + +```bash +# Preferred: use uv for faster installation +uv pip install databricks-tools-core + +# Fallback: use pip if uv is not available +pip install databricks-tools-core +``` + ## Python Script Pattern ```python From 3a817761ef18ab93a64430f213efcd121e5456f6 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Mon, 13 Apr 2026 12:02:35 +0200 Subject: [PATCH 16/66] Make skills self-contained without core dependencies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - mas_manager.py: Inline all agent_bricks functionality, use raw HTTP with WorkspaceClient for auth only (no core imports) - pdf_generator.py: New self-contained script using CLI for uploads (databricks fs cp) instead of SDK-based volume operations - Update SKILL.md files to reflect self-contained scripts - Update tests to work with new modules Skills now only require: - databricks-sdk (for auth in mas_manager) - requests (for HTTP in mas_manager) - plutoprint (for PDF generation) πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../.tests/test_agent_bricks_manager.py | 2 +- .../databricks-agent-bricks/SKILL.md | 9 +- .../databricks-agent-bricks/mas_manager.py | 540 +++++++++++++----- .../SKILL.md | 52 +- .../pdf_generator.py | 284 +++++++++ 5 files changed, 734 insertions(+), 153 deletions(-) create mode 100644 databricks-skills/databricks-unstructured-pdf-generation/pdf_generator.py diff --git a/databricks-skills/.tests/test_agent_bricks_manager.py b/databricks-skills/.tests/test_agent_bricks_manager.py index a7b24aaa..fc5ded42 100644 --- a/databricks-skills/.tests/test_agent_bricks_manager.py +++ b/databricks-skills/.tests/test_agent_bricks_manager.py @@ -2,7 +2,7 @@ Integration tests for databricks-agent-bricks/mas_manager.py Tests the Supervisor Agent (MAS) CLI interface functions. -Requires databricks-tools-core.agent_bricks module. +The mas_manager.py is self-contained - requires only databricks-sdk and requests. """ import json diff --git a/databricks-skills/databricks-agent-bricks/SKILL.md b/databricks-skills/databricks-agent-bricks/SKILL.md index 21e30a78..f9d98d67 100644 --- a/databricks-skills/databricks-agent-bricks/SKILL.md +++ b/databricks-skills/databricks-agent-bricks/SKILL.md @@ -105,7 +105,14 @@ See `databricks-genie` skill for: ### Supervisor Agent (MAS) -**NO CLI AVAILABLE** - Supervisor Agents are managed via the `mas_manager.py` script in this skill folder: +**NO CLI AVAILABLE** - Supervisor Agents are managed via the self-contained `mas_manager.py` script in this skill folder. + +Install requirements first: +```bash +pip install databricks-sdk requests +``` + +Usage: ```bash # List all Supervisor Agents diff --git a/databricks-skills/databricks-agent-bricks/mas_manager.py b/databricks-skills/databricks-agent-bricks/mas_manager.py index e9317321..27b6b424 100644 --- a/databricks-skills/databricks-agent-bricks/mas_manager.py +++ b/databricks-skills/databricks-agent-bricks/mas_manager.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 """ -Supervisor Agent (MAS) Manager - CLI interface for MAS operations. +Supervisor Agent (MAS) Manager - Self-contained CLI for MAS operations. Usage: python mas_manager.py create_mas "Name" '{"agents": [...], "description": "...", "instructions": "..."}' @@ -13,23 +13,380 @@ python mas_manager.py add_examples_queued TILE_ID '[{"question": "...", "guideline": "..."}]' python mas_manager.py list_examples TILE_ID -Requires: databricks-tools-core package +Requires: databricks-sdk, requests + pip install databricks-sdk requests """ import json +import logging +import re import sys -from typing import Any, Dict, List, Optional +import threading +import time +from concurrent.futures import ThreadPoolExecutor, as_completed +from dataclasses import dataclass +from enum import Enum +from typing import Any, Dict, List, Optional, Tuple -from databricks_tools_core.agent_bricks import ( - AgentBricksManager, - EndpointStatus, - get_tile_example_queue, -) +import requests +from databricks.sdk import WorkspaceClient +logger = logging.getLogger(__name__) -def _get_manager() -> AgentBricksManager: - """Get AgentBricksManager instance.""" - return AgentBricksManager() + +# ============================================================================ +# Models +# ============================================================================ + + +class TileType(Enum): + """Tile types.""" + UNSPECIFIED = 0 + KIE = 1 + T2T = 2 + KA = 3 + MAO = 4 + MAS = 5 + + +class EndpointStatus(Enum): + """Endpoint status values.""" + ONLINE = "ONLINE" + OFFLINE = "OFFLINE" + PROVISIONING = "PROVISIONING" + NOT_READY = "NOT_READY" + + +@dataclass(frozen=True) +class MASIds: + """Supervisor Agent identifiers.""" + tile_id: str + name: str + + +# ============================================================================ +# MAS Manager Class +# ============================================================================ + + +class MASManager: + """Manager for Supervisor Agent (MAS) operations. + + Uses raw HTTP API calls since there's no CLI for MAS operations. + Authentication is handled via databricks-sdk WorkspaceClient. + """ + + def __init__(self, client: Optional[WorkspaceClient] = None): + """Initialize the MAS Manager. + + Args: + client: Optional WorkspaceClient (creates new one if not provided) + """ + self.w: WorkspaceClient = client or WorkspaceClient() + + @staticmethod + def sanitize_name(name: str) -> str: + """Sanitize a name to ensure it's alphanumeric with only hyphens and underscores.""" + sanitized = name.replace(" ", "_") + sanitized = re.sub(r"[^a-zA-Z0-9_-]", "_", sanitized) + sanitized = re.sub(r"[_-]{2,}", "_", sanitized) + sanitized = sanitized.strip("_-") + if not sanitized: + sanitized = "supervisor_agent" + return sanitized + + # ======================================================================== + # MAS CRUD Operations + # ======================================================================== + + def create( + self, + name: str, + agents: List[Dict[str, Any]], + description: Optional[str] = None, + instructions: Optional[str] = None, + ) -> Dict[str, Any]: + """Create a Supervisor Agent with specified agents.""" + payload = {"name": self.sanitize_name(name), "agents": agents} + if description: + payload["description"] = description + if instructions: + payload["instructions"] = instructions + return self._post("/api/2.0/multi-agent-supervisors", payload) + + def get(self, tile_id: str) -> Optional[Dict[str, Any]]: + """Get MAS by tile_id.""" + try: + return self._get(f"/api/2.0/multi-agent-supervisors/{tile_id}") + except Exception as e: + if "does not exist" in str(e).lower() or "not found" in str(e).lower(): + return None + raise + + def update( + self, + tile_id: str, + name: Optional[str] = None, + description: Optional[str] = None, + instructions: Optional[str] = None, + agents: Optional[List[Dict[str, Any]]] = None, + ) -> Dict[str, Any]: + """Update a Supervisor Agent.""" + payload = {"tile_id": tile_id} + if name: + payload["name"] = self.sanitize_name(name) + if description: + payload["description"] = description + if instructions: + payload["instructions"] = instructions + if agents: + payload["agents"] = agents + return self._patch(f"/api/2.0/multi-agent-supervisors/{tile_id}", payload) + + def delete(self, tile_id: str) -> None: + """Delete a Supervisor Agent.""" + self._delete(f"/api/2.0/tiles/{tile_id}") + + def find_by_name(self, name: str) -> Optional[MASIds]: + """Find a MAS by exact display name.""" + sanitized_name = self.sanitize_name(name) + filter_q = f"name_contains={sanitized_name}&&tile_type=MAS" + page_token = None + while True: + params = {"filter": filter_q} + if page_token: + params["page_token"] = page_token + resp = self._get("/api/2.0/tiles", params=params) + for t in resp.get("tiles", []): + if t.get("name") == sanitized_name: + return MASIds(tile_id=t["tile_id"], name=sanitized_name) + page_token = resp.get("next_page_token") + if not page_token: + break + return None + + def list_all(self, page_size: int = 100) -> List[Dict[str, Any]]: + """List all Supervisor Agents.""" + all_tiles = [] + filter_q = "tile_type=MAS" + page_token = None + + while True: + params = {"page_size": page_size, "filter": filter_q} + if page_token: + params["page_token"] = page_token + + resp = self._get("/api/2.0/tiles", params=params) + for tile in resp.get("tiles", []): + tile_type = tile.get("tile_type") + if tile_type in ("MAS", "5"): + all_tiles.append(tile) + + page_token = resp.get("next_page_token") + if not page_token: + break + + return all_tiles + + def get_endpoint_status(self, tile_id: str) -> Optional[str]: + """Get the endpoint status of a MAS.""" + mas = self.get(tile_id) + if not mas: + return None + return mas.get("multi_agent_supervisor", {}).get("status", {}).get("endpoint_status") + + # ======================================================================== + # Examples Management + # ======================================================================== + + def create_example(self, tile_id: str, question: str, guidelines: Optional[List[str]] = None) -> Dict[str, Any]: + """Create an example question for the MAS.""" + payload = {"tile_id": tile_id, "question": question} + if guidelines: + payload["guidelines"] = guidelines + return self._post(f"/api/2.0/multi-agent-supervisors/{tile_id}/examples", payload) + + def list_examples(self, tile_id: str, page_size: int = 100) -> Dict[str, Any]: + """List all examples for a MAS.""" + return self._get(f"/api/2.0/multi-agent-supervisors/{tile_id}/examples", params={"page_size": page_size}) + + def delete_example(self, tile_id: str, example_id: str) -> None: + """Delete an example from the MAS.""" + self._delete(f"/api/2.0/multi-agent-supervisors/{tile_id}/examples/{example_id}") + + def add_examples_batch(self, tile_id: str, questions: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Add multiple example questions in parallel.""" + created_examples = [] + + def create_example(q: Dict[str, Any]) -> Optional[Dict[str, Any]]: + question_text = q.get("question", "") + guidelines = q.get("guideline") + if guidelines and isinstance(guidelines, str): + guidelines = [guidelines] + + if not question_text: + return None + try: + return self.create_example(tile_id, question_text, guidelines) + except Exception as e: + logger.error(f"Failed to add MAS example '{question_text[:50]}...': {e}") + return None + + max_workers = min(2, len(questions)) + with ThreadPoolExecutor(max_workers=max_workers) as executor: + future_to_q = {executor.submit(create_example, q): q for q in questions} + for future in as_completed(future_to_q): + result = future.result() + if result: + created_examples.append(result) + + return created_examples + + # ======================================================================== + # HTTP Helpers + # ======================================================================== + + def _handle_response_error(self, response: requests.Response, method: str, path: str) -> None: + """Extract detailed error from response and raise.""" + if response.status_code >= 400: + try: + error_data = response.json() + error_msg = error_data.get("message", error_data.get("error", str(error_data))) + raise Exception(f"{method} {path} failed: {error_msg}") + except ValueError: + raise Exception(f"{method} {path} failed with status {response.status_code}: {response.text}") + + def _get(self, path: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + headers = self.w.config.authenticate() + url = f"{self.w.config.host}{path}" + response = requests.get(url, headers=headers, params=params or {}, timeout=20) + self._handle_response_error(response, "GET", path) + return response.json() + + def _post(self, path: str, body: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]: + headers = self.w.config.authenticate() + headers["Content-Type"] = "application/json" + url = f"{self.w.config.host}{path}" + response = requests.post(url, headers=headers, json=body, timeout=timeout) + self._handle_response_error(response, "POST", path) + return response.json() + + def _patch(self, path: str, body: Dict[str, Any]) -> Dict[str, Any]: + headers = self.w.config.authenticate() + headers["Content-Type"] = "application/json" + url = f"{self.w.config.host}{path}" + response = requests.patch(url, headers=headers, json=body, timeout=20) + self._handle_response_error(response, "PATCH", path) + return response.json() + + def _delete(self, path: str) -> Dict[str, Any]: + headers = self.w.config.authenticate() + url = f"{self.w.config.host}{path}" + response = requests.delete(url, headers=headers, timeout=20) + self._handle_response_error(response, "DELETE", path) + return response.json() + + +# ============================================================================ +# Example Queue (for adding examples when MAS becomes ONLINE) +# ============================================================================ + + +class TileExampleQueue: + """Background queue for adding examples to tiles that aren't ready yet.""" + + def __init__(self, poll_interval: float = 30.0, max_attempts: int = 120): + self.queue: Dict[str, Tuple[MASManager, List[Dict[str, Any]], float, int]] = {} + self.lock = threading.Lock() + self.running = False + self.thread: Optional[threading.Thread] = None + self.poll_interval = poll_interval + self.max_attempts = max_attempts + + def enqueue(self, tile_id: str, manager: MASManager, questions: List[Dict[str, Any]]) -> None: + """Add a tile and its questions to the processing queue.""" + with self.lock: + self.queue[tile_id] = (manager, questions, time.time(), 0) + logger.info(f"Enqueued {len(questions)} examples for MAS {tile_id}") + + if not self.running: + self.start() + + def start(self) -> None: + """Start the background processing thread.""" + if not self.running: + self.running = True + self.thread = threading.Thread(target=self._process_loop, daemon=True) + self.thread.start() + + def stop(self) -> None: + """Stop the background processing thread.""" + self.running = False + if self.thread: + self.thread.join(timeout=5) + + def _process_loop(self) -> None: + """Background loop that checks tile status and adds examples when ready.""" + while self.running: + try: + with self.lock: + items_to_process = list(self.queue.items()) + + for tile_id, (manager, questions, enqueue_time, attempt_count) in items_to_process: + try: + if attempt_count >= self.max_attempts: + logger.error(f"MAS {tile_id} exceeded max attempts. Removing from queue.") + with self.lock: + self.queue.pop(tile_id, None) + continue + + with self.lock: + if tile_id in self.queue: + self.queue[tile_id] = (manager, questions, enqueue_time, attempt_count + 1) + + status = manager.get_endpoint_status(tile_id) + + if status == EndpointStatus.ONLINE.value: + logger.info(f"MAS {tile_id} is ONLINE, adding {len(questions)} examples...") + created = manager.add_examples_batch(tile_id, questions) + logger.info(f"Added {len(created)} examples to MAS {tile_id}") + with self.lock: + self.queue.pop(tile_id, None) + + except Exception as e: + logger.error(f"Error processing MAS {tile_id}: {e}") + with self.lock: + self.queue.pop(tile_id, None) + + except Exception as e: + logger.error(f"Error in queue processor: {e}") + + time.sleep(self.poll_interval) + + +# Global singleton queue instance +_tile_example_queue: Optional[TileExampleQueue] = None +_queue_lock = threading.Lock() + + +def get_tile_example_queue() -> TileExampleQueue: + """Get or create the global tile example queue instance.""" + global _tile_example_queue + if _tile_example_queue is None: + with _queue_lock: + if _tile_example_queue is None: + _tile_example_queue = TileExampleQueue() + return _tile_example_queue + + +# ============================================================================ +# CLI Functions +# ============================================================================ + + +def _get_manager() -> MASManager: + """Get MASManager instance.""" + return MASManager() def _build_agent_list(agents: List[Dict[str, str]]) -> List[Dict[str, Any]]: @@ -80,24 +437,11 @@ def create_mas( description: str = None, instructions: str = None, ) -> Dict[str, Any]: - """Create a new Supervisor Agent. - - Args: - name: Display name for the MAS - agents: List of agent configs, each with: - - name: Agent identifier - - description: What this agent handles (critical for routing) - - ONE OF: endpoint_name, genie_space_id, ka_tile_id, uc_function_name, connection_name - description: What this MAS does - instructions: Routing rules for the supervisor - - Returns: - Dict with tile_id, name, endpoint_status - """ + """Create a new Supervisor Agent.""" manager = _get_manager() agent_list = _build_agent_list(agents) - result = manager.mas_create( + result = manager.create( name=name, agents=agent_list, description=description, @@ -117,16 +461,9 @@ def create_mas( def get_mas(tile_id: str) -> Dict[str, Any]: - """Get a Supervisor Agent by tile ID. - - Args: - tile_id: The MAS tile ID - - Returns: - Dict with tile_id, name, description, endpoint_status, agents, instructions - """ + """Get a Supervisor Agent by tile ID.""" manager = _get_manager() - result = manager.mas_get(tile_id) + result = manager.get(tile_id) if not result: return {"error": f"Supervisor Agent {tile_id} not found"} @@ -146,21 +483,14 @@ def get_mas(tile_id: str) -> Dict[str, Any]: def find_mas(name: str) -> Dict[str, Any]: - """Find a Supervisor Agent by name. - - Args: - name: The display name to search for - - Returns: - Dict with found, tile_id, name, endpoint_status if found - """ + """Find a Supervisor Agent by name.""" manager = _get_manager() - result = manager.mas_find_by_name(name) + result = manager.find_by_name(name) if result is None: return {"found": False, "name": name} - full_details = manager.mas_get(result.tile_id) + full_details = manager.get(result.tile_id) if full_details: mas_data = full_details.get("multi_agent_supervisor", {}) status_data = mas_data.get("status", {}) @@ -186,29 +516,16 @@ def update_mas( description: str = None, instructions: str = None, ) -> Dict[str, Any]: - """Update an existing Supervisor Agent. - - Args: - tile_id: The MAS tile ID to update - name: New display name (optional) - agents: New agent list (optional) - description: New description (optional) - instructions: New routing instructions (optional) - - Returns: - Dict with tile_id, name, endpoint_status - """ + """Update an existing Supervisor Agent.""" manager = _get_manager() - # Get existing to merge - existing = manager.mas_get(tile_id) + existing = manager.get(tile_id) if not existing: return {"error": f"Supervisor Agent {tile_id} not found"} mas_data = existing.get("multi_agent_supervisor", {}) tile_data = mas_data.get("tile", {}) - # Use existing values if not provided final_name = name or tile_data.get("name", "") final_description = description or tile_data.get("description", "") final_instructions = instructions or mas_data.get("instructions", "") @@ -218,7 +535,7 @@ def update_mas( else: agent_list = mas_data.get("agents", []) - result = manager.mas_update( + result = manager.update( tile_id=tile_id, name=final_name, description=final_description, @@ -238,14 +555,7 @@ def update_mas( def delete_mas(tile_id: str) -> Dict[str, Any]: - """Delete a Supervisor Agent. - - Args: - tile_id: The MAS tile ID to delete - - Returns: - Dict with success status - """ + """Delete a Supervisor Agent.""" manager = _get_manager() try: manager.delete(tile_id) @@ -255,51 +565,32 @@ def delete_mas(tile_id: str) -> Dict[str, Any]: def list_mas() -> List[Dict[str, Any]]: - """List all Supervisor Agents. - - Returns: - List of MAS summaries with tile_id, name, endpoint_status - """ + """List all Supervisor Agents.""" manager = _get_manager() results = [] - # List all tiles and filter to MAS type - tiles = manager.list_tiles() + tiles = manager.list_all() for tile in tiles: - if tile.tile_type == "MULTI_AGENT_SUPERVISOR": - details = manager.mas_get(tile.tile_id) - if details: - mas_data = details.get("multi_agent_supervisor", {}) - tile_data = mas_data.get("tile", {}) - status_data = mas_data.get("status", {}) - results.append({ - "tile_id": tile.tile_id, - "name": tile_data.get("name", ""), - "endpoint_status": status_data.get("endpoint_status", "UNKNOWN"), - "agents_count": len(mas_data.get("agents", [])), - }) + tile_id = tile.get("tile_id") + details = manager.get(tile_id) + if details: + mas_data = details.get("multi_agent_supervisor", {}) + tile_data = mas_data.get("tile", {}) + status_data = mas_data.get("status", {}) + results.append({ + "tile_id": tile_id, + "name": tile_data.get("name", ""), + "endpoint_status": status_data.get("endpoint_status", "UNKNOWN"), + "agents_count": len(mas_data.get("agents", [])), + }) return results -def add_examples( - tile_id: str, - examples: List[Dict[str, Any]], -) -> Dict[str, Any]: - """Add example questions to a Supervisor Agent. - - Args: - tile_id: The MAS tile ID - examples: List of example dicts with: - - question: The example question - - guideline: Optional routing guideline (e.g., "Should route to billing_agent") - - Returns: - Dict with added_count and any errors - """ +def add_examples(tile_id: str, examples: List[Dict[str, Any]]) -> Dict[str, Any]: + """Add example questions to a Supervisor Agent.""" manager = _get_manager() - # Check if MAS is online status = get_mas(tile_id) if "error" in status: return status @@ -311,8 +602,7 @@ def add_examples( "tile_id": tile_id, } - # Add examples directly - created = manager.mas_add_examples_batch(tile_id, examples) + created = manager.add_examples_batch(tile_id, examples) return { "tile_id": tile_id, "added_count": len(created), @@ -320,34 +610,16 @@ def add_examples( } -def add_examples_queued( - tile_id: str, - examples: List[Dict[str, Any]], -) -> Dict[str, Any]: - """Queue example questions to be added when MAS becomes ONLINE. - - If the MAS is already ONLINE, examples are added immediately. - If not, they're queued and added automatically when the endpoint is ready. - - Args: - tile_id: The MAS tile ID - examples: List of example dicts with: - - question: The example question - - guideline: Optional routing guideline - - Returns: - Dict with status (queued or added) and count - """ +def add_examples_queued(tile_id: str, examples: List[Dict[str, Any]]) -> Dict[str, Any]: + """Queue example questions to be added when MAS becomes ONLINE.""" manager = _get_manager() - # Check current status status = get_mas(tile_id) if "error" in status: return status if status.get("endpoint_status") == "ONLINE": - # Add immediately - created = manager.mas_add_examples_batch(tile_id, examples) + created = manager.add_examples_batch(tile_id, examples) return { "tile_id": tile_id, "status": "added", @@ -355,10 +627,9 @@ def add_examples_queued( "total_requested": len(examples), } else: - # Queue for later queue = get_tile_example_queue() queue.start() - queue.enqueue(tile_id, manager, examples, tile_type="mas") + queue.enqueue(tile_id, manager, examples) return { "tile_id": tile_id, "status": "queued", @@ -369,16 +640,9 @@ def add_examples_queued( def list_examples(tile_id: str) -> Dict[str, Any]: - """List all examples for a Supervisor Agent. - - Args: - tile_id: The MAS tile ID - - Returns: - Dict with examples list - """ + """List all examples for a Supervisor Agent.""" manager = _get_manager() - result = manager.mas_list_examples(tile_id) + result = manager.list_examples(tile_id) return { "tile_id": tile_id, "examples": result.get("examples", []), diff --git a/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md b/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md index 5c1e2802..1be5a38b 100644 --- a/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md +++ b/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md @@ -9,25 +9,25 @@ Convert HTML content to PDF documents and upload them to Unity Catalog Volumes. ## Overview -Generate PDFs from HTML using the `databricks-tools-core` library. You (the LLM) generate the HTML content, and the Python script handles conversion and upload. +Generate PDFs from HTML using the self-contained `pdf_generator.py` script. You (the LLM) generate the HTML content, and the Python script handles conversion and upload using the Databricks CLI. ## Installation -Install the `databricks-tools-core` library before using PDF generation: +Install plutoprint for HTML to PDF conversion: ```bash # Preferred: use uv for faster installation -uv pip install databricks-tools-core +uv pip install plutoprint # Fallback: use pip if uv is not available -pip install databricks-tools-core +pip install plutoprint ``` ## Python Script Pattern ```python -# generate_pdf.py -from databricks_tools_core.pdf_generator import generate_and_upload_pdf +# Import from the skill's pdf_generator.py +from pdf_generator import generate_and_upload_pdf result = generate_and_upload_pdf( html_content=html_content, # Complete HTML document @@ -37,7 +37,7 @@ result = generate_and_upload_pdf( volume="raw_data", # Volume name (default: "raw_data") folder=None, # Optional subfolder ) -print(f"Uploaded to: {result['volume_path']}") +print(f"Uploaded to: {result.volume_path}") ``` **Returns:** @@ -49,12 +49,30 @@ print(f"Uploaded to: {result['volume_path']}") } ``` +## CLI Usage + +The script can also be run directly from command line: + +```bash +# Generate from inline HTML +python pdf_generator.py generate --html '

Hello

' \ + --filename hello.pdf --catalog my_catalog --schema my_schema + +# Generate from HTML file +python pdf_generator.py generate --html-file input.html \ + --filename report.pdf --catalog my_catalog --schema my_schema --folder reports + +# Get JSON output +python pdf_generator.py generate --html '...' --filename test.pdf \ + --catalog my_catalog --schema my_schema --json +``` + ## Quick Start Generate a simple PDF: ```python -from databricks_tools_core.pdf_generator import generate_and_upload_pdf +from pdf_generator import generate_and_upload_pdf generate_and_upload_pdf( html_content=''' @@ -88,7 +106,7 @@ generate_and_upload_pdf( ```python import concurrent.futures -from databricks_tools_core.pdf_generator import generate_and_upload_pdf +from pdf_generator import generate_and_upload_pdf pdfs_to_generate = [ {"html_content": "...Employee Handbook content...", "filename": "employee_handbook.pdf"}, @@ -112,7 +130,7 @@ with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: results = list(executor.map(generate_pdf, pdfs_to_generate)) for result in results: - print(f"Uploaded: {result['volume_path']}") + print(f"Uploaded: {result.volume_path}") ``` By calling these in parallel (not sequentially), 5 PDFs that would take 15-25 seconds sequentially complete in 3-5 seconds total. @@ -231,7 +249,9 @@ PlutoPrint supports modern CSS3: Generate API documentation, user guides, or technical specs: -``` +```python +from pdf_generator import generate_and_upload_pdf + generate_and_upload_pdf( html_content=''' @@ -260,7 +280,9 @@ Content-Type: application/json ### Pattern 2: Business Reports -``` +```python +from pdf_generator import generate_and_upload_pdf + generate_and_upload_pdf( html_content=''' @@ -291,7 +313,9 @@ generate_and_upload_pdf( ### Pattern 3: HR Policies -``` +```python +from pdf_generator import generate_and_upload_pdf + generate_and_upload_pdf( html_content=''' @@ -334,6 +358,7 @@ When asked to generate multiple PDFs: - Unity Catalog schema must exist - Volume must exist (default: `raw_data`) - User must have WRITE permission on the volume +- Databricks CLI must be configured and authenticated ## Troubleshooting @@ -343,3 +368,4 @@ When asked to generate multiple PDFs: | "Schema does not exist" | Create the schema or check the name | | PDF looks wrong | Check HTML/CSS syntax, use supported CSS features | | Slow generation | Call multiple PDFs in parallel, not sequentially | +| CLI not found | Ensure `databricks` CLI is installed and in PATH | diff --git a/databricks-skills/databricks-unstructured-pdf-generation/pdf_generator.py b/databricks-skills/databricks-unstructured-pdf-generation/pdf_generator.py new file mode 100644 index 00000000..97ce0725 --- /dev/null +++ b/databricks-skills/databricks-unstructured-pdf-generation/pdf_generator.py @@ -0,0 +1,284 @@ +#!/usr/bin/env python3 +""" +PDF Generator - Self-contained HTML to PDF generation and upload to Unity Catalog volumes. + +Usage: + python pdf_generator.py generate --html '...' --filename report.pdf --catalog my_catalog --schema my_schema + python pdf_generator.py generate --html-file input.html --filename report.pdf --catalog my_catalog --schema my_schema --volume raw_data --folder docs + +Requires: plutoprint + pip install plutoprint +""" + +import argparse +import json +import logging +import subprocess +import sys +import tempfile +from dataclasses import dataclass +from pathlib import Path +from typing import Optional + +logger = logging.getLogger(__name__) + + +@dataclass +class PDFResult: + """Result from generating a PDF.""" + success: bool + volume_path: Optional[str] = None + error: Optional[str] = None + + def to_dict(self) -> dict: + return { + "success": self.success, + "volume_path": self.volume_path, + "error": self.error, + } + + +def _convert_html_to_pdf(html_content: str, output_path: str) -> bool: + """Convert HTML content to PDF using PlutoPrint. + + Args: + html_content: HTML string to convert + output_path: Path where PDF should be saved + + Returns: + True if successful, False otherwise + """ + output_dir = Path(output_path).parent + output_dir.mkdir(parents=True, exist_ok=True) + + try: + import plutoprint + + logger.debug(f"Converting HTML to PDF using PlutoPrint: {output_path}") + + book = plutoprint.Book(plutoprint.PAGE_SIZE_A4) + book.load_html(html_content) + book.write_to_pdf(output_path) + + if Path(output_path).exists(): + file_size = Path(output_path).stat().st_size + logger.info(f"PDF saved: {output_path} (size: {file_size:,} bytes)") + return True + else: + logger.error("PlutoPrint conversion failed - file not created") + return False + + except ImportError: + logger.error("PlutoPrint is not installed. Install with: pip install plutoprint") + return False + except Exception as e: + logger.error(f"Failed to convert HTML to PDF: {str(e)}", exc_info=True) + return False + + +def _run_cli(args: list[str], check: bool = True) -> subprocess.CompletedProcess: + """Run a databricks CLI command. + + Args: + args: Command arguments (without 'databricks' prefix) + check: Whether to raise on non-zero exit code + + Returns: + CompletedProcess with stdout/stderr + """ + cmd = ["databricks"] + args + logger.debug(f"Running: {' '.join(cmd)}") + return subprocess.run(cmd, capture_output=True, text=True, check=check) + + +def _validate_volume_exists(catalog: str, schema: str, volume: str) -> Optional[str]: + """Validate that the volume exists using CLI. + + Args: + catalog: Catalog name + schema: Schema name + volume: Volume name + + Returns: + Error message if validation fails, None if successful + """ + # Check volume exists + result = _run_cli(["volumes", "read", f"{catalog}.{schema}.{volume}"], check=False) + if result.returncode != 0: + return f"Volume '{catalog}.{schema}.{volume}' does not exist or is not accessible: {result.stderr}" + return None + + +def _upload_to_volume(local_path: str, volume_path: str) -> Optional[str]: + """Upload a file to Unity Catalog volume using CLI. + + Args: + local_path: Local file path + volume_path: Volume path (e.g., /Volumes/catalog/schema/volume/file.pdf) + + Returns: + Error message if upload fails, None if successful + """ + result = _run_cli(["fs", "cp", local_path, volume_path, "--overwrite"], check=False) + if result.returncode != 0: + return f"Failed to upload to {volume_path}: {result.stderr}" + return None + + +def _create_volume_directory(volume_path: str) -> None: + """Create a directory in the volume using CLI (best effort). + + Args: + volume_path: Volume directory path + """ + # Use fs mkdirs - it's idempotent + _run_cli(["fs", "mkdirs", volume_path], check=False) + + +def generate_and_upload_pdf( + html_content: str, + filename: str, + catalog: str, + schema: str, + volume: str = "raw_data", + folder: Optional[str] = None, +) -> PDFResult: + """Convert HTML to PDF and upload to a Unity Catalog volume. + + Args: + html_content: Complete HTML document (including , , , + ...

Hello World

+ ... + ... ''' + >>> result = generate_and_upload_pdf( + ... html_content=html, + ... filename="hello.pdf", + ... catalog="my_catalog", + ... schema="my_schema", + ... ) + >>> print(result.volume_path) + /Volumes/my_catalog/my_schema/raw_data/hello.pdf + """ + # Ensure filename ends with .pdf + if not filename.lower().endswith(".pdf"): + filename = f"{filename}.pdf" + + # Validate volume exists + error = _validate_volume_exists(catalog, schema, volume) + if error: + return PDFResult(success=False, error=error) + + # Build volume path + if folder: + volume_path = f"/Volumes/{catalog}/{schema}/{volume}/{folder}/{filename}" + else: + volume_path = f"/Volumes/{catalog}/{schema}/{volume}/{filename}" + + try: + with tempfile.TemporaryDirectory() as temp_dir: + local_pdf_path = str(Path(temp_dir) / filename) + + # Convert HTML to PDF + if not _convert_html_to_pdf(html_content, local_pdf_path): + return PDFResult(success=False, error="Failed to convert HTML to PDF") + + # Create folder if needed + if folder: + folder_path = f"/Volumes/{catalog}/{schema}/{volume}/{folder}" + _create_volume_directory(folder_path) + + # Upload to volume + error = _upload_to_volume(local_pdf_path, volume_path) + if error: + return PDFResult(success=False, error=error) + + logger.info(f"PDF uploaded to {volume_path}") + return PDFResult(success=True, volume_path=volume_path) + + except Exception as e: + error_msg = f"Error generating PDF: {str(e)}" + logger.error(error_msg, exc_info=True) + return PDFResult(success=False, error=error_msg) + + +def main(): + """CLI entry point.""" + parser = argparse.ArgumentParser( + description="Generate PDFs from HTML and upload to Unity Catalog volumes", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Generate from inline HTML + python pdf_generator.py generate --html '

Hello

' \\ + --filename hello.pdf --catalog my_catalog --schema my_schema + + # Generate from HTML file + python pdf_generator.py generate --html-file input.html \\ + --filename report.pdf --catalog my_catalog --schema my_schema --folder reports + """, + ) + + subparsers = parser.add_subparsers(dest="command", help="Commands") + + # Generate command + gen_parser = subparsers.add_parser("generate", help="Generate PDF from HTML") + gen_parser.add_argument("--html", help="HTML content as string") + gen_parser.add_argument("--html-file", help="Path to HTML file") + gen_parser.add_argument("--filename", required=True, help="Output PDF filename") + gen_parser.add_argument("--catalog", required=True, help="Unity Catalog name") + gen_parser.add_argument("--schema", required=True, help="Schema name") + gen_parser.add_argument("--volume", default="raw_data", help="Volume name (default: raw_data)") + gen_parser.add_argument("--folder", help="Optional folder within volume") + gen_parser.add_argument("--json", action="store_true", help="Output result as JSON") + + args = parser.parse_args() + + if args.command == "generate": + # Get HTML content + if args.html: + html_content = args.html + elif args.html_file: + with open(args.html_file, "r") as f: + html_content = f.read() + else: + print("Error: Either --html or --html-file is required") + sys.exit(1) + + result = generate_and_upload_pdf( + html_content=html_content, + filename=args.filename, + catalog=args.catalog, + schema=args.schema, + volume=args.volume, + folder=args.folder, + ) + + if args.json: + print(json.dumps(result.to_dict(), indent=2)) + else: + if result.success: + print(f"Success: PDF uploaded to {result.volume_path}") + else: + print(f"Error: {result.error}") + sys.exit(1) + else: + parser.print_help() + sys.exit(1) + + +if __name__ == "__main__": + main() From 095ddc040fe66203551fdbafd5d748b0d07dc893 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Mon, 13 Apr 2026 15:18:23 +0200 Subject: [PATCH 17/66] Move skill Python scripts to scripts/ subfolders MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Move mas_manager.py to databricks-agent-bricks/scripts/ - Move conversation.py to databricks-genie/scripts/ - Move pdf_generator.py to databricks-unstructured-pdf-generation/scripts/ - Update all markdown references to use scripts/ path πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../.tests/test_agent_bricks_manager.py | 2 +- .../.tests/test_genie_conversation.py | 2 +- .../1-knowledge-assistants.md | 2 +- .../2-supervisor-agents.md | 40 ++++++++++------- .../databricks-agent-bricks/SKILL.md | 28 ++++++------ .../{ => scripts}/mas_manager.py | 0 databricks-skills/databricks-genie/SKILL.md | 10 ++--- .../databricks-genie/conversation.md | 44 +++++++++---------- .../{ => scripts}/conversation.py | 0 .../SKILL.md | 8 ++-- .../{ => scripts}/pdf_generator.py | 0 11 files changed, 71 insertions(+), 65 deletions(-) rename databricks-skills/databricks-agent-bricks/{ => scripts}/mas_manager.py (100%) rename databricks-skills/databricks-genie/{ => scripts}/conversation.py (100%) rename databricks-skills/databricks-unstructured-pdf-generation/{ => scripts}/pdf_generator.py (100%) diff --git a/databricks-skills/.tests/test_agent_bricks_manager.py b/databricks-skills/.tests/test_agent_bricks_manager.py index fc5ded42..d6c7d62f 100644 --- a/databricks-skills/.tests/test_agent_bricks_manager.py +++ b/databricks-skills/.tests/test_agent_bricks_manager.py @@ -1,5 +1,5 @@ """ -Integration tests for databricks-agent-bricks/mas_manager.py +Integration tests for databricks-agent-bricks/scripts/mas_manager.py Tests the Supervisor Agent (MAS) CLI interface functions. The mas_manager.py is self-contained - requires only databricks-sdk and requests. diff --git a/databricks-skills/.tests/test_genie_conversation.py b/databricks-skills/.tests/test_genie_conversation.py index 2ec4f281..0ada389f 100644 --- a/databricks-skills/.tests/test_genie_conversation.py +++ b/databricks-skills/.tests/test_genie_conversation.py @@ -1,5 +1,5 @@ """ -Integration tests for databricks-genie/conversation.py +Integration tests for databricks-genie/scripts/conversation.py Tests the Genie Conversation API CLI interface. Requires databricks.sdk for Genie Space operations. diff --git a/databricks-skills/databricks-agent-bricks/1-knowledge-assistants.md b/databricks-skills/databricks-agent-bricks/1-knowledge-assistants.md index fa28eb9f..90015dbf 100644 --- a/databricks-skills/databricks-agent-bricks/1-knowledge-assistants.md +++ b/databricks-skills/databricks-agent-bricks/1-knowledge-assistants.md @@ -168,7 +168,7 @@ The endpoint name follows this pattern: `ka-{tile_id}-endpoint` databricks knowledge-assistants list-knowledge-assistants # Then use the tile_id to create a Supervisor Agent with mas_manager.py -python mas_manager.py create_mas "Support_MAS" '{ +python scripts/mas_manager.py create_mas "Support_MAS" '{ "agents": [ { "name": "hr_agent", diff --git a/databricks-skills/databricks-agent-bricks/2-supervisor-agents.md b/databricks-skills/databricks-agent-bricks/2-supervisor-agents.md index f2c6cf74..893c49be 100644 --- a/databricks-skills/databricks-agent-bricks/2-supervisor-agents.md +++ b/databricks-skills/databricks-agent-bricks/2-supervisor-agents.md @@ -135,10 +135,10 @@ Reference the UC Connection using the `connection_name` field: ### Complete Example: Multi-System Supervisor -Example showing integration of Genie, KA, and external MCP using `mas_manager.py`: +Example showing integration of Genie, KA, and external MCP using `scripts/mas_manager.py`: ```bash -python mas_manager.py create_mas "AP_Invoice_Supervisor" '{ +python scripts/mas_manager.py create_mas "AP_Invoice_Supervisor" '{ "description": "AP automation assistant with analytics, policy guidance, and operational actions", "instructions": "Route queries as follows:\n- Data questions (invoice counts, spend analysis, vendor metrics) β†’ billing_analyst\n- Policy questions (thresholds, SLAs, compliance rules) β†’ policy_expert\n- Actions (approve, reject, flag, search, workflows) β†’ ap_operations\n\nWhen a user asks to approve, reject, or flag an invoice, ALWAYS use ap_operations.", "agents": [ @@ -181,10 +181,10 @@ SELECT http_request( ## Creating a Supervisor Agent -**NO CLI AVAILABLE** - Use the `mas_manager.py` script in this skill folder: +**NO CLI AVAILABLE** - Use the `scripts/mas_manager.py` script in this skill folder: ```bash -python mas_manager.py create_mas "Customer Support MAS" '{ +python scripts/mas_manager.py create_mas "Customer Support MAS" '{ "description": "Routes customer queries to specialized support agents", "instructions": "Analyze the user'\''s question and route to the most appropriate agent. If unclear, ask for clarification.", "agents": [ @@ -252,7 +252,7 @@ After creation, the Supervisor Agent endpoint needs to provision: | `ONLINE` | Ready to route queries | - | | `OFFLINE` | Not currently running | - | -Use `python mas_manager.py get_mas TILE_ID` to check the status. +Use `python scripts/mas_manager.py get_mas TILE_ID` to check the status. ## Adding Example Questions @@ -261,23 +261,29 @@ Example questions help with evaluation and can guide routing optimization. ### Using the CLI ```bash -# Add examples immediately (MAS must be ONLINE) -python mas_manager.py add_examples TILE_ID '[ +# Add examples (MAS must be ONLINE) +python scripts/mas_manager.py add_examples TILE_ID '[ {"question": "I haven'\''t received my invoice for this month", "guideline": "Should be routed to billing_agent"}, {"question": "The API is returning a 500 error", "guideline": "Should be routed to technical_agent"}, {"question": "How many vacation days do I have?", "guideline": "Should be routed to hr_agent"} ]' -# Add examples with queuing (waits for ONLINE if provisioning) -python mas_manager.py add_examples_queued TILE_ID '[ - {"question": "I haven'\''t received my invoice for this month", "guideline": "Should be routed to billing_agent"} -]' - # List existing examples -python mas_manager.py list_examples TILE_ID +python scripts/mas_manager.py list_examples TILE_ID +``` + +### Adding Examples in Jobs + +When creating a MAS in a Databricks job and immediately adding examples, use `add_examples_wait` to wait for the endpoint to become ONLINE: + +```bash +# Waits up to 10 minutes for ONLINE, then adds examples +python scripts/mas_manager.py add_examples_wait TILE_ID '[ + {"question": "...", "guideline": "..."} +]' ``` -If the Supervisor Agent is not yet `ONLINE`, use `add_examples_queued` - examples are queued and added automatically when the endpoint becomes ready. +This is useful in automated pipelines where you create a MAS and need to add examples in the same job without manual intervention. ## Best Practices @@ -332,14 +338,14 @@ Consider adding a general-purpose agent for queries that don't fit elsewhere: ## Updating a Supervisor Agent -To update an existing Supervisor Agent, use `mas_manager.py`: +To update an existing Supervisor Agent, use `scripts/mas_manager.py`: ```bash # Get current state -python mas_manager.py get_mas TILE_ID +python scripts/mas_manager.py get_mas TILE_ID # Update with new configuration -python mas_manager.py update_mas TILE_ID '{"name": "New Name", "agents": [...], "instructions": "..."}' +python scripts/mas_manager.py update_mas TILE_ID '{"name": "New Name", "agents": [...], "instructions": "..."}' ``` 1. **Add/remove agents**: Include updated `agents` list diff --git a/databricks-skills/databricks-agent-bricks/SKILL.md b/databricks-skills/databricks-agent-bricks/SKILL.md index f9d98d67..94f3d68a 100644 --- a/databricks-skills/databricks-agent-bricks/SKILL.md +++ b/databricks-skills/databricks-agent-bricks/SKILL.md @@ -105,7 +105,7 @@ See `databricks-genie` skill for: ### Supervisor Agent (MAS) -**NO CLI AVAILABLE** - Supervisor Agents are managed via the self-contained `mas_manager.py` script in this skill folder. +**NO CLI AVAILABLE** - Supervisor Agents are managed via the self-contained `scripts/mas_manager.py` script in this skill folder. Install requirements first: ```bash @@ -116,31 +116,31 @@ Usage: ```bash # List all Supervisor Agents -python mas_manager.py list_mas +python scripts/mas_manager.py list_mas # Create a Supervisor Agent -python mas_manager.py create_mas "My Supervisor" '{"agents": [...], "description": "...", "instructions": "..."}' +python scripts/mas_manager.py create_mas "My Supervisor" '{"agents": [...], "description": "...", "instructions": "..."}' # Get a Supervisor Agent by tile ID -python mas_manager.py get_mas TILE_ID +python scripts/mas_manager.py get_mas TILE_ID # Find a Supervisor Agent by name -python mas_manager.py find_mas "My Supervisor" +python scripts/mas_manager.py find_mas "My Supervisor" # Update a Supervisor Agent -python mas_manager.py update_mas TILE_ID '{"name": "New Name", "agents": [...], ...}' +python scripts/mas_manager.py update_mas TILE_ID '{"name": "New Name", "agents": [...], ...}' # Delete a Supervisor Agent -python mas_manager.py delete_mas TILE_ID +python scripts/mas_manager.py delete_mas TILE_ID -# Add examples (must be ONLINE) -python mas_manager.py add_examples TILE_ID '[{"question": "...", "guideline": "..."}]' +# Add examples (MAS must be ONLINE) +python scripts/mas_manager.py add_examples TILE_ID '[{"question": "...", "guideline": "..."}]' -# Add examples (queued - waits for ONLINE) -python mas_manager.py add_examples_queued TILE_ID '[{"question": "...", "guideline": "..."}]' +# Add examples with wait (for jobs - waits up to 10 min for ONLINE) +python scripts/mas_manager.py add_examples_wait TILE_ID '[{"question": "...", "guideline": "..."}]' # List examples -python mas_manager.py list_examples TILE_ID +python scripts/mas_manager.py list_examples TILE_ID ``` Agent configuration options (provide exactly one per agent): @@ -193,10 +193,10 @@ For KA, if `add_examples_from_volume=true`, examples are automatically extracted ## Example: Multi-Modal Supervisor Agent -Use `mas_manager.py` to create a Supervisor Agent: +Use `scripts/mas_manager.py` to create a Supervisor Agent: ```bash -python mas_manager.py create_mas "Enterprise Support Supervisor" '{ +python scripts/mas_manager.py create_mas "Enterprise Support Supervisor" '{ "description": "Comprehensive enterprise support agent", "instructions": "Route queries as follows:\n1. Policy/procedure questions β†’ knowledge_base\n2. Data analysis requests β†’ analytics_engine\n3. Ticket classification β†’ ml_classifier", "agents": [ diff --git a/databricks-skills/databricks-agent-bricks/mas_manager.py b/databricks-skills/databricks-agent-bricks/scripts/mas_manager.py similarity index 100% rename from databricks-skills/databricks-agent-bricks/mas_manager.py rename to databricks-skills/databricks-agent-bricks/scripts/mas_manager.py diff --git a/databricks-skills/databricks-genie/SKILL.md b/databricks-skills/databricks-genie/SKILL.md index 39c0ab00..8f3e9975 100644 --- a/databricks-skills/databricks-genie/SKILL.md +++ b/databricks-skills/databricks-genie/SKILL.md @@ -69,18 +69,18 @@ databricks genie import-space --json '{ ### Conversation API (Query) -Use the `conversation.py` script in this skill folder to ask questions: +Use the `scripts/conversation.py` script in this skill folder to ask questions: ```bash # Ask a question to a Genie Space -python conversation.py ask SPACE_ID "What were total sales last month?" +python scripts/conversation.py ask SPACE_ID "What were total sales last month?" # Returns: {question, conversation_id, message_id, status, sql, columns, data, row_count} # Follow-up question in same conversation -python conversation.py ask SPACE_ID "Break that down by region" --conversation-id CONV_ID +python scripts/conversation.py ask SPACE_ID "Break that down by region" --conversation-id CONV_ID # With custom timeout (default: 60 seconds) -python conversation.py ask SPACE_ID "Complex analysis query" --timeout 120 +python scripts/conversation.py ask SPACE_ID "Complex analysis query" --timeout 120 ``` ### Table Inspection @@ -124,7 +124,7 @@ databricks genie create-space --json '{ ### 3. Ask Questions (Conversation API) ```bash -python conversation.py ask YOUR_SPACE_ID "What were total sales last month?" +python scripts/conversation.py ask YOUR_SPACE_ID "What were total sales last month?" # Returns: SQL, columns, data, row_count ``` diff --git a/databricks-skills/databricks-genie/conversation.md b/databricks-skills/databricks-genie/conversation.md index 4be5aa64..760c00d7 100644 --- a/databricks-skills/databricks-genie/conversation.md +++ b/databricks-skills/databricks-genie/conversation.md @@ -4,7 +4,7 @@ Use the Genie Conversation API to ask natural language questions to a curated Ge ## Overview -The `conversation.py` script in this skill folder allows you to programmatically send questions to a Genie Space and receive SQL-generated answers. Instead of writing SQL directly, you delegate the query generation to Genie, which has been curated with business logic, instructions, and certified queries. +The `scripts/conversation.py` script in this skill folder allows you to programmatically send questions to a Genie Space and receive SQL-generated answers. Instead of writing SQL directly, you delegate the query generation to Genie, which has been curated with business logic, instructions, and certified queries. ## When to Use the Conversation API @@ -29,10 +29,10 @@ The `conversation.py` script in this skill folder allows you to programmatically ## CLI Usage -Use the `conversation.py` script to ask questions: +Use the `scripts/conversation.py` script to ask questions: ```bash -python conversation.py ask SPACE_ID "Your question here" +python scripts/conversation.py ask SPACE_ID "Your question here" ``` ## Basic Usage @@ -40,7 +40,7 @@ python conversation.py ask SPACE_ID "Your question here" ### Ask a Question ```bash -python conversation.py ask 01abc123... "What were total sales last month?" +python scripts/conversation.py ask 01abc123... "What were total sales last month?" ``` **Response:** @@ -63,11 +63,11 @@ Use the `conversation_id` from the first response to ask follow-up questions wit ```bash # First question - capture the conversation_id from output -python conversation.py ask 01abc123... "What were total sales last month?" +python scripts/conversation.py ask 01abc123... "What were total sales last month?" # Output includes: "conversation_id": "conv_xyz789" # Follow-up (uses context from first question) -python conversation.py ask 01abc123... "Break that down by region" --conversation-id conv_xyz789 +python scripts/conversation.py ask 01abc123... "Break that down by region" --conversation-id conv_xyz789 ``` Genie remembers the context, so "that" refers to "total sales last month". @@ -94,7 +94,7 @@ Genie remembers the context, so "that" refers to "total sales last month". The script returns JSON that can be parsed: ```bash -python conversation.py ask SPACE_ID "Who are our top 10 customers?" | jq '.status' +python scripts/conversation.py ask SPACE_ID "Who are our top 10 customers?" | jq '.status' # Output: "COMPLETED" ``` @@ -107,7 +107,7 @@ Response fields when status is `COMPLETED`: ### Failed Response ```bash -python conversation.py ask SPACE_ID "What is the meaning of life?" | jq '.status, .error' +python scripts/conversation.py ask SPACE_ID "What is the meaning of life?" | jq '.status, .error' # Output: "FAILED" # Output: "Could not generate SQL for this question" ``` @@ -117,7 +117,7 @@ Genie couldn't answer - may need to rephrase or use direct SQL. ### Timeout ```bash -python conversation.py ask SPACE_ID "Complex query" --timeout 120 | jq '.status' +python scripts/conversation.py ask SPACE_ID "Complex query" --timeout 120 | jq '.status' # If timeout occurs: "TIMEOUT" ``` @@ -132,7 +132,7 @@ User: "Ask my Sales Genie what the churn rate is" Claude: 1. Identifies user wants to use Genie (explicit request) -2. Runs: python conversation.py ask sales_genie_id "What is the churn rate?" +2. Runs: python scripts/conversation.py ask sales_genie_id "What is the churn rate?" 3. Returns: "Based on your Sales Genie, the churn rate is 4.2%. Genie used this SQL: SELECT ..." ``` @@ -145,8 +145,8 @@ User: "I just created a Genie Space for HR data. Can you test it?" Claude: 1. Gets the space_id from the user or recent databricks genie create-space result 2. Runs conversation.py with test questions: - - python conversation.py ask SPACE_ID "How many employees do we have?" - - python conversation.py ask SPACE_ID "What is the average salary by department?" + - python scripts/conversation.py ask SPACE_ID "How many employees do we have?" + - python scripts/conversation.py ask SPACE_ID "What is the average salary by department?" 3. Reports results: "Your HR Genie is working. It correctly answered..." ``` @@ -156,12 +156,12 @@ Claude: User: "Use my analytics Genie to explore sales trends" Claude: -1. python conversation.py ask SPACE_ID "What were total sales by month this year?" +1. python scripts/conversation.py ask SPACE_ID "What were total sales by month this year?" # Returns conversation_id: conv_xyz 2. User: "Which month had the highest growth?" -3. python conversation.py ask SPACE_ID "Which month had the highest growth?" -c conv_xyz +3. python scripts/conversation.py ask SPACE_ID "Which month had the highest growth?" -c conv_xyz 4. User: "What products drove that growth?" -5. python conversation.py ask SPACE_ID "What products drove that growth?" -c conv_xyz +5. python scripts/conversation.py ask SPACE_ID "What products drove that growth?" -c conv_xyz ``` ## Best Practices @@ -172,13 +172,13 @@ Don't reuse conversations across unrelated questions: ```bash # Good: New conversation for new topic -python conversation.py ask SPACE_ID "What were sales last month?" # New conversation -python conversation.py ask SPACE_ID "How many employees do we have?" # New conversation +python scripts/conversation.py ask SPACE_ID "What were sales last month?" # New conversation +python scripts/conversation.py ask SPACE_ID "How many employees do we have?" # New conversation # Good: Follow-up for related question -python conversation.py ask SPACE_ID "What were sales last month?" +python scripts/conversation.py ask SPACE_ID "What were sales last month?" # Get conversation_id from output, then: -python conversation.py ask SPACE_ID "Break that down by product" -c CONV_ID # Related follow-up +python scripts/conversation.py ask SPACE_ID "Break that down by product" -c CONV_ID # Related follow-up ``` ### Handle Clarification Requests @@ -186,7 +186,7 @@ python conversation.py ask SPACE_ID "Break that down by product" -c CONV_ID # R Genie may ask for clarification instead of returning results: ```bash -python conversation.py ask SPACE_ID "Show me the data" | jq '.text_response' +python scripts/conversation.py ask SPACE_ID "Show me the data" | jq '.text_response' # If Genie needs clarification, text_response will contain the question # Rephrase with more specifics ``` @@ -199,10 +199,10 @@ python conversation.py ask SPACE_ID "Show me the data" | jq '.text_response' ```bash # Quick question (default 60s) -python conversation.py ask SPACE_ID "How many orders today?" +python scripts/conversation.py ask SPACE_ID "How many orders today?" # Complex analysis with longer timeout -python conversation.py ask SPACE_ID "Calculate customer lifetime value for all customers" --timeout 180 +python scripts/conversation.py ask SPACE_ID "Calculate customer lifetime value for all customers" --timeout 180 ``` ## Troubleshooting diff --git a/databricks-skills/databricks-genie/conversation.py b/databricks-skills/databricks-genie/scripts/conversation.py similarity index 100% rename from databricks-skills/databricks-genie/conversation.py rename to databricks-skills/databricks-genie/scripts/conversation.py diff --git a/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md b/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md index 1be5a38b..4f1af2ee 100644 --- a/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md +++ b/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md @@ -9,7 +9,7 @@ Convert HTML content to PDF documents and upload them to Unity Catalog Volumes. ## Overview -Generate PDFs from HTML using the self-contained `pdf_generator.py` script. You (the LLM) generate the HTML content, and the Python script handles conversion and upload using the Databricks CLI. +Generate PDFs from HTML using the self-contained `scripts/pdf_generator.py` script. You (the LLM) generate the HTML content, and the Python script handles conversion and upload using the Databricks CLI. ## Installation @@ -55,15 +55,15 @@ The script can also be run directly from command line: ```bash # Generate from inline HTML -python pdf_generator.py generate --html '

Hello

' \ +python scripts/pdf_generator.py generate --html '

Hello

' \ --filename hello.pdf --catalog my_catalog --schema my_schema # Generate from HTML file -python pdf_generator.py generate --html-file input.html \ +python scripts/pdf_generator.py generate --html-file input.html \ --filename report.pdf --catalog my_catalog --schema my_schema --folder reports # Get JSON output -python pdf_generator.py generate --html '...' --filename test.pdf \ +python scripts/pdf_generator.py generate --html '...' --filename test.pdf \ --catalog my_catalog --schema my_schema --json ``` diff --git a/databricks-skills/databricks-unstructured-pdf-generation/pdf_generator.py b/databricks-skills/databricks-unstructured-pdf-generation/scripts/pdf_generator.py similarity index 100% rename from databricks-skills/databricks-unstructured-pdf-generation/pdf_generator.py rename to databricks-skills/databricks-unstructured-pdf-generation/scripts/pdf_generator.py From 69c9ce73fe7520aba471d00805fb7101c29bc10f Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Mon, 13 Apr 2026 15:28:04 +0200 Subject: [PATCH 18/66] Add CLI syntax patterns to prevent common errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Use --json syntax for creating UC objects (catalogs, schemas, volumes) - Document correct JSON format for each create operation - Add SQL execution alternative for creating objects - Fix incorrect positional args syntax in multiple skill files The --json syntax is the most reliable across CLI versions. πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- DECOMMISSION_PLAN.md | 270 ++++++++++++++++++ databricks-skills/databricks-config/SKILL.md | 48 ++++ .../references/1-project-initialization.md | 2 +- .../databricks-unity-catalog/SKILL.md | 23 ++ 4 files changed, 342 insertions(+), 1 deletion(-) create mode 100644 DECOMMISSION_PLAN.md diff --git a/DECOMMISSION_PLAN.md b/DECOMMISSION_PLAN.md new file mode 100644 index 00000000..93bfa905 --- /dev/null +++ b/DECOMMISSION_PLAN.md @@ -0,0 +1,270 @@ +# MCP Server Decommissioning Plan + +## Executive Summary + +This plan outlines removing `databricks-tools-core` and `databricks-mcp-server` from the main AI Dev Kit project, simplifying the installation to focus on **standalone skills only**. + +## Current State Analysis + +### What Exists Today + +| Component | Purpose | Dependencies | +|-----------|---------|--------------| +| `databricks-tools-core/` | Python library with high-level Databricks functions | None (standalone) | +| `databricks-mcp-server/` | MCP server exposing 50+ tools | Depends on databricks-tools-core | +| `databricks-skills/` | Markdown skills + self-contained Python scripts | **None** (already standalone) | +| `databricks-builder-app/` | Full-stack web application | **Depends on BOTH** tools-core and mcp-server | + +### Files Referencing MCP/Core + +**Shell scripts:** +- `install.sh` (main installer) - lines 1071, 251, 657, etc. +- `databricks-mcp-server/setup.sh` +- `.claude-plugin/setup.sh` +- `databricks-builder-app/scripts/deploy.sh` (lines 193-195) +- `databricks-builder-app/scripts/start_local.sh` (lines 205-206) + +**Documentation:** +- `README.md` - references both packages in "What's Included" and "Core Library" sections +- `SECURITY.md` - mentions packages in installation flow +- `CONTRIBUTING.md` - setup instructions reference mcp-server +- `databricks-builder-app/README.md` - architecture diagram includes mcp-server + +## builder-app Refactoring (Much Simpler Than Expected!) + +### Reference Implementation + +A cleaner solution exists in `industry-demo-prompts/app/src/demo_prompt_generator/backend/services/agent.py`. + +**Key insight:** MCP tools are NOT needed. Skills + standard SDK tools provide everything: + +```python +# Note: MCP tools removed - ai-dev-kit now uses CLI tools via skills +allowed_tools = ["Read", "Write", "Edit", "Glob", "Grep", "Bash", "Skill"] +``` + +### Current builder-app Dependencies + +| File | Import | Can Be Removed? | +|------|--------|----------------| +| `server/services/agent.py` | `databricks_tools_core.auth` | Yes - use `databricks.sdk.WorkspaceClient()` directly | +| `server/services/databricks_tools.py` | `databricks_mcp_server.*` | **DELETE ENTIRE FILE** | +| `server/services/clusters.py` | `databricks_tools_core.auth` | Yes - use SDK directly | +| `server/services/warehouses.py` | `databricks_tools_core.auth` | Yes - use SDK directly | +| `server/services/user.py` | `databricks_tools_core.identity` | Yes - inline constants | +| `server/db/database.py` | `databricks_tools_core.identity` | Yes - inline constants | +| `alembic/env.py` | `databricks_tools_core.identity` | Yes - inline constants | + +### Refactoring Steps + +1. **Delete `databricks_tools.py`** (433 lines) - No longer needed +2. **Simplify `agent.py`**: + - Remove MCP server loading + - Use standard SDK tools: `["Read", "Write", "Edit", "Glob", "Grep", "Bash", "Skill"]` + - Add `setting_sources=["project"]` to enable skill discovery + - Copy client pooling pattern from reference implementation +3. **Replace auth imports** - Use `databricks.sdk.WorkspaceClient()` directly +4. **Inline identity constants**: + ```python + # Instead of: from databricks_tools_core.identity import PRODUCT_NAME, PRODUCT_VERSION + PRODUCT_NAME = "databricks-builder-app" + PRODUCT_VERSION = "0.1.0" + ``` +5. **Update deploy.sh** - Remove package copying steps +6. **Update pyproject.toml** - Remove `databricks_tools_core*` and `databricks_mcp_server*` from includes + +### Code Reduction + +| File | Before | After | +|------|--------|-------| +| `databricks_tools.py` | 433 lines | **DELETED** | +| `agent.py` | ~400 lines | ~300 lines | +| `deploy.sh` | Complex pkg copy | Simple | + +**Total: ~500+ lines removed, simpler architecture** + +### Phase 2: Simplify Main Project + +Once builder-app is self-contained: + +#### 2.1 Delete Folders +```bash +rm -rf databricks-tools-core/ +rm -rf databricks-mcp-server/ +``` + +#### 2.2 Simplify install.sh + +**Option A: Remove MCP entirely (Recommended)** + +Replace the 1790-line `install.sh` with a simplified version that: +- Only installs skills (like `install_skills.sh` does) +- Removes all MCP configuration code +- Removes the Python venv creation for MCP + +**Option B: Keep MCP as optional** + +Keep `--skills-only` as default, make MCP opt-in via `--with-mcp`: +- Default behavior = skills only +- `--with-mcp` = old behavior + +#### 2.3 Update Documentation + +**README.md changes:** +- Remove "Core Library" section +- Remove "MCP Tools Only" from table +- Remove databricks-tools-core from "What's Included" +- Update architecture diagram (remove MCP layer) + +**Files to update:** +- `README.md` +- `SECURITY.md` +- `CONTRIBUTING.md` +- `databricks-builder-app/README.md` + +#### 2.4 Update Other Files + +- `.mcp.json` - Delete or update +- `.claude-plugin/setup.sh` - Remove core/mcp references +- `pyproject.toml` (if any) - Update dependencies + +## Installation Flow Comparison + +### Current Flow (install.sh) +``` +1. Clone repo to ~/.ai-dev-kit +2. Create Python venv +3. pip install databricks-tools-core + databricks-mcp-server +4. Install skills to .claude/skills/ +5. Write MCP config to claude_desktop_config.json, etc. +``` + +### Simplified Flow (after decommissioning) +``` +1. Install skills to .claude/skills/ (directly from GitHub) +2. Done! +``` + +## Migration Guide for Users + +Users who want MCP tools after decommissioning: + +1. **Use databricks CLI directly** - Skills now guide users to use CLI commands +2. **Use databricks SDK** - Skills include Python SDK examples +3. **Fork the MCP server** - If they really need it, they can fork the repo at the commit before removal + +## Risks and Mitigations + +| Risk | Mitigation | +|------|------------| +| builder-app breaks | Phase 1 must complete before Phase 2 | +| Users depend on MCP | Document migration path; skills cover same functionality | +| Lost test coverage | Move relevant tests to databricks-skills/.tests/ | + +## File Deletion Summary + +**Folders to delete:** +- `databricks-tools-core/` (~20 Python files, ~15K lines) +- `databricks-mcp-server/` (~15 Python files, ~10K lines) + +**Files to heavily modify:** +- `install.sh` (reduce from 1790 lines to ~500) +- `README.md` (remove 4+ sections) +- `CONTRIBUTING.md` (remove MCP setup) +- `SECURITY.md` (update installation flow) + +**Files to delete:** +- `.mcp.json` (MCP config example) + +## Pre-requisite: Fix Skills Integration Tests + +Before proceeding with decommissioning, fix the broken integration tests in `databricks-skills/.tests/`: + +### Current Test Status + +| Test File | Unit Tests | Integration Tests | Status | +|-----------|------------|-------------------|--------| +| `test_agent_bricks_manager.py` | 5 pass | 3 skip (no workspace) | OK | +| `test_pdf_generator.py` | 13 pass | 3 fail | **NEEDS FIX** | + +### Failing Tests (test_pdf_generator.py) + +``` +FAILED test_pdf_generator.py::TestPDFGenerationIntegration::test_generate_and_upload_pdf +FAILED test_pdf_generator.py::TestPDFGenerationIntegration::test_generate_and_upload_pdf_with_folder +FAILED test_pdf_generator.py::TestPDFGenerationIntegration::test_generate_complex_pdf +``` + +**Root cause:** Test volume `ai_dev_kit.test_pdf_generation.raw_data` doesn't exist. + +### Fix Required + +Update `test_pdf_generator.py` to skip gracefully when test volume is unavailable: + +```python +@pytest.fixture(autouse=True) +def skip_if_volume_missing(self, test_config): + """Skip tests if the required volume doesn't exist.""" + error = _validate_volume_exists( + test_config["catalog"], + test_config["schema"], + test_config["volume"] + ) + if error: + pytest.skip(f"Test volume not available: {error}") +``` + +### Additional Integration Tests Needed + +For complete coverage, add integration tests for remaining skills with Python files: + +| Skill | Python File | Test Status | +|-------|-------------|-------------| +| `databricks-agent-bricks` | `mas_manager.py` | Has tests | +| `databricks-unstructured-pdf-generation` | `pdf_generator.py` | Has tests (needs fix) | +| Other skills with .py files | Various | Need tests | + +## Recommended Execution Order + +### Phase 0: Fix Skills Tests +1. [ ] **Fix broken integration tests** (test_pdf_generator.py skip when volume missing) +2. [ ] Add integration tests for remaining skills with Python files + +### Phase 1: Refactor builder-app (Much Simpler Now!) + +**Reference implementation:** `../industry-demo-prompts/app/src/demo_prompt_generator/backend/services/agent.py` + +3. [ ] Update `pyproject.toml`: + - Bump `claude-agent-sdk>=0.1.50` (from 0.1.19) + - Remove `databricks_tools_core*` and `databricks_mcp_server*` from includes +4. [ ] Delete `server/services/databricks_tools.py` entirely +5. [ ] Simplify `server/services/agent.py`: + - Remove MCP imports and loading + - Use standard tools: `["Read", "Write", "Edit", "Glob", "Grep", "Bash", "Skill"]` + - Add `setting_sources=["project"]` for skill discovery + - Adopt client pooling pattern from reference implementation +6. [ ] Replace `databricks_tools_core.auth` β†’ `databricks.sdk.WorkspaceClient()` +7. [ ] Inline `PRODUCT_NAME`, `PRODUCT_VERSION` constants +8. [ ] Update `deploy.sh` - remove package copying +9. [ ] Test builder-app locally and deployed + +### Phase 2: Simplify Main Project +10. [ ] Simplify `install.sh` to skills-only (remove MCP setup) +11. [ ] Update `install.ps1` (Windows) similarly +12. [ ] Update `README.md` +13. [ ] Update `CONTRIBUTING.md` +14. [ ] Update `SECURITY.md` + +### Phase 3: Delete and Verify +15. [ ] Delete `databricks-tools-core/` +16. [ ] Delete `databricks-mcp-server/` +17. [ ] Delete `.mcp.json` +18. [ ] Delete `.claude-plugin/` (or update if needed) +19. [ ] Test full installation flow (skills-only) +20. [ ] Test builder-app deployment + +## Questions to Resolve + +1. **Should we archive MCP in a separate branch?** - For users who want to fork it +2. **What about install.ps1 (Windows)?** - Same changes needed +3. **Keep .claude-plugin/ ?** - This also references MCP diff --git a/databricks-skills/databricks-config/SKILL.md b/databricks-skills/databricks-config/SKILL.md index 58441f4a..6ee921ce 100644 --- a/databricks-skills/databricks-config/SKILL.md +++ b/databricks-skills/databricks-config/SKILL.md @@ -96,3 +96,51 @@ print(w.current_user.me().user_name) ``` > **Note:** Profile changes via environment variables or CLI flags are session-scoped. For permanent profile setup, use `databricks auth login -p ` and update `~/.databrickscfg` with `cluster_id` or `serverless_compute_id = auto`. + +## CLI Syntax Patterns + +**IMPORTANT**: Use `--json` for creating Unity Catalog objects. This is the most reliable syntax. + +```bash +# βœ… CORRECT - use --json for create operations +databricks catalogs create --json '{"name": "my_catalog"}' +databricks schemas create --json '{"name": "my_schema", "catalog_name": "my_catalog"}' +databricks volumes create --json '{"name": "my_volume", "catalog_name": "my_catalog", "schema_name": "my_schema", "volume_type": "MANAGED"}' +``` + +### Common CLI Patterns + +```bash +# Get help for any command +databricks --help +databricks schemas create --help + +# List operations +databricks catalogs list +databricks schemas list CATALOG_NAME +databricks volumes list CATALOG_NAME.SCHEMA_NAME +databricks clusters list +databricks warehouses list + +# Create operations (use --json) +databricks catalogs create --json '{"name": "my_catalog"}' +databricks schemas create --json '{"name": "my_schema", "catalog_name": "my_catalog"}' +databricks volumes create --json '{"name": "my_volume", "catalog_name": "my_catalog", "schema_name": "my_schema", "volume_type": "MANAGED"}' + +# Delete operations (use full name) +databricks catalogs delete CATALOG_NAME +databricks schemas delete CATALOG_NAME.SCHEMA_NAME +databricks volumes delete CATALOG_NAME.SCHEMA_NAME.VOLUME_NAME +``` + +### SQL Execution via CLI + +```bash +# Run SQL query +databricks sql execute --warehouse-id WAREHOUSE_ID --query "SELECT * FROM catalog.schema.table LIMIT 10" + +# Create objects via SQL (alternative approach) +databricks sql execute --warehouse-id WAREHOUSE_ID --query "CREATE CATALOG my_catalog" +databricks sql execute --warehouse-id WAREHOUSE_ID --query "CREATE SCHEMA my_catalog.my_schema" +databricks sql execute --warehouse-id WAREHOUSE_ID --query "CREATE VOLUME my_catalog.my_schema.my_volume" +``` diff --git a/databricks-skills/databricks-spark-declarative-pipelines/references/1-project-initialization.md b/databricks-skills/databricks-spark-declarative-pipelines/references/1-project-initialization.md index fcea0c45..c5fdc00a 100644 --- a/databricks-skills/databricks-spark-declarative-pipelines/references/1-project-initialization.md +++ b/databricks-skills/databricks-spark-declarative-pipelines/references/1-project-initialization.md @@ -429,7 +429,7 @@ pip install --upgrade databricks-cli databricks catalogs list # Create catalog if needed -databricks catalogs create --name my_catalog +databricks catalogs create --json '{"name": "my_catalog"}' ``` ### "Language option not recognized" diff --git a/databricks-skills/databricks-unity-catalog/SKILL.md b/databricks-skills/databricks-unity-catalog/SKILL.md index 5b9da187..335a923a 100644 --- a/databricks-skills/databricks-unity-catalog/SKILL.md +++ b/databricks-skills/databricks-unity-catalog/SKILL.md @@ -29,6 +29,29 @@ Use this skill when: ## Quick Start +### Create Unity Catalog Objects (CLI) + +**IMPORTANT**: Use `--json` for creating UC objects. Positional args vary by command and version. + +```bash +# Create a catalog +databricks catalogs create --json '{"name": "my_catalog"}' + +# Create a schema +databricks schemas create --json '{"name": "my_schema", "catalog_name": "my_catalog"}' + +# Create a managed volume +databricks volumes create --json '{"name": "my_volume", "catalog_name": "my_catalog", "schema_name": "my_schema", "volume_type": "MANAGED"}' + +# Create an external volume +databricks volumes create --json '{"name": "my_volume", "catalog_name": "my_catalog", "schema_name": "my_schema", "volume_type": "EXTERNAL", "storage_location": "s3://bucket/path"}' + +# List catalogs, schemas, volumes +databricks catalogs list +databricks schemas list my_catalog +databricks volumes list my_catalog.my_schema +``` + ### Volume File Operations (CLI) ```bash From a908657482b0a071c2e8b91ae6a4436e7d8a43ad Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Mon, 13 Apr 2026 15:30:19 +0200 Subject: [PATCH 19/66] Fix CLI syntax: use --json for UC object creation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Use --json syntax for catalogs, schemas, volumes create commands - Remove incorrect positional argument examples - Simplify volume example (remove external variant) πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- databricks-skills/databricks-unity-catalog/SKILL.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/databricks-skills/databricks-unity-catalog/SKILL.md b/databricks-skills/databricks-unity-catalog/SKILL.md index 335a923a..46dd2906 100644 --- a/databricks-skills/databricks-unity-catalog/SKILL.md +++ b/databricks-skills/databricks-unity-catalog/SKILL.md @@ -40,12 +40,9 @@ databricks catalogs create --json '{"name": "my_catalog"}' # Create a schema databricks schemas create --json '{"name": "my_schema", "catalog_name": "my_catalog"}' -# Create a managed volume +# Create a volume databricks volumes create --json '{"name": "my_volume", "catalog_name": "my_catalog", "schema_name": "my_schema", "volume_type": "MANAGED"}' -# Create an external volume -databricks volumes create --json '{"name": "my_volume", "catalog_name": "my_catalog", "schema_name": "my_schema", "volume_type": "EXTERNAL", "storage_location": "s3://bucket/path"}' - # List catalogs, schemas, volumes databricks catalogs list databricks schemas list my_catalog From 5c4189128035cc832997cdd799ccf2dff7af3c04 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Mon, 13 Apr 2026 15:44:23 +0200 Subject: [PATCH 20/66] Remove DatabricksEnv references - use standard DatabricksSession MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DatabricksEnv does not exist in current databricks-connect versions. Updated all skills to use: - DatabricksSession.builder.serverless(True).getOrCreate() - Local dependency installation via uv/pip πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../references/1-databricks-connect.md | 13 +-- .../databricks-synthetic-data-gen/SKILL.md | 15 +-- .../references/2-troubleshooting.md | 27 +---- .../scripts/generate_synthetic_data.py | 110 +++--------------- 4 files changed, 30 insertions(+), 135 deletions(-) diff --git a/databricks-skills/databricks-execution-compute/references/1-databricks-connect.md b/databricks-skills/databricks-execution-compute/references/1-databricks-connect.md index 8b0d74ad..39be79a4 100644 --- a/databricks-skills/databricks-execution-compute/references/1-databricks-connect.md +++ b/databricks-skills/databricks-execution-compute/references/1-databricks-connect.md @@ -30,16 +30,12 @@ auth_type = databricks-cli ## Usage Pattern ```python -from databricks.connect import DatabricksSession, DatabricksEnv - -# Declare dependencies installed on serverless compute -# CRITICAL: Include ALL packages used inside UDFs (pandas/numpy are there by default) -env = DatabricksEnv().withDependencies("faker", "holidays") +from databricks.connect import DatabricksSession +# Install dependencies locally first: uv pip install faker holidays spark = ( DatabricksSession.builder - .profile("my-workspace") # optional: run on a specific profile from ~/.databrickscfg instead of default - .withEnvironment(env) + .profile("my-workspace") # optional: use a specific profile from ~/.databrickscfg .serverless(True) .getOrCreate() ) @@ -54,9 +50,8 @@ df.write.mode('overwrite').saveAsTable("catalog.schema.table") | Issue | Solution | |-------|----------| | `Python 3.12 required` | create venv with correct python version | -| `DatabricksEnv not found` | Upgrade to databricks-connect >= 16.4 | | `serverless_compute_id` error | Add `serverless_compute_id = auto` to ~/.databrickscfg | -| `ModuleNotFoundError` inside UDF | Add the package to `withDependencies()` | +| `ModuleNotFoundError` inside UDF | Install the package locally: `uv pip install ` | | `PERSIST TABLE not supported` | Don't use `.cache()` or `.persist()` with serverless | | `broadcast` is used | Don't broadcast small DF using spark connect, have a small python list instead or join small DF | diff --git a/databricks-skills/databricks-synthetic-data-gen/SKILL.md b/databricks-skills/databricks-synthetic-data-gen/SKILL.md index c046e488..d2e61596 100644 --- a/databricks-skills/databricks-synthetic-data-gen/SKILL.md +++ b/databricks-skills/databricks-synthetic-data-gen/SKILL.md @@ -133,19 +133,18 @@ After generating data, use `get_volume_folder_details` to validate the output ma - Schema matches expected columns and types - Data distributions look reasonable (check column stats) -## Use Databricks Connect Spark + Faker Pattern +## Use Databricks Connect Spark + Faker Pattern ```python -from databricks.connect import DatabricksSession, DatabricksEnv +from databricks.connect import DatabricksSession from pyspark.sql import functions as F from pyspark.sql.types import StringType import pandas as pd -# Setup serverless with dependencies (MUST list all libs used in UDFs) -env = DatabricksEnv().withDependencies("faker", "holidays") -spark = DatabricksSession.builder.withEnvironment(env).serverless(True).getOrCreate() +# Setup serverless Spark session +spark = DatabricksSession.builder.serverless(True).getOrCreate() -# Pandas UDF pattern - import lib INSIDE the function +# Pandas UDF pattern - import lib INSIDE the function (libs must be installed locally) @F.pandas_udf(StringType()) def fake_name(ids: pd.Series) -> pd.Series: from faker import Faker # Import inside UDF @@ -248,9 +247,7 @@ uv pip install "databricks-connect>=16.4,<17.4" faker numpy pandas holidays | Issue | Solution | |-------|----------| -| `ImportError: cannot import name 'DatabricksEnv'` | Upgrade: `uv pip install "databricks-connect>=16.4"` | -| Python 3.11 instead of 3.12 | Python 3.12 required. Use `uv` to create env with correct version | -| `ModuleNotFoundError: faker` | Add to `withDependencies()`, import inside UDF | +| `ModuleNotFoundError: faker` | Install locally: `uv pip install faker`, import inside UDF | | Faker UDF is slow | Use `pandas_udf` for batch processing | | Out of memory | Increase `numPartitions` in `spark.range()` | | Referential integrity errors | Write master table to Delta first, read back for FK joins | diff --git a/databricks-skills/databricks-synthetic-data-gen/references/2-troubleshooting.md b/databricks-skills/databricks-synthetic-data-gen/references/2-troubleshooting.md index 420b3500..7379fb2f 100644 --- a/databricks-skills/databricks-synthetic-data-gen/references/2-troubleshooting.md +++ b/databricks-skills/databricks-synthetic-data-gen/references/2-troubleshooting.md @@ -12,31 +12,16 @@ Common issues and solutions for synthetic data generation. | Mode | Solution | |------|----------| -| **DB Connect 16.4+** | Use `DatabricksEnv().withDependencies("faker", "pandas", ...)` | -| **Older DB Connect with Serverless** | Create job with `environments` parameter | -| **Databricks Runtime** | Use Databricks CLI to install `faker holidays` | +| **DB Connect with Serverless** | Install libs locally (`uv pip install faker`), use `DatabricksSession.builder.serverless(True)` | +| **Databricks Runtime** | Use Databricks CLI to install `faker holidays` | | **Classic cluster** | Use Databricks CLI to install libraries. `databricks libraries install --json '{"cluster_id": "", "libraries": [{"pypi": {"package": "faker"}}, {"pypi": {"package": "holidays"}}]}'` | ```python -# For DB Connect 16.4+ -from databricks.connect import DatabricksSession, DatabricksEnv +# For DB Connect with serverless +from databricks.connect import DatabricksSession -env = DatabricksEnv().withDependencies("faker", "pandas", "numpy", "holidays") -spark = DatabricksSession.builder.withEnvironment(env).serverless(True).getOrCreate() -``` - -### DatabricksEnv not found - -**Problem:** Using older databricks-connect version. - -**Solution:** Upgrade to 16.4+ or use job-based approach: - -```bash -# Upgrade (prefer uv, fall back to pip) -uv pip install "databricks-connect>=16.4,<17.4" -# or: pip install "databricks-connect>=16.4,<17.4" - -# Or use job with environments parameter instead +# Install dependencies locally first: uv pip install faker pandas numpy holidays +spark = DatabricksSession.builder.serverless(True).getOrCreate() ``` ### serverless_compute_id error diff --git a/databricks-skills/databricks-synthetic-data-gen/scripts/generate_synthetic_data.py b/databricks-skills/databricks-synthetic-data-gen/scripts/generate_synthetic_data.py index b9f953fa..7841edf4 100644 --- a/databricks-skills/databricks-synthetic-data-gen/scripts/generate_synthetic_data.py +++ b/databricks-skills/databricks-synthetic-data-gen/scripts/generate_synthetic_data.py @@ -6,9 +6,9 @@ - Direct write to Unity Catalog - Works with serverless and classic compute -Auto-detects environment and uses: -- DatabricksEnv with managed dependencies if databricks-connect >= 16.4 (local) -- Standard session if running on Databricks Runtime or older databricks-connect +Prerequisites: +- Install dependencies locally: uv pip install faker pandas numpy holidays databricks-connect +- Configure ~/.databrickscfg with serverless_compute_id = auto """ import sys import os @@ -61,105 +61,23 @@ REGION_PROBS = [0.4, 0.25, 0.2, 0.15] # ============================================================================= -# ENVIRONMENT DETECTION AND SESSION CREATION +# SESSION CREATION # ============================================================================= -def is_databricks_runtime(): - """Check if running on Databricks Runtime vs locally.""" - return "DATABRICKS_RUNTIME_VERSION" in os.environ - -def get_databricks_connect_version(): - """Get databricks-connect version as (major, minor) tuple or None.""" - try: - import importlib.metadata - version_str = importlib.metadata.version('databricks-connect') - parts = version_str.split('.') - return (int(parts[0]), int(parts[1])) - except Exception: - return None - -# Detect environment -on_runtime = is_databricks_runtime() -db_version = get_databricks_connect_version() +from databricks.connect import DatabricksSession print("=" * 80) -print("ENVIRONMENT DETECTION") +print("CREATING SPARK SESSION") print("=" * 80) -print(f"Running on Databricks Runtime: {on_runtime}") -if db_version: - print(f"databricks-connect version: {db_version[0]}.{db_version[1]}") -else: - print("databricks-connect: not available") - -# Use DatabricksEnv with managed dependencies if: -# - Running locally (not on Databricks Runtime) -# - databricks-connect >= 16.4 -use_managed_deps = (not on_runtime) and db_version and db_version >= (16, 4) - -if use_managed_deps: - print("Using DatabricksEnv with managed dependencies") - print("=" * 80) - from databricks.connect import DatabricksSession, DatabricksEnv - - env = DatabricksEnv().withDependencies("faker", "pandas", "numpy", "holidays") - - if USE_SERVERLESS: - spark = DatabricksSession.builder.withEnvironment(env).serverless(True).getOrCreate() - print("Connected to serverless compute with managed dependencies!") - else: - if not CLUSTER_ID: - raise ValueError("CLUSTER_ID must be set when USE_SERVERLESS=False") - spark = DatabricksSession.builder.withEnvironment(env).clusterId(CLUSTER_ID).getOrCreate() - print(f"Connected to cluster with managed dependencies!") + +if USE_SERVERLESS: + spark = DatabricksSession.builder.serverless(True).getOrCreate() + print("Connected to serverless compute") else: - print("Using standard session (dependencies must be pre-installed)") - print("=" * 80) - - # Check that UDF dependencies are available - print("\nChecking UDF dependencies...") - missing_deps = [] - - try: - from faker import Faker - print(" faker: OK") - except ImportError: - missing_deps.append("faker") - print(" faker: MISSING") - - try: - import pandas as pd - print(" pandas: OK") - except ImportError: - missing_deps.append("pandas") - print(" pandas: MISSING") - - if missing_deps: - print("\n" + "=" * 80) - print("ERROR: Missing dependencies for UDFs") - print("=" * 80) - print(f"Missing: {', '.join(missing_deps)}") - if on_runtime: - print('\nSolution: Install libraries via Databricks CLI:') - print(' databricks libraries install --json \'{"cluster_id": "", "libraries": [{"pypi": {"package": "faker"}}, {"pypi": {"package": "holidays"}}]}\'') - else: - print("\nSolution: Upgrade to databricks-connect >= 16.4 for managed deps") - print(" Or create a job with environment settings") - print("=" * 80) - sys.exit(1) - - print("\nAll dependencies available") - print("=" * 80) - - from databricks.connect import DatabricksSession - - if USE_SERVERLESS: - spark = DatabricksSession.builder.serverless(True).getOrCreate() - print("Connected to serverless compute") - else: - if not CLUSTER_ID: - raise ValueError("CLUSTER_ID must be set when USE_SERVERLESS=False") - spark = DatabricksSession.builder.clusterId(CLUSTER_ID).getOrCreate() - print(f"Connected to cluster ") + if not CLUSTER_ID: + raise ValueError("CLUSTER_ID must be set when USE_SERVERLESS=False") + spark = DatabricksSession.builder.clusterId(CLUSTER_ID).getOrCreate() + print(f"Connected to cluster {CLUSTER_ID}") # Import Faker for UDF definitions from faker import Faker From 6f45a1725d99c988b57a85427772725ef96fecd3 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Mon, 13 Apr 2026 15:56:57 +0200 Subject: [PATCH 21/66] Update synthetic data skill to use CLI SQL for validation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add Post-Generation Validation section with CLI SQL examples - Update troubleshooting.md with CLI-based validation queries - Remove in-script .show() calls from generate_synthetic_data.py - Validate data using `databricks sql execute` instead of DataFrame API πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../databricks-synthetic-data-gen/SKILL.md | 15 ++-- .../references/2-troubleshooting.md | 70 ++++++++++++++----- .../scripts/generate_synthetic_data.py | 8 --- 3 files changed, 61 insertions(+), 32 deletions(-) diff --git a/databricks-skills/databricks-synthetic-data-gen/SKILL.md b/databricks-skills/databricks-synthetic-data-gen/SKILL.md index d2e61596..3ba55e87 100644 --- a/databricks-skills/databricks-synthetic-data-gen/SKILL.md +++ b/databricks-skills/databricks-synthetic-data-gen/SKILL.md @@ -126,12 +126,17 @@ Show a clear specification with **the business story and your assumptions surfac **Do NOT proceed to code generation until user approves the plan, including the catalog.** -### Post-Generation Checklist +### Post-Generation Validation -After generating data, use `get_volume_folder_details` to validate the output matches requirements: -- Row counts match the plan -- Schema matches expected columns and types -- Data distributions look reasonable (check column stats) +Use `databricks sql execute` to validate generated data (row counts, distributions, referential integrity). Query parquet files directly: + +```bash +databricks sql execute --warehouse-id $WAREHOUSE_ID --query " +SELECT COUNT(*) FROM parquet.\`/Volumes/CATALOG/SCHEMA/raw_data/customers\` +" +``` + +See [references/2-troubleshooting.md](references/2-troubleshooting.md) for full validation examples. ## Use Databricks Connect Spark + Faker Pattern diff --git a/databricks-skills/databricks-synthetic-data-gen/references/2-troubleshooting.md b/databricks-skills/databricks-synthetic-data-gen/references/2-troubleshooting.md index 7379fb2f..0bbdfe9b 100644 --- a/databricks-skills/databricks-synthetic-data-gen/references/2-troubleshooting.md +++ b/databricks-skills/databricks-synthetic-data-gen/references/2-troubleshooting.md @@ -285,25 +285,57 @@ resolution_hours = np.random.exponential(scale=resolution_scale[priority]) ## Validation Steps -After generation, verify your data: +After generation, validate using SQL queries via Databricks CLI: -```python -# 1. Check row counts -print(f"Customers: {customers_df.count():,}") -print(f"Orders: {orders_df.count():,}") - -# 2. Verify distributions -customers_df.groupBy("tier").count().show() -orders_df.describe("amount").show() - -# 3. Check referential integrity -orphans = orders_df.join( - customers_df, - orders_df.customer_id == customers_df.customer_id, - "left_anti" -) -print(f"Orphan orders: {orphans.count()}") +```bash +# Set your warehouse ID +WAREHOUSE_ID="your-warehouse-id" +VOLUME_PATH="/Volumes/CATALOG/SCHEMA/raw_data" -# 4. Verify date range -orders_df.select(F.min("order_date"), F.max("order_date")).show() +# 1. Check row counts +databricks sql execute --warehouse-id $WAREHOUSE_ID --query " +SELECT 'customers' as table_name, COUNT(*) as row_count FROM parquet.\`${VOLUME_PATH}/customers\` +UNION ALL +SELECT 'orders', COUNT(*) FROM parquet.\`${VOLUME_PATH}/orders\` +" + +# 2. Preview schema and sample data +databricks sql execute --warehouse-id $WAREHOUSE_ID --query " +DESCRIBE SELECT * FROM parquet.\`${VOLUME_PATH}/customers\` +" + +databricks sql execute --warehouse-id $WAREHOUSE_ID --query " +SELECT * FROM parquet.\`${VOLUME_PATH}/customers\` LIMIT 5 +" + +# 3. Verify distributions +databricks sql execute --warehouse-id $WAREHOUSE_ID --query " +SELECT tier, COUNT(*) as count, ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 1) as pct +FROM parquet.\`${VOLUME_PATH}/customers\` +GROUP BY tier ORDER BY tier +" + +# 4. Check amount statistics +databricks sql execute --warehouse-id $WAREHOUSE_ID --query " +SELECT + MIN(amount) as min_amount, + MAX(amount) as max_amount, + ROUND(AVG(amount), 2) as avg_amount, + ROUND(STDDEV(amount), 2) as stddev_amount +FROM parquet.\`${VOLUME_PATH}/orders\` +" + +# 5. Check referential integrity +databricks sql execute --warehouse-id $WAREHOUSE_ID --query " +SELECT COUNT(*) as orphan_orders +FROM parquet.\`${VOLUME_PATH}/orders\` o +LEFT JOIN parquet.\`${VOLUME_PATH}/customers\` c ON o.customer_id = c.customer_id +WHERE c.customer_id IS NULL +" + +# 6. Verify date range +databricks sql execute --warehouse-id $WAREHOUSE_ID --query " +SELECT MIN(order_date) as min_date, MAX(order_date) as max_date +FROM parquet.\`${VOLUME_PATH}/orders\` +" ``` diff --git a/databricks-skills/databricks-synthetic-data-gen/scripts/generate_synthetic_data.py b/databricks-skills/databricks-synthetic-data-gen/scripts/generate_synthetic_data.py index 7841edf4..b36edb8e 100644 --- a/databricks-skills/databricks-synthetic-data-gen/scripts/generate_synthetic_data.py +++ b/databricks-skills/databricks-synthetic-data-gen/scripts/generate_synthetic_data.py @@ -178,10 +178,6 @@ def generate_lognormal_amount(tiers: pd.Series) -> pd.Series: customers_df.write.mode(WRITE_MODE).parquet(f"{VOLUME_PATH}/customers") print(f" Saved customers to {VOLUME_PATH}/customers") -# Show tier distribution -print("\n Tier distribution:") -customers_df.groupBy("tier").count().orderBy("tier").show() - # ============================================================================= # GENERATE ORDERS (Child Table with Referential Integrity) # ============================================================================= @@ -284,10 +280,6 @@ def generate_lognormal_amount(tiers: pd.Series) -> pd.Series: orders_final.write.mode(WRITE_MODE).parquet(f"{VOLUME_PATH}/orders") print(f" Saved orders to {VOLUME_PATH}/orders") -# Show status distribution -print("\n Status distribution:") -orders_final.groupBy("status").count().orderBy("status").show() - # ============================================================================= # CLEANUP AND SUMMARY # ============================================================================= From 933271d09d3d2dd6f203825e007d6e953e4b6060 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Mon, 13 Apr 2026 16:45:05 +0200 Subject: [PATCH 22/66] Fix pdf_generator import path to use scripts/ folder MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../SKILL.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md b/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md index 4f1af2ee..f146c02e 100644 --- a/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md +++ b/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md @@ -26,8 +26,8 @@ pip install plutoprint ## Python Script Pattern ```python -# Import from the skill's pdf_generator.py -from pdf_generator import generate_and_upload_pdf +# Import from scripts/pdf_generator.py +from scripts.pdf_generator import generate_and_upload_pdf result = generate_and_upload_pdf( html_content=html_content, # Complete HTML document @@ -72,7 +72,7 @@ python scripts/pdf_generator.py generate --html '...' --filename test.pdf \ Generate a simple PDF: ```python -from pdf_generator import generate_and_upload_pdf +from scripts.pdf_generator import generate_and_upload_pdf generate_and_upload_pdf( html_content=''' @@ -106,7 +106,7 @@ generate_and_upload_pdf( ```python import concurrent.futures -from pdf_generator import generate_and_upload_pdf +from scripts.pdf_generator import generate_and_upload_pdf pdfs_to_generate = [ {"html_content": "...Employee Handbook content...", "filename": "employee_handbook.pdf"}, @@ -250,7 +250,7 @@ PlutoPrint supports modern CSS3: Generate API documentation, user guides, or technical specs: ```python -from pdf_generator import generate_and_upload_pdf +from scripts.pdf_generator import generate_and_upload_pdf generate_and_upload_pdf( html_content=''' @@ -281,7 +281,7 @@ Content-Type: application/json ### Pattern 2: Business Reports ```python -from pdf_generator import generate_and_upload_pdf +from scripts.pdf_generator import generate_and_upload_pdf generate_and_upload_pdf( html_content=''' @@ -314,7 +314,7 @@ generate_and_upload_pdf( ### Pattern 3: HR Policies ```python -from pdf_generator import generate_and_upload_pdf +from scripts.pdf_generator import generate_and_upload_pdf generate_and_upload_pdf( html_content=''' From ad17c2e8ad06d4b865527aa4c6a8dcb4fc92391b Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Mon, 13 Apr 2026 16:47:25 +0200 Subject: [PATCH 23/66] Simplify PDF skill to use CLI pattern MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove Python import patterns (not usable by agent) - Focus on CLI: write HTML to temp file, run script - Remove redundant sections and patterns πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../SKILL.md | 236 +++--------------- 1 file changed, 30 insertions(+), 206 deletions(-) diff --git a/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md b/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md index f146c02e..1788d73e 100644 --- a/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md +++ b/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md @@ -9,132 +9,66 @@ Convert HTML content to PDF documents and upload them to Unity Catalog Volumes. ## Overview -Generate PDFs from HTML using the self-contained `scripts/pdf_generator.py` script. You (the LLM) generate the HTML content, and the Python script handles conversion and upload using the Databricks CLI. +Generate PDFs from HTML using `scripts/pdf_generator.py`. You generate the HTML content, write it to a temp file, then run the script to convert and upload. ## Installation -Install plutoprint for HTML to PDF conversion: - ```bash -# Preferred: use uv for faster installation uv pip install plutoprint - -# Fallback: use pip if uv is not available -pip install plutoprint ``` -## Python Script Pattern - -```python -# Import from scripts/pdf_generator.py -from scripts.pdf_generator import generate_and_upload_pdf - -result = generate_and_upload_pdf( - html_content=html_content, # Complete HTML document - filename="report.pdf", # PDF filename - catalog="my_catalog", # Unity Catalog name - schema="my_schema", # Schema name - volume="raw_data", # Volume name (default: "raw_data") - folder=None, # Optional subfolder -) -print(f"Uploaded to: {result.volume_path}") -``` +## Usage -**Returns:** -```json -{ - "success": true, - "volume_path": "/Volumes/catalog/schema/volume/filename.pdf", - "error": null -} -``` - -## CLI Usage - -The script can also be run directly from command line: +Run the script via CLI: ```bash -# Generate from inline HTML +# Generate from HTML file (recommended - avoids shell escaping issues) +python scripts/pdf_generator.py generate --html-file ./raw_pdf/content.html \ + --filename report.pdf --catalog my_catalog --schema my_schema + +# Generate from inline HTML (for simple content) python scripts/pdf_generator.py generate --html '

Hello

' \ --filename hello.pdf --catalog my_catalog --schema my_schema -# Generate from HTML file -python scripts/pdf_generator.py generate --html-file input.html \ - --filename report.pdf --catalog my_catalog --schema my_schema --folder reports - -# Get JSON output -python scripts/pdf_generator.py generate --html '...' --filename test.pdf \ - --catalog my_catalog --schema my_schema --json +# With subfolder and JSON output +python scripts/pdf_generator.py generate --html-file ./raw_pdf/content.html \ + --filename report.pdf --catalog my_catalog --schema my_schema --folder reports --json ``` -## Quick Start +**Workflow:** +1. Write HTML content to a local file (e.g., `./raw_pdf/content.html`) +2. Run the script with `--html-file` +3. Script converts to PDF and uploads to Unity Catalog volume -Generate a simple PDF: +## Quick Start -```python -from scripts.pdf_generator import generate_and_upload_pdf +1. Write HTML to a local file +2. Run the script -generate_and_upload_pdf( - html_content=''' +```bash +# Create folder and write HTML content +mkdir -p ./raw_pdf +cat > ./raw_pdf/report.html << 'EOF' +

Quarterly Report Q1 2024

-
-

Executive Summary

-

Revenue increased 15% year-over-year...

-
+

Revenue increased 15% year-over-year...

-''', - filename="q1_report.pdf", - catalog="my_catalog", - schema="my_schema" -) -``` - -## Performance: Generate Multiple PDFs in Parallel - -**IMPORTANT**: PDF generation and upload can take 2-5 seconds per document. When generating multiple PDFs, use concurrent execution to maximize throughput. - -### Example: Generate 5 PDFs in Parallel - -```python -import concurrent.futures -from scripts.pdf_generator import generate_and_upload_pdf - -pdfs_to_generate = [ - {"html_content": "...Employee Handbook content...", "filename": "employee_handbook.pdf"}, - {"html_content": "...Leave Policy content...", "filename": "leave_policy.pdf"}, - {"html_content": "...Code of Conduct content...", "filename": "code_of_conduct.pdf"}, - {"html_content": "...Benefits Guide content...", "filename": "benefits_guide.pdf"}, - {"html_content": "...Remote Work Policy content...", "filename": "remote_work_policy.pdf"}, -] - -def generate_pdf(pdf_config): - return generate_and_upload_pdf( - html_content=pdf_config["html_content"], - filename=pdf_config["filename"], - catalog="hr_catalog", - schema="policies", - folder="2024" - ) - -# Generate in parallel -with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: - results = list(executor.map(generate_pdf, pdfs_to_generate)) + +EOF -for result in results: - print(f"Uploaded: {result.volume_path}") +# Generate and upload PDF +python scripts/pdf_generator.py generate --html-file ./raw_pdf/report.html \ + --filename q1_report.pdf --catalog my_catalog --schema my_schema ``` -By calling these in parallel (not sequentially), 5 PDFs that would take 15-25 seconds sequentially complete in 3-5 seconds total. - ## HTML Best Practices ### Use Complete HTML5 Structure @@ -243,116 +177,6 @@ PlutoPrint supports modern CSS3: ``` -## Common Patterns - -### Pattern 1: Technical Documentation - -Generate API documentation, user guides, or technical specs: - -```python -from scripts.pdf_generator import generate_and_upload_pdf - -generate_and_upload_pdf( - html_content=''' - - - -

API Reference

-
- GET /api/v1/users -

Returns a list of all users.

-
-

Request Headers

-
Authorization: Bearer {token}
-Content-Type: application/json
- -''', - filename="api_reference.pdf", - catalog="docs_catalog", - schema="api_docs" -) -``` - -### Pattern 2: Business Reports - -```python -from scripts.pdf_generator import generate_and_upload_pdf - -generate_and_upload_pdf( - html_content=''' - - - -

Q1 2024 Performance Report

-
-
$2.4M
-
Revenue
-
-
-
+15%
-
Growth
-
- -''', - filename="q1_2024_report.pdf", - catalog="finance", - schema="reports", - folder="quarterly" -) -``` - -### Pattern 3: HR Policies - -```python -from scripts.pdf_generator import generate_and_upload_pdf - -generate_and_upload_pdf( - html_content=''' - - - -

Employee Leave Policy

-

Effective: January 1, 2024

- -
-

1. Annual Leave

-

All full-time employees are entitled to 20 days of paid annual leave per calendar year.

-
- -
- Note: Leave requests must be submitted at least 2 weeks in advance. -
- -''', - filename="leave_policy.pdf", - catalog="hr_catalog", - schema="policies" -) -``` - -## Workflow for Multiple Documents - -When asked to generate multiple PDFs: - -1. **Plan the documents**: Determine titles, content structure for each -2. **Generate HTML for each**: Create complete HTML documents -3. **Call tool in parallel**: Make multiple simultaneous `generate_and_upload_pdf` calls -4. **Report results**: Summarize successful uploads and any errors - ## Prerequisites - Unity Catalog schema must exist From ebef603d5dafee2e5f39b524290efc98c0dfb2c0 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Tue, 14 Apr 2026 11:28:30 +0200 Subject: [PATCH 24/66] Document Genie serialized_space format requirements MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add field format requirements: all items need unique 32-char hex UUID id - Document that question/sql/content fields must be arrays of strings - Add example showing correct format - Add trash-space command for deleting spaces πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- databricks-skills/databricks-genie/SKILL.md | 21 ++-- databricks-skills/databricks-genie/spaces.md | 112 +++++++++++++------ 2 files changed, 89 insertions(+), 44 deletions(-) diff --git a/databricks-skills/databricks-genie/SKILL.md b/databricks-skills/databricks-genie/SKILL.md index 8f3e9975..f255d437 100644 --- a/databricks-skills/databricks-genie/SKILL.md +++ b/databricks-skills/databricks-genie/SKILL.md @@ -33,19 +33,20 @@ Use this skill when: # List all Genie Spaces databricks genie list-spaces -# Create a Genie Space +# Create a Genie Space (requires warehouse_id and serialized_space) databricks genie create-space --json '{ - "display_name": "Sales Analytics", + "warehouse_id": "WAREHOUSE_ID", + "title": "Sales Analytics", "description": "Explore sales data with natural language", - "table_identifiers": ["catalog.schema.customers", "catalog.schema.orders"] + "serialized_space": "{\"version\": 2, \"data_sources\": {\"tables\": [{\"identifier\": \"catalog.schema.customers\"}, {\"identifier\": \"catalog.schema.orders\"}]}}" }' -# Get space details -databricks genie get-space SPACE_ID +# Get space details (with serialized config) +databricks genie get-space SPACE_ID --include-serialized-space # Update a Genie Space databricks genie update-space SPACE_ID --json '{ - "display_name": "Updated Name", + "title": "Updated Name", "description": "Updated description" }' @@ -112,12 +113,10 @@ databricks experimental aitools tools discover-schema my_catalog.sales.customers ```bash databricks genie create-space --json '{ - "display_name": "Sales Analytics", + "warehouse_id": "YOUR_WAREHOUSE_ID", + "title": "Sales Analytics", "description": "Explore sales data with natural language", - "table_identifiers": [ - "my_catalog.sales.customers", - "my_catalog.sales.orders" - ] + "serialized_space": "{\"version\": 2, \"data_sources\": {\"tables\": [{\"identifier\": \"my_catalog.sales.customers\"}, {\"identifier\": \"my_catalog.sales.orders\"}]}}" }' ``` diff --git a/databricks-skills/databricks-genie/spaces.md b/databricks-skills/databricks-genie/spaces.md index af6ea640..4e08902e 100644 --- a/databricks-skills/databricks-genie/spaces.md +++ b/databricks-skills/databricks-genie/spaces.md @@ -38,38 +38,34 @@ Based on the schema information: ### Step 3: Create the Genie Space -Create the space with content tailored to the actual data: +Create the space using `warehouse_id` and `serialized_space`: ```bash databricks genie create-space --json '{ - "display_name": "Sales Analytics", - "description": "Explore retail sales data with three related tables:\n- customers: Customer demographics including region, segment, and signup date\n- orders: Transaction history with order_date, total_amount, and status\n- products: Product catalog with category, price, and inventory\n\nTables join on customer_id and product_id.", - "table_identifiers": [ - "my_catalog.sales.customers", - "my_catalog.sales.orders", - "my_catalog.sales.products" - ] + "warehouse_id": "YOUR_WAREHOUSE_ID", + "title": "Sales Analytics", + "description": "Explore retail sales data", + "parent_path": "/Workspace/Users/your.email@company.com/genie_spaces", + "serialized_space": "{\"version\": 2, \"data_sources\": {\"tables\": [{\"identifier\": \"my_catalog.sales.customers\"}, {\"identifier\": \"my_catalog.sales.orders\"}, {\"identifier\": \"my_catalog.sales.products\"}]}}" }' ``` -Sample questions can be added via the Databricks UI after creation: -- "What were total sales last month?" -- "Who are our top 10 customers by total_amount?" -- "How many orders were placed in Q4 by region?" -- "What's the average order value by customer segment?" -- "Which product categories have the highest revenue?" -- "Show me customers who haven't ordered in 90 days" +**Required parameters:** +- `warehouse_id` β€” SQL warehouse to use +- `serialized_space` β€” JSON string with version 2 format (see Field Format Requirements below) + +**Optional parameters:** +- `title` β€” Display name +- `description` β€” Space description +- `parent_path` β€” Workspace folder path + +Sample questions can be added via the Databricks UI after creation, or included in `serialized_space`. Questions should reference actual column names and reflect realistic business queries for the data (e.g., "What were total sales last month?" for a sales table with `order_date` and `amount` columns). ## Why This Workflow Matters -**Sample questions that reference actual column names** help Genie: -- Learn the vocabulary of your data -- Generate more accurate SQL queries -- Provide better autocomplete suggestions +**Sample questions that reference actual column names** help Genie learn the vocabulary of your data and generate more accurate SQL. -**A description that explains table relationships** helps Genie: -- Understand how to join tables correctly -- Know which table contains which information +**A description that explains table relationships** helps Genie understand how to join tables correctly. - Provide more relevant answers ## Auto-Detection of Warehouse @@ -142,6 +138,16 @@ Write sample questions that: - Demonstrate the data's capabilities - Use natural language (not SQL terms) +## Deleting a Genie Space + +Use `databricks genie trash-space` to delete a space: + +```bash +databricks genie trash-space SPACE_ID +``` + +> **Note:** The command is `trash-space`, not `delete-space`. + ## Updating a Genie Space Use `databricks genie update-space` to update an existing space by ID. @@ -149,15 +155,10 @@ Use `databricks genie update-space` to update an existing space by ID. ### Simple field updates ```bash -# Update display name and description +# Update title and description databricks genie update-space SPACE_ID --json '{ - "display_name": "Sales Analytics", - "description": "Updated description.", - "table_identifiers": [ - "my_catalog.sales.customers", - "my_catalog.sales.orders", - "my_catalog.sales.products" - ] + "title": "Sales Analytics", + "description": "Updated description." }' ``` @@ -173,7 +174,7 @@ databricks genie export-space SOURCE_SPACE_ID > config.json databricks genie update-space TARGET_SPACE_ID --json @updated_config.json ``` -> **Note:** When using serialized_space, the full config comes from the serialized payload. Top-level overrides (display_name, warehouse_id, description) can still be applied. +> **Note:** When using serialized_space, the full config comes from the serialized payload. Top-level overrides (title, warehouse_id, description) can still be applied. ## Export, Import & Migration @@ -208,6 +209,50 @@ Minimum structure: {"version": 2, "data_sources": {"tables": [{"identifier": "catalog.schema.table"}]}} ``` +### Field Format Requirements + +**IMPORTANT:** All items in `sample_questions`, `example_question_sqls`, and `text_instructions` require a unique `id` field. + +| Field | ID Required | Format | +|-------|-------------|--------| +| `config.sample_questions[]` | **YES** | `{"id": "32hexchars", "question": ["..."]}` | +| `instructions.example_question_sqls[]` | **YES** | `{"id": "32hexchars", "question": ["..."], "sql": ["..."]}` | +| `instructions.text_instructions[]` | **YES** | `{"id": "32hexchars", "content": ["..."]}` | + +**ID format:** 32-character lowercase hex UUID without hyphens. Generate with `uuid.uuid4().hex` in Python. + +**Text fields are arrays:** `question`, `sql`, and `content` are arrays of strings, not plain strings. Multi-line content is split into array elements. + +Example: +```json +{ + "version": 2, + "config": { + "sample_questions": [ + {"id": "a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4", "question": ["What were total sales last month?"]} + ] + }, + "data_sources": { + "tables": [{"identifier": "catalog.schema.customers"}] + }, + "instructions": { + "example_question_sqls": [ + { + "id": "b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5", + "question": ["Show top customers"], + "sql": ["SELECT customer_name, SUM(amount) AS total\n", "FROM catalog.schema.orders\n", "GROUP BY 1 ORDER BY 2 DESC"] + } + ], + "text_instructions": [ + { + "id": "c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6", + "content": ["Use gold tables for KPI queries.\n", "Use silver tables for detailed drill-downs."] + } + ] + } +} +``` + ### Exporting a Space Use `databricks genie export-space` to export the full configuration (requires CAN EDIT permission): @@ -307,8 +352,9 @@ To push a serialized config to an already-existing space (rather than creating a 4. **Create the Genie Space**: ```bash databricks genie create-space --json '{ - "display_name": "My Data Explorer", - "table_identifiers": ["catalog.schema.silver_customers", "catalog.schema.silver_orders"] + "warehouse_id": "YOUR_WAREHOUSE_ID", + "title": "My Data Explorer", + "serialized_space": "{\"version\": 2, \"data_sources\": {\"tables\": [{\"identifier\": \"catalog.schema.silver_customers\"}, {\"identifier\": \"catalog.schema.silver_orders\"}]}}" }' ``` From f7043cd54f954fc81b9872e0b29bde22344b7938 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Tue, 14 Apr 2026 11:34:09 +0200 Subject: [PATCH 25/66] Consolidate Genie skill: merge spaces.md into SKILL.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Document correct serialized_space format with ID requirements - All items require 32-char hex UUID id field (uuid.uuid4().hex) - Text fields (question, sql, content) must be arrays, not strings - Fix CLI syntax: use title (not display_name), serialized_space (not table_identifiers) - Add trash-space command documentation - Remove redundant spaces.md file πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- databricks-skills/databricks-genie/SKILL.md | 205 +++++----- databricks-skills/databricks-genie/spaces.md | 406 ------------------- 2 files changed, 101 insertions(+), 510 deletions(-) delete mode 100644 databricks-skills/databricks-genie/spaces.md diff --git a/databricks-skills/databricks-genie/SKILL.md b/databricks-skills/databricks-genie/SKILL.md index f255d437..a56cfd02 100644 --- a/databricks-skills/databricks-genie/SKILL.md +++ b/databricks-skills/databricks-genie/SKILL.md @@ -5,26 +5,12 @@ description: "Create and query Databricks Genie Spaces for natural language SQL # Databricks Genie -Create, manage, and query Databricks Genie Spaces - natural language interfaces for SQL-based data exploration. +Create, manage, and query Genie Spaces - natural language interfaces for SQL-based data exploration. ## Overview Genie Spaces allow users to ask natural language questions about structured data in Unity Catalog. The system translates questions into SQL queries, executes them on a SQL warehouse, and presents results conversationally. -## When to Use This Skill - -Use this skill when: -- Creating a new Genie Space for data exploration -- Adding sample questions to guide users -- Connecting Unity Catalog tables to a conversational interface -- Asking questions to a Genie Space programmatically (Conversation API) -- Exporting a Genie Space configuration (serialized_space) for backup or migration -- Importing / cloning a Genie Space from a serialized payload -- Migrating a Genie Space between workspaces or environments (dev β†’ staging β†’ prod) - - Only supports catalog remapping where catalog names differ across environments - - Not supported for schema and/or table names that differ across environments - - Not including migration of tables between environments (only migration of Genie Spaces) - ## CLI Commands ### Space Management @@ -33,15 +19,16 @@ Use this skill when: # List all Genie Spaces databricks genie list-spaces -# Create a Genie Space (requires warehouse_id and serialized_space) +# Create a Genie Space databricks genie create-space --json '{ "warehouse_id": "WAREHOUSE_ID", "title": "Sales Analytics", - "description": "Explore sales data with natural language", - "serialized_space": "{\"version\": 2, \"data_sources\": {\"tables\": [{\"identifier\": \"catalog.schema.customers\"}, {\"identifier\": \"catalog.schema.orders\"}]}}" + "description": "Explore sales data", + "parent_path": "/Workspace/Users/you@company.com/genie_spaces", + "serialized_space": "{\"version\": 2, \"data_sources\": {\"tables\": [{\"identifier\": \"catalog.schema.table\"}]}}" }' -# Get space details (with serialized config) +# Get space details (with full config) databricks genie get-space SPACE_ID --include-serialized-space # Update a Genie Space @@ -50,129 +37,139 @@ databricks genie update-space SPACE_ID --json '{ "description": "Updated description" }' -# Delete (trash) a Genie Space +# Delete a Genie Space databricks genie trash-space SPACE_ID ``` -### Export & Import (Migration) +### Export & Import ```bash -# Export space configuration (returns JSON with serialized_space) -databricks genie export-space SPACE_ID +# Export space configuration +databricks genie export-space SPACE_ID > exported.json # Import space from exported config -databricks genie import-space --json '{ - "warehouse_id": "WAREHOUSE_ID", - "serialized_space": "...", - "title": "Sales Analytics (Prod)" -}' -``` - -### Conversation API (Query) - -Use the `scripts/conversation.py` script in this skill folder to ask questions: - -```bash -# Ask a question to a Genie Space -python scripts/conversation.py ask SPACE_ID "What were total sales last month?" -# Returns: {question, conversation_id, message_id, status, sql, columns, data, row_count} - -# Follow-up question in same conversation -python scripts/conversation.py ask SPACE_ID "Break that down by region" --conversation-id CONV_ID - -# With custom timeout (default: 60 seconds) -python scripts/conversation.py ask SPACE_ID "Complex analysis query" --timeout 120 +databricks genie import-space --json @exported.json ``` ### Table Inspection ```bash # Inspect table schemas before creating a space -databricks unity-catalog tables get CATALOG.SCHEMA.TABLE - -# Or use the discover-schema tool for multiple tables databricks experimental aitools tools discover-schema catalog.schema.table1 catalog.schema.table2 ``` -## Quick Start - -### 1. Inspect Your Tables - -Before creating a Genie Space, understand your data: - -```bash -# Get table details -databricks unity-catalog tables get my_catalog.sales.customers -databricks unity-catalog tables get my_catalog.sales.orders - -# Or use discover-schema for multiple tables -databricks experimental aitools tools discover-schema my_catalog.sales.customers my_catalog.sales.orders +## serialized_space Format + +The `serialized_space` field is a JSON string containing the full space configuration. + +### Structure + +```json +{ + "version": 2, + "config": { + "sample_questions": [...] + }, + "data_sources": { + "tables": [{"identifier": "catalog.schema.table"}] + }, + "instructions": { + "example_question_sqls": [...], + "text_instructions": [...] + } +} ``` -### 2. Create the Genie Space - -```bash -databricks genie create-space --json '{ - "warehouse_id": "YOUR_WAREHOUSE_ID", - "title": "Sales Analytics", - "description": "Explore sales data with natural language", - "serialized_space": "{\"version\": 2, \"data_sources\": {\"tables\": [{\"identifier\": \"my_catalog.sales.customers\"}, {\"identifier\": \"my_catalog.sales.orders\"}]}}" -}' +### Field Format Requirements + +**IMPORTANT:** All items in `sample_questions`, `example_question_sqls`, and `text_instructions` require a unique `id` field. + +| Field | Format | +|-------|--------| +| `config.sample_questions[]` | `{"id": "32hexchars", "question": ["..."]}` | +| `instructions.example_question_sqls[]` | `{"id": "32hexchars", "question": ["..."], "sql": ["..."]}` | +| `instructions.text_instructions[]` | `{"id": "32hexchars", "content": ["..."]}` | + +- **ID format:** 32-character lowercase hex UUID without hyphens. Generate with `uuid.uuid4().hex` in Python. +- **Text fields are arrays:** `question`, `sql`, and `content` are arrays of strings, not plain strings. + +### Example + +```json +{ + "version": 2, + "config": { + "sample_questions": [ + {"id": "a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4", "question": ["What were total sales last month?"]} + ] + }, + "data_sources": { + "tables": [{"identifier": "catalog.schema.orders"}] + }, + "instructions": { + "example_question_sqls": [ + { + "id": "b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5", + "question": ["Show top customers"], + "sql": ["SELECT customer_name, SUM(amount) AS total ", "FROM catalog.schema.orders ", "GROUP BY 1 ORDER BY 2 DESC"] + } + ] + } +} ``` -### 3. Ask Questions (Conversation API) +## Migration Workflow + +### Clone (Same Workspace) ```bash -python scripts/conversation.py ask YOUR_SPACE_ID "What were total sales last month?" -# Returns: SQL, columns, data, row_count +databricks genie export-space SOURCE_ID > space.json +databricks genie import-space --json @space.json ``` -### 4. Export & Import (Clone / Migrate) +### Cross-Workspace with Catalog Remapping -Export a space (preserves all tables, instructions, SQL examples, and layout): +When migrating between environments (dev β†’ prod), catalog names often differ. Remap them: ```bash -databricks genie export-space YOUR_SPACE_ID > exported_space.json -# exported_space.json contains serialized_space with full config -``` +# 1. Export from source workspace +DATABRICKS_CONFIG_PROFILE=source databricks genie export-space SPACE_ID > exported.json -Clone to a new space (same catalog): +# 2. Remap catalog name +sed -i '' 's/source_catalog/target_catalog/g' exported.json -```bash -# Extract and import -databricks genie import-space --json '{ - "warehouse_id": "WAREHOUSE_ID", - "serialized_space": "...", - "title": "Sales Analytics (Clone)" -}' +# 3. Import to target workspace +DATABRICKS_CONFIG_PROFILE=target databricks genie import-space --json @exported.json ``` -> **Cross-workspace migration:** Use different Databricks CLI profiles for source and target workspaces. Export from source profile, remap catalog names in `serialized_space`, then import via target profile. See [spaces.md Β§Migration](spaces.md#migrating-across-workspaces-with-catalog-remapping) for the full workflow. +## Conversation API -## Reference Files +Use `scripts/conversation.py` to ask questions programmatically: -- [spaces.md](spaces.md) - Creating and managing Genie Spaces -- [conversation.md](conversation.md) - Asking questions via the Conversation API - -## Prerequisites +```bash +# Ask a question +python scripts/conversation.py ask SPACE_ID "What were total sales last month?" -Before creating a Genie Space: +# Follow-up in same conversation +python scripts/conversation.py ask SPACE_ID "Break down by region" --conversation-id CONV_ID -1. **Tables in Unity Catalog** - Bronze/silver/gold tables with the data -2. **SQL Warehouse** - A warehouse to execute queries (auto-detected if not specified) +# With timeout +python scripts/conversation.py ask SPACE_ID "Complex query" --timeout 120 +``` -### Creating Tables +See [conversation.md](conversation.md) for full details. -Use these skills in sequence: -1. `databricks-synthetic-data-gen` - Generate raw parquet files -2. `databricks-spark-declarative-pipelines` - Create bronze/silver/gold tables +## Troubleshooting -## Common Issues +| Issue | Solution | +|-------|----------| +| `sample_question.id must be provided` | Add 32-char hex UUID `id` to each sample question | +| `Expected an array for question` | Use `"question": ["text"]` not `"question": "text"` | +| No warehouse available | Create a SQL warehouse or provide `warehouse_id` | +| Empty `serialized_space` on export | Requires CAN EDIT permission on the space | +| Tables not found after migration | Remap catalog name in `serialized_space` before import | -See [spaces.md Β§Troubleshooting](spaces.md#troubleshooting) for a full list of issues and solutions. ## Related Skills -- **[databricks-agent-bricks](../databricks-agent-bricks/SKILL.md)** - Use Genie Spaces as agents inside Supervisor Agents -- **[databricks-synthetic-data-gen](../databricks-synthetic-data-gen/SKILL.md)** - Generate raw parquet data to populate tables for Genie -- **[databricks-spark-declarative-pipelines](../databricks-spark-declarative-pipelines/SKILL.md)** - Build bronze/silver/gold tables consumed by Genie Spaces -- **[databricks-unity-catalog](../databricks-unity-catalog/SKILL.md)** - Manage the catalogs, schemas, and tables Genie queries +- **[databricks-synthetic-data-gen](../databricks-synthetic-data-gen/SKILL.md)** - Generate data for Genie tables +- **[databricks-spark-declarative-pipelines](../databricks-spark-declarative-pipelines/SKILL.md)** - Build bronze/silver/gold tables diff --git a/databricks-skills/databricks-genie/spaces.md b/databricks-skills/databricks-genie/spaces.md deleted file mode 100644 index 4e08902e..00000000 --- a/databricks-skills/databricks-genie/spaces.md +++ /dev/null @@ -1,406 +0,0 @@ -# Creating Genie Spaces - -This guide covers creating and managing Genie Spaces for SQL-based data exploration. - -## What is a Genie Space? - -A Genie Space connects to Unity Catalog tables and translates natural language questions into SQL β€” understanding schemas, generating queries, executing them on a SQL warehouse, and presenting results conversationally. - -## Creation Workflow - -### Step 1: Inspect Table Schemas (Required) - -**Before creating a Genie Space, you MUST inspect the table schemas** to understand what data is available: - -```bash -# Get table details -databricks unity-catalog tables get my_catalog.sales.customers -databricks unity-catalog tables get my_catalog.sales.orders - -# Or use discover-schema for multiple tables with statistics -databricks experimental aitools tools discover-schema my_catalog.sales.customers my_catalog.sales.orders -``` - -This returns: -- Table names and row counts -- Column names and data types -- Sample values and cardinality -- Null counts and statistics - -### Step 2: Analyze and Plan - -Based on the schema information: - -1. **Select relevant tables** - Choose tables that support the user's use case -2. **Identify key columns** - Note date columns, metrics, dimensions, and foreign keys -3. **Understand relationships** - How do tables join together? -4. **Plan sample questions** - What questions can this data answer? - -### Step 3: Create the Genie Space - -Create the space using `warehouse_id` and `serialized_space`: - -```bash -databricks genie create-space --json '{ - "warehouse_id": "YOUR_WAREHOUSE_ID", - "title": "Sales Analytics", - "description": "Explore retail sales data", - "parent_path": "/Workspace/Users/your.email@company.com/genie_spaces", - "serialized_space": "{\"version\": 2, \"data_sources\": {\"tables\": [{\"identifier\": \"my_catalog.sales.customers\"}, {\"identifier\": \"my_catalog.sales.orders\"}, {\"identifier\": \"my_catalog.sales.products\"}]}}" -}' -``` - -**Required parameters:** -- `warehouse_id` β€” SQL warehouse to use -- `serialized_space` β€” JSON string with version 2 format (see Field Format Requirements below) - -**Optional parameters:** -- `title` β€” Display name -- `description` β€” Space description -- `parent_path` β€” Workspace folder path - -Sample questions can be added via the Databricks UI after creation, or included in `serialized_space`. Questions should reference actual column names and reflect realistic business queries for the data (e.g., "What were total sales last month?" for a sales table with `order_date` and `amount` columns). - -## Why This Workflow Matters - -**Sample questions that reference actual column names** help Genie learn the vocabulary of your data and generate more accurate SQL. - -**A description that explains table relationships** helps Genie understand how to join tables correctly. -- Provide more relevant answers - -## Auto-Detection of Warehouse - -When `warehouse_id` is not specified, the tool: - -1. Lists all SQL warehouses in the workspace -2. Prioritizes by: - - **Running** warehouses first (already available) - - **Starting** warehouses second - - **Smaller sizes** preferred (cost-efficient) -3. Returns an error if no warehouses exist - -To use a specific warehouse, provide the `warehouse_id` explicitly. - -## Table Selection - -Choose tables carefully for best results: - -| Layer | Recommended | Why | -|-------|-------------|-----| -| Bronze | No | Raw data, may have quality issues | -| Silver | Yes | Cleaned and validated | -| Gold | Yes | Aggregated, optimized for analytics | - -### Tips for Table Selection - -- **Include related tables**: If users ask about customers and orders, include both -- **Use descriptive column names**: `customer_name` is better than `cust_nm` -- **Add table comments**: Genie uses metadata to understand the data - -## Sample Questions - -Sample questions help users understand what they can ask: - -**Good sample questions:** -- "What were total sales last month?" -- "Who are our top 10 customers by revenue?" -- "How many orders were placed in Q4?" -- "What's the average order value by region?" - -These appear in the Genie UI to guide users. - -## Best Practices - -### Table Design for Genie - -1. **Descriptive names**: Use `customer_lifetime_value` not `clv` -2. **Add comments**: `COMMENT ON TABLE sales.customers IS 'Customer master data'` -3. **Primary keys**: Define relationships clearly -4. **Date columns**: Include proper date/timestamp columns for time-based queries - -### Description and Context - -Provide context in the description: - -``` -Explore retail sales data from our e-commerce platform. Includes: -- Customers: demographics, segments, and account status -- Orders: transaction history with amounts and dates -- Products: catalog with categories and pricing - -Time range: Last 6 months of data -``` - -### Sample Questions - -Write sample questions that: -- Cover common use cases -- Demonstrate the data's capabilities -- Use natural language (not SQL terms) - -## Deleting a Genie Space - -Use `databricks genie trash-space` to delete a space: - -```bash -databricks genie trash-space SPACE_ID -``` - -> **Note:** The command is `trash-space`, not `delete-space`. - -## Updating a Genie Space - -Use `databricks genie update-space` to update an existing space by ID. - -### Simple field updates - -```bash -# Update title and description -databricks genie update-space SPACE_ID --json '{ - "title": "Sales Analytics", - "description": "Updated description." -}' -``` - -### Full config update via serialized_space - -To push a complete serialized configuration to an existing space (preserves all instructions, SQL examples, join specs, etc.): - -```bash -# First export the current config -databricks genie export-space SOURCE_SPACE_ID > config.json - -# Modify the serialized_space as needed, then update -databricks genie update-space TARGET_SPACE_ID --json @updated_config.json -``` - -> **Note:** When using serialized_space, the full config comes from the serialized payload. Top-level overrides (title, warehouse_id, description) can still be applied. - -## Export, Import & Migration - -`databricks genie export-space SPACE_ID` returns a JSON object with these top-level keys: - -| Key | Description | -|-----|-------------| -| `space_id` | ID of the exported space | -| `title` | Display name of the space | -| `description` | Description of the space | -| `warehouse_id` | SQL warehouse associated with the space (workspace-specific β€” do **not** reuse across workspaces) | -| `serialized_space` | JSON-encoded string with the full space configuration (see below) | - -This envelope enables cloning, backup, and cross-workspace migration. - -### What is `serialized_space`? - -`serialized_space` is a JSON string (version 2) embedded inside the export envelope. Its top-level keys are: - -| Key | Contents | -|-----|----------| -| `version` | Schema version (currently `2`) | -| `config` | Space-level config: `sample_questions` shown in the UI | -| `data_sources` | `tables` array β€” each entry has a fully-qualified `identifier` (`catalog.schema.table`) and optional `column_configs` (format assistance, entity matching per column) | -| `instructions` | `example_question_sqls` (certified Q&A pairs), `join_specs` (join relationships between tables), `sql_snippets` (`filters` and `measures` with display names and usage instructions) | -| `benchmarks` | Evaluation Q&A pairs used to measure space quality | - -Catalog names appear **everywhere** inside `serialized_space` β€” in `data_sources.tables[].identifier`, SQL strings in `example_question_sqls`, `join_specs`, and `sql_snippets`. A single `.replace(src_catalog, tgt_catalog)` on the whole string is sufficient for catalog remapping. - -Minimum structure: -```json -{"version": 2, "data_sources": {"tables": [{"identifier": "catalog.schema.table"}]}} -``` - -### Field Format Requirements - -**IMPORTANT:** All items in `sample_questions`, `example_question_sqls`, and `text_instructions` require a unique `id` field. - -| Field | ID Required | Format | -|-------|-------------|--------| -| `config.sample_questions[]` | **YES** | `{"id": "32hexchars", "question": ["..."]}` | -| `instructions.example_question_sqls[]` | **YES** | `{"id": "32hexchars", "question": ["..."], "sql": ["..."]}` | -| `instructions.text_instructions[]` | **YES** | `{"id": "32hexchars", "content": ["..."]}` | - -**ID format:** 32-character lowercase hex UUID without hyphens. Generate with `uuid.uuid4().hex` in Python. - -**Text fields are arrays:** `question`, `sql`, and `content` are arrays of strings, not plain strings. Multi-line content is split into array elements. - -Example: -```json -{ - "version": 2, - "config": { - "sample_questions": [ - {"id": "a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4", "question": ["What were total sales last month?"]} - ] - }, - "data_sources": { - "tables": [{"identifier": "catalog.schema.customers"}] - }, - "instructions": { - "example_question_sqls": [ - { - "id": "b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5", - "question": ["Show top customers"], - "sql": ["SELECT customer_name, SUM(amount) AS total\n", "FROM catalog.schema.orders\n", "GROUP BY 1 ORDER BY 2 DESC"] - } - ], - "text_instructions": [ - { - "id": "c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6", - "content": ["Use gold tables for KPI queries.\n", "Use silver tables for detailed drill-downs."] - } - ] - } -} -``` - -### Exporting a Space - -Use `databricks genie export-space` to export the full configuration (requires CAN EDIT permission): - -```bash -databricks genie export-space 01abc123... > exported_space.json -# Returns: -# { -# "space_id": "01abc123...", -# "title": "Sales Analytics", -# "description": "Explore sales data...", -# "warehouse_id": "abc123def456", -# "serialized_space": "{\"version\":2,\"data_sources\":{...},\"instructions\":{...}}" -# } -``` - -### Cloning a Space (Same Workspace) - -```bash -# Step 1: Export the source space -databricks genie export-space 01abc123... > source.json - -# Step 2: Import as a new space (modify title in JSON if needed) -databricks genie import-space --json @source.json -# Returns: {"space_id": "01def456...", "title": "Sales Analytics", "operation": "imported"} -``` - -### Migrating Across Workspaces with Catalog Remapping - -When migrating between environments (e.g. prod β†’ dev), Unity Catalog names are often different. The `serialized_space` string contains the source catalog name **everywhere** β€” in table identifiers, SQL queries, join specs, and filter snippets. You must remap it before importing. - -**Workflow (3 steps):** - -**Step 1 β€” Export from source workspace:** -```bash -# Use source workspace profile -DATABRICKS_CONFIG_PROFILE=source databricks genie export-space 01f106e1239d14b28d6ab46f9c15e540 > exported.json -``` - -**Step 2 β€” Remap catalog name in `serialized_space`:** - -Use sed or a script to replace catalog names: -```bash -# Replace source catalog with target catalog in the serialized_space -sed -i '' 's/source_catalog_name/target_catalog_name/g' exported.json -``` -This replaces all occurrences β€” table identifiers, SQL FROM clauses, join specs, and filter snippets. - -**Step 3 β€” Import to target workspace:** -```bash -# Use target workspace profile -DATABRICKS_CONFIG_PROFILE=target databricks genie import-space --json @exported.json -``` - -### Batch Migration of Multiple Spaces - -To migrate several spaces at once, use a shell loop: - -```bash -for space_id in id1 id2 id3; do - # Export - DATABRICKS_CONFIG_PROFILE=source databricks genie export-space $space_id > ${space_id}.json - # Remap catalog - sed -i '' 's/src_catalog/tgt_catalog/g' ${space_id}.json - # Import - DATABRICKS_CONFIG_PROFILE=target databricks genie import-space --json @${space_id}.json -done -``` - -After migration, update `databricks.yml` with the new dev `space_id` values under the `dev` target's `genie_space_ids` variable. - -### Updating an Existing Space with New Config - -To push a serialized config to an already-existing space (rather than creating a new one), use `databricks genie update-space` with the serialized config. The export β†’ remap β†’ push pattern is identical to the migration steps above; just replace `import-space` with `update-space TARGET_SPACE_ID` as the final call. - -### Permissions Required - -| Operation | Required Permission | -|-----------|-------------------| -| `databricks genie export-space` | CAN EDIT on source space | -| `databricks genie import-space` | Can create items in target workspace folder | -| `databricks genie update-space` with serialized_space | CAN EDIT on target space | - -## Example End-to-End Workflow - -1. **Generate synthetic data** using `databricks-synthetic-data-gen` skill: - - Creates parquet files in `/Volumes/catalog/schema/raw_data/` - -2. **Create tables** using `databricks-spark-declarative-pipelines` skill: - - Creates `catalog.schema.bronze_*` β†’ `catalog.schema.silver_*` β†’ `catalog.schema.gold_*` - -3. **Inspect the tables**: - ```bash - databricks experimental aitools tools discover-schema catalog.schema.silver_customers catalog.schema.silver_orders - ``` - -4. **Create the Genie Space**: - ```bash - databricks genie create-space --json '{ - "warehouse_id": "YOUR_WAREHOUSE_ID", - "title": "My Data Explorer", - "serialized_space": "{\"version\": 2, \"data_sources\": {\"tables\": [{\"identifier\": \"catalog.schema.silver_customers\"}, {\"identifier\": \"catalog.schema.silver_orders\"}]}}" - }' - ``` - -5. **Add sample questions** via the Databricks UI based on actual column names - -6. **Test** using conversation.py or the Databricks UI - -## Troubleshooting - -### No warehouse available - -- Create a SQL warehouse in the Databricks workspace -- Or provide a specific `warehouse_id` - -### Queries are slow - -- Ensure the warehouse is running (not stopped) -- Consider using a larger warehouse size -- Check if tables are optimized (OPTIMIZE, Z-ORDER) - -### Poor query generation - -- Use descriptive column names -- Add table and column comments -- Include sample questions that demonstrate the vocabulary -- Add instructions via the Databricks Genie UI - -### `databricks genie export-space` returns empty `serialized_space` - -Requires at least **CAN EDIT** permission on the space. - -### `databricks genie import-space` fails with permission error - -Ensure you have CREATE privileges in the target workspace folder. - -### Tables not found after migration - -Catalog name was not remapped β€” replace the source catalog name in `serialized_space` before calling `databricks genie import-space`. The catalog appears in table identifiers, SQL FROM clauses, join specs, and filter snippets; a single `sed 's/src_catalog/tgt_catalog/g'` on the whole JSON covers all occurrences. - -### CLI targets the wrong workspace - -Use `DATABRICKS_CONFIG_PROFILE=profile_name` to specify which workspace profile to use: -```bash -DATABRICKS_CONFIG_PROFILE=dev databricks genie list-spaces -``` - -### `databricks genie import-space` fails with JSON parse error - -The `serialized_space` string may contain multi-line SQL arrays with `\n` escape sequences. Flatten SQL arrays to single-line strings before passing to avoid double-escaping issues. From 0b5ac87f101e88ba6f8c719e85250624412388da Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Tue, 14 Apr 2026 12:37:49 +0200 Subject: [PATCH 26/66] Add standalone compute.py script and consolidate Genie skill MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Create standalone compute.py with all logic inlined (no external deps) - Filter clusters to UI/API sources only (interactive, human-created) - Add page_size=100 for faster cluster listing - Use proper SDK types (JobEnvironment, Environment, timedelta) - Add integration tests for compute.py CLI - Merge Genie conversation.md into SKILL.md - Fix CLI commands in SKILL.md (databricks warehouses) πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .test/tests/integration/test_compute.py | 245 +++++++ .../databricks-execution-compute/SKILL.md | 3 +- .../references/3-interactive-cluster.md | 139 ++-- .../scripts/compute.py | 668 ++++++++++++++++++ databricks-skills/databricks-genie/SKILL.md | 126 ++-- .../databricks-genie/conversation.md | 232 ------ 6 files changed, 1075 insertions(+), 338 deletions(-) create mode 100644 .test/tests/integration/test_compute.py create mode 100644 databricks-skills/databricks-execution-compute/scripts/compute.py delete mode 100644 databricks-skills/databricks-genie/conversation.md diff --git a/.test/tests/integration/test_compute.py b/.test/tests/integration/test_compute.py new file mode 100644 index 00000000..688d1536 --- /dev/null +++ b/.test/tests/integration/test_compute.py @@ -0,0 +1,245 @@ +"""Integration tests for compute.py CLI script. + +Tests actual subprocess execution of the compute CLI script. +""" +import json +import subprocess +import sys +from pathlib import Path + +import pytest + +# Get repo root for running scripts +_repo_root = Path(__file__).resolve().parents[3] +_compute_script = _repo_root / "databricks-skills" / "databricks-execution-compute" / "scripts" / "compute.py" + + +class TestComputeScriptHelp: + """Test compute.py help and basic CLI structure.""" + + def test_script_shows_help(self): + """Verify script has help output.""" + result = subprocess.run( + [sys.executable, str(_compute_script), "--help"], + capture_output=True, + text=True, + cwd=str(_repo_root), + timeout=10 + ) + + assert result.returncode == 0 + assert "execute-code" in result.stdout + assert "list-compute" in result.stdout + assert "manage-cluster" in result.stdout + + def test_execute_code_help(self): + """Verify execute-code subcommand help.""" + result = subprocess.run( + [sys.executable, str(_compute_script), "execute-code", "--help"], + capture_output=True, + text=True, + cwd=str(_repo_root), + timeout=10 + ) + + assert result.returncode == 0 + assert "--code" in result.stdout + assert "--compute-type" in result.stdout + + def test_list_compute_help(self): + """Verify list-compute subcommand help.""" + result = subprocess.run( + [sys.executable, str(_compute_script), "list-compute", "--help"], + capture_output=True, + text=True, + cwd=str(_repo_root), + timeout=10 + ) + + assert result.returncode == 0 + assert "--resource" in result.stdout + + def test_manage_cluster_help(self): + """Verify manage-cluster subcommand help.""" + result = subprocess.run( + [sys.executable, str(_compute_script), "manage-cluster", "--help"], + capture_output=True, + text=True, + cwd=str(_repo_root), + timeout=10 + ) + + assert result.returncode == 0 + assert "--action" in result.stdout + + +@pytest.mark.integration +class TestListCompute: + """Tests for list-compute command.""" + + def test_list_clusters(self): + """Should list all clusters.""" + result = subprocess.run( + [sys.executable, str(_compute_script), "list-compute", "--resource", "clusters"], + capture_output=True, + text=True, + cwd=str(_repo_root), + timeout=60 + ) + + try: + output = json.loads(result.stdout) + assert "clusters" in output + assert isinstance(output["clusters"], list) + except json.JSONDecodeError: + pytest.fail(f"Invalid JSON: {result.stdout}\nStderr: {result.stderr}") + + def test_list_node_types(self): + """Should list available node types.""" + result = subprocess.run( + [sys.executable, str(_compute_script), "list-compute", "--resource", "node_types"], + capture_output=True, + text=True, + cwd=str(_repo_root), + timeout=60 + ) + + try: + output = json.loads(result.stdout) + assert "node_types" in output + assert isinstance(output["node_types"], list) + assert len(output["node_types"]) > 0 + except json.JSONDecodeError: + pytest.fail(f"Invalid JSON: {result.stdout}\nStderr: {result.stderr}") + + def test_list_spark_versions(self): + """Should list available Spark versions.""" + result = subprocess.run( + [sys.executable, str(_compute_script), "list-compute", "--resource", "spark_versions"], + capture_output=True, + text=True, + cwd=str(_repo_root), + timeout=60 + ) + + try: + output = json.loads(result.stdout) + assert "spark_versions" in output + assert isinstance(output["spark_versions"], list) + assert len(output["spark_versions"]) > 0 + except json.JSONDecodeError: + pytest.fail(f"Invalid JSON: {result.stdout}\nStderr: {result.stderr}") + + +@pytest.mark.integration +class TestExecuteCode: + """Tests for execute-code command.""" + + def test_execute_serverless_simple(self): + """Test simple Python execution on serverless.""" + code = 'print("Hello from compute test"); dbutils.notebook.exit("success")' + + result = subprocess.run( + [ + sys.executable, str(_compute_script), + "execute-code", + "--code", code, + "--compute-type", "serverless", + "--timeout", "180" + ], + capture_output=True, + text=True, + cwd=str(_repo_root), + timeout=300 # 5 min for cold start + ) + + try: + output = json.loads(result.stdout) + assert output.get("success", False), f"Execution failed: {output}" + except json.JSONDecodeError: + pytest.fail(f"Invalid JSON: {result.stdout}\nStderr: {result.stderr}") + + def test_execute_requires_code_or_file(self): + """Should return error when neither code nor file provided.""" + result = subprocess.run( + [ + sys.executable, str(_compute_script), + "execute-code", + "--compute-type", "serverless" + ], + capture_output=True, + text=True, + cwd=str(_repo_root), + timeout=30 + ) + + try: + output = json.loads(result.stdout) + assert output.get("success") is False + assert "error" in output + except json.JSONDecodeError: + pytest.fail(f"Invalid JSON: {result.stdout}\nStderr: {result.stderr}") + + +@pytest.mark.integration +class TestManageCluster: + """Tests for manage-cluster command (read-only operations).""" + + def test_invalid_action(self): + """Should return error for invalid action.""" + result = subprocess.run( + [ + sys.executable, str(_compute_script), + "manage-cluster", + "--action", "invalid_action" + ], + capture_output=True, + text=True, + cwd=str(_repo_root), + timeout=30 + ) + + # argparse will fail with invalid choice + assert result.returncode != 0 or "error" in result.stdout.lower() + + def test_get_requires_cluster_id(self): + """Should return error when cluster_id not provided for get.""" + result = subprocess.run( + [ + sys.executable, str(_compute_script), + "manage-cluster", + "--action", "get" + ], + capture_output=True, + text=True, + cwd=str(_repo_root), + timeout=30 + ) + + try: + output = json.loads(result.stdout) + assert output.get("success") is False + assert "error" in output + except json.JSONDecodeError: + pytest.fail(f"Invalid JSON: {result.stdout}\nStderr: {result.stderr}") + + def test_create_requires_name(self): + """Should return error when name not provided for create.""" + result = subprocess.run( + [ + sys.executable, str(_compute_script), + "manage-cluster", + "--action", "create" + ], + capture_output=True, + text=True, + cwd=str(_repo_root), + timeout=30 + ) + + try: + output = json.loads(result.stdout) + assert output.get("success") is False + assert "error" in output + except json.JSONDecodeError: + pytest.fail(f"Invalid JSON: {result.stdout}\nStderr: {result.stderr}") diff --git a/databricks-skills/databricks-execution-compute/SKILL.md b/databricks-skills/databricks-execution-compute/SKILL.md index 770d139a..151a467d 100644 --- a/databricks-skills/databricks-execution-compute/SKILL.md +++ b/databricks-skills/databricks-execution-compute/SKILL.md @@ -27,6 +27,7 @@ Run code on Databricks. Three execution modesβ€”choose based on workload. ### Decision Flow +Prefer Databricks Connect for all spark-based workload. ``` Spark-based code? β†’ Databricks Connect (fastest) └─ Python 3.12 missing? β†’ Install it + databricks-connect @@ -91,7 +92,7 @@ databricks jobs create --json '{ | `databricks jobs create/run-now` | Serverless, Cluster | Run code remotely | | `databricks clusters list` | Interactive | List clusters, check status | | `databricks clusters create/start/delete` | Interactive | Manage clusters. **COSTLY:** `start` takes 3-8 min | -| `databricks sql warehouses create/list` | SQL | Manage SQL warehouses | +| `databricks warehouses create/list` | SQL | Manage SQL warehouses | ## Related Skills diff --git a/databricks-skills/databricks-execution-compute/references/3-interactive-cluster.md b/databricks-skills/databricks-execution-compute/references/3-interactive-cluster.md index 1ed6fd36..fbff2469 100644 --- a/databricks-skills/databricks-execution-compute/references/3-interactive-cluster.md +++ b/databricks-skills/databricks-execution-compute/references/3-interactive-cluster.md @@ -20,8 +20,8 @@ **Starting a cluster takes 3-8 minutes and costs money.** Always check first: -```python -list_compute(resource="clusters") +```bash +python scripts/compute.py list-compute --resource clusters ``` If no cluster is running, ask the user: @@ -34,58 +34,80 @@ If no cluster is running, ask the user: ### First Command: Creates Context -```python -result = execute_code( - code="import pandas as pd\ndf = pd.DataFrame({'a': [1, 2, 3]})", - compute_type="cluster", - cluster_id="1234-567890-abcdef" -) -# result contains context_id for reuse +```bash +python scripts/compute.py execute-code \ + --code "import pandas as pd; df = pd.DataFrame({'a': [1, 2, 3]}); print(df)" \ + --compute-type cluster \ + --cluster-id "1234-567890-abcdef" +``` + +Response includes `context_id` for reuse: +```json +{ + "success": true, + "output": " a\n0 1\n1 2\n2 3", + "context_id": "ctx_abc123", + "cluster_id": "1234-567890-abcdef" +} ``` ### Follow-up Commands: Reuse Context -```python +```bash # Variables from first command still available -execute_code( - code="print(df.shape)", # df exists - context_id=result["context_id"], - cluster_id=result["cluster_id"] -) +python scripts/compute.py execute-code \ + --code "print(df.shape)" \ + --compute-type cluster \ + --cluster-id "1234-567890-abcdef" \ + --context-id "ctx_abc123" ``` ### Auto-Select Best Running Cluster -```python -best_cluster = list_compute(resource="clusters", auto_select=True) -execute_code( - code="spark.range(100).show()", - compute_type="cluster", - cluster_id=best_cluster["cluster_id"] -) +```bash +# Get best running cluster +python scripts/compute.py list-compute --auto-select +# Returns: {"cluster_id": "1234-567890-abcdef"} + +# Then execute on it +python scripts/compute.py execute-code \ + --code "spark.range(100).show()" \ + --compute-type cluster \ + --cluster-id "1234-567890-abcdef" ``` ## Language Support -```python -execute_code(code='println("Hello")', compute_type="cluster", language="scala") -execute_code(code="SELECT * FROM table LIMIT 10", compute_type="cluster", language="sql") -execute_code(code='print("Hello")', compute_type="cluster", language="r") +```bash +# Scala +python scripts/compute.py execute-code --code 'println("Hello")' --compute-type cluster --language scala --cluster-id ... + +# SQL +python scripts/compute.py execute-code --code "SELECT * FROM table LIMIT 10" --compute-type cluster --language sql --cluster-id ... + +# R +python scripts/compute.py execute-code --code 'print("Hello")' --compute-type cluster --language r --cluster-id ... ``` ## Installing Libraries -Install pip packages directly in the execution context (pandas/numpy are there by default): - -```python -# Install library -execute_code( - code="""%pip install faker - dbutils.library.restartPython()""", # Restart Python to pick up new packages (if needed) - compute_type="cluster", - cluster_id="...", - context_id="..." -) +Install pip packages directly in the execution context: + +```bash +python scripts/compute.py execute-code \ + --code "%pip install faker" \ + --compute-type cluster \ + --cluster-id "..." \ + --context-id "..." +``` + +If needed, restart Python to pick up new packages: +```bash +python scripts/compute.py execute-code \ + --code "dbutils.library.restartPython()" \ + --compute-type cluster \ + --cluster-id "..." \ + --context-id "..." ``` ## Context Lifecycle @@ -93,32 +115,31 @@ execute_code( **Keep alive (default):** Context persists until cluster terminates. **Destroy when done:** -```python -execute_code( - code="print('Done!')", - compute_type="cluster", - destroy_context_on_completion=True -) +```bash +python scripts/compute.py execute-code \ + --code "print('Done!')" \ + --compute-type cluster \ + --cluster-id "..." \ + --destroy-context ``` -## Handling No Running Cluster +## Managing Clusters -When no cluster is running, `execute_code` returns: -```json -{ - "success": false, - "error": "No running cluster available", - "startable_clusters": [{"cluster_id": "...", "cluster_name": "...", "state": "TERMINATED"}], - "suggestions": ["Start a terminated cluster", "Use serverless instead"] -} -``` +```bash +# List all clusters +python scripts/compute.py list-compute --resource clusters + +# Get specific cluster status +python scripts/compute.py list-compute --cluster-id "1234-567890-abcdef" + +# Start a cluster (WITH USER APPROVAL ONLY - costs money, 3-8min startup) +python scripts/compute.py manage-cluster --action start --cluster-id "1234-567890-abcdef" -### Starting a Cluster (With User Approval Only) +# Terminate a cluster (reversible) +python scripts/compute.py manage-cluster --action terminate --cluster-id "1234-567890-abcdef" -```python -manage_cluster(action="start", cluster_id="1234-567890-abcdef") -# Poll until running (wait 20sec) -list_compute(resource="clusters", cluster_id="1234-567890-abcdef") +# Create a new cluster +python scripts/compute.py manage-cluster --action create --name "my-cluster" --num-workers 2 ``` ## Common Issues @@ -127,7 +148,7 @@ list_compute(resource="clusters", cluster_id="1234-567890-abcdef") |-------|----------| | "No running cluster" | Ask user to start or use serverless | | Context not found | Context expired; create new one | -| Library not found | `%pip install ` then if needed `dbutils.library.restartPython()` | +| Library not found | `%pip install ` then restart Python if needed | ## When NOT to Use diff --git a/databricks-skills/databricks-execution-compute/scripts/compute.py b/databricks-skills/databricks-execution-compute/scripts/compute.py new file mode 100644 index 00000000..0e584f3c --- /dev/null +++ b/databricks-skills/databricks-execution-compute/scripts/compute.py @@ -0,0 +1,668 @@ +#!/usr/bin/env python3 +"""Compute CLI - Execute code and manage compute resources on Databricks. + +Standalone script with no external dependencies beyond databricks-sdk. + +Commands: +- execute-code: Run code on serverless or cluster compute +- list-compute: List clusters, node types, or spark versions +- manage-cluster: Create, start, terminate, or delete clusters + +Requires: pip install databricks-sdk +""" + +import argparse +import base64 +import json +import uuid +from dataclasses import dataclass +from datetime import timedelta +from typing import Any, Dict, List, Optional + +from databricks.sdk import WorkspaceClient +from databricks.sdk.service.compute import ( + ClusterSource, + CommandStatus, + ContextStatus, + Environment, + Language, + ListClustersFilterBy, + ResultType, + State, +) +from databricks.sdk.service.jobs import ( + JobEnvironment, + NotebookTask, + RunResultState, + Source, + SubmitTask, +) +from databricks.sdk.service.workspace import ImportFormat, Language as WsLang + + +# --------------------------------------------------------------------------- +# Authentication +# --------------------------------------------------------------------------- + +def get_workspace_client() -> WorkspaceClient: + """Get authenticated WorkspaceClient using standard auth chain.""" + return WorkspaceClient() + + +def get_current_username() -> str: + """Get the current user's username.""" + w = get_workspace_client() + return w.current_user.me().user_name + + +# --------------------------------------------------------------------------- +# Exceptions +# --------------------------------------------------------------------------- + +class NoRunningClusterError(Exception): + """Raised when no running cluster is available.""" + + def __init__(self, message: str, suggestions: List[str] = None, startable_clusters: List[Dict] = None): + super().__init__(message) + self.suggestions = suggestions or [] + self.startable_clusters = startable_clusters or [] + + +# --------------------------------------------------------------------------- +# Result Classes +# --------------------------------------------------------------------------- + +@dataclass +class ExecutionResult: + """Result from cluster command execution.""" + success: bool + output: str = "" + error: str = "" + cluster_id: str = "" + context_id: str = "" + status: str = "" + result_type: str = "" + + def to_dict(self) -> Dict[str, Any]: + return { + "success": self.success, + "output": self.output, + "error": self.error, + "cluster_id": self.cluster_id, + "context_id": self.context_id, + "status": self.status, + "result_type": self.result_type, + } + + +@dataclass +class ServerlessRunResult: + """Result from serverless code execution.""" + success: bool + output: str = "" + error: str = "" + run_id: int = 0 + run_page_url: str = "" + state: str = "" + execution_duration_ms: int = 0 + + def to_dict(self) -> Dict[str, Any]: + return { + "success": self.success, + "output": self.output, + "error": self.error, + "run_id": self.run_id, + "run_page_url": self.run_page_url, + "state": self.state, + "execution_duration_ms": self.execution_duration_ms, + } + + +# --------------------------------------------------------------------------- +# Cluster Execution +# --------------------------------------------------------------------------- + +def list_clusters() -> List[Dict[str, Any]]: + """List interactive clusters created by humans (UI/API, not jobs).""" + w = get_workspace_client() + clusters = [] + # Filter to only UI and API created clusters (interactive, human-created) + # Excludes JOB clusters (created by jobs) and other system clusters + filter_by = ListClustersFilterBy( + cluster_sources=[ClusterSource.UI, ClusterSource.API] + ) + for c in w.clusters.list(filter_by=filter_by, page_size=100): + clusters.append({ + "cluster_id": c.cluster_id, + "cluster_name": c.cluster_name, + "state": c.state.value if c.state else "UNKNOWN", + "creator_user_name": c.creator_user_name, + "spark_version": c.spark_version, + "node_type_id": c.node_type_id, + "num_workers": c.num_workers, + }) + return clusters + + +def get_best_cluster() -> str: + """Get the best running interactive cluster ID, or raise NoRunningClusterError.""" + w = get_workspace_client() + running = [] + startable = [] + + # Filter to only interactive clusters (UI/API created) + filter_by = ListClustersFilterBy( + cluster_sources=[ClusterSource.UI, ClusterSource.API] + ) + for c in w.clusters.list(filter_by=filter_by, page_size=100): + info = { + "cluster_id": c.cluster_id, + "cluster_name": c.cluster_name, + "state": c.state.value if c.state else "UNKNOWN", + } + if c.state == State.RUNNING: + running.append(info) + elif c.state in (State.TERMINATED, State.PENDING): + startable.append(info) + + if running: + return running[0]["cluster_id"] + + raise NoRunningClusterError( + "No running cluster available.", + suggestions=[ + "Start an existing cluster with: python compute.py manage-cluster --action start --cluster-id ", + "Use serverless compute: python compute.py execute-code --compute-type serverless --code '...'", + ], + startable_clusters=startable, + ) + + +def start_cluster(cluster_id: str) -> Dict[str, Any]: + """Start a cluster and wait for it to be running.""" + w = get_workspace_client() + w.clusters.start(cluster_id=cluster_id) + # Don't wait - just return immediately + return {"success": True, "cluster_id": cluster_id, "message": "Cluster start initiated"} + + +def get_cluster_status(cluster_id: str) -> Dict[str, Any]: + """Get the status of a specific cluster.""" + w = get_workspace_client() + c = w.clusters.get(cluster_id=cluster_id) + return { + "cluster_id": c.cluster_id, + "cluster_name": c.cluster_name, + "state": c.state.value if c.state else "UNKNOWN", + "state_message": c.state_message, + "creator_user_name": c.creator_user_name, + "spark_version": c.spark_version, + "node_type_id": c.node_type_id, + "num_workers": c.num_workers, + } + + +def _get_or_create_context(w: WorkspaceClient, cluster_id: str, context_id: Optional[str], language: str) -> str: + """Get existing context or create a new one.""" + lang_map = {"python": Language.PYTHON, "scala": Language.SCALA, "sql": Language.SQL, "r": Language.R} + lang = lang_map.get(language.lower(), Language.PYTHON) + + if context_id: + # Verify context exists + try: + status = w.command_execution.context_status(cluster_id=cluster_id, context_id=context_id) + if status.status == ContextStatus.RUNNING: + return context_id + except Exception: + pass # Context doesn't exist, create new one + + # Create new context + ctx = w.command_execution.create(cluster_id=cluster_id, language=lang).result() + return ctx.id + + +def execute_databricks_command( + code: str, + cluster_id: Optional[str] = None, + context_id: Optional[str] = None, + language: str = "python", + timeout: int = 120, + destroy_context_on_completion: bool = False, +) -> ExecutionResult: + """Execute code on a Databricks cluster using Command Execution API.""" + w = get_workspace_client() + + # Get cluster ID if not provided + if not cluster_id: + cluster_id = get_best_cluster() + + # Get or create context + ctx_id = _get_or_create_context(w, cluster_id, context_id, language) + + # Execute command + lang_map = {"python": Language.PYTHON, "scala": Language.SCALA, "sql": Language.SQL, "r": Language.R} + lang = lang_map.get(language.lower(), Language.PYTHON) + + try: + cmd = w.command_execution.execute( + cluster_id=cluster_id, + context_id=ctx_id, + language=lang, + command=code, + ).result(timeout=timedelta(seconds=timeout)) + + # Parse results + output = "" + error = "" + result_type = cmd.results.result_type.value if cmd.results and cmd.results.result_type else "" + + if cmd.results: + if cmd.results.result_type == ResultType.TEXT: + output = cmd.results.data or "" + elif cmd.results.result_type == ResultType.TABLE: + output = json.dumps(cmd.results.data) if cmd.results.data else "" + elif cmd.results.result_type == ResultType.ERROR: + error = cmd.results.cause or str(cmd.results.data) or "Unknown error" + + success = cmd.status == CommandStatus.FINISHED and cmd.results.result_type != ResultType.ERROR + + return ExecutionResult( + success=success, + output=output, + error=error, + cluster_id=cluster_id, + context_id=ctx_id, + status=cmd.status.value if cmd.status else "", + result_type=result_type, + ) + + finally: + if destroy_context_on_completion and ctx_id: + try: + w.command_execution.destroy(cluster_id=cluster_id, context_id=ctx_id) + except Exception: + pass + + +# --------------------------------------------------------------------------- +# Serverless Execution +# --------------------------------------------------------------------------- + +def run_code_on_serverless( + code: str, + language: str = "python", + timeout: int = 1800, +) -> ServerlessRunResult: + """Run code on serverless compute using Jobs API runs/submit.""" + w = get_workspace_client() + + # Create temp notebook + username = get_current_username() + notebook_name = f"_tmp_serverless_{uuid.uuid4().hex[:8]}" + notebook_path = f"/Workspace/Users/{username}/.tmp/{notebook_name}" + + # Ensure directory exists + try: + w.workspace.mkdirs(f"/Workspace/Users/{username}/.tmp") + except Exception: + pass + + # Upload notebook content + if language.lower() == "sql": + notebook_content = f"-- Databricks notebook source\n{code}" + else: + notebook_content = f"# Databricks notebook source\n{code}" + + content_b64 = base64.b64encode(notebook_content.encode()).decode() + + ws_lang_map = {"python": WsLang.PYTHON, "sql": WsLang.SQL} + ws_lang = ws_lang_map.get(language.lower(), WsLang.PYTHON) + + w.workspace.import_( + path=notebook_path, + content=content_b64, + format=ImportFormat.SOURCE, + language=ws_lang, + overwrite=True, + ) + + try: + # Submit run + run = w.jobs.submit( + run_name=f"serverless-run-{uuid.uuid4().hex[:8]}", + tasks=[ + SubmitTask( + task_key="main", + notebook_task=NotebookTask( + notebook_path=notebook_path, + source=Source.WORKSPACE, + ), + environment_key="default", + ) + ], + environments=[ + JobEnvironment( + environment_key="default", + spec=Environment(client="1"), + ) + ], + ).result(timeout=timedelta(seconds=timeout)) + + # Get run output + run_output = w.jobs.get_run_output(run_id=run.tasks[0].run_id) + + output = "" + error = "" + success = run.state.result_state == RunResultState.SUCCESS + + if run_output.notebook_output and run_output.notebook_output.result: + output = run_output.notebook_output.result + if run_output.error: + error = run_output.error + + return ServerlessRunResult( + success=success, + output=output, + error=error, + run_id=run.run_id, + run_page_url=run.run_page_url or "", + state=run.state.result_state.value if run.state and run.state.result_state else "", + execution_duration_ms=run.execution_duration or 0, + ) + + finally: + # Cleanup temp notebook + try: + w.workspace.delete(notebook_path) + except Exception: + pass + + +# --------------------------------------------------------------------------- +# Cluster Management +# --------------------------------------------------------------------------- + +def create_cluster( + name: str, + num_workers: int = 1, + autotermination_minutes: int = 120, + spark_version: Optional[str] = None, + node_type_id: Optional[str] = None, +) -> Dict[str, Any]: + """Create a new cluster.""" + w = get_workspace_client() + + # Get defaults if not provided + if not spark_version: + versions = list(w.clusters.spark_versions()) + # Pick latest LTS + for v in versions: + if "LTS" in v.name and "ML" not in v.name: + spark_version = v.key + break + if not spark_version and versions: + spark_version = versions[0].key + + if not node_type_id: + node_types = list(w.clusters.list_node_types().node_types) + # Pick smallest available + for nt in sorted(node_types, key=lambda x: x.memory_mb or 0): + if nt.is_deprecated is not True: + node_type_id = nt.node_type_id + break + + cluster = w.clusters.create( + cluster_name=name, + spark_version=spark_version, + node_type_id=node_type_id, + num_workers=num_workers, + autotermination_minutes=autotermination_minutes, + ).result() + + return { + "success": True, + "cluster_id": cluster.cluster_id, + "cluster_name": name, + "message": "Cluster created", + } + + +def terminate_cluster(cluster_id: str) -> Dict[str, Any]: + """Terminate a cluster (can be restarted).""" + w = get_workspace_client() + w.clusters.delete(cluster_id=cluster_id) + return {"success": True, "cluster_id": cluster_id, "message": "Cluster terminated"} + + +def delete_cluster(cluster_id: str) -> Dict[str, Any]: + """Permanently delete a cluster.""" + w = get_workspace_client() + w.clusters.permanent_delete(cluster_id=cluster_id) + return {"success": True, "cluster_id": cluster_id, "message": "Cluster permanently deleted"} + + +def list_node_types() -> List[Dict[str, Any]]: + """List available node types.""" + w = get_workspace_client() + result = [] + for nt in w.clusters.list_node_types().node_types: + result.append({ + "node_type_id": nt.node_type_id, + "memory_mb": nt.memory_mb, + "num_cores": nt.num_cores, + "description": nt.description, + "is_deprecated": nt.is_deprecated, + }) + return result + + +def list_spark_versions() -> List[Dict[str, Any]]: + """List available Spark versions.""" + w = get_workspace_client() + result = [] + response = w.clusters.spark_versions() + for v in response.versions or []: + result.append({ + "key": v.key, + "name": v.name, + }) + return result + + +# --------------------------------------------------------------------------- +# CLI Commands +# --------------------------------------------------------------------------- + +def _none_if_empty(value): + """Convert empty strings to None.""" + return None if value == "" else value + + +def _no_cluster_error_response(e: NoRunningClusterError) -> Dict[str, Any]: + """Build a structured error response when no running cluster is available.""" + return { + "success": False, + "error": str(e), + "suggestions": e.suggestions, + "startable_clusters": e.startable_clusters, + } + + +def cmd_execute_code(args): + """Execute code on Databricks via serverless or cluster compute.""" + code = _none_if_empty(args.code) + file_path = _none_if_empty(args.file) + cluster_id = _none_if_empty(args.cluster_id) + context_id = _none_if_empty(args.context_id) + language = _none_if_empty(args.language) or "python" + compute_type = args.compute_type + timeout = args.timeout + destroy_context = args.destroy_context + + if not code and not file_path: + return {"success": False, "error": "Either --code or --file must be provided."} + + # Read code from file if provided + if file_path and not code: + try: + with open(file_path, "r", encoding="utf-8") as f: + code = f.read() + except FileNotFoundError: + return {"success": False, "error": f"File not found: {file_path}"} + + # Resolve "auto" compute type + if compute_type == "auto": + if cluster_id or context_id: + compute_type = "cluster" + elif language.lower() in ("scala", "r"): + compute_type = "cluster" + else: + compute_type = "serverless" + + # Serverless execution + if compute_type == "serverless": + default_timeout = timeout if timeout else 1800 + result = run_code_on_serverless( + code=code, + language=language, + timeout=default_timeout, + ) + return result.to_dict() + + # Cluster execution + default_timeout = timeout if timeout else 120 + try: + result = execute_databricks_command( + code=code, + cluster_id=cluster_id, + context_id=context_id, + language=language, + timeout=default_timeout, + destroy_context_on_completion=destroy_context, + ) + return result.to_dict() + except NoRunningClusterError as e: + return _no_cluster_error_response(e) + + +def cmd_list_compute(args): + """List compute resources: clusters, node types, or spark versions.""" + resource = args.resource.lower() + cluster_id = _none_if_empty(args.cluster_id) + auto_select = args.auto_select + + if resource == "clusters": + if cluster_id: + return get_cluster_status(cluster_id) + if auto_select: + try: + best = get_best_cluster() + return {"cluster_id": best} + except NoRunningClusterError as e: + return _no_cluster_error_response(e) + return {"clusters": list_clusters()} + + elif resource == "node_types": + return {"node_types": list_node_types()} + + elif resource == "spark_versions": + return {"spark_versions": list_spark_versions()} + + else: + return {"success": False, "error": f"Unknown resource: {resource}. Use: clusters, node_types, spark_versions"} + + +def cmd_manage_cluster(args): + """Create, start, terminate, or delete a cluster.""" + action = args.action.lower() + cluster_id = _none_if_empty(args.cluster_id) + name = _none_if_empty(args.name) + + if action == "create": + if not name: + return {"success": False, "error": "name is required for create action."} + return create_cluster( + name=name, + num_workers=args.num_workers or 1, + autotermination_minutes=args.autotermination_minutes or 120, + ) + + elif action == "start": + if not cluster_id: + return {"success": False, "error": "cluster_id is required for start action."} + return start_cluster(cluster_id) + + elif action == "terminate": + if not cluster_id: + return {"success": False, "error": "cluster_id is required for terminate action."} + return terminate_cluster(cluster_id) + + elif action == "delete": + if not cluster_id: + return {"success": False, "error": "cluster_id is required for delete action."} + return delete_cluster(cluster_id) + + elif action == "get": + if not cluster_id: + return {"success": False, "error": "cluster_id is required for get action."} + try: + return get_cluster_status(cluster_id) + except Exception as e: + if "does not exist" in str(e).lower(): + return {"success": True, "cluster_id": cluster_id, "state": "DELETED", "exists": False} + return {"success": False, "error": str(e)} + + else: + return {"success": False, "error": f"Unknown action: {action}. Use: create, start, terminate, delete, get"} + + +# --------------------------------------------------------------------------- +# CLI Setup +# --------------------------------------------------------------------------- + +def main(): + parser = argparse.ArgumentParser( + description="Execute code and manage compute on Databricks", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + subparsers = parser.add_subparsers(dest="command", required=True) + + # execute-code + exec_parser = subparsers.add_parser("execute-code", help="Run code on Databricks") + exec_parser.add_argument("--code", help="Code to execute") + exec_parser.add_argument("--file", help="File to execute") + exec_parser.add_argument("--compute-type", default="auto", choices=["auto", "serverless", "cluster"], + help="Compute type (default: auto)") + exec_parser.add_argument("--cluster-id", help="Cluster ID (for cluster compute)") + exec_parser.add_argument("--context-id", help="Context ID (reuse existing context)") + exec_parser.add_argument("--language", default="python", choices=["python", "scala", "sql", "r"], + help="Language (default: python)") + exec_parser.add_argument("--timeout", type=int, help="Timeout in seconds") + exec_parser.add_argument("--destroy-context", action="store_true", help="Destroy context after execution") + exec_parser.set_defaults(func=cmd_execute_code) + + # list-compute + list_parser = subparsers.add_parser("list-compute", help="List compute resources") + list_parser.add_argument("--resource", default="clusters", choices=["clusters", "node_types", "spark_versions"], + help="Resource to list (default: clusters)") + list_parser.add_argument("--cluster-id", help="Get specific cluster status") + list_parser.add_argument("--auto-select", action="store_true", help="Return best running cluster") + list_parser.set_defaults(func=cmd_list_compute) + + # manage-cluster + manage_parser = subparsers.add_parser("manage-cluster", help="Manage clusters") + manage_parser.add_argument("--action", required=True, choices=["create", "start", "terminate", "delete", "get"], + help="Action to perform") + manage_parser.add_argument("--cluster-id", help="Cluster ID") + manage_parser.add_argument("--name", help="Cluster name (for create)") + manage_parser.add_argument("--num-workers", type=int, help="Number of workers (for create)") + manage_parser.add_argument("--autotermination-minutes", type=int, help="Auto-termination minutes (for create)") + manage_parser.set_defaults(func=cmd_manage_cluster) + + args = parser.parse_args() + result = args.func(args) + print(json.dumps(result, indent=2, default=str)) + + +if __name__ == "__main__": + main() diff --git a/databricks-skills/databricks-genie/SKILL.md b/databricks-skills/databricks-genie/SKILL.md index a56cfd02..a8652ebc 100644 --- a/databricks-skills/databricks-genie/SKILL.md +++ b/databricks-skills/databricks-genie/SKILL.md @@ -11,36 +11,55 @@ Create, manage, and query Genie Spaces - natural language interfaces for SQL-bas Genie Spaces allow users to ask natural language questions about structured data in Unity Catalog. The system translates questions into SQL queries, executes them on a SQL warehouse, and presents results conversationally. -## CLI Commands +## Creating a Genie Space -### Space Management +### Step 1: Understand the Data + +Before creating a Genie Space, explore the available tables to: +- **Select relevant tables** β€” typically gold layer (aggregated KPIs) and sometimes silver layer (cleaned facts) or metric views +- **Understand the story** β€” what business questions can this data answer? What insights can users discover? +- **Design meaningful sample questions** β€” questions should reflect real use cases and lead to actionable insights in the data + +```bash +# Discover table schemas, columns, and sample values +databricks experimental aitools tools discover-schema catalog.schema.gold_sales catalog.schema.gold_customers + +# Run SQL queries to explore the data and understand relationships +databricks sql exec "SELECT * FROM catalog.schema.gold_sales LIMIT 10" +databricks sql exec "DESCRIBE TABLE catalog.schema.gold_sales" +``` + +### Step 2: Create the Space + +Define your space in a local JSON file (e.g., `genie_space.json`) for version control and easy iteration. See "serialized_space Format" below for the full structure. ```bash # List all Genie Spaces databricks genie list-spaces -# Create a Genie Space -databricks genie create-space --json '{ - "warehouse_id": "WAREHOUSE_ID", - "title": "Sales Analytics", - "description": "Explore sales data", - "parent_path": "/Workspace/Users/you@company.com/genie_spaces", - "serialized_space": "{\"version\": 2, \"data_sources\": {\"tables\": [{\"identifier\": \"catalog.schema.table\"}]}}" -}' +# Create a Genie Space from a local file +# IMPORTANT: sample_questions require a 32-char hex "id" and "question" must be an array +databricks genie create-space --json "{ + \"warehouse_id\": \"WAREHOUSE_ID\", + \"title\": \"Sales Analytics\", + \"description\": \"Explore sales data\", + \"parent_path\": \"/Workspace/Users/you@company.com/genie_spaces\", + \"serialized_space\": $(cat genie_space.json | jq -c '.' | jq -Rs '.') +}" # Get space details (with full config) databricks genie get-space SPACE_ID --include-serialized-space -# Update a Genie Space -databricks genie update-space SPACE_ID --json '{ - "title": "Updated Name", - "description": "Updated description" -}' - # Delete a Genie Space databricks genie trash-space SPACE_ID ``` +### Step 3: Test and Iterate + +Use `scripts/conversation.py` (see Conversation API section below) to test questions and verify answers are accurate. + +If answers are inaccurate or incomplete, improve the space β€” see "Improving a Genie Space" below. + ### Export & Import ```bash @@ -51,11 +70,15 @@ databricks genie export-space SPACE_ID > exported.json databricks genie import-space --json @exported.json ``` -### Table Inspection +### Improving a Genie Space + +When Genie answers are inaccurate or incomplete, improve the space by updating questions, SQL examples, or instructions: ```bash -# Inspect table schemas before creating a space -databricks experimental aitools tools discover-schema catalog.schema.table1 catalog.schema.table2 +# 1. Edit your local genie_space.json (add questions, fix SQL examples, improve instructions) + +# 2. Push updates back to the space +databricks genie update-space SPACE_ID --json "{\"serialized_space\": $(cat genie_space.json | jq -c '.' | jq -Rs '.')}" ``` ## serialized_space Format @@ -90,58 +113,69 @@ The `serialized_space` field is a JSON string containing the full space configur | `instructions.example_question_sqls[]` | `{"id": "32hexchars", "question": ["..."], "sql": ["..."]}` | | `instructions.text_instructions[]` | `{"id": "32hexchars", "content": ["..."]}` | -- **ID format:** 32-character lowercase hex UUID without hyphens. Generate with `uuid.uuid4().hex` in Python. +- **ID format:** 32-character lowercase hex UUID without hyphens. - **Text fields are arrays:** `question`, `sql`, and `content` are arrays of strings, not plain strings. -### Example +### Text Instructions + +`text_instructions` make the Genie Space more reliable by explaining: +- **Where to find information** β€” which tables contain which metrics +- **How to answer specific questions** β€” when a user asks X, use table Y with filter Z +- **Business context** β€” definitions, thresholds, and domain knowledge + +Well-crafted instructions significantly improve answer accuracy. + +### Complete Example + +This example shows a properly formatted `serialized_space` with sample questions, SQL examples, and text instructions. Note that every item has a unique 32-char hex `id` and all text fields are arrays: ```json { "version": 2, "config": { "sample_questions": [ - {"id": "a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4", "question": ["What were total sales last month?"]} + {"id": "a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4", "question": ["What is our current on-time performance?"]},... ] }, "data_sources": { - "tables": [{"identifier": "catalog.schema.orders"}] + "tables": [ + {"identifier": "catalog.ops.gold_otp_summary"},... + ] }, "instructions": { "example_question_sqls": [ { "id": "b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5", - "question": ["Show top customers"], - "sql": ["SELECT customer_name, SUM(amount) AS total ", "FROM catalog.schema.orders ", "GROUP BY 1 ORDER BY 2 DESC"] + "question": ["What is our on-time performance?"], + "sql": ["SELECT flight_date, ROUND(SUM(on_time_count) * 100.0 / SUM(total_flights), 1) AS otp_pct\n", "FROM catalog.ops.gold_otp_summary\n", "WHERE flight_date >= date_sub(current_date(), 7)\n", "GROUP BY flight_date ORDER BY flight_date"] + } + ], + "text_instructions": [ + { + "id": "c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6", + "content": [ + "On-time performance (OTP) questions: Use gold_otp_summary table. OTP target is 85%.\n", + "Delay analysis questions: Use gold_delay_analysis table. Filter by delay_code for specific delay types.\n", + "When asked about 'this week' or 'recent': Use flight_date >= date_sub(current_date(), 7).\n", + "When comparing aircraft: Join with gold_aircraft_reliability on tail_number." + ] } ] } } ``` -## Migration Workflow -### Clone (Same Workspace) +## Cross-Workspace Migration -```bash -databricks genie export-space SOURCE_ID > space.json -databricks genie import-space --json @space.json -``` - -### Cross-Workspace with Catalog Remapping - -When migrating between environments (dev β†’ prod), catalog names often differ. Remap them: +When migrating between workspaces, catalog names often differ. Export the space, remap with `sed`, then import: ```bash -# 1. Export from source workspace -DATABRICKS_CONFIG_PROFILE=source databricks genie export-space SPACE_ID > exported.json - -# 2. Remap catalog name -sed -i '' 's/source_catalog/target_catalog/g' exported.json - -# 3. Import to target workspace -DATABRICKS_CONFIG_PROFILE=target databricks genie import-space --json @exported.json +sed -i '' 's/source_catalog/target_catalog/g' genie_space.json ``` +Use `DATABRICKS_CONFIG_PROFILE=profile_name` to target different workspaces. + ## Conversation API Use `scripts/conversation.py` to ask questions programmatically: @@ -150,14 +184,14 @@ Use `scripts/conversation.py` to ask questions programmatically: # Ask a question python scripts/conversation.py ask SPACE_ID "What were total sales last month?" -# Follow-up in same conversation +# Follow-up in same conversation (Genie remembers context) python scripts/conversation.py ask SPACE_ID "Break down by region" --conversation-id CONV_ID -# With timeout +# With timeout for complex queries python scripts/conversation.py ask SPACE_ID "Complex query" --timeout 120 ``` -See [conversation.md](conversation.md) for full details. +Start a new conversation for unrelated topics. Use `--conversation-id` only for follow-ups on the same topic. ## Troubleshooting diff --git a/databricks-skills/databricks-genie/conversation.md b/databricks-skills/databricks-genie/conversation.md deleted file mode 100644 index 760c00d7..00000000 --- a/databricks-skills/databricks-genie/conversation.md +++ /dev/null @@ -1,232 +0,0 @@ -# Genie Conversations - -Use the Genie Conversation API to ask natural language questions to a curated Genie Space. - -## Overview - -The `scripts/conversation.py` script in this skill folder allows you to programmatically send questions to a Genie Space and receive SQL-generated answers. Instead of writing SQL directly, you delegate the query generation to Genie, which has been curated with business logic, instructions, and certified queries. - -## When to Use the Conversation API - -### Use Conversation API When: - -| Scenario | Why | -|----------|-----| -| Genie Space has curated business logic | Genie knows rules like "active customer = ordered in 90 days" | -| User explicitly says "ask Genie" or "use my Genie Space" | User intent to use their curated space | -| Complex business metrics with specific definitions | Genie has certified queries for official metrics | -| Testing a Genie Space after creating it | Validate the space works correctly | -| User wants conversational data exploration | Genie handles context for follow-up questions | - -### Use Direct SQL Instead When: - -| Scenario | Why | -|----------|-----| -| Simple ad-hoc query | Direct SQL is faster, no curation needed | -| You already have the exact SQL | No need for Genie to regenerate | -| Genie Space doesn't exist for this data | Can't use Genie without a space | -| Need precise control over the query | Direct SQL gives exact control | - -## CLI Usage - -Use the `scripts/conversation.py` script to ask questions: - -```bash -python scripts/conversation.py ask SPACE_ID "Your question here" -``` - -## Basic Usage - -### Ask a Question - -```bash -python scripts/conversation.py ask 01abc123... "What were total sales last month?" -``` - -**Response:** -```json -{ - "question": "What were total sales last month?", - "conversation_id": "conv_xyz789", - "message_id": "msg_123", - "status": "COMPLETED", - "sql": "SELECT SUM(total_amount) AS total_sales FROM orders WHERE order_date >= DATE_TRUNC('month', CURRENT_DATE - INTERVAL 1 MONTH) AND order_date < DATE_TRUNC('month', CURRENT_DATE)", - "columns": ["total_sales"], - "data": [[125430.50]], - "row_count": 1 -} -``` - -### Ask Follow-up Questions - -Use the `conversation_id` from the first response to ask follow-up questions with context: - -```bash -# First question - capture the conversation_id from output -python scripts/conversation.py ask 01abc123... "What were total sales last month?" -# Output includes: "conversation_id": "conv_xyz789" - -# Follow-up (uses context from first question) -python scripts/conversation.py ask 01abc123... "Break that down by region" --conversation-id conv_xyz789 -``` - -Genie remembers the context, so "that" refers to "total sales last month". - -## Response Fields - -| Field | Description | -|-------|-------------| -| `question` | The original question asked | -| `conversation_id` | ID for follow-up questions | -| `message_id` | Unique message identifier | -| `status` | `COMPLETED`, `FAILED`, `CANCELLED`, `TIMEOUT` | -| `sql` | The SQL query Genie generated | -| `columns` | List of column names in result | -| `data` | Query results as list of rows | -| `row_count` | Number of rows returned | -| `text_response` | Text explanation (if Genie asks for clarification) | -| `error` | Error message (if status is not COMPLETED) | - -## Handling Responses - -### Successful Response - -The script returns JSON that can be parsed: - -```bash -python scripts/conversation.py ask SPACE_ID "Who are our top 10 customers?" | jq '.status' -# Output: "COMPLETED" -``` - -Response fields when status is `COMPLETED`: -- `sql`: The SQL query Genie generated -- `columns`: List of column names -- `data`: Query results as list of rows -- `row_count`: Number of rows returned - -### Failed Response - -```bash -python scripts/conversation.py ask SPACE_ID "What is the meaning of life?" | jq '.status, .error' -# Output: "FAILED" -# Output: "Could not generate SQL for this question" -``` - -Genie couldn't answer - may need to rephrase or use direct SQL. - -### Timeout - -```bash -python scripts/conversation.py ask SPACE_ID "Complex query" --timeout 120 | jq '.status' -# If timeout occurs: "TIMEOUT" -``` - -Query took too long - try a simpler question or increase timeout. - -## Example Workflows - -### Workflow 1: User Asks to Use Genie - -``` -User: "Ask my Sales Genie what the churn rate is" - -Claude: -1. Identifies user wants to use Genie (explicit request) -2. Runs: python scripts/conversation.py ask sales_genie_id "What is the churn rate?" -3. Returns: "Based on your Sales Genie, the churn rate is 4.2%. - Genie used this SQL: SELECT ..." -``` - -### Workflow 2: Testing a New Genie Space - -``` -User: "I just created a Genie Space for HR data. Can you test it?" - -Claude: -1. Gets the space_id from the user or recent databricks genie create-space result -2. Runs conversation.py with test questions: - - python scripts/conversation.py ask SPACE_ID "How many employees do we have?" - - python scripts/conversation.py ask SPACE_ID "What is the average salary by department?" -3. Reports results: "Your HR Genie is working. It correctly answered..." -``` - -### Workflow 3: Data Exploration with Follow-ups - -``` -User: "Use my analytics Genie to explore sales trends" - -Claude: -1. python scripts/conversation.py ask SPACE_ID "What were total sales by month this year?" - # Returns conversation_id: conv_xyz -2. User: "Which month had the highest growth?" -3. python scripts/conversation.py ask SPACE_ID "Which month had the highest growth?" -c conv_xyz -4. User: "What products drove that growth?" -5. python scripts/conversation.py ask SPACE_ID "What products drove that growth?" -c conv_xyz -``` - -## Best Practices - -### Start New Conversations for New Topics - -Don't reuse conversations across unrelated questions: - -```bash -# Good: New conversation for new topic -python scripts/conversation.py ask SPACE_ID "What were sales last month?" # New conversation -python scripts/conversation.py ask SPACE_ID "How many employees do we have?" # New conversation - -# Good: Follow-up for related question -python scripts/conversation.py ask SPACE_ID "What were sales last month?" -# Get conversation_id from output, then: -python scripts/conversation.py ask SPACE_ID "Break that down by product" -c CONV_ID # Related follow-up -``` - -### Handle Clarification Requests - -Genie may ask for clarification instead of returning results: - -```bash -python scripts/conversation.py ask SPACE_ID "Show me the data" | jq '.text_response' -# If Genie needs clarification, text_response will contain the question -# Rephrase with more specifics -``` - -### Set Appropriate Timeouts - -- Simple aggregations: 30-60 seconds -- Complex joins: 60-120 seconds -- Large data scans: 120+ seconds - -```bash -# Quick question (default 60s) -python scripts/conversation.py ask SPACE_ID "How many orders today?" - -# Complex analysis with longer timeout -python scripts/conversation.py ask SPACE_ID "Calculate customer lifetime value for all customers" --timeout 180 -``` - -## Troubleshooting - -### "Genie Space not found" - -- Verify the `space_id` is correct -- Check you have access to the space -- Use `databricks genie get-space SPACE_ID` to verify it exists - -### "Query timed out" - -- Increase timeout: `--timeout 120` -- Simplify the question -- Check if the SQL warehouse is running: `databricks warehouses list` - -### "Failed to generate SQL" - -- Rephrase the question more clearly -- Check if the question is answerable with the available tables -- Add more instructions/curation to the Genie Space via the Databricks UI - -### Unexpected Results - -- Review the generated SQL in the response (`jq '.sql'`) -- Add SQL instructions to the Genie Space via the Databricks UI -- Add sample questions that demonstrate correct patterns From 1a1d7d52746923f9a42890e9ced2e6c4119f4a14 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Tue, 14 Apr 2026 13:18:20 +0200 Subject: [PATCH 27/66] Improve AI/BI dashboard skill with CLI-based workflow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add CRITICAL widget version requirements table - Document mandatory validation workflow (test queries before deploy) - Fix CLI commands: discover-schema requires CATALOG.SCHEMA.TABLE format - Fix lakeview create: use --display-name, --warehouse-id, --serialized-dashboard - Add Genie space linking via uiSettings.genieSpace - Add design best practices section - Remove duplicate 3-examples.md (content in 4-examples.md) - Update file references to match correct numbering πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../databricks-aibi-dashboards/3-examples.md | 286 ------------------ .../databricks-aibi-dashboards/SKILL.md | 168 ++++++++-- 2 files changed, 150 insertions(+), 304 deletions(-) delete mode 100644 databricks-skills/databricks-aibi-dashboards/3-examples.md diff --git a/databricks-skills/databricks-aibi-dashboards/3-examples.md b/databricks-skills/databricks-aibi-dashboards/3-examples.md deleted file mode 100644 index 078d3792..00000000 --- a/databricks-skills/databricks-aibi-dashboards/3-examples.md +++ /dev/null @@ -1,286 +0,0 @@ -# Complete Dashboard Examples - -Production-ready templates you can adapt for your use case. - -## Basic Dashboard (NYC Taxi) - -```bash -# Step 1: Check table schema -databricks experimental aitools tools discover-schema samples.nyctaxi.trips - -# Step 2: Test queries -databricks experimental aitools tools query --warehouse YOUR_WAREHOUSE_ID "SELECT COUNT(*) as trips, AVG(fare_amount) as avg_fare, AVG(trip_distance) as avg_distance FROM samples.nyctaxi.trips" - -databricks experimental aitools tools query --warehouse YOUR_WAREHOUSE_ID "SELECT pickup_zip, COUNT(*) as trip_count FROM samples.nyctaxi.trips GROUP BY pickup_zip ORDER BY trip_count DESC LIMIT 10" - -# Step 3: Build dashboard JSON -dashboard = { - "datasets": [ - { - "name": "summary", - "displayName": "Summary Stats", - "queryLines": [ - "SELECT COUNT(*) as trips, AVG(fare_amount) as avg_fare, ", - "AVG(trip_distance) as avg_distance ", - "FROM samples.nyctaxi.trips " - ] - }, - { - "name": "by_zip", - "displayName": "Trips by ZIP", - "queryLines": [ - "SELECT pickup_zip, COUNT(*) as trip_count ", - "FROM samples.nyctaxi.trips ", - "GROUP BY pickup_zip ", - "ORDER BY trip_count DESC ", - "LIMIT 10 " - ] - } - ], - "pages": [{ - "name": "overview", - "displayName": "NYC Taxi Overview", - "pageType": "PAGE_TYPE_CANVAS", - "layout": [ - # Text header - NO spec block! Use SEPARATE widgets for title and subtitle! - { - "widget": { - "name": "title", - "multilineTextboxSpec": { - "lines": ["## NYC Taxi Dashboard"] - } - }, - "position": {"x": 0, "y": 0, "width": 6, "height": 1} - }, - { - "widget": { - "name": "subtitle", - "multilineTextboxSpec": { - "lines": ["Trip statistics and analysis"] - } - }, - "position": {"x": 0, "y": 1, "width": 6, "height": 1} - }, - # Counter - version 2, width 2! - { - "widget": { - "name": "total-trips", - "queries": [{ - "name": "main_query", - "query": { - "datasetName": "summary", - "fields": [{"name": "trips", "expression": "`trips`"}], - "disaggregated": True - } - }], - "spec": { - "version": 2, - "widgetType": "counter", - "encodings": { - "value": {"fieldName": "trips", "displayName": "Total Trips"} - }, - "frame": {"title": "Total Trips", "showTitle": True} - } - }, - "position": {"x": 0, "y": 2, "width": 2, "height": 3} - }, - { - "widget": { - "name": "avg-fare", - "queries": [{ - "name": "main_query", - "query": { - "datasetName": "summary", - "fields": [{"name": "avg_fare", "expression": "`avg_fare`"}], - "disaggregated": True - } - }], - "spec": { - "version": 2, - "widgetType": "counter", - "encodings": { - "value": {"fieldName": "avg_fare", "displayName": "Avg Fare"} - }, - "frame": {"title": "Average Fare", "showTitle": True} - } - }, - "position": {"x": 2, "y": 2, "width": 2, "height": 3} - }, - { - "widget": { - "name": "total-distance", - "queries": [{ - "name": "main_query", - "query": { - "datasetName": "summary", - "fields": [{"name": "avg_distance", "expression": "`avg_distance`"}], - "disaggregated": True - } - }], - "spec": { - "version": 2, - "widgetType": "counter", - "encodings": { - "value": {"fieldName": "avg_distance", "displayName": "Avg Distance"} - }, - "frame": {"title": "Average Distance", "showTitle": True} - } - }, - "position": {"x": 4, "y": 2, "width": 2, "height": 3} - }, - # Bar chart - version 3 - { - "widget": { - "name": "trips-by-zip", - "queries": [{ - "name": "main_query", - "query": { - "datasetName": "by_zip", - "fields": [ - {"name": "pickup_zip", "expression": "`pickup_zip`"}, - {"name": "trip_count", "expression": "`trip_count`"} - ], - "disaggregated": True - } - }], - "spec": { - "version": 3, - "widgetType": "bar", - "encodings": { - "x": {"fieldName": "pickup_zip", "scale": {"type": "categorical"}, "displayName": "ZIP"}, - "y": {"fieldName": "trip_count", "scale": {"type": "quantitative"}, "displayName": "Trips"} - }, - "frame": {"title": "Trips by Pickup ZIP", "showTitle": True} - } - }, - "position": {"x": 0, "y": 5, "width": 6, "height": 5} - }, - # Table - version 2, minimal column props! - { - "widget": { - "name": "zip-table", - "queries": [{ - "name": "main_query", - "query": { - "datasetName": "by_zip", - "fields": [ - {"name": "pickup_zip", "expression": "`pickup_zip`"}, - {"name": "trip_count", "expression": "`trip_count`"} - ], - "disaggregated": True - } - }], - "spec": { - "version": 2, - "widgetType": "table", - "encodings": { - "columns": [ - {"fieldName": "pickup_zip", "displayName": "ZIP Code"}, - {"fieldName": "trip_count", "displayName": "Trip Count"} - ] - }, - "frame": {"title": "Top ZIP Codes", "showTitle": True} - } - }, - "position": {"x": 0, "y": 10, "width": 6, "height": 5} - } - ] - }] -} - -# Step 4: Save dashboard JSON to file, then deploy via CLI -# Save the above dashboard dict to dashboard.json, then: -# databricks lakeview create --json @dashboard.json -``` - -## Dashboard with Global Filters - -```python -import json - -# Dashboard with a global filter for region -dashboard_with_filters = { - "datasets": [ - { - "name": "sales", - "displayName": "Sales Data", - "queryLines": [ - "SELECT region, SUM(revenue) as total_revenue ", - "FROM catalog.schema.sales ", - "GROUP BY region" - ] - } - ], - "pages": [ - { - "name": "overview", - "displayName": "Sales Overview", - "pageType": "PAGE_TYPE_CANVAS", - "layout": [ - { - "widget": { - "name": "total-revenue", - "queries": [{ - "name": "main_query", - "query": { - "datasetName": "sales", - "fields": [{"name": "total_revenue", "expression": "`total_revenue`"}], - "disaggregated": True - } - }], - "spec": { - "version": 2, # Version 2 for counters! - "widgetType": "counter", - "encodings": { - "value": {"fieldName": "total_revenue", "displayName": "Total Revenue"} - }, - "frame": {"title": "Total Revenue", "showTitle": True} - } - }, - "position": {"x": 0, "y": 0, "width": 6, "height": 3} - } - ] - }, - { - "name": "filters", - "displayName": "Filters", - "pageType": "PAGE_TYPE_GLOBAL_FILTERS", # Required for global filter page! - "layout": [ - { - "widget": { - "name": "filter_region", - "queries": [{ - "name": "ds_sales_region", - "query": { - "datasetName": "sales", - "fields": [ - {"name": "region", "expression": "`region`"} - # DO NOT use associative_filter_predicate_group - causes SQL errors! - ], - "disaggregated": False # False for filters! - } - }], - "spec": { - "version": 2, # Version 2 for filters! - "widgetType": "filter-multi-select", # NOT "filter"! - "encodings": { - "fields": [{ - "fieldName": "region", - "displayName": "Region", - "queryName": "ds_sales_region" # Must match query name! - }] - }, - "frame": {"showTitle": True, "title": "Region"} # Always show title! - } - }, - "position": {"x": 0, "y": 0, "width": 2, "height": 2} - } - ] - } - ] -} - -# Deploy with filters -# Save dashboard_with_filters to dashboard_filters.json, then: -# databricks lakeview create --json @dashboard_filters.json -``` diff --git a/databricks-skills/databricks-aibi-dashboards/SKILL.md b/databricks-skills/databricks-aibi-dashboards/SKILL.md index ba325d2c..7572ece7 100644 --- a/databricks-skills/databricks-aibi-dashboards/SKILL.md +++ b/databricks-skills/databricks-aibi-dashboards/SKILL.md @@ -1,11 +1,26 @@ --- name: databricks-aibi-dashboards -description: "Create Databricks AI/BI dashboards. Use when creating, updating, or deploying Lakeview dashboards. CRITICAL: You MUST test ALL SQL queries via CLI BEFORE deploying. Follow guidelines strictly." +description: "Create Databricks AI/BI dashboards. Must use when creating, updating, or deploying Lakeview dashboards as Databricks Dashboard have a unique json structure. CRITICAL: You MUST test ALL SQL queries via CLI BEFORE deploying. Follow guidelines strictly." --- # AI/BI Dashboard Skill -Create Databricks AI/BI dashboards (formerly Lakeview dashboards). **Follow these guidelines strictly.** +Create Databricks AI/BI dashboards (formerly Lakeview dashboards). +A dashboard should be showing something relevant for a human, typically some KPI on the top, and based on the story, some graph (often temporal), and we see "something happens". +**Follow these guidelines strictly.** + +## CRITICAL: Widget Version Requirements + +> **Wrong version = broken widget!** This is the #1 cause of dashboard errors. + +| Widget Type | Version | Notes | +|-------------|---------|-------| +| `counter` | **2** | KPI cards | +| `table` | **2** | Data tables | +| `bar`, `line`, `area`, `pie` | **3** | Charts | +| `filter-*` | **2** | All filter types | + +--- ## CRITICAL: MANDATORY VALIDATION WORKFLOW @@ -22,7 +37,7 @@ Create Databricks AI/BI dashboards (formerly Lakeview dashboards). **Follow thes β”‚ - Verify column names match what widgets will reference β”‚ β”‚ - Verify data types are correct (dates, numbers, strings) β”‚ β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ -β”‚ STEP 4: Build dashboard JSON using ONLY verified queries β”‚ +β”‚ STEP 4: Build dashboard JSON (serialized_dashboard content) β”‚ β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ β”‚ STEP 5: Deploy via databricks lakeview create β”‚ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ @@ -36,7 +51,21 @@ Create Databricks AI/BI dashboards (formerly Lakeview dashboards). **Follow thes ```bash # Get table schemas for designing queries +# IMPORTANT: Use CATALOG.SCHEMA.TABLE format (full 3-part name required) databricks experimental aitools tools discover-schema catalog.schema.table1 catalog.schema.table2 + +# Example: +databricks experimental aitools tools discover-schema samples.nyctaxi.trips main.default.customers + +# Explore data patterns if needed (to understand what to visualize): +databricks experimental aitools tools query --warehouse WAREHOUSE_ID "SELECT DISTINCT status FROM catalog.schema.orders" +``` + +### Step 2: Get Warehouse ID + +```bash +# List warehouses to find one for SQL execution +databricks warehouses list ``` ### Step 3: Test SQL Queries @@ -44,20 +73,53 @@ databricks experimental aitools tools discover-schema catalog.schema.table1 cata ```bash # Test SQL queries - MANDATORY before deployment! databricks experimental aitools tools query --warehouse WAREHOUSE_ID "SELECT COUNT(*) FROM catalog.schema.table" + +# Test aggregations that will be used in widgets: +databricks experimental aitools tools query --warehouse WAREHOUSE_ID "SELECT region, SUM(revenue) FROM catalog.schema.sales GROUP BY region" ``` +### Step 4: Verify Data Matches Story + +Before finalizing, run validation queries to confirm the data tells the intended story: +```bash +# Example: Verify a spike/trend is visible in the data +databricks experimental aitools tools query --warehouse WAREHOUSE_ID " +SELECT + CASE WHEN date < '2025-02-17' THEN 'Before' ELSE 'After' END as period, + AVG(metric) as avg_value +FROM catalog.schema.table +GROUP BY 1" +# Should show significant difference between periods if that's the story +``` + +If values don't match expectations, fix the data or adjust the story before creating the dashboard. + ### Step 5: Dashboard Lifecycle ```bash +# Create a dashboard +# IMPORTANT: Use --display-name, --warehouse-id, and --serialized-dashboard (NOT --json @file.json with displayName in it) +databricks lakeview create \ + --display-name "My Dashboard" \ + --warehouse-id "abc123def456" \ + --serialized-dashboard "$(cat dashboard.json)" + +# Alternative: Use --json with the correct structure +databricks lakeview create --json '{ + "display_name": "My Dashboard", + "warehouse_id": "abc123def456", + "serialized_dashboard": "{\"datasets\":[...],\"pages\":[...]}" +}' + # List all dashboards databricks lakeview list -# Create a dashboard from JSON file -databricks lakeview create --json @dashboard.json - # Get dashboard details databricks lakeview get DASHBOARD_ID +# Update a dashboard +databricks lakeview update DASHBOARD_ID --serialized-dashboard "$(cat dashboard.json)" + # Publish a dashboard databricks lakeview publish DASHBOARD_ID --warehouse-id WAREHOUSE_ID @@ -68,21 +130,80 @@ databricks lakeview unpublish DASHBOARD_ID databricks lakeview trash DASHBOARD_ID ``` -### Get Available Warehouse +--- -```bash -# List warehouses to find one for SQL execution -databricks warehouses list +## JSON Structure (Required Skeleton) + +Every dashboard's `serialized_dashboard` content must follow this exact structure: + +```json +{ + "datasets": [ + { + "name": "ds_x", + "displayName": "Dataset X", + "queryLines": ["SELECT col1, col2 ", "FROM catalog.schema.table"] + } + ], + "pages": [ + { + "name": "main", + "displayName": "Main", + "pageType": "PAGE_TYPE_CANVAS", + "layout": [ + {"widget": {/* INLINE widget definition */}, "position": {"x":0,"y":0,"width":2,"height":3}} + ] + } + ] +} +``` + +**Structural rules (violations cause "failed to parse serialized dashboard"):** +- `queryLines`: Array of strings, NOT `"query": "string"` +- Widgets: INLINE in `layout[].widget`, NOT a separate `"widgets"` array +- `pageType`: Required on every page (`PAGE_TYPE_CANVAS` or `PAGE_TYPE_GLOBAL_FILTERS`) +- Query binding: `query.fields[].name` must exactly match `encodings.*.fieldName` + +### Linking a Genie Space (Optional) + +To add an "Ask Genie" button to the dashboard, add `uiSettings.genieSpace` to the JSON: + +```json +{ + "datasets": [...], + "pages": [...], + "uiSettings": { + "genieSpace": { + "isEnabled": true, + "overrideId": "your-genie-space-id-here", + "enablementMode": "ENABLED" + } + } +} ``` +> **Genie is NOT a widget.** Link via `uiSettings.genieSpace` only. There is no `"widgetType": "assistant"`. + +--- + +## Design Best Practices + +Apply unless user specifies otherwise: + +- **Global date filter**: When data has temporal columns, add a date range filter. Most dashboards need time-based filtering. +- **KPI time bounds**: Use time-bounded metrics that enable period comparison (MoM, YoY). Unbounded "all-time" totals are less actionable. +- **Value formatting**: Format values based on their meaning β€” currency with symbol, percentages with %, large numbers compacted (K/M/B). +- **Chart selection**: Match cardinality to chart type. Few distinct values β†’ pie/bar with color grouping; many values β†’ table. + ## Reference Files | What are you building? | Reference | |------------------------|-----------| | Any widget (text, counter, table, chart) | [1-widget-specifications.md](1-widget-specifications.md) | -| Dashboard with filters (global or page-level) | [2-filters.md](2-filters.md) | -| Need a complete working template to adapt | [3-examples.md](3-examples.md) | -| Debugging a broken dashboard | [4-troubleshooting.md](4-troubleshooting.md) | +| Advanced charts (area, scatter, combo, map) | [2-advanced-widget-specifications.md](2-advanced-widget-specifications.md) | +| Dashboard with filters (global or page-level) | [3-filters.md](3-filters.md) | +| Need a complete working template to adapt | [4-examples.md](4-examples.md) | +| Debugging a broken dashboard | [5-troubleshooting.md](5-troubleshooting.md) | --- @@ -90,12 +211,16 @@ databricks warehouses list ### 1) DATASET ARCHITECTURE -- **One dataset per domain** (e.g., orders, customers, products) +- **One dataset per domain** (e.g., orders, customers, products). Datasets shared across widgets benefit from the same filters. - **Exactly ONE valid SQL query per dataset** (no multiple queries separated by `;`) - Always use **fully-qualified table names**: `catalog.schema.table_name` - SELECT must include all dimensions needed by widgets and all derived columns via `AS` aliases - Put ALL business logic (CASE/WHEN, COALESCE, ratios) into the dataset SELECT with explicit aliases - **Contract rule**: Every widget `fieldName` must exactly match a dataset column or alias +- **Add ORDER BY** when visualization depends on data order: + - Time series: `ORDER BY date` for chronological display + - Rankings/Top-N: `ORDER BY metric DESC LIMIT 10` for "Top 10" charts + - Categorical charts: `ORDER BY metric DESC` to show largest values first ### 2) WIDGET FIELD EXPRESSIONS @@ -159,13 +284,20 @@ Each widget has a position: `{"x": 0, "y": 0, "width": 2, "height": 4}` **CRITICAL**: Each row must fill width=6 exactly. No gaps allowed. +``` +CORRECT: WRONG: +y=0: [w=6] y=0: [w=4]____ ← gap! +y=1: [w=2][w=2][w=2] ← fills 6 y=1: [w=1][w=1][w=1][w=1]__ ← gap! +y=4: [w=3][w=3] ← fills 6 +``` + **Recommended widget sizes:** | Widget Type | Width | Height | Notes | |-------------|-------|--------|-------| | Text header | 6 | 1 | Full width; use SEPARATE widgets for title and subtitle | | Counter/KPI | 2 | **3-4** | **NEVER height=2** - too cramped! | -| Line/Bar chart | 3 | **5-6** | Pair side-by-side to fill row | +| Line/Bar/Area chart | 3 | **5-6** | Pair side-by-side to fill row | | Pie chart | 3 | **5-6** | Needs space for legend | | Full-width chart | 6 | 5-7 | For detailed time series | | Table | 6 | 5-8 | Full width for readability | @@ -188,11 +320,11 @@ y=12: Table (w=6, h=6) - Detailed data | Dimension Type | Max Values | Examples | |----------------|------------|----------| | Chart color/groups | **3-8** | 4 regions, 5 product lines, 3 tiers | -| Filters | 4-10 | 8 countries, 5 channels | +| Filters | 4-15 | 8 countries, 5 channels | | High cardinality | **Table only** | customer_id, order_id, SKU | **Before creating any chart with color/grouping:** -1. Check column cardinality (use `databricks experimental aitools tools discover-schema` to see distinct values) +1. Check column cardinality via discover-schema or a COUNT DISTINCT query 2. If >10 distinct values, aggregate to higher level OR use TOP-N + "Other" bucket 3. For high-cardinality dimensions, use a table widget instead of a chart @@ -202,7 +334,7 @@ Before deploying, verify: 1. All widget names use only alphanumeric + hyphens + underscores 2. All rows sum to width=6 with no gaps 3. KPIs use height 3-4, charts use height 5-6 -4. Chart dimensions have ≀8 distinct values +4. Chart dimensions have reasonable cardinality (≀8 for colors/groups) 5. All widget fieldNames match dataset columns exactly 6. **Field `name` in query.fields matches `fieldName` in encodings exactly** (e.g., both `"sum(spend)"`) 7. Counter datasets: use `disaggregated: true` for 1-row datasets, `disaggregated: false` with aggregation for multi-row From 4a800d0c553dab959ecebea55c6fb67b7071d1ea Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Tue, 14 Apr 2026 14:41:59 +0200 Subject: [PATCH 28/66] Streamline AI/BI dashboard skill based on real usage testing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add Quick Reference table at top for common CLI commands - Add Step 4 design phase with filter-to-dataset mapping - Add filter scope rule to checklist (filters only affect datasets with field) - Clarify percentage format (0-1 vs 0-100) with fix options - Add data variance guidance for trend charts - Condense expression examples using [option|option] notation - Remove redundant ASCII workflow diagram (steps below are clearer) - Link dataset parameters to filter widget documentation πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../1-widget-specifications.md | 18 +-- .../databricks-aibi-dashboards/SKILL.md | 146 +++++++++--------- 2 files changed, 76 insertions(+), 88 deletions(-) diff --git a/databricks-skills/databricks-aibi-dashboards/1-widget-specifications.md b/databricks-skills/databricks-aibi-dashboards/1-widget-specifications.md index d8e03c13..9f861c1b 100644 --- a/databricks-skills/databricks-aibi-dashboards/1-widget-specifications.md +++ b/databricks-skills/databricks-aibi-dashboards/1-widget-specifications.md @@ -297,7 +297,7 @@ Add `format` to any encoding to display values appropriately: ## Dataset Parameters -Use `:param` syntax in SQL for dynamic filtering: +Use `:param` syntax in SQL for dynamic filtering. Parameters can be bound to filter widgets (see [3-filters.md](3-filters.md)): ```json { @@ -323,19 +323,9 @@ Use `:param` syntax in SQL for dynamic filtering: Allowed in `query.fields` (no CAST or complex SQL): ```json -// Aggregations -{"name": "sum(revenue)", "expression": "SUM(`revenue`)"} -{"name": "avg(price)", "expression": "AVG(`price`)"} -{"name": "count(id)", "expression": "COUNT(`id`)"} -{"name": "countdistinct(id)", "expression": "COUNT(DISTINCT `id`)"} - -// Date truncation -{"name": "daily(date)", "expression": "DATE_TRUNC(\"DAY\", `date`)"} -{"name": "weekly(date)", "expression": "DATE_TRUNC(\"WEEK\", `date`)"} -{"name": "monthly(date)", "expression": "DATE_TRUNC(\"MONTH\", `date`)"} - -// Simple reference -{"name": "category", "expression": "`category`"} +{"name": "[sum|avg|count|countdistinct|min|max](col)", "expression": "[SUM|AVG|COUNT|COUNT(DISTINCT)|MIN|MAX](`col`)"} +{"name": "[daily|weekly|monthly](date)", "expression": "DATE_TRUNC(\"[DAY|WEEK|MONTH]\", `date`)"} +{"name": "field", "expression": "`field`"} ``` For conditional logic, compute in dataset SQL instead. diff --git a/databricks-skills/databricks-aibi-dashboards/SKILL.md b/databricks-skills/databricks-aibi-dashboards/SKILL.md index 7572ece7..0e51d509 100644 --- a/databricks-skills/databricks-aibi-dashboards/SKILL.md +++ b/databricks-skills/databricks-aibi-dashboards/SKILL.md @@ -9,6 +9,21 @@ Create Databricks AI/BI dashboards (formerly Lakeview dashboards). A dashboard should be showing something relevant for a human, typically some KPI on the top, and based on the story, some graph (often temporal), and we see "something happens". **Follow these guidelines strictly.** +## Quick Reference + +| Task | Command | +|------|---------| +| List warehouses | `databricks warehouses list` | +| List tables | `databricks experimental aitools tools query --warehouse WH "SHOW TABLES IN catalog.schema"` | +| Get schema | `databricks experimental aitools tools discover-schema catalog.schema.table1 catalog.schema.table2` | +| Test query | `databricks experimental aitools tools query --warehouse WH "SELECT..."` | +| Create dashboard | `databricks lakeview create --display-name "X" --warehouse-id "Y" --serialized-dashboard "$(cat file.json)"` | +| Update dashboard | `databricks lakeview update DASHBOARD_ID --serialized-dashboard "$(cat file.json)"` | +| Publish | `databricks lakeview publish DASHBOARD_ID --warehouse-id WH` | +| Delete | `databricks lakeview trash DASHBOARD_ID` | + +--- + ## CRITICAL: Widget Version Requirements > **Wrong version = broken widget!** This is the #1 cause of dashboard errors. @@ -17,85 +32,69 @@ A dashboard should be showing something relevant for a human, typically some KPI |-------------|---------|-------| | `counter` | **2** | KPI cards | | `table` | **2** | Data tables | -| `bar`, `line`, `area`, `pie` | **3** | Charts | +| `bar`, `line`, `area`, `pie`, `scatter` | **3** | Charts | +| `combo`, `choropleth-map` | **1** | Advanced charts | | `filter-*` | **2** | All filter types | --- -## CRITICAL: MANDATORY VALIDATION WORKFLOW +## NEW DASHBOARD CREATION WORKFLOW -**You MUST follow this workflow exactly. Skipping validation causes broken dashboards.** +**You MUST test ALL SQL queries via CLI BEFORE deploying. Follow the overall logic in these steps for new dashboard - Skipping validation causes broken dashboards.** -``` -β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” -β”‚ STEP 1: Get table schemas via discover-schema β”‚ -β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ -β”‚ STEP 2: Write SQL queries for each dataset β”‚ -β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ -β”‚ STEP 3: TEST EVERY QUERY via CLI ← DO NOT SKIP! β”‚ -β”‚ - If query fails, FIX IT before proceeding β”‚ -β”‚ - Verify column names match what widgets will reference β”‚ -β”‚ - Verify data types are correct (dates, numbers, strings) β”‚ -β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ -β”‚ STEP 4: Build dashboard JSON (serialized_dashboard content) β”‚ -β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ -β”‚ STEP 5: Deploy via databricks lakeview create β”‚ -β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ -``` +### Step 1: Get Warehouse ID if not already known -**WARNING: If you deploy without testing queries, widgets WILL show "Invalid widget definition" errors!** - -## CLI Commands +```bash +# List warehouses to find one for SQL execution +databricks warehouses list +``` -### Step 1: Discover Table Schemas +### Step 2: Discover Table Schemas and existing data pattern ```bash # Get table schemas for designing queries +databricks experimental aitools tools query --warehouse WAREHOUSE_ID "SHOW TABLES IN catalog.schema" 2>&1 # IMPORTANT: Use CATALOG.SCHEMA.TABLE format (full 3-part name required) databricks experimental aitools tools discover-schema catalog.schema.table1 catalog.schema.table2 # Example: databricks experimental aitools tools discover-schema samples.nyctaxi.trips main.default.customers -# Explore data patterns if needed (to understand what to visualize): -databricks experimental aitools tools query --warehouse WAREHOUSE_ID "SELECT DISTINCT status FROM catalog.schema.orders" +# Explore data patterns if needed to confirm the data tells the intended story (to understand what/how to visualize): +databricks experimental aitools tools query --warehouse WAREHOUSE_ID "" ``` -### Step 2: Get Warehouse ID -```bash -# List warehouses to find one for SQL execution -databricks warehouses list -``` +### Step 3: Verify Data Matches Story +The datasets.querylines in the dashboard json (see example below) must be tested to ensure -### Step 3: Test SQL Queries +Before finalizing, run the SQL Queries you intend to add in each dataset to confirm that they run properly and that the result are valid. +This is crucial, as the widget defined in the json will use the query field output to render the visualization. The value should also make sense at a business level. +Remember that for the filter to work, the query should have the field available (so typically group by the filter field) -```bash -# Test SQL queries - MANDATORY before deployment! -databricks experimental aitools tools query --warehouse WAREHOUSE_ID "SELECT COUNT(*) FROM catalog.schema.table" +If values don't match expectations, ensure the query is correct, fix the data if you can, or adjust the story before creating the dashboard. -# Test aggregations that will be used in widgets: -databricks experimental aitools tools query --warehouse WAREHOUSE_ID "SELECT region, SUM(revenue) FROM catalog.schema.sales GROUP BY region" -``` +### Step 4: Plan Dashboard Structure -### Step 4: Verify Data Matches Story +Before writing JSON, plan your dashboard: -Before finalizing, run validation queries to confirm the data tells the intended story: -```bash -# Example: Verify a spike/trend is visible in the data -databricks experimental aitools tools query --warehouse WAREHOUSE_ID " -SELECT - CASE WHEN date < '2025-02-17' THEN 'Before' ELSE 'After' END as period, - AVG(metric) as avg_value -FROM catalog.schema.table -GROUP BY 1" -# Should show significant difference between periods if that's the story -``` +1. You must know the expected specific JSON structure. For this, **Read reference files**: [1-widget-specifications.md](1-widget-specifications.md), [3-filters.md](3-filters.md), [4-examples.md](4-examples.md) -If values don't match expectations, fix the data or adjust the story before creating the dashboard. +2. Think: **What widgets?** Map each visualization to a dataset: + | Widget | Type | Dataset | Has filter field? | + |--------|------|---------|-------------------| + | Revenue KPI | counter | ds_sales | βœ“ date, region | + | Trend Chart | line | ds_sales | βœ“ date, region | + | Top Products | table | ds_products | βœ— no date | + ... -### Step 5: Dashboard Lifecycle +3. **What filters?** For each filter, verify ALL datasets you want filtered contain the filter field. + > **Filters only affect datasets that have the filter field.** A pre-aggregated table without dates WON'T be date-filtered. + +4. **Write JSON locally** as a file. +### Step 5: Dashboard Lifecycle +Once created, you can edit the file as following: ```bash # Create a dashboard # IMPORTANT: Use --display-name, --warehouse-id, and --serialized-dashboard (NOT --json @file.json with displayName in it) @@ -166,7 +165,7 @@ Every dashboard's `serialized_dashboard` content must follow this exact structur ### Linking a Genie Space (Optional) -To add an "Ask Genie" button to the dashboard, add `uiSettings.genieSpace` to the JSON: +To add an "Ask Genie" button to the dashboard, or to link a genie space/room with an ID, add `uiSettings.genieSpace` to the JSON: ```json { @@ -189,7 +188,6 @@ To add an "Ask Genie" button to the dashboard, add `uiSettings.genieSpace` to th ## Design Best Practices Apply unless user specifies otherwise: - - **Global date filter**: When data has temporal columns, add a date range filter. Most dashboards need time-based filtering. - **KPI time bounds**: Use time-bounded metrics that enable period comparison (MoM, YoY). Unbounded "all-time" totals are less actionable. - **Value formatting**: Format values based on their meaning β€” currency with symbol, percentages with %, large numbers compacted (K/M/B). @@ -200,7 +198,7 @@ Apply unless user specifies otherwise: | What are you building? | Reference | |------------------------|-----------| | Any widget (text, counter, table, chart) | [1-widget-specifications.md](1-widget-specifications.md) | -| Advanced charts (area, scatter, combo, map) | [2-advanced-widget-specifications.md](2-advanced-widget-specifications.md) | +| Advanced charts (area, scatter/Bubble, combo (Line+Bar), Choropleth map) | [2-advanced-widget-specifications.md](2-advanced-widget-specifications.md) | | Dashboard with filters (global or page-level) | [3-filters.md](3-filters.md) | | Need a complete working template to adapt | [4-examples.md](4-examples.md) | | Debugging a broken dashboard | [5-troubleshooting.md](5-troubleshooting.md) | @@ -248,26 +246,10 @@ Apply unless user specifies otherwise: Allowed expressions in widget queries (you CANNOT use CAST or other SQL in expressions): -**For numbers:** ```json -{"name": "sum(revenue)", "expression": "SUM(`revenue`)"} -{"name": "avg(price)", "expression": "AVG(`price`)"} -{"name": "count(orders)", "expression": "COUNT(`order_id`)"} -{"name": "countdistinct(customers)", "expression": "COUNT(DISTINCT `customer_id`)"} -{"name": "min(date)", "expression": "MIN(`order_date`)"} -{"name": "max(date)", "expression": "MAX(`order_date`)"} -``` - -**For dates** (use daily for timeseries, weekly/monthly for grouped comparisons): -```json -{"name": "daily(date)", "expression": "DATE_TRUNC(\"DAY\", `date`)"} -{"name": "weekly(date)", "expression": "DATE_TRUNC(\"WEEK\", `date`)"} -{"name": "monthly(date)", "expression": "DATE_TRUNC(\"MONTH\", `date`)"} -``` - -**Simple field reference** (for pre-aggregated data): -```json -{"name": "category", "expression": "`category`"} +{"name": "[sum|avg|count|countdistinct|min|max](col)", "expression": "[SUM|AVG|COUNT|COUNT(DISTINCT)|MIN|MAX](`col`)"} +{"name": "[daily|weekly|monthly](date)", "expression": "DATE_TRUNC(\"[DAY|WEEK|MONTH]\", `date`)"} +{"name": "field", "expression": "`field`"} ``` If you need conditional logic or multi-field formulas, compute a derived column in the dataset SQL first. @@ -338,9 +320,25 @@ Before deploying, verify: 5. All widget fieldNames match dataset columns exactly 6. **Field `name` in query.fields matches `fieldName` in encodings exactly** (e.g., both `"sum(spend)"`) 7. Counter datasets: use `disaggregated: true` for 1-row datasets, `disaggregated: false` with aggregation for multi-row -8. Percent values are 0-1 (not 0-100) +8. **Percent values must be 0-1 for `number-percent` format** (0.865 displays as "86.5%", don't forget to set the format). If data is 0-100, either divide by 100 in SQL or use `number` format instead. 9. SQL uses Spark syntax (date_sub, not INTERVAL) 10. **All SQL queries tested via CLI and return expected data** +11. **Every dataset you want filtered MUST contain the filter field** β€” filters only affect datasets with that column in their query + +--- + +## Data Variance Considerations + +Before creating trend charts, check if the metric has enough variance to visualize meaningfully: + +```sql +SELECT MIN(metric), MAX(metric), MAX(metric) - MIN(metric) as range FROM dataset +``` + +If the range is very small relative to the scale (e.g., 83-89% on a 0-100 scale), the chart will appear nearly flat. Consider: +- Showing as KPI with delta/comparison instead of chart +- Using a table to display exact values +- Adjusting the visualization to focus on the variance --- From f1a745cb9d59c9b8898fa3f37bc24162f8893f33 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Tue, 14 Apr 2026 14:50:49 +0200 Subject: [PATCH 29/66] Add testing section to skills README MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Documents how to run unit and integration tests for skill scripts. πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- databricks-skills/README.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/databricks-skills/README.md b/databricks-skills/README.md index 08a77d32..ea3f3102 100644 --- a/databricks-skills/README.md +++ b/databricks-skills/README.md @@ -148,6 +148,26 @@ description: "What this teaches" ... ``` +## Testing + +Run tests for skill scripts (requires `pytest`): + +```bash +cd databricks-skills/.tests + +# Run all tests (unit tests are mocked, no Databricks connection needed) +python run_tests.py + +# Run only unit tests +python run_tests.py --unit + +# Run integration tests (requires Databricks connection) +python run_tests.py --integration + +# Verbose output +python run_tests.py -v +``` + ## Troubleshooting **Skills not loading?** Check `.claude/skills/` exists and each skill has `SKILL.md` From fcb36debd6e7bc3c1d15598db3fc817554540148 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Tue, 14 Apr 2026 15:32:40 +0200 Subject: [PATCH 30/66] Fix Knowledge Assistant skill CLI examples and add Quick Reference MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix create-knowledge-source: use --json with source_type "files" and files.path (old --volume-config flag doesn't exist) - Add Quick Reference section with correct commands - Add volume discovery step: databricks volumes list CATALOG SCHEMA - Fix state name: CREATING (not PROVISIONING) - Streamline content, remove duplicates πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../1-knowledge-assistants.md | 211 +++++-------- .../databricks-agent-bricks/SKILL.md | 298 ++++++++---------- 2 files changed, 208 insertions(+), 301 deletions(-) diff --git a/databricks-skills/databricks-agent-bricks/1-knowledge-assistants.md b/databricks-skills/databricks-agent-bricks/1-knowledge-assistants.md index 90015dbf..86c94b20 100644 --- a/databricks-skills/databricks-agent-bricks/1-knowledge-assistants.md +++ b/databricks-skills/databricks-agent-bricks/1-knowledge-assistants.md @@ -1,200 +1,149 @@ # Knowledge Assistants (KA) -Knowledge Assistants are document-based Q&A systems that use RAG (Retrieval-Augmented Generation) to answer questions from indexed documents. +Knowledge Assistants are document-based Q&A systems using RAG (Retrieval-Augmented Generation). -## What is a Knowledge Assistant? +## How It Works -A KA connects to documents stored in a Unity Catalog Volume and allows users to ask natural language questions. The system: - -1. **Indexes** all documents in the volume (PDFs, text files, etc.) +1. **Indexes** documents from a Volume (PDFs, text files) 2. **Retrieves** relevant chunks when a question is asked 3. **Generates** an answer using the retrieved context ## When to Use -Use a Knowledge Assistant when: -- You have a collection of documents (policies, manuals, guides, reports) +- Collection of documents (policies, manuals, guides, reports) - Users need to find specific information without reading entire documents -- You want to provide a conversational interface to documentation - -## Prerequisites +- Conversational interface to documentation -Before creating a KA, you need documents in a Unity Catalog Volume: +## Creating a Knowledge Assistant -**Option 1: Use existing documents** -- Upload PDFs/text files to a Volume manually or via SDK +### Step 1: Find Your Volume -**Option 2: Generate synthetic documents** -- Use the `databricks-unstructured-pdf-generation` skill to create realistic PDF documents -- Each PDF gets a companion JSON file with question/guideline pairs for evaluation +```bash +# List volumes in the schema +databricks volumes list catalog schema --output json -## Creating a Knowledge Assistant +# Browse volume contents +databricks experimental aitools tools query --warehouse WH "LIST '/Volumes/catalog/schema/volume/'" +``` -Use the CLI to create a Knowledge Assistant: +### Step 2: Create the KA ```bash -# Step 1: Create the Knowledge Assistant databricks knowledge-assistants create-knowledge-assistant \ "HR Policy Assistant" \ "Answers questions about HR policies and procedures" - -# Step 2: Add a knowledge source (volume with documents) -databricks knowledge-assistants create-knowledge-source \ - "knowledge-assistants/{ka_id}" \ - "HR Documents" \ - "HR policy documents" \ - "VOLUME" \ - --volume-config '{"volume_id": "/Volumes/my_catalog/my_schema/raw_data/hr_docs"}' - -# Step 3: Sync (index) the knowledge sources -databricks knowledge-assistants sync-knowledge-sources "knowledge-assistants/{ka_id}" ``` -The CLI will: -1. Create the KA with the specified configuration -2. Create a knowledge source pointing to your volume -3. Trigger indexing of the documents - -## Provisioning Timeline - -After creation, the KA endpoint needs to provision: - -| Status | Meaning | Duration | -|--------|---------|----------| -| `PROVISIONING` | Creating the endpoint | 2-5 minutes | -| `ONLINE` | Ready to use | - | -| `OFFLINE` | Not currently running | - | +Response: +```json +{ + "id": "dab408a2-f8f4-439e-b65d-cc3cc2c45bbd", + "name": "knowledge-assistants/dab408a2-f8f4-439e-b65d-cc3cc2c45bbd", + "endpoint_name": "ka-dab408a2-endpoint", + "state": "CREATING" +} +``` -Use the CLI to check the status: +### Step 3: Add Knowledge Source ```bash -databricks knowledge-assistants get-knowledge-assistant "knowledge-assistants/{ka_id}" +databricks knowledge-assistants create-knowledge-source \ + "knowledge-assistants/{ka_id}" \ + --json '{ + "display_name": "HR Documents", + "description": "HR policy PDFs", + "source_type": "files", + "files": {"path": "/Volumes/my_catalog/my_schema/hr_docs/"} + }' ``` -## Adding Example Questions +**Source types:** -Example questions help with: -- **Evaluation**: Test if the KA answers correctly -- **User onboarding**: Show users what to ask +| Type | Config | Use Case | +|------|--------|----------| +| `files` | `files.path` | PDFs/text in a Volume | +| `index` | `index.index_name`, `index.text_col`, `index.doc_uri_col` | Existing Vector Search index | -### Automatic (from PDF generation) +### Step 4: Sync and Wait -If you used `generate_pdf_documents`, each PDF has a companion JSON with: -```json -{ - "question": "What is the company's remote work policy?", - "guideline": "Should mention the 3-day minimum in-office requirement" -} -``` - -These are automatically added when `add_examples_from_volume=true` (default). +```bash +# Trigger indexing +databricks knowledge-assistants sync-knowledge-sources "knowledge-assistants/{ka_id}" -### Manual +# Check status +databricks knowledge-assistants get-knowledge-assistant "knowledge-assistants/{ka_id}" +``` -Examples can also be added manually via the Databricks UI or SDK. +| State | Meaning | Duration | +|-------|---------|----------| +| `CREATING` | Provisioning endpoint | 2-5 minutes | +| `ONLINE` | Ready to use | - | +| `OFFLINE` | Not running | - | -## Best Practices +## Managing Knowledge Assistants -### Document Organization +```bash +# List all KAs +databricks knowledge-assistants list-knowledge-assistants -- **One volume per topic**: e.g., `/Volumes/catalog/schema/raw_data/hr_docs`, `/Volumes/catalog/schema/raw_data/tech_docs` -- **Clear naming**: Name files descriptively so chunks are identifiable +# Get details +databricks knowledge-assistants get-knowledge-assistant "knowledge-assistants/{ka_id}" -### Instructions +# List knowledge sources +databricks knowledge-assistants list-knowledge-sources "knowledge-assistants/{ka_id}" -Good instructions improve answer quality: +# Update KA +databricks knowledge-assistants update-knowledge-assistant "knowledge-assistants/{ka_id}" "*" "New Name" "New Description" -``` -Be helpful and professional. When answering: -1. Always cite the specific document and section -2. If multiple documents are relevant, mention all of them -3. If the information isn't in the documents, clearly say so -4. Use bullet points for multi-part answers +# Delete KA +databricks knowledge-assistants delete-knowledge-assistant "knowledge-assistants/{ka_id}" ``` -### Updating Content +## Updating Content + +To update indexed documents: -To update the indexed documents: 1. Add/remove/modify files in the volume -2. Trigger a sync to re-index: +2. Re-sync: ```bash databricks knowledge-assistants sync-knowledge-sources "knowledge-assistants/{ka_id}" ``` -3. The KA will re-index the updated content - -## Example Workflow - -1. **Generate PDF documents** using `databricks-unstructured-pdf-generation` skill: - - Creates PDFs in `/Volumes/catalog/schema/raw_data/pdf_documents` - - Creates JSON files with question/guideline pairs - -2. **Create the Knowledge Assistant**: - - `name`: "My Document Assistant" - - `volume_path`: "/Volumes/catalog/schema/raw_data/pdf_documents" - -3. **Wait for ONLINE status** (2-5 minutes) - -4. **Examples are automatically added** from the JSON files - -5. **Test the KA** in the Databricks UI ## Using KA in Supervisor Agents -Knowledge Assistants can be used as agents in a Supervisor Agent (formerly Multi-Agent Supervisor, MAS). Each KA has an associated model serving endpoint. - -### Finding the Endpoint Name - -Use the CLI to retrieve the KA details: +KAs can be added to Supervisor Agents using their tile ID: ```bash -# List all KAs to find the one you want -databricks knowledge-assistants list-knowledge-assistants - -# Get details for a specific KA -databricks knowledge-assistants get-knowledge-assistant "knowledge-assistants/{ka_id}" -``` - -The response includes: -- `name`: The resource name (knowledge-assistants/{ka_id}) -- `display_name`: The KA display name -- Status information - -The endpoint name follows this pattern: `ka-{tile_id}-endpoint` - -### Example: Adding KA to Supervisor Agent +# Get KA tile ID +databricks knowledge-assistants list-knowledge-assistants --output json | jq '.[].id' -```bash -# First, list KAs to find the tile_id -databricks knowledge-assistants list-knowledge-assistants - -# Then use the tile_id to create a Supervisor Agent with mas_manager.py -python scripts/mas_manager.py create_mas "Support_MAS" '{ +# Use in Supervisor Agent +python scripts/mas_manager.py create_mas "Support MAS" '{ "agents": [ { - "name": "hr_agent", - "ka_tile_id": "", - "description": "Answers HR policy questions from the employee handbook" + "name": "policy_agent", + "ka_tile_id": "dab408a2-f8f4-439e-b65d-cc3cc2c45bbd", + "description": "Answers HR policy questions from documents" } ] }' ``` -## Troubleshooting +The endpoint name follows pattern: `ka-{tile_id}-endpoint` -### Endpoint stays in PROVISIONING +## Troubleshooting +### KA stays in CREATING state +- Wait up to 10 minutes - Check workspace capacity and quotas - Verify the volume path is accessible -- Wait up to 10 minutes before investigating further ### Documents not indexed - -- Ensure files are in a supported format (PDF, TXT, MD) +- Ensure files are supported format (PDF, TXT, MD) - Check file permissions in the volume -- Verify the volume path is correct +- Verify volume path is correct (trailing slash matters) ### Poor answer quality - -- Add more specific instructions +- Add instructions to guide the AI's behavior - Ensure documents are well-structured - Consider breaking large documents into smaller files diff --git a/databricks-skills/databricks-agent-bricks/SKILL.md b/databricks-skills/databricks-agent-bricks/SKILL.md index 94f3d68a..c2a4206f 100644 --- a/databricks-skills/databricks-agent-bricks/SKILL.md +++ b/databricks-skills/databricks-agent-bricks/SKILL.md @@ -1,245 +1,203 @@ --- name: databricks-agent-bricks -description: "Create and manage Databricks Agent Bricks: Knowledge Assistants (KA) for document Q&A, Genie Spaces for SQL exploration, and Supervisor Agents (MAS) for multi-agent orchestration. Use when building conversational AI applications on Databricks." +description: "Create and manage Databricks Agent Bricks: Knowledge Assistants (KA) for document Q&A, Genie Spaces for SQL exploration, and Supervisor Agents for multi-agent orchestration." --- # Agent Bricks -Create and manage Databricks Agent Bricks - pre-built AI components for building conversational applications. +Create and manage Databricks Agent Bricks - pre-built AI components for conversational applications. ## Overview -Agent Bricks are three types of pre-built AI tiles in Databricks: - | Brick | Purpose | Data Source | |-------|---------|-------------| | **Knowledge Assistant (KA)** | Document-based Q&A using RAG | PDF/text files in Volumes | | **Genie Space** | Natural language to SQL | Unity Catalog tables | -| **Supervisor Agent (MAS)** | Multi-agent orchestration | Model serving endpoints | - -## Prerequisites - -Before creating Agent Bricks, ensure you have the required data: - -### For Knowledge Assistants -- **Documents in a Volume**: PDF, text, or other files stored in a Unity Catalog volume -- Generate synthetic documents using the `databricks-unstructured-pdf-generation` skill if needed - -### For Genie Spaces -- **See the `databricks-genie` skill** for comprehensive Genie Space guidance -- Tables in Unity Catalog with the data to explore -- Generate raw data using the `databricks-synthetic-data-gen` skill -- Create tables using the `databricks-spark-declarative-pipelines` skill +| **Supervisor Agent** | Multi-agent orchestration | KA, Genie, endpoints, UC functions, MCP | -### For Supervisor Agents -- **Model Serving Endpoints**: Deployed agent endpoints (KA endpoints, custom agents, fine-tuned models) -- **Genie Spaces**: Existing Genie spaces can be used directly as agents for SQL-based queries -- Mix and match endpoint-based and Genie-based agents in the same Supervisor Agent +## Quick Reference -### For Unity Catalog Functions -- **Existing UC Function**: Function already registered in Unity Catalog -- Agent service principal has `EXECUTE` privilege on the function - -### For External MCP Servers -- **Existing UC HTTP Connection**: Connection configured with `is_mcp_connection: 'true'` -- Agent service principal has `USE CONNECTION` privilege on the connection - -## CLI Tools - -### Knowledge Assistant CLI +### Knowledge Assistant ```bash -# List all Knowledge Assistants -databricks knowledge-assistants list-knowledge-assistants +# List volumes in a schema +databricks volumes list CATALOG SCHEMA -# Create a Knowledge Assistant -databricks knowledge-assistants create-knowledge-assistant "My KA Name" "Description of what the KA does" +# Browse volume contents (via SQL) +databricks experimental aitools tools query --warehouse WH "LIST '/Volumes/catalog/schema/volume/'" -# Get a Knowledge Assistant by name (resource name format: knowledge-assistants/{id}) -databricks knowledge-assistants get-knowledge-assistant "knowledge-assistants/{ka_id}" +# Create KA +databricks knowledge-assistants create-knowledge-assistant "Name" "Description" -# Update a Knowledge Assistant -databricks knowledge-assistants update-knowledge-assistant "knowledge-assistants/{ka_id}" "*" "New Name" "New Description" +# Add knowledge source (files from volume) +databricks knowledge-assistants create-knowledge-source "knowledge-assistants/{ka_id}" \ + --json '{"display_name": "Source Name", "description": "...", "source_type": "files", "files": {"path": "/Volumes/catalog/schema/volume/"}}' -# Delete a Knowledge Assistant -databricks knowledge-assistants delete-knowledge-assistant "knowledge-assistants/{ka_id}" - -# Create a Knowledge Source (volume-based documents) -databricks knowledge-assistants create-knowledge-source "knowledge-assistants/{ka_id}" "Source Name" "Description" "VOLUME" \ - --volume-config '{"volume_id": "/Volumes/catalog/schema/volume"}' +# Sync (index) documents +databricks knowledge-assistants sync-knowledge-sources "knowledge-assistants/{ka_id}" -# List Knowledge Sources for a KA -databricks knowledge-assistants list-knowledge-sources "knowledge-assistants/{ka_id}" +# Check status (wait for ONLINE) +databricks knowledge-assistants get-knowledge-assistant "knowledge-assistants/{ka_id}" -# Sync (re-index) Knowledge Sources -databricks knowledge-assistants sync-knowledge-sources "knowledge-assistants/{ka_id}" +# List all KAs +databricks knowledge-assistants list-knowledge-assistants ``` -### Genie Space CLI +### Genie Space -**For comprehensive Genie guidance, use the `databricks-genie` skill.** +**See `databricks-genie` skill for comprehensive guidance.** ```bash -# List all Genie Spaces databricks genie list-spaces - -# Create a Genie Space -databricks genie create-space --json '{"display_name": "My Genie", "description": "...", "table_identifiers": ["catalog.schema.table"]}' - -# Get a Genie Space +databricks genie create-space --json '{"display_name": "Name", "description": "...", "table_identifiers": ["catalog.schema.table"]}' databricks genie get-space SPACE_ID - -# Update a Genie Space -databricks genie update-space SPACE_ID --json '{"display_name": "New Name"}' - -# Delete (trash) a Genie Space -databricks genie trash-space SPACE_ID ``` -See `databricks-genie` skill for: -- Table inspection workflow -- Sample question best practices -- Curation (instructions, certified queries) +### Supervisor Agent -**IMPORTANT**: There is NO system table for Genie spaces (e.g., `system.ai.genie_spaces` does not exist). Use `databricks genie list-spaces` to find spaces. +**No CLI** - use `scripts/mas_manager.py`: -### Supervisor Agent (MAS) - -**NO CLI AVAILABLE** - Supervisor Agents are managed via the self-contained `scripts/mas_manager.py` script in this skill folder. - -Install requirements first: ```bash -pip install databricks-sdk requests -``` - -Usage: - -```bash -# List all Supervisor Agents python scripts/mas_manager.py list_mas - -# Create a Supervisor Agent -python scripts/mas_manager.py create_mas "My Supervisor" '{"agents": [...], "description": "...", "instructions": "..."}' - -# Get a Supervisor Agent by tile ID +python scripts/mas_manager.py create_mas "Name" '{"description": "...", "agents": [...]}' python scripts/mas_manager.py get_mas TILE_ID +``` -# Find a Supervisor Agent by name -python scripts/mas_manager.py find_mas "My Supervisor" +--- -# Update a Supervisor Agent -python scripts/mas_manager.py update_mas TILE_ID '{"name": "New Name", "agents": [...], ...}' +## Knowledge Assistant Workflow -# Delete a Supervisor Agent -python scripts/mas_manager.py delete_mas TILE_ID - -# Add examples (MAS must be ONLINE) -python scripts/mas_manager.py add_examples TILE_ID '[{"question": "...", "guideline": "..."}]' +### Step 1: Find Your Volume -# Add examples with wait (for jobs - waits up to 10 min for ONLINE) -python scripts/mas_manager.py add_examples_wait TILE_ID '[{"question": "...", "guideline": "..."}]' +```bash +# List volumes in the schema +databricks volumes list skywest_airlines ops_control --output json -# List examples -python scripts/mas_manager.py list_examples TILE_ID +# Browse volume contents +databricks experimental aitools tools query --warehouse WH "LIST '/Volumes/catalog/schema/volume/'" ``` -Agent configuration options (provide exactly one per agent): -- `ka_tile_id`: Knowledge Assistant tile ID (for document Q&A agents) -- `genie_space_id`: Genie space ID (for SQL-based data agents) -- `endpoint_name`: Model serving endpoint name (for custom agents) -- `uc_function_name`: Unity Catalog function name in format `catalog.schema.function_name` -- `connection_name`: Unity Catalog connection name (for external MCP servers) +### Step 2: Create the KA -## Typical Workflow +```bash +databricks knowledge-assistants create-knowledge-assistant \ + "Engineering Docs Assistant" \ + "Answers questions about engineering documents and procedures" +``` -### 1. Generate Source Data +Response includes `id` (e.g., `dab408a2-f8f4-439e-b65d-cc3cc2c45bbd`). -Before creating Agent Bricks, generate the required source data: +### Step 3: Add Knowledge Source -**For KA (document Q&A)**: -``` -1. Use `databricks-unstructured-pdf-generation` skill to generate PDFs -2. PDFs are saved to a Volume with companion JSON files (question/guideline pairs) +```bash +databricks knowledge-assistants create-knowledge-source \ + "knowledge-assistants/{ka_id}" \ + --json '{ + "display_name": "Engineering Documents", + "description": "PDFs from engineering docs folder", + "source_type": "files", + "files": {"path": "/Volumes/catalog/schema/volume/docs/"} + }' ``` -**For Genie (SQL exploration)**: -``` -1. Use `databricks-synthetic-data-gen` skill to create raw parquet data -2. Use `databricks-spark-declarative-pipelines` skill to create bronze/silver/gold tables -``` +**Source types:** +- `files` - PDFs/text from a Volume path (`files.path`) +- `index` - Existing Vector Search index (`index.index_name`, `index.text_col`, `index.doc_uri_col`) -### 2. Create the Agent Brick +### Step 4: Sync and Wait -Use the CLI commands above or SDK to create your Agent Bricks with data sources. +```bash +# Trigger indexing +databricks knowledge-assistants sync-knowledge-sources "knowledge-assistants/{ka_id}" -### 3. Wait for Provisioning +# Check status (wait for state: ONLINE) +databricks knowledge-assistants get-knowledge-assistant "knowledge-assistants/{ka_id}" +``` -Newly created KA and MAS tiles need time to provision. The endpoint status will progress: -- `PROVISIONING` - Being created (can take 2-5 minutes) -- `ONLINE` - Ready to use -- `OFFLINE` - Not running +| State | Meaning | +|-------|---------| +| `CREATING` | Provisioning endpoint (2-5 min) | +| `ONLINE` | Ready to use | +| `OFFLINE` | Not running | -### 4. Add Examples (Automatic) +--- -For KA, if `add_examples_from_volume=true`, examples are automatically extracted from JSON files in the volume and added once the endpoint is `ONLINE`. +## Supervisor Agent Workflow -## Best Practices +Supervisor Agents orchestrate multiple agents (KA, Genie, endpoints, UC functions, MCP). -1. **Use meaningful names**: Names are sanitized automatically (spaces become underscores) -2. **Provide descriptions**: Helps users understand what the brick does -3. **Add instructions**: Guide the AI's behavior and tone -4. **Include sample questions**: Shows users how to interact with the brick -5. **Use the workflow**: Generate data first, then create the brick +### Prerequisites -## Example: Multi-Modal Supervisor Agent +```bash +pip install databricks-sdk requests +``` -Use `scripts/mas_manager.py` to create a Supervisor Agent: +### Create a Supervisor Agent ```bash -python scripts/mas_manager.py create_mas "Enterprise Support Supervisor" '{ - "description": "Comprehensive enterprise support agent", - "instructions": "Route queries as follows:\n1. Policy/procedure questions β†’ knowledge_base\n2. Data analysis requests β†’ analytics_engine\n3. Ticket classification β†’ ml_classifier", +python scripts/mas_manager.py create_mas "Support Supervisor" '{ + "description": "Routes queries to specialized agents", + "instructions": "Route billing questions to billing_agent, technical questions to docs_agent", "agents": [ { - "name": "knowledge_base", - "ka_tile_id": "f32c5f73-466b-...", - "description": "Answers questions about company policies from indexed files" - }, - { - "name": "analytics_engine", + "name": "billing_agent", "genie_space_id": "01abc123...", - "description": "Runs SQL analytics on usage metrics" - }, - { - "name": "ml_classifier", - "endpoint_name": "custom-classification-endpoint", - "description": "Classifies support tickets using custom ML model" + "description": "SQL analytics on billing data" }, { - "name": "data_enrichment", - "uc_function_name": "support.utils.enrich_ticket_data", - "description": "Enriches support ticket data with customer history" - }, - { - "name": "ticket_operations", - "connection_name": "ticket_system_mcp", - "description": "Creates and updates support tickets in external system" + "name": "docs_agent", + "ka_tile_id": "dab408a2-f8f4-439e-b65d-cc3cc2c45bbd", + "description": "Answers questions from technical documentation" } ] }' ``` -## Related Skills +### Agent Types + +Each agent needs exactly ONE of: -- **[databricks-genie](../databricks-genie/SKILL.md)** - Comprehensive Genie Space creation, curation, and Conversation API guidance -- **[databricks-unstructured-pdf-generation](../databricks-unstructured-pdf-generation/SKILL.md)** - Generate synthetic PDFs to feed into Knowledge Assistants -- **[databricks-synthetic-data-gen](../databricks-synthetic-data-gen/SKILL.md)** - Create raw data for Genie Space tables -- **[databricks-spark-declarative-pipelines](../databricks-spark-declarative-pipelines/SKILL.md)** - Build bronze/silver/gold tables consumed by Genie Spaces -- **[databricks-model-serving](../databricks-model-serving/SKILL.md)** - Deploy custom agent endpoints used as MAS agents -- **[databricks-vector-search](../databricks-vector-search/SKILL.md)** - Build vector indexes for RAG applications paired with KAs +| Field | Agent Type | +|-------|------------| +| `ka_tile_id` | Knowledge Assistant (document Q&A) | +| `genie_space_id` | Genie Space (SQL analytics) | +| `endpoint_name` | Model serving endpoint (custom agent) | +| `uc_function_name` | Unity Catalog function (`catalog.schema.function`) | +| `connection_name` | UC HTTP Connection (MCP server) | -## See Also +### Find IDs + +```bash +# KA tile ID +databricks knowledge-assistants list-knowledge-assistants --output json | jq '.[].id' + +# Genie space ID +databricks genie list-spaces --output json | jq '.[].space_id' +``` + +### Manage Supervisor Agents + +```bash +python scripts/mas_manager.py get_mas TILE_ID +python scripts/mas_manager.py update_mas TILE_ID '{"name": "New Name", ...}' +python scripts/mas_manager.py delete_mas TILE_ID + +# Add examples (requires ONLINE status) +python scripts/mas_manager.py add_examples TILE_ID '[{"question": "...", "guideline": "..."}]' +``` + +--- + +## Reference Files + +| Topic | File | +|-------|------| +| KA details, troubleshooting | [1-knowledge-assistants.md](1-knowledge-assistants.md) | +| Supervisor Agent details, MCP, UC functions | [2-supervisor-agents.md](2-supervisor-agents.md) | +| Genie Spaces | See `databricks-genie` skill | + +## Related Skills -- `1-knowledge-assistants.md` - Detailed KA patterns and examples -- `databricks-genie` skill - Detailed Genie patterns, curation, and examples -- `2-supervisor-agents.md` - Detailed MAS patterns and examples +- **databricks-genie** - Genie Space creation, curation, Conversation API +- **databricks-unstructured-pdf-generation** - Generate synthetic PDFs for KA +- **databricks-model-serving** - Deploy custom agent endpoints From 59eea3957ad107de21d07b9b60897e77b06d06c1 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Tue, 14 Apr 2026 16:47:59 +0200 Subject: [PATCH 31/66] Improve Supervisor Agent skill documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Fix status table: NOT_READY instead of PROVISIONING - Expand Quick Reference with complete working example - Add note about running from skill folder - Include ID lookup commands in Quick Reference πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../2-supervisor-agents.md | 2 +- .../databricks-agent-bricks/SKILL.md | 20 +++++++++++++++++-- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/databricks-skills/databricks-agent-bricks/2-supervisor-agents.md b/databricks-skills/databricks-agent-bricks/2-supervisor-agents.md index 893c49be..75ccae4f 100644 --- a/databricks-skills/databricks-agent-bricks/2-supervisor-agents.md +++ b/databricks-skills/databricks-agent-bricks/2-supervisor-agents.md @@ -248,7 +248,7 @@ After creation, the Supervisor Agent endpoint needs to provision: | Status | Meaning | Duration | |--------|---------|----------| -| `PROVISIONING` | Creating the supervisor | 2-5 minutes | +| `NOT_READY` | Creating the supervisor | 2-5 minutes | | `ONLINE` | Ready to route queries | - | | `OFFLINE` | Not currently running | - | diff --git a/databricks-skills/databricks-agent-bricks/SKILL.md b/databricks-skills/databricks-agent-bricks/SKILL.md index c2a4206f..84a61325 100644 --- a/databricks-skills/databricks-agent-bricks/SKILL.md +++ b/databricks-skills/databricks-agent-bricks/SKILL.md @@ -55,12 +55,28 @@ databricks genie get-space SPACE_ID ### Supervisor Agent -**No CLI** - use `scripts/mas_manager.py`: +**No CLI** - use `scripts/mas_manager.py` (run from skill folder): ```bash +# List existing MAS python scripts/mas_manager.py list_mas -python scripts/mas_manager.py create_mas "Name" '{"description": "...", "agents": [...]}' + +# Create MAS with KA and Genie agents +python scripts/mas_manager.py create_mas "My Supervisor" '{ + "description": "Routes queries to specialized agents", + "instructions": "Route data questions to analyst. Route document questions to docs_agent.", + "agents": [ + {"name": "analyst", "genie_space_id": "01abc...", "description": "SQL analytics on data"}, + {"name": "docs_agent", "ka_tile_id": "dab408a2-...", "description": "Answers from documents"} + ] +}' + +# Check status (wait for ONLINE) python scripts/mas_manager.py get_mas TILE_ID + +# Find IDs +databricks knowledge-assistants list-knowledge-assistants --output json | jq '.[].id' +databricks genie list-spaces --output json | jq '.[].space_id' ``` --- From 1391f1f01537bf54c8b98c6f1cae8d09614bbab1 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Tue, 14 Apr 2026 16:57:44 +0200 Subject: [PATCH 32/66] Restructure Agent Bricks skill - remove redundancy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SKILL.md: Quick Reference only (commands for KA, MAS) - Clarify Genie is in databricks-genie skill - Add explanation of what Agent Bricks are - Remove Genie section (handled by separate skill) 1-knowledge-assistants.md: Source types + troubleshooting only 2-supervisor-agents.md: UC functions, MCP, descriptions, examples, troubleshooting Removed 500+ lines of duplicate content. πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../1-knowledge-assistants.md | 156 ++----- .../2-supervisor-agents.md | 392 ++---------------- .../databricks-agent-bricks/SKILL.md | 206 ++------- 3 files changed, 116 insertions(+), 638 deletions(-) diff --git a/databricks-skills/databricks-agent-bricks/1-knowledge-assistants.md b/databricks-skills/databricks-agent-bricks/1-knowledge-assistants.md index 86c94b20..9fd1ffd5 100644 --- a/databricks-skills/databricks-agent-bricks/1-knowledge-assistants.md +++ b/databricks-skills/databricks-agent-bricks/1-knowledge-assistants.md @@ -1,149 +1,53 @@ -# Knowledge Assistants (KA) +# Knowledge Assistants - Details -Knowledge Assistants are document-based Q&A systems using RAG (Retrieval-Augmented Generation). +For commands, see [SKILL.md](SKILL.md). -## How It Works +## Source Types -1. **Indexes** documents from a Volume (PDFs, text files) -2. **Retrieves** relevant chunks when a question is asked -3. **Generates** an answer using the retrieved context +### Files (Volume) -## When to Use - -- Collection of documents (policies, manuals, guides, reports) -- Users need to find specific information without reading entire documents -- Conversational interface to documentation - -## Creating a Knowledge Assistant - -### Step 1: Find Your Volume - -```bash -# List volumes in the schema -databricks volumes list catalog schema --output json - -# Browse volume contents -databricks experimental aitools tools query --warehouse WH "LIST '/Volumes/catalog/schema/volume/'" -``` - -### Step 2: Create the KA - -```bash -databricks knowledge-assistants create-knowledge-assistant \ - "HR Policy Assistant" \ - "Answers questions about HR policies and procedures" -``` - -Response: ```json { - "id": "dab408a2-f8f4-439e-b65d-cc3cc2c45bbd", - "name": "knowledge-assistants/dab408a2-f8f4-439e-b65d-cc3cc2c45bbd", - "endpoint_name": "ka-dab408a2-endpoint", - "state": "CREATING" + "source_type": "files", + "files": {"path": "/Volumes/catalog/schema/volume/folder/"} } ``` -### Step 3: Add Knowledge Source - -```bash -databricks knowledge-assistants create-knowledge-source \ - "knowledge-assistants/{ka_id}" \ - --json '{ - "display_name": "HR Documents", - "description": "HR policy PDFs", - "source_type": "files", - "files": {"path": "/Volumes/my_catalog/my_schema/hr_docs/"} - }' -``` - -**Source types:** - -| Type | Config | Use Case | -|------|--------|----------| -| `files` | `files.path` | PDFs/text in a Volume | -| `index` | `index.index_name`, `index.text_col`, `index.doc_uri_col` | Existing Vector Search index | - -### Step 4: Sync and Wait - -```bash -# Trigger indexing -databricks knowledge-assistants sync-knowledge-sources "knowledge-assistants/{ka_id}" - -# Check status -databricks knowledge-assistants get-knowledge-assistant "knowledge-assistants/{ka_id}" -``` - -| State | Meaning | Duration | -|-------|---------|----------| -| `CREATING` | Provisioning endpoint | 2-5 minutes | -| `ONLINE` | Ready to use | - | -| `OFFLINE` | Not running | - | - -## Managing Knowledge Assistants +Supported formats: PDF, TXT, MD, DOCX -```bash -# List all KAs -databricks knowledge-assistants list-knowledge-assistants +### Vector Search Index -# Get details -databricks knowledge-assistants get-knowledge-assistant "knowledge-assistants/{ka_id}" +Use existing index instead of auto-indexing: -# List knowledge sources -databricks knowledge-assistants list-knowledge-sources "knowledge-assistants/{ka_id}" - -# Update KA -databricks knowledge-assistants update-knowledge-assistant "knowledge-assistants/{ka_id}" "*" "New Name" "New Description" - -# Delete KA -databricks knowledge-assistants delete-knowledge-assistant "knowledge-assistants/{ka_id}" +```json +{ + "source_type": "index", + "index": { + "index_name": "catalog.schema.my_index", + "text_col": "content", + "doc_uri_col": "source_url" + } +} ``` ## Updating Content -To update indexed documents: - -1. Add/remove/modify files in the volume -2. Re-sync: - ```bash - databricks knowledge-assistants sync-knowledge-sources "knowledge-assistants/{ka_id}" - ``` - -## Using KA in Supervisor Agents - -KAs can be added to Supervisor Agents using their tile ID: - -```bash -# Get KA tile ID -databricks knowledge-assistants list-knowledge-assistants --output json | jq '.[].id' - -# Use in Supervisor Agent -python scripts/mas_manager.py create_mas "Support MAS" '{ - "agents": [ - { - "name": "policy_agent", - "ka_tile_id": "dab408a2-f8f4-439e-b65d-cc3cc2c45bbd", - "description": "Answers HR policy questions from documents" - } - ] -}' -``` - -The endpoint name follows pattern: `ka-{tile_id}-endpoint` +1. Add/modify/remove files in the Volume +2. Re-sync: `databricks knowledge-assistants sync-knowledge-sources "knowledge-assistants/{ka_id}"` ## Troubleshooting -### KA stays in CREATING state +**KA stays in CREATING:** - Wait up to 10 minutes -- Check workspace capacity and quotas -- Verify the volume path is accessible +- Check workspace quotas +- Verify volume path exists -### Documents not indexed -- Ensure files are supported format (PDF, TXT, MD) -- Check file permissions in the volume -- Verify volume path is correct (trailing slash matters) +**Documents not indexed:** +- Check file format (PDF, TXT, MD, DOCX) +- Verify volume path (trailing slash matters) +- Check file permissions -### Poor answer quality -- Add instructions to guide the AI's behavior +**Poor answer quality:** - Ensure documents are well-structured -- Consider breaking large documents into smaller files +- Break large documents into smaller files +- Add clear headings and sections diff --git a/databricks-skills/databricks-agent-bricks/2-supervisor-agents.md b/databricks-skills/databricks-agent-bricks/2-supervisor-agents.md index 75ccae4f..bbb296d0 100644 --- a/databricks-skills/databricks-agent-bricks/2-supervisor-agents.md +++ b/databricks-skills/databricks-agent-bricks/2-supervisor-agents.md @@ -1,394 +1,92 @@ -# Supervisor Agents (MAS) +# Supervisor Agents - Details -Supervisor Agents orchestrate multiple specialized agents, routing user queries to the most appropriate agent based on the query content. - -## What is a Supervisor Agent? - -A Supervisor Agent (formerly Multi-Agent Supervisor, MAS) acts as a traffic controller for multiple AI agents, routing user queries to the most appropriate agent. It supports five types of agents: - -1. **Knowledge Assistants (KA)**: Document-based Q&A from PDFs/files in Volumes -2. **Genie Spaces**: Natural language to SQL for data exploration -3. **Model Serving Endpoints**: Custom LLM agents, fine-tuned models, RAG applications -4. **Unity Catalog Functions**: Callable UC functions for data operations -5. **External MCP Servers**: JSON-RPC endpoints via UC HTTP Connections for external system integration - -When a user asks a question: -1. **Analyzes** the query to understand the intent -2. **Routes** to the most appropriate specialized agent -3. **Returns** the agent's response to the user - -This allows you to combine multiple specialized agents into a single unified interface. - -## When to Use - -Use a Supervisor Agent when: -- You have multiple specialized agents (billing, technical support, HR, etc.) -- Users shouldn't need to know which agent to ask -- You want to provide a unified conversational experience - -## Prerequisites - -Before creating a Supervisor Agent, you need agents of one or both types: - -**Model Serving Endpoints** (`endpoint_name`): -- Knowledge Assistant (KA) endpoints (e.g., `ka-abc123-endpoint`) -- Custom agents built with LangChain, LlamaIndex, etc. -- Fine-tuned models -- RAG applications - -**Genie Spaces** (`genie_space_id`): -- Existing Genie spaces for SQL-based data exploration -- Great for analytics, metrics, and data-driven questions -- No separate endpoint deployment required - reference the space directly -- To find a Genie space, use `databricks genie list-spaces` -- **Note**: There is NO system table for Genie spaces - do not try to query `system.ai.genie_spaces` +For commands, see [SKILL.md](SKILL.md). ## Unity Catalog Functions -Unity Catalog Functions allow Supervisor Agents to call registered UC functions for data operations. - -### Prerequisites +Call registered UC functions from the Supervisor Agent. -- UC Function already exists (use SQL `CREATE FUNCTION` or Python UDF) -- Agent service principal has `EXECUTE` privilege: - ```sql - GRANT EXECUTE ON FUNCTION catalog.schema.function_name TO ``; - ``` - -### Configuration +**Prerequisites:** +- UC Function exists (`CREATE FUNCTION` or Python UDF) +- Grant execute: `GRANT EXECUTE ON FUNCTION catalog.schema.func TO \`\`;` +**Config:** ```json -{ - "name": "data_enrichment", - "uc_function_name": "sales_analytics.utils.enrich_customer_data", - "description": "Enriches customer records with demographic and purchase history data" -} +{"name": "enricher", "uc_function_name": "catalog.schema.enrich_data", "description": "Enriches customer records"} ``` -**Field**: `uc_function_name` - Fully-qualified function name in format `catalog.schema.function_name` - ## External MCP Servers -External MCP Servers enable Supervisor Agents to interact with external systems (ERP, CRM, etc.) via UC HTTP Connections. The MCP server implements a JSON-RPC 2.0 endpoint that exposes tools for the Supervisor Agent to call. - -### Prerequisites - -**1. MCP Server Endpoint**: Your external system must provide a JSON-RPC 2.0 endpoint (e.g., `/api/mcp`) that implements the MCP protocol: - -```python -# Example MCP server tool definition -TOOLS = [ - { - "name": "approve_invoice", - "description": "Approve a specific invoice", - "inputSchema": { - "type": "object", - "properties": { - "invoice_number": {"type": "string", "description": "Invoice number to approve"}, - "approver": {"type": "string", "description": "Name/email of approver"}, - }, - "required": ["invoice_number"], - }, - }, -] - -# JSON-RPC methods: initialize, tools/list, tools/call -``` - -**2. UC HTTP Connection**: Create a Unity Catalog HTTP Connection that points to your MCP endpoint: +Connect to external systems (ERP, CRM) via UC HTTP Connection implementing MCP protocol. +**1. Create UC HTTP Connection:** ```sql -CREATE CONNECTION my_mcp_connection TYPE HTTP +CREATE CONNECTION my_mcp TYPE HTTP OPTIONS ( - host 'https://my-app.databricksapps.com', -- Your MCP server URL + host 'https://my-app.databricksapps.com', port '443', - base_path '/api/mcp', -- Path to JSON-RPC endpoint - client_id '', -- OAuth M2M credentials - client_secret '', + base_path '/api/mcp', + client_id '', + client_secret '', oauth_scope 'all-apis', token_endpoint 'https://.azuredatabricks.net/oidc/v1/token', - is_mcp_connection 'true' -- REQUIRED: Identifies as MCP connection + is_mcp_connection 'true' ); ``` -**3. Grant Permissions**: Agent service principal needs access to the connection: - +**2. Grant access:** ```sql -GRANT USE CONNECTION ON my_mcp_connection TO ``; +GRANT USE CONNECTION ON my_mcp TO ``; ``` -### Configuration - -Reference the UC Connection using the `connection_name` field: - -```python -{ - "name": "external_operations", - "connection_name": "my_mcp_connection", - "description": "Execute external system operations: approve invoices, create records, trigger workflows" -} -``` - -**Field**: `connection_name` - the name of the Unity Catalog HTTP Connection configured as an MCP server - -**Important**: Make the description comprehensive - it guides the Supervisor Agent's routing decisions for when to call this agent. - -### Complete Example: Multi-System Supervisor - -Example showing integration of Genie, KA, and external MCP using `scripts/mas_manager.py`: - -```bash -python scripts/mas_manager.py create_mas "AP_Invoice_Supervisor" '{ - "description": "AP automation assistant with analytics, policy guidance, and operational actions", - "instructions": "Route queries as follows:\n- Data questions (invoice counts, spend analysis, vendor metrics) β†’ billing_analyst\n- Policy questions (thresholds, SLAs, compliance rules) β†’ policy_expert\n- Actions (approve, reject, flag, search, workflows) β†’ ap_operations\n\nWhen a user asks to approve, reject, or flag an invoice, ALWAYS use ap_operations.", - "agents": [ - { - "name": "billing_analyst", - "genie_space_id": "01abc123...", - "description": "SQL analytics on AP invoice data: spending trends, vendor analysis, aging reports" - }, - { - "name": "policy_expert", - "ka_tile_id": "f32c5f73...", - "description": "Answers questions about AP policies, approval workflows, and compliance requirements from policy documents" - }, - { - "name": "ap_operations", - "connection_name": "ap_invoice_mcp", - "description": "Execute AP operations: approve/reject/flag invoices, search invoice details, get vendor summaries, trigger batch workflows. Use for ANY action or write operation." - } - ] -}' +**3. Config:** +```json +{"name": "operations", "connection_name": "my_mcp", "description": "Execute operations: approve invoices, trigger workflows"} ``` -### MCP Connection Testing - -Verify your connection before adding to MAS: - +**Test connection:** ```sql --- Test tools/list method -SELECT http_request( - conn => 'my_mcp_connection', - method => 'POST', - path => '', - json => '{"jsonrpc":"2.0","method":"tools/list","id":1}' -); +SELECT http_request(conn => 'my_mcp', method => 'POST', path => '', json => '{"jsonrpc":"2.0","method":"tools/list","id":1}'); ``` -### Resources - -- **MCP Protocol Spec**: [Model Context Protocol](https://modelcontextprotocol.io) - -## Creating a Supervisor Agent - -**NO CLI AVAILABLE** - Use the `scripts/mas_manager.py` script in this skill folder: - -```bash -python scripts/mas_manager.py create_mas "Customer Support MAS" '{ - "description": "Routes customer queries to specialized support agents", - "instructions": "Analyze the user'\''s question and route to the most appropriate agent. If unclear, ask for clarification.", - "agents": [ - { - "name": "policy_agent", - "ka_tile_id": "f32c5f73-466b-4798-b3a0-5396b5ece2a5", - "description": "Answers questions about company policies and procedures from indexed documents" - }, - { - "name": "usage_analytics", - "genie_space_id": "01abc123-def4-5678-90ab-cdef12345678", - "description": "Answers data questions about usage metrics, trends, and statistics" - }, - { - "name": "custom_agent", - "endpoint_name": "my-custom-endpoint", - "description": "Handles specialized queries via custom model endpoint" - } - ] -}' -``` - -This example shows mixing Knowledge Assistants (policy_agent), Genie spaces (usage_analytics), and custom endpoints (custom_agent). - -## Agent Configuration - -Each agent in the `agents` list needs: - -| Field | Required | Description | -|-------|----------|-------------| -| `name` | Yes | Internal identifier for the agent | -| `description` | Yes | What this agent handles (critical for routing) | -| `ka_tile_id` | One of these | Knowledge Assistant tile ID (for document Q&A agents) | -| `genie_space_id` | One of these | Genie space ID (for SQL-based data agents) | -| `endpoint_name` | One of these | Model serving endpoint name (for custom agents) | -| `uc_function_name` | One of these | Unity Catalog function name in format `catalog.schema.function_name` | -| `connection_name` | One of these | Unity Catalog connection name (for external MCP servers) | - -**Note**: Provide exactly one of: `ka_tile_id`, `genie_space_id`, `endpoint_name`, `uc_function_name`, or `connection_name`. - -To find a KA tile_id, use `databricks knowledge-assistants list-knowledge-assistants`. -To find a Genie space_id, use `databricks genie list-spaces`. - -### Writing Good Descriptions +## Writing Good Descriptions -The `description` field is critical for routing. Make it specific: +The `description` field drives routing. Be specific: -**Good descriptions:** -- "Handles billing questions including invoices, payments, refunds, and subscription changes" -- "Answers technical questions about API errors, integration issues, and product bugs" -- "Provides information about HR policies, PTO, benefits, and employee handbook" +| Good | Bad | +|------|-----| +| "Handles billing: invoices, payments, refunds, subscriptions" | "Billing agent" | +| "Answers API errors, integration issues, product bugs" | "Technical" | +| "HR policies, PTO, benefits, employee handbook" | "Handles stuff" | -**Bad descriptions:** -- "Billing agent" (too vague) -- "Handles stuff" (not helpful) -- "Technical" (not specific) +## Adding Examples -## Provisioning Timeline - -After creation, the Supervisor Agent endpoint needs to provision: - -| Status | Meaning | Duration | -|--------|---------|----------| -| `NOT_READY` | Creating the supervisor | 2-5 minutes | -| `ONLINE` | Ready to route queries | - | -| `OFFLINE` | Not currently running | - | - -Use `python scripts/mas_manager.py get_mas TILE_ID` to check the status. - -## Adding Example Questions - -Example questions help with evaluation and can guide routing optimization. - -### Using the CLI +Examples help evaluation and routing optimization. MAS must be ONLINE. ```bash -# Add examples (MAS must be ONLINE) python scripts/mas_manager.py add_examples TILE_ID '[ - {"question": "I haven'\''t received my invoice for this month", "guideline": "Should be routed to billing_agent"}, - {"question": "The API is returning a 500 error", "guideline": "Should be routed to technical_agent"}, - {"question": "How many vacation days do I have?", "guideline": "Should be routed to hr_agent"} + {"question": "I need my invoice for March", "guideline": "Route to billing_agent"}, + {"question": "API returns 500 error", "guideline": "Route to tech_agent"} ]' -# List existing examples python scripts/mas_manager.py list_examples TILE_ID ``` -### Adding Examples in Jobs - -When creating a MAS in a Databricks job and immediately adding examples, use `add_examples_wait` to wait for the endpoint to become ONLINE: - +**In automated jobs** (waits for ONLINE): ```bash -# Waits up to 10 minutes for ONLINE, then adds examples -python scripts/mas_manager.py add_examples_wait TILE_ID '[ - {"question": "...", "guideline": "..."} -]' -``` - -This is useful in automated pipelines where you create a MAS and need to add examples in the same job without manual intervention. - -## Best Practices - -### Agent Design - -1. **Specialized agents**: Each agent should have a clear, distinct purpose -2. **Non-overlapping domains**: Avoid agents with similar descriptions -3. **Clear boundaries**: Define what each agent does and doesn't handle - -### Instructions - -Provide routing instructions: - -``` -You are a customer support supervisor. Your job is to route user queries to the right specialist: - -1. For billing, payments, or subscription questions β†’ billing_agent -2. For technical issues, bugs, or API problems β†’ technical_agent -3. For HR, benefits, or policy questions β†’ hr_agent - -If the query is unclear or spans multiple domains, ask the user to clarify. +python scripts/mas_manager.py add_examples_wait TILE_ID '[...]' ``` -### Fallback Handling - -Consider adding a general-purpose agent for queries that don't fit elsewhere: - -```json -{ - "name": "general_agent", - "endpoint_name": "general-support-endpoint", - "description": "Handles general inquiries that don't fit other categories, provides navigation help" -} -``` - -## Example Workflow - -1. **Deploy specialized agents** as model serving endpoints: - - `billing-assistant-endpoint` - - `tech-support-endpoint` - - `hr-assistant-endpoint` - -2. **Create the MAS**: - - Configure agents with clear descriptions - - Add routing instructions - -3. **Wait for ONLINE status** (2-5 minutes) - -4. **Add example questions** for evaluation - -5. **Test routing** with various query types - -## Updating a Supervisor Agent - -To update an existing Supervisor Agent, use `scripts/mas_manager.py`: - -```bash -# Get current state -python scripts/mas_manager.py get_mas TILE_ID - -# Update with new configuration -python scripts/mas_manager.py update_mas TILE_ID '{"name": "New Name", "agents": [...], "instructions": "..."}' -``` - -1. **Add/remove agents**: Include updated `agents` list -2. **Update descriptions**: Change agent descriptions to improve routing -3. **Modify instructions**: Update routing rules - ## Troubleshooting -### Queries routed to wrong agent - -- Review and improve agent descriptions -- Make descriptions more specific and distinct -- Add examples that demonstrate correct routing - -### Endpoint not responding - -- Verify each underlying model serving endpoint is running -- Check endpoint logs for errors -- Ensure endpoints accept the expected input format - -### Slow responses +**Wrong routing:** +- Improve agent descriptions (more specific, less overlap) +- Add examples demonstrating correct routing -- Check latency of underlying endpoints -- Consider endpoint scaling settings -- Monitor for cold start issues - -## Advanced: Hierarchical Routing - -For complex scenarios, you can create multiple levels of Supervisor Agents: - -``` -Top-level Supervisor -β”œβ”€β”€ Customer Support Supervisor -β”‚ β”œβ”€β”€ billing_agent -β”‚ β”œβ”€β”€ technical_agent -β”‚ └── general_agent -β”œβ”€β”€ Sales Supervisor -β”‚ β”œβ”€β”€ pricing_agent -β”‚ β”œβ”€β”€ demo_agent -β”‚ └── contract_agent -└── Internal Supervisor - β”œβ”€β”€ hr_agent - └── it_helpdesk_agent -``` +**Endpoint not responding:** +- Verify underlying endpoints are running +- Check endpoint logs -Each sub-supervisor is deployed as an endpoint and configured as an agent in the top-level supervisor. +**Slow responses:** +- Check underlying endpoint latency +- Review endpoint scaling settings diff --git a/databricks-skills/databricks-agent-bricks/SKILL.md b/databricks-skills/databricks-agent-bricks/SKILL.md index 84a61325..ce261746 100644 --- a/databricks-skills/databricks-agent-bricks/SKILL.md +++ b/databricks-skills/databricks-agent-bricks/SKILL.md @@ -1,219 +1,95 @@ --- name: databricks-agent-bricks -description: "Create and manage Databricks Agent Bricks: Knowledge Assistants (KA) for document Q&A, Genie Spaces for SQL exploration, and Supervisor Agents for multi-agent orchestration." +description: "Create Knowledge Assistants (KA) for document Q&A and Supervisor Agents for multi-agent orchestration. For Genie Spaces, see databricks-genie skill." --- # Agent Bricks -Create and manage Databricks Agent Bricks - pre-built AI components for conversational applications. +Agent Bricks are pre-built AI tiles in Databricks that provide conversational interfaces. This skill covers **Knowledge Assistants** and **Supervisor Agents**. For Genie Spaces, use the `databricks-genie` skill. -## Overview +| Brick | Purpose | This Skill | +|-------|---------|------------| +| **Knowledge Assistant (KA)** | Document Q&A using RAG on PDFs/text in Volumes | βœ“ | +| **Supervisor Agent** | Orchestrates multiple agents (KA, Genie, endpoints, UC functions, MCP) | βœ“ | +| **Genie Space** | Natural language to SQL on Unity Catalog tables | `databricks-genie` | -| Brick | Purpose | Data Source | -|-------|---------|-------------| -| **Knowledge Assistant (KA)** | Document-based Q&A using RAG | PDF/text files in Volumes | -| **Genie Space** | Natural language to SQL | Unity Catalog tables | -| **Supervisor Agent** | Multi-agent orchestration | KA, Genie, endpoints, UC functions, MCP | - -## Quick Reference +--- -### Knowledge Assistant +## Knowledge Assistant ```bash -# List volumes in a schema +# Find volumes databricks volumes list CATALOG SCHEMA - -# Browse volume contents (via SQL) databricks experimental aitools tools query --warehouse WH "LIST '/Volumes/catalog/schema/volume/'" # Create KA databricks knowledge-assistants create-knowledge-assistant "Name" "Description" -# Add knowledge source (files from volume) +# Add knowledge source databricks knowledge-assistants create-knowledge-source "knowledge-assistants/{ka_id}" \ - --json '{"display_name": "Source Name", "description": "...", "source_type": "files", "files": {"path": "/Volumes/catalog/schema/volume/"}}' + --json '{"display_name": "Docs", "description": "...", "source_type": "files", "files": {"path": "/Volumes/catalog/schema/volume/"}}' -# Sync (index) documents +# Sync and check status databricks knowledge-assistants sync-knowledge-sources "knowledge-assistants/{ka_id}" - -# Check status (wait for ONLINE) databricks knowledge-assistants get-knowledge-assistant "knowledge-assistants/{ka_id}" -# List all KAs +# List/manage databricks knowledge-assistants list-knowledge-assistants +databricks knowledge-assistants delete-knowledge-assistant "knowledge-assistants/{ka_id}" ``` -### Genie Space +**Source types:** `files` (Volume path) or `index` (Vector Search: `index.index_name`, `index.text_col`, `index.doc_uri_col`) -**See `databricks-genie` skill for comprehensive guidance.** +**Status:** `CREATING` (2-5 min) β†’ `ONLINE` β†’ `OFFLINE` -```bash -databricks genie list-spaces -databricks genie create-space --json '{"display_name": "Name", "description": "...", "table_identifiers": ["catalog.schema.table"]}' -databricks genie get-space SPACE_ID -``` +--- -### Supervisor Agent +## Supervisor Agent **No CLI** - use `scripts/mas_manager.py` (run from skill folder): ```bash -# List existing MAS -python scripts/mas_manager.py list_mas - -# Create MAS with KA and Genie agents +# Create MAS python scripts/mas_manager.py create_mas "My Supervisor" '{ "description": "Routes queries to specialized agents", - "instructions": "Route data questions to analyst. Route document questions to docs_agent.", + "instructions": "Route data questions to analyst, document questions to docs_agent.", "agents": [ - {"name": "analyst", "genie_space_id": "01abc...", "description": "SQL analytics on data"}, + {"name": "analyst", "genie_space_id": "01abc...", "description": "SQL analytics"}, {"name": "docs_agent", "ka_tile_id": "dab408a2-...", "description": "Answers from documents"} ] }' -# Check status (wait for ONLINE) +# Check status and manage python scripts/mas_manager.py get_mas TILE_ID +python scripts/mas_manager.py list_mas +python scripts/mas_manager.py update_mas TILE_ID '{"agents": [...]}' +python scripts/mas_manager.py delete_mas TILE_ID -# Find IDs -databricks knowledge-assistants list-knowledge-assistants --output json | jq '.[].id' -databricks genie list-spaces --output json | jq '.[].space_id' -``` - ---- - -## Knowledge Assistant Workflow - -### Step 1: Find Your Volume - -```bash -# List volumes in the schema -databricks volumes list skywest_airlines ops_control --output json - -# Browse volume contents -databricks experimental aitools tools query --warehouse WH "LIST '/Volumes/catalog/schema/volume/'" -``` - -### Step 2: Create the KA - -```bash -databricks knowledge-assistants create-knowledge-assistant \ - "Engineering Docs Assistant" \ - "Answers questions about engineering documents and procedures" -``` - -Response includes `id` (e.g., `dab408a2-f8f4-439e-b65d-cc3cc2c45bbd`). - -### Step 3: Add Knowledge Source - -```bash -databricks knowledge-assistants create-knowledge-source \ - "knowledge-assistants/{ka_id}" \ - --json '{ - "display_name": "Engineering Documents", - "description": "PDFs from engineering docs folder", - "source_type": "files", - "files": {"path": "/Volumes/catalog/schema/volume/docs/"} - }' -``` - -**Source types:** -- `files` - PDFs/text from a Volume path (`files.path`) -- `index` - Existing Vector Search index (`index.index_name`, `index.text_col`, `index.doc_uri_col`) - -### Step 4: Sync and Wait - -```bash -# Trigger indexing -databricks knowledge-assistants sync-knowledge-sources "knowledge-assistants/{ka_id}" - -# Check status (wait for state: ONLINE) -databricks knowledge-assistants get-knowledge-assistant "knowledge-assistants/{ka_id}" -``` - -| State | Meaning | -|-------|---------| -| `CREATING` | Provisioning endpoint (2-5 min) | -| `ONLINE` | Ready to use | -| `OFFLINE` | Not running | - ---- - -## Supervisor Agent Workflow - -Supervisor Agents orchestrate multiple agents (KA, Genie, endpoints, UC functions, MCP). - -### Prerequisites - -```bash -pip install databricks-sdk requests -``` - -### Create a Supervisor Agent - -```bash -python scripts/mas_manager.py create_mas "Support Supervisor" '{ - "description": "Routes queries to specialized agents", - "instructions": "Route billing questions to billing_agent, technical questions to docs_agent", - "agents": [ - { - "name": "billing_agent", - "genie_space_id": "01abc123...", - "description": "SQL analytics on billing data" - }, - { - "name": "docs_agent", - "ka_tile_id": "dab408a2-f8f4-439e-b65d-cc3cc2c45bbd", - "description": "Answers questions from technical documentation" - } - ] -}' -``` - -### Agent Types - -Each agent needs exactly ONE of: - -| Field | Agent Type | -|-------|------------| -| `ka_tile_id` | Knowledge Assistant (document Q&A) | -| `genie_space_id` | Genie Space (SQL analytics) | -| `endpoint_name` | Model serving endpoint (custom agent) | -| `uc_function_name` | Unity Catalog function (`catalog.schema.function`) | -| `connection_name` | UC HTTP Connection (MCP server) | - -### Find IDs +# Add examples (requires ONLINE) +python scripts/mas_manager.py add_examples TILE_ID '[{"question": "...", "guideline": "..."}]' -```bash -# KA tile ID +# Find IDs databricks knowledge-assistants list-knowledge-assistants --output json | jq '.[].id' - -# Genie space ID databricks genie list-spaces --output json | jq '.[].space_id' ``` -### Manage Supervisor Agents +**Agent types** (use exactly ONE per agent): -```bash -python scripts/mas_manager.py get_mas TILE_ID -python scripts/mas_manager.py update_mas TILE_ID '{"name": "New Name", ...}' -python scripts/mas_manager.py delete_mas TILE_ID +| Field | Type | +|-------|------| +| `ka_tile_id` | Knowledge Assistant | +| `genie_space_id` | Genie Space | +| `endpoint_name` | Model serving endpoint | +| `uc_function_name` | UC function (`catalog.schema.func`) | +| `connection_name` | MCP server (UC HTTP Connection) | -# Add examples (requires ONLINE status) -python scripts/mas_manager.py add_examples TILE_ID '[{"question": "...", "guideline": "..."}]' -``` +**Status:** `NOT_READY` (2-5 min) β†’ `ONLINE` β†’ `OFFLINE` --- -## Reference Files +## Reference | Topic | File | |-------|------| -| KA details, troubleshooting | [1-knowledge-assistants.md](1-knowledge-assistants.md) | -| Supervisor Agent details, MCP, UC functions | [2-supervisor-agents.md](2-supervisor-agents.md) | -| Genie Spaces | See `databricks-genie` skill | - -## Related Skills - -- **databricks-genie** - Genie Space creation, curation, Conversation API -- **databricks-unstructured-pdf-generation** - Generate synthetic PDFs for KA -- **databricks-model-serving** - Deploy custom agent endpoints +| KA source types, troubleshooting | [1-knowledge-assistants.md](1-knowledge-assistants.md) | +| UC functions, MCP servers, examples | [2-supervisor-agents.md](2-supervisor-agents.md) | From 085b419a0294d229ab1c96200c60ed162018f09f Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Tue, 14 Apr 2026 18:00:35 +0200 Subject: [PATCH 33/66] Fix SDP skill: CLI syntax and library type issues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tested by creating skywest_ops_analytics pipeline. Found and fixed: 1. library type: Use `file` not `notebook` for raw SQL files - {"notebook": ...} β†’ {"file": ...} - Fixes "Only SQL, Scala and Python notebooks are supported" 2. CLI commands: - `list` β†’ `list-pipelines` - `--pipeline-id ID` β†’ positional `PIPELINE_ID` for all commands - `workspace ls` β†’ `workspace list` 3. Validation: Use `discover-schema` instead of manual SQL - Returns schema, row counts, sample data, null counts in one call - Much better than running COUNT(*) queries 4. Added troubleshooting entry for file vs notebook error πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../SKILL.md | 47 ++- .../references/1-project-initialization.md | 4 +- .../references/2-cli-approach.md | 69 +++-- .../references/3-advanced-configuration.md | 289 ++++++++---------- 4 files changed, 187 insertions(+), 222 deletions(-) diff --git a/databricks-skills/databricks-spark-declarative-pipelines/SKILL.md b/databricks-skills/databricks-spark-declarative-pipelines/SKILL.md index 3fe77e2a..0efeb017 100644 --- a/databricks-skills/databricks-spark-declarative-pipelines/SKILL.md +++ b/databricks-skills/databricks-spark-declarative-pipelines/SKILL.md @@ -39,10 +39,10 @@ description: "Creates, configures, and updates Databricks Lakeflow Spark Declara - When the user provides table schema and asks for code, respond directly with the code. Don't ask clarifying questions if the request is clear. ## Tools -- List files in volume: `databricks fs ls dbfs:/Volumes/{catalog}/{schema}/{volume}/{path} --profile {PROFILE}` -- Query data: `databricks experimental aitools tools query --profile {PROFILE} --warehouse abc123 "SELECT 1 FROM catalog.schema.table"` -- Discover schema: `databricks experimental aitools tools discover-schema --profile {PROFILE} catalog.schema.table1 catalog.schema.table2` -- Pipelines CLI: `databricks pipelines init|deploy|run|logs|stop` or use `databricks pipelines --help` for more options +- List files in volume: `databricks fs ls /Volumes/{catalog}/{schema}/{volume}/{path}` +- Query data: `databricks experimental aitools tools query --warehouse abc123 "SELECT 1 FROM catalog.schema.table"` +- Discover schema: `databricks experimental aitools tools discover-schema catalog.schema.table1 catalog.schema.table2` +- Pipelines CLI: `databricks pipelines create|get|delete|start-update|list-pipelines` or use `databricks pipelines --help` for more options ## Choose Your Workflow @@ -83,7 +83,7 @@ Use this when the pipeline is **part of an existing DAB project**: β†’ See [1-project-initialization.md](references/1-project-initialization.md) for adding pipelines to existing bundles -### Option C: Rapid Iteration with CLI (no bundle management) +### Option C: Rapid Iteration with CLI (no bundle management, or you'll create the DAB at the end) Use this when you need to **quickly create, test, and iterate** on a pipeline without managing bundle files: - User wants to "just run a pipeline and see if it works" @@ -282,37 +282,31 @@ After running a pipeline (via DAB or CLI), you **MUST** validate both the execut ### Step 1: Check Pipeline Execution Status ```bash -# Get pipeline status and details -databricks pipelines get --pipeline-id +# Get pipeline status and details (pipeline_id is positional) +databricks pipelines get # Get recent events/logs -databricks pipelines list-pipeline-events --pipeline-id +databricks pipelines list-pipeline-events ``` **From DAB (`databricks bundle run`):** - Check the command output for success/failure -- Use `databricks pipelines get --pipeline-id ...` to get detailed status and recent events +- Use `databricks pipelines get ` to get detailed status and recent events ### Step 2: Validate Output Data Even if the pipeline reports SUCCESS, you **MUST** verify the data is correct: ```bash -# Check table schema and stats -databricks sql execute --warehouse-id WAREHOUSE_ID --query " -DESCRIBE TABLE EXTENDED my_catalog.my_schema.bronze_orders; -" - -# Check row counts -databricks sql execute --warehouse-id WAREHOUSE_ID --query " -SELECT 'bronze_orders' as table_name, COUNT(*) as row_count FROM my_catalog.my_schema.bronze_orders -UNION ALL -SELECT 'silver_orders', COUNT(*) FROM my_catalog.my_schema.silver_orders -UNION ALL -SELECT 'gold_summary', COUNT(*) FROM my_catalog.my_schema.gold_summary; -" +# Check schema, row counts, sample data, and null counts for all tables +databricks experimental aitools tools discover-schema \ + my_catalog.my_schema.bronze_orders \ + my_catalog.my_schema.silver_orders \ + my_catalog.my_schema.gold_summary ``` +This returns per table: columns/types, 5 sample rows, total_rows count, and null counts per column. + **Check for:** - Empty tables (row_count = 0) - indicates ingestion or filtering issues - Unexpected row counts - joins may have exploded or filtered too much @@ -334,7 +328,7 @@ If validation reveals problems, trace upstream to find the root cause: 5. **Fix the SQL/Python code**, re-upload, and re-run the pipeline -**Do NOT use `execute_sql` with COUNT queries for validation** - `get_table_stats_and_schema` is faster and returns more information in a single call. +**Use `discover-schema` for validation** - it returns schema, row counts, sample data, and null counts in a single call. --- @@ -342,17 +336,18 @@ If validation reveals problems, trace upstream to find the root cause: | Issue | Solution | |-------|----------| -| **Empty output tables** | Use `get_table_stats_and_schema` to check upstream sources. Verify source files exist and paths are correct. | +| **"Only SQL, Scala and Python notebooks are supported"** | Use `{"file": {"path": "..."}}` instead of `{"notebook": {"path": "..."}}` for raw SQL files. `notebook` is for Databricks notebook format only. | +| **Empty output tables** | Use `discover-schema` to check upstream tables. Verify source files exist and paths are correct. | | **Pipeline stuck INITIALIZING** | Normal for serverless, wait a few minutes | | **"Column not found"** | Check `schemaHints` match actual data | | **Streaming reads fail** | For file ingestion in a streaming table, you must use the `STREAM` keyword with `read_files`: `FROM STREAM read_files(...)`. For table streams use `FROM stream(table)`. See [read_files β€” Usage in streaming tables](https://docs.databricks.com/aws/en/sql/language-manual/functions/read_files#usage-in-streaming-tables). | -| **Timeout during run** | Increase `timeout`, or use `wait_for_completion=False` and check status with `manage_pipeline(action="get")` | +| **Timeout during run** | Use `databricks pipelines get ` to check status | | **MV doesn't refresh** | Enable row tracking on source tables | | **SCD2: query column not found** | Lakeflow uses `__START_AT` and `__END_AT` (double underscore), not `START_AT`/`END_AT`. Use `WHERE __END_AT IS NULL` for current rows. See [sql/4-cdc-patterns.md](references/sql/4-cdc-patterns.md). | | **AUTO CDC parse error at APPLY/SEQUENCE** | Put `APPLY AS DELETE WHEN` **before** `SEQUENCE BY`. Only list columns in `COLUMNS * EXCEPT (...)` that exist in the source (omit `_rescued_data` unless bronze uses rescue data). Omit `TRACK HISTORY ON *` if it causes "end of input" errors; default is equivalent. See [sql/4-cdc-patterns.md](references/sql/4-cdc-patterns.md). | | **"Cannot create streaming table from batch query"** | In a streaming table query, use `FROM STREAM read_files(...)` so `read_files` leverages Auto Loader; `FROM read_files(...)` alone is batch. See [sql/2-ingestion.md](references/sql/2-ingestion.md) and [read_files β€” Usage in streaming tables](https://docs.databricks.com/aws/en/sql/language-manual/functions/read_files#usage-in-streaming-tables). | -**For detailed errors**, the `result["message"]` from `manage_pipeline(action="create_or_update")` includes suggested next steps. Use `manage_pipeline(action="get", pipeline_id=...)` which includes recent events and error details. +**For detailed errors**, use `databricks pipelines get ` which includes recent events, or `databricks pipelines list-pipeline-events ` for full event history. --- diff --git a/databricks-skills/databricks-spark-declarative-pipelines/references/1-project-initialization.md b/databricks-skills/databricks-spark-declarative-pipelines/references/1-project-initialization.md index c5fdc00a..40cc8d4a 100644 --- a/databricks-skills/databricks-spark-declarative-pipelines/references/1-project-initialization.md +++ b/databricks-skills/databricks-spark-declarative-pipelines/references/1-project-initialization.md @@ -232,8 +232,8 @@ databricks bundle run customer_pipeline_etl # Run specific target databricks bundle run customer_pipeline_etl --target prod -# Or use Pipeline API directly -databricks pipelines start-update --pipeline-id +# Or use Pipeline API directly (pipeline_id is positional) +databricks pipelines start-update ``` --- diff --git a/databricks-skills/databricks-spark-declarative-pipelines/references/2-cli-approach.md b/databricks-skills/databricks-spark-declarative-pipelines/references/2-cli-approach.md index 848cd2ea..4fdd27df 100644 --- a/databricks-skills/databricks-spark-declarative-pipelines/references/2-cli-approach.md +++ b/databricks-skills/databricks-spark-declarative-pipelines/references/2-cli-approach.md @@ -21,18 +21,25 @@ databricks workspace import-dir ./my_pipeline /Workspace/Users/user@example.com/ ```bash # Create pipeline with JSON config +# Use "file" - can point to a single .sql/.py file OR a directory (includes all files) databricks pipelines create --json '{ "name": "my_orders_pipeline", "catalog": "my_catalog", "schema": "my_schema", "serverless": true, "libraries": [ - {"notebook": {"path": "/Workspace/Users/user@example.com/my_pipeline/bronze/ingest_orders.sql"}}, - {"notebook": {"path": "/Workspace/Users/user@example.com/my_pipeline/silver/clean_orders.sql"}}, - {"notebook": {"path": "/Workspace/Users/user@example.com/my_pipeline/gold/daily_summary.sql"}} + {"file": {"path": "/Workspace/Users/user@example.com/my_pipeline"}} ], "development": true }' + +# Or specify individual files: +# "libraries": [ +# {"file": {"path": "/Workspace/.../bronze/ingest_orders.sql"}}, +# {"file": {"path": "/Workspace/.../silver/clean_orders.sql"}} +# ] +# +# Legacy (avoid): {"notebook": {"path": "..."}} - use "file" instead ``` Save the returned `pipeline_id` for subsequent operations. @@ -40,11 +47,11 @@ Save the returned `pipeline_id` for subsequent operations. ### Step 4: Run Pipeline ```bash -# Start a full refresh run -databricks pipelines start-update --pipeline-id --full-refresh +# Start a full refresh run (pipeline_id is a positional argument) +databricks pipelines start-update --full-refresh # Check run status -databricks pipelines get --pipeline-id +databricks pipelines get ``` ### Step 5: Validate Results @@ -52,16 +59,15 @@ databricks pipelines get --pipeline-id **On Success** - Verify tables were created with correct data: ```bash -# Check table schemas and row counts -databricks sql execute --warehouse-id WAREHOUSE_ID --query " -DESCRIBE TABLE EXTENDED my_catalog.my_schema.bronze_orders; -" - -databricks sql execute --warehouse-id WAREHOUSE_ID --query " -SELECT COUNT(*) as row_count FROM my_catalog.my_schema.bronze_orders; -" +# Check schema, row counts, sample data, and null counts for all tables +databricks experimental aitools tools discover-schema \ + my_catalog.my_schema.bronze_orders \ + my_catalog.my_schema.silver_orders \ + my_catalog.my_schema.gold_summary ``` +This returns per table: columns/types, 5 sample rows, total_rows count, and null counts. + Or use Python for detailed stats: ```python from databricks.sdk import WorkspaceClient @@ -77,11 +83,11 @@ print(f"Created: {table.created_at}") **On Failure** - Get pipeline events and errors: ```bash -# Get pipeline details with recent events -databricks pipelines get --pipeline-id +# Get pipeline details with recent events (pipeline_id is positional) +databricks pipelines get # Get specific run events -databricks pipelines list-pipeline-events --pipeline-id +databricks pipelines list-pipeline-events ``` ### Step 6: Iterate Until Working @@ -89,7 +95,7 @@ databricks pipelines list-pipeline-events --pipeline-id 1. Review errors from pipeline status or events 2. Fix issues in local files 3. Re-upload: `databricks workspace import-dir ./my_pipeline /Workspace/Users/user@example.com/my_pipeline --overwrite` -4. Update and run: `databricks pipelines update --pipeline-id --json '...'` then `databricks pipelines start-update --pipeline-id ` +4. Update and run: `databricks pipelines update --json '...'` then `databricks pipelines start-update ` 5. Repeat until pipeline completes successfully --- @@ -101,28 +107,29 @@ databricks pipelines list-pipeline-events --pipeline-id | Command | Description | |---------|-------------| | `databricks pipelines create --json '{...}'` | Create new pipeline | -| `databricks pipelines get --pipeline-id ID` | Get pipeline details and status | -| `databricks pipelines update --pipeline-id ID --json '{...}'` | Update pipeline config | -| `databricks pipelines delete --pipeline-id ID` | Delete a pipeline | -| `databricks pipelines list` | List all pipelines | +| `databricks pipelines get PIPELINE_ID` | Get pipeline details and status | +| `databricks pipelines update PIPELINE_ID --json '{...}'` | Update pipeline config | +| `databricks pipelines delete PIPELINE_ID` | Delete a pipeline | +| `databricks pipelines list-pipelines` | List all pipelines | ### Run Management | Command | Description | |---------|-------------| -| `databricks pipelines start-update --pipeline-id ID` | Start pipeline update | -| `databricks pipelines start-update --pipeline-id ID --full-refresh` | Start with full refresh | -| `databricks pipelines stop --pipeline-id ID` | Stop running pipeline | -| `databricks pipelines list-pipeline-events --pipeline-id ID` | Get events/logs | -| `databricks pipelines list-updates --pipeline-id ID` | List recent runs | +| `databricks pipelines start-update PIPELINE_ID` | Start pipeline update | +| `databricks pipelines start-update PIPELINE_ID --full-refresh` | Start with full refresh | +| `databricks pipelines stop PIPELINE_ID` | Stop running pipeline | +| `databricks pipelines list-pipeline-events PIPELINE_ID` | Get events/logs | +| `databricks pipelines list-updates PIPELINE_ID` | List recent runs | ### Supporting Commands | Command | Description | |---------|-------------| | `databricks workspace import-dir` | Upload files/folders to workspace | -| `databricks workspace ls` | List workspace files | -| `databricks sql execute` | Run ad-hoc SQL to inspect data | +| `databricks workspace list` | List workspace files | +| `databricks experimental aitools tools discover-schema` | Get schema, row counts, sample data, null counts | +| `databricks experimental aitools tools query` | Run ad-hoc SQL queries | --- @@ -135,14 +142,14 @@ from databricks.sdk import WorkspaceClient w = WorkspaceClient() -# Create pipeline +# Create pipeline - use "file" to include all .sql/.py files in a directory pipeline = w.pipelines.create( name="my_orders_pipeline", catalog="my_catalog", schema="my_schema", serverless=True, libraries=[ - {"notebook": {"path": "/Workspace/Users/user@example.com/my_pipeline/bronze/ingest_orders.sql"}} + {"file": {"path": "/Workspace/Users/user@example.com/my_pipeline"}} ], development=True ) diff --git a/databricks-skills/databricks-spark-declarative-pipelines/references/3-advanced-configuration.md b/databricks-skills/databricks-spark-declarative-pipelines/references/3-advanced-configuration.md index b637f469..6a349f78 100644 --- a/databricks-skills/databricks-spark-declarative-pipelines/references/3-advanced-configuration.md +++ b/databricks-skills/databricks-spark-declarative-pipelines/references/3-advanced-configuration.md @@ -1,13 +1,13 @@ -# Advanced Pipeline Configuration (`extra_settings`) +# Advanced Pipeline Configuration -By default, pipelines are created with **serverless compute and Unity Catalog**. Use the `extra_settings` parameter only for advanced use cases. +By default, pipelines are created with **serverless compute and Unity Catalog**. Use advanced configuration options only when needed. -**CRITICAL: Do NOT use `extra_settings` to set `serverless=false` unless the user explicitly requires:** +**CRITICAL: Do NOT set `serverless=false` unless the user explicitly requires:** - R language support - Spark RDD APIs - JAR libraries or Maven coordinates -## When to Use `extra_settings` +## When to Use Advanced Configuration - **Development mode**: Faster iteration with relaxed validation - **Continuous pipelines**: Real-time streaming instead of triggered runs @@ -16,7 +16,9 @@ By default, pipelines are created with **serverless compute and Unity Catalog**. - **Python dependencies**: Install pip packages for serverless pipelines - **Classic clusters** (rare): Only if user explicitly needs R, RDD APIs, or JARs -## `extra_settings` Parameter Reference +## Pipeline JSON Configuration Reference + +These fields can be passed to `databricks pipelines create --json '{...}'` or `databricks pipelines update --json '{...}'`. ### Top-Level Fields @@ -157,198 +159,159 @@ Install pip dependencies for serverless pipelines: ## Configuration Examples +All examples use `databricks pipelines create --json '{...}'`. For updates, use `databricks pipelines update --json '{...}'`. + ### Development Mode Pipeline -Use `manage_pipeline(action="create_or_update")` tool with: -- `name`: "my_dev_pipeline" -- `root_path`: "/Workspace/Users/user@example.com/my_pipeline" -- `catalog`: "dev_catalog" -- `schema`: "dev_schema" -- `workspace_file_paths`: [...] -- `start_run`: true -- `extra_settings`: -```json -{ - "development": true, - "tags": {"environment": "development", "owner": "data-team"} -} +```bash +databricks pipelines create --json '{ + "name": "my_dev_pipeline", + "catalog": "dev_catalog", + "schema": "dev_schema", + "serverless": true, + "development": true, + "libraries": [{"file": {"path": "/Workspace/Users/user@example.com/my_pipeline"}}], + "tags": {"environment": "development", "owner": "data-team"} +}' ``` ### Non-Serverless with Dedicated Cluster -Use `manage_pipeline(action="create_or_update")` tool with `extra_settings`: -```json -{ - "serverless": false, - "clusters": [{ - "label": "default", - "num_workers": 4, - "node_type_id": "i3.xlarge", - "custom_tags": {"cost_center": "analytics"} - }], - "photon": true, - "edition": "ADVANCED" -} +```bash +databricks pipelines create --json '{ + "name": "my_pipeline", + "catalog": "my_catalog", + "schema": "my_schema", + "serverless": false, + "photon": true, + "edition": "ADVANCED", + "clusters": [{ + "label": "default", + "num_workers": 4, + "node_type_id": "i3.xlarge", + "custom_tags": {"cost_center": "analytics"} + }], + "libraries": [{"file": {"path": "/Workspace/Users/user@example.com/my_pipeline"}}] +}' ``` ### Continuous Streaming Pipeline -Use `manage_pipeline(action="create_or_update")` tool with `extra_settings`: -```json -{ - "continuous": true, - "configuration": { - "spark.sql.shuffle.partitions": "auto" - } -} -``` - -### Using Instance Pool - -Use `manage_pipeline(action="create_or_update")` tool with `extra_settings`: -```json -{ - "serverless": false, - "clusters": [{ - "label": "default", - "instance_pool_id": "0727-104344-hauls13-pool-xyz", - "num_workers": 2, - "custom_tags": {"project": "analytics"} - }] -} -``` - -### Custom Event Log Location - -Use `manage_pipeline(action="create_or_update")` tool with `extra_settings`: -```json -{ - "event_log": { - "catalog": "audit_catalog", - "schema": "pipeline_logs", - "name": "my_pipeline_events" - } -} +```bash +databricks pipelines create --json '{ + "name": "my_streaming_pipeline", + "catalog": "my_catalog", + "schema": "my_schema", + "serverless": true, + "continuous": true, + "configuration": {"spark.sql.shuffle.partitions": "auto"}, + "libraries": [{"file": {"path": "/Workspace/Users/user@example.com/my_pipeline"}}] +}' ``` ### Pipeline with Email Notifications -Use `manage_pipeline(action="create_or_update")` tool with `extra_settings`: -```json -{ - "notifications": [{ - "email_recipients": ["team@example.com", "oncall@example.com"], - "alerts": ["on-update-failure", "on-update-fatal-failure", "on-flow-failure"] - }] -} +```bash +databricks pipelines create --json '{ + "name": "my_pipeline", + "catalog": "my_catalog", + "schema": "my_schema", + "serverless": true, + "libraries": [{"file": {"path": "/Workspace/Users/user@example.com/my_pipeline"}}], + "notifications": [{ + "email_recipients": ["team@example.com", "oncall@example.com"], + "alerts": ["on-update-failure", "on-update-fatal-failure", "on-flow-failure"] + }] +}' ``` ### Production Pipeline with Autoscaling -Use `manage_pipeline(action="create_or_update")` tool with `extra_settings`: -```json -{ - "serverless": false, - "development": false, - "photon": true, - "edition": "ADVANCED", - "clusters": [{ - "label": "default", - "autoscale": { - "min_workers": 2, - "max_workers": 8, - "mode": "ENHANCED" - }, - "node_type_id": "i3.xlarge", - "spark_conf": { - "spark.sql.adaptive.enabled": "true" - }, - "custom_tags": {"environment": "production"} - }], - "notifications": [{ - "email_recipients": ["data-team@example.com"], - "alerts": ["on-update-failure"] - }] -} +```bash +databricks pipelines create --json '{ + "name": "prod_pipeline", + "catalog": "prod_catalog", + "schema": "prod_schema", + "serverless": false, + "development": false, + "photon": true, + "edition": "ADVANCED", + "clusters": [{ + "label": "default", + "autoscale": {"min_workers": 2, "max_workers": 8, "mode": "ENHANCED"}, + "node_type_id": "i3.xlarge", + "spark_conf": {"spark.sql.adaptive.enabled": "true"}, + "custom_tags": {"environment": "production"} + }], + "libraries": [{"file": {"path": "/Workspace/Users/user@example.com/my_pipeline"}}], + "notifications": [{"email_recipients": ["data-team@example.com"], "alerts": ["on-update-failure"]}] +}' ``` -### Run as Service Principal +### Serverless with Python Dependencies -Use `manage_pipeline(action="create_or_update")` tool with `extra_settings`: -```json -{ - "run_as": { - "service_principal_name": "00000000-0000-0000-0000-000000000000" - } -} +```bash +databricks pipelines create --json '{ + "name": "ml_pipeline", + "catalog": "my_catalog", + "schema": "my_schema", + "serverless": true, + "libraries": [{"file": {"path": "/Workspace/Users/user@example.com/my_pipeline"}}], + "environment": { + "dependencies": ["scikit-learn==1.3.0", "pandas>=2.0.0", "requests"] + } +}' ``` ### Continuous Pipeline with Restart Window -Use `manage_pipeline(action="create_or_update")` tool with `extra_settings`: -```json -{ - "continuous": true, - "restart_window": { - "start_hour": 2, - "days_of_week": ["SATURDAY", "SUNDAY"], - "time_zone_id": "America/Los_Angeles" - } -} +```bash +databricks pipelines create --json '{ + "name": "realtime_pipeline", + "catalog": "my_catalog", + "schema": "my_schema", + "serverless": true, + "continuous": true, + "libraries": [{"file": {"path": "/Workspace/Users/user@example.com/my_pipeline"}}], + "restart_window": { + "start_hour": 2, + "days_of_week": ["SATURDAY", "SUNDAY"], + "time_zone_id": "America/Los_Angeles" + } +}' ``` -### Serverless with Python Dependencies +### Custom Event Log Location -Use `manage_pipeline(action="create_or_update")` tool with `extra_settings`: -```json -{ - "serverless": true, - "environment": { - "dependencies": [ - "scikit-learn==1.3.0", - "pandas>=2.0.0", - "requests" - ] - } -} +```bash +databricks pipelines create --json '{ + "name": "my_pipeline", + "catalog": "my_catalog", + "schema": "my_schema", + "serverless": true, + "libraries": [{"file": {"path": "/Workspace/Users/user@example.com/my_pipeline"}}], + "event_log": { + "catalog": "audit_catalog", + "schema": "pipeline_logs", + "name": "my_pipeline_events" + } +}' ``` -### Update Existing Pipeline by ID +### Update Existing Pipeline -If you have a pipeline ID from the Databricks UI, you can force an update by including `id` in `extra_settings`: -```json -{ - "id": "554f4497-4807-4182-bff0-ffac4bb4f0ce" -} -``` +```bash +# Update pipeline configuration +databricks pipelines update --json '{ + "name": "updated_pipeline_name", + "development": false, + "notifications": [{"email_recipients": ["team@example.com"], "alerts": ["on-update-failure"]}] +}' -### Full JSON Export from Databricks UI - -You can copy pipeline settings from the Databricks UI (Pipeline Settings > JSON) and pass them directly as `extra_settings`. Invalid fields like `pipeline_type` are automatically filtered: - -```json -{ - "id": "554f4497-4807-4182-bff0-ffac4bb4f0ce", - "pipeline_type": "WORKSPACE", - "continuous": false, - "development": true, - "photon": false, - "edition": "ADVANCED", - "channel": "CURRENT", - "clusters": [{ - "label": "default", - "num_workers": 1, - "instance_pool_id": "0727-104344-pool-xyz" - }], - "configuration": { - "catalog": "main", - "schema": "my_schema" - } -} +# Then run it +databricks pipelines start-update --full-refresh ``` -**Note**: Explicit tool parameters (`name`, `root_path`, `catalog`, `schema`, `workspace_file_paths`) always take precedence over values in `extra_settings`. - --- ## Multi-Schema Patterns From 5032660710dda3f131c210d72d168a28301a9f60 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Tue, 14 Apr 2026 18:37:49 +0200 Subject: [PATCH 34/66] Refactor PDF generator and fix CLI commands across skills MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PDF Generator: - Rewrite to separate conversion from upload (local HTMLβ†’PDF, then databricks fs cp) - Add parallel folder conversion with ThreadPool (4 workers) - Smart skip: only reconvert if HTML newer than PDF - Remove --json flag, simplify CLI - Add pdf_eval_questions.json format for KA testing - Update tests for new API CLI Fixes (across all skills): - databricks sql execute β†’ databricks experimental aitools tools query - workspace ls β†’ workspace list - All positional pipeline args (not --pipeline-id) Knowledge Assistants: - Add evaluation questions section referencing pdf_eval_questions.json πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- databricks-skills/README.md | 2 +- .../1-knowledge-assistants.md | 15 + .../databricks-agent-bricks/SKILL.md | 4 +- databricks-skills/databricks-config/SKILL.md | 8 +- databricks-skills/databricks-dbsql/SKILL.md | 2 +- .../databricks-metric-views/SKILL.md | 2 +- .../databricks-model-serving/SKILL.md | 2 +- .../databricks-python-sdk/SKILL.md | 2 +- .../databricks-synthetic-data-gen/SKILL.md | 4 +- .../references/2-troubleshooting.md | 14 +- .../7-data-profiling.md | 2 +- .../databricks-unity-catalog/SKILL.md | 4 +- .../SKILL.md | 229 ++++------- .../scripts/pdf_generator.py | 376 +++++++++--------- .../end-to-end-rag.md | 4 +- 15 files changed, 298 insertions(+), 372 deletions(-) diff --git a/databricks-skills/README.md b/databricks-skills/README.md index ea3f3102..95e9a3f2 100644 --- a/databricks-skills/README.md +++ b/databricks-skills/README.md @@ -125,7 +125,7 @@ cp -r ai-dev-kit/databricks-skills/databricks-agent-bricks .claude/skills/ **Example:** User says "Create a sales dashboard" 1. Claude loads `databricks-aibi-dashboards` skill β†’ learns validation workflow -2. Runs `databricks sql execute` β†’ tests queries +2. Runs `databricks experimental aitools tools query` β†’ tests queries 3. Uses Python SDK to create dashboard via REST API 4. Returns working dashboard URL diff --git a/databricks-skills/databricks-agent-bricks/1-knowledge-assistants.md b/databricks-skills/databricks-agent-bricks/1-knowledge-assistants.md index 9fd1ffd5..f7d0a942 100644 --- a/databricks-skills/databricks-agent-bricks/1-knowledge-assistants.md +++ b/databricks-skills/databricks-agent-bricks/1-knowledge-assistants.md @@ -51,3 +51,18 @@ Use existing index instead of auto-indexing: - Ensure documents are well-structured - Break large documents into smaller files - Add clear headings and sections + +## Evaluation Questions + +When testing a KA, check if the volume or project contains a `pdf_eval_questions.json` file with test questions: + +```json +{ + "api_errors_guide.pdf": { + "question": "What is the solution for error ERR-4521?", + "expected_fact": "Call /api/v2/auth/refresh with refresh_token before the 3600s TTL expires" + } +} +``` + +Use these questions to validate retrieval accuracy. See [databricks-unstructured-pdf-generation](../databricks-unstructured-pdf-generation/SKILL.md) for generating test PDFs with eval questions. diff --git a/databricks-skills/databricks-agent-bricks/SKILL.md b/databricks-skills/databricks-agent-bricks/SKILL.md index ce261746..1245a54d 100644 --- a/databricks-skills/databricks-agent-bricks/SKILL.md +++ b/databricks-skills/databricks-agent-bricks/SKILL.md @@ -1,6 +1,6 @@ --- name: databricks-agent-bricks -description: "Create Knowledge Assistants (KA) for document Q&A and Supervisor Agents for multi-agent orchestration. For Genie Spaces, see databricks-genie skill." +description: "Create Agent Bricks: Knowledge Assistants (KA) for document Q&A and Supervisor Agents for multi-agent orchestration (MAS). For Genie Spaces, see databricks-genie skill." --- # Agent Bricks @@ -91,5 +91,5 @@ databricks genie list-spaces --output json | jq '.[].space_id' | Topic | File | |-------|------| -| KA source types, troubleshooting | [1-knowledge-assistants.md](1-knowledge-assistants.md) | +| KA source types, index, troubleshooting | [1-knowledge-assistants.md](1-knowledge-assistants.md) | | UC functions, MCP servers, examples | [2-supervisor-agents.md](2-supervisor-agents.md) | diff --git a/databricks-skills/databricks-config/SKILL.md b/databricks-skills/databricks-config/SKILL.md index 6ee921ce..21728f19 100644 --- a/databricks-skills/databricks-config/SKILL.md +++ b/databricks-skills/databricks-config/SKILL.md @@ -137,10 +137,10 @@ databricks volumes delete CATALOG_NAME.SCHEMA_NAME.VOLUME_NAME ```bash # Run SQL query -databricks sql execute --warehouse-id WAREHOUSE_ID --query "SELECT * FROM catalog.schema.table LIMIT 10" +databricks experimental aitools tools query --warehouse WAREHOUSE_ID "SELECT * FROM catalog.schema.table LIMIT 10" # Create objects via SQL (alternative approach) -databricks sql execute --warehouse-id WAREHOUSE_ID --query "CREATE CATALOG my_catalog" -databricks sql execute --warehouse-id WAREHOUSE_ID --query "CREATE SCHEMA my_catalog.my_schema" -databricks sql execute --warehouse-id WAREHOUSE_ID --query "CREATE VOLUME my_catalog.my_schema.my_volume" +databricks experimental aitools tools query --warehouse WAREHOUSE_ID "CREATE CATALOG my_catalog" +databricks experimental aitools tools query --warehouse WAREHOUSE_ID "CREATE SCHEMA my_catalog.my_schema" +databricks experimental aitools tools query --warehouse WAREHOUSE_ID "CREATE VOLUME my_catalog.my_schema.my_volume" ``` diff --git a/databricks-skills/databricks-dbsql/SKILL.md b/databricks-skills/databricks-dbsql/SKILL.md index 4def6b57..043228b9 100644 --- a/databricks-skills/databricks-dbsql/SKILL.md +++ b/databricks-skills/databricks-dbsql/SKILL.md @@ -297,4 +297,4 @@ Load these for detailed syntax, full parameter lists, and advanced patterns: - **Star schema in Gold layer** for BI; OBT acceptable in Silver - **Define PK/FK constraints** on dimensional models for query optimization - **Use `COLLATE UTF8_LCASE`** for user-facing string columns that need case-insensitive search -- **Test SQL via CLI** (`databricks sql execute`) or notebooks before deploying +- **Test SQL via CLI** (`databricks experimental aitools tools query`) or notebooks before deploying diff --git a/databricks-skills/databricks-metric-views/SKILL.md b/databricks-skills/databricks-metric-views/SKILL.md index 94db66d0..b395a077 100644 --- a/databricks-skills/databricks-metric-views/SKILL.md +++ b/databricks-skills/databricks-metric-views/SKILL.md @@ -162,7 +162,7 @@ DROP VIEW IF EXISTS catalog.schema.orders_metrics; ```bash # Execute SQL via CLI -databricks sql execute --warehouse-id WAREHOUSE_ID --query " +databricks experimental aitools tools query --warehouse WAREHOUSE_ID " CREATE OR REPLACE VIEW catalog.schema.orders_metrics WITH METRICS LANGUAGE YAML diff --git a/databricks-skills/databricks-model-serving/SKILL.md b/databricks-skills/databricks-model-serving/SKILL.md index aed60ee6..bf520b5a 100644 --- a/databricks-skills/databricks-model-serving/SKILL.md +++ b/databricks-skills/databricks-model-serving/SKILL.md @@ -183,7 +183,7 @@ databricks serving-endpoints query sklearn-classifier --json '{ databricks workspace import-dir ./my_agent /Workspace/Users/you@company.com/my_agent # List workspace files -databricks workspace ls /Workspace/Users/you@company.com/my_agent +databricks workspace list /Workspace/Users/you@company.com/my_agent ``` ### Jobs for Deployment diff --git a/databricks-skills/databricks-python-sdk/SKILL.md b/databricks-skills/databricks-python-sdk/SKILL.md index eaf7cd66..4d03b5ce 100644 --- a/databricks-skills/databricks-python-sdk/SKILL.md +++ b/databricks-skills/databricks-python-sdk/SKILL.md @@ -91,7 +91,7 @@ databricks --profile MY_PROFILE clusters list # Common commands databricks clusters list databricks jobs list -databricks workspace ls /Users/me +databricks workspace list /Users/me ``` --- diff --git a/databricks-skills/databricks-synthetic-data-gen/SKILL.md b/databricks-skills/databricks-synthetic-data-gen/SKILL.md index 3ba55e87..3e6f5b71 100644 --- a/databricks-skills/databricks-synthetic-data-gen/SKILL.md +++ b/databricks-skills/databricks-synthetic-data-gen/SKILL.md @@ -128,10 +128,10 @@ Show a clear specification with **the business story and your assumptions surfac ### Post-Generation Validation -Use `databricks sql execute` to validate generated data (row counts, distributions, referential integrity). Query parquet files directly: +Use `databricks experimental aitools tools query` to validate generated data (row counts, distributions, referential integrity). Query parquet files directly: ```bash -databricks sql execute --warehouse-id $WAREHOUSE_ID --query " +databricks experimental aitools tools query --warehouse $WAREHOUSE_ID " SELECT COUNT(*) FROM parquet.\`/Volumes/CATALOG/SCHEMA/raw_data/customers\` " ``` diff --git a/databricks-skills/databricks-synthetic-data-gen/references/2-troubleshooting.md b/databricks-skills/databricks-synthetic-data-gen/references/2-troubleshooting.md index 0bbdfe9b..793b64f7 100644 --- a/databricks-skills/databricks-synthetic-data-gen/references/2-troubleshooting.md +++ b/databricks-skills/databricks-synthetic-data-gen/references/2-troubleshooting.md @@ -293,30 +293,30 @@ WAREHOUSE_ID="your-warehouse-id" VOLUME_PATH="/Volumes/CATALOG/SCHEMA/raw_data" # 1. Check row counts -databricks sql execute --warehouse-id $WAREHOUSE_ID --query " +databricks experimental aitools tools query --warehouse $WAREHOUSE_ID " SELECT 'customers' as table_name, COUNT(*) as row_count FROM parquet.\`${VOLUME_PATH}/customers\` UNION ALL SELECT 'orders', COUNT(*) FROM parquet.\`${VOLUME_PATH}/orders\` " # 2. Preview schema and sample data -databricks sql execute --warehouse-id $WAREHOUSE_ID --query " +databricks experimental aitools tools query --warehouse $WAREHOUSE_ID " DESCRIBE SELECT * FROM parquet.\`${VOLUME_PATH}/customers\` " -databricks sql execute --warehouse-id $WAREHOUSE_ID --query " +databricks experimental aitools tools query --warehouse $WAREHOUSE_ID " SELECT * FROM parquet.\`${VOLUME_PATH}/customers\` LIMIT 5 " # 3. Verify distributions -databricks sql execute --warehouse-id $WAREHOUSE_ID --query " +databricks experimental aitools tools query --warehouse $WAREHOUSE_ID " SELECT tier, COUNT(*) as count, ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 1) as pct FROM parquet.\`${VOLUME_PATH}/customers\` GROUP BY tier ORDER BY tier " # 4. Check amount statistics -databricks sql execute --warehouse-id $WAREHOUSE_ID --query " +databricks experimental aitools tools query --warehouse $WAREHOUSE_ID " SELECT MIN(amount) as min_amount, MAX(amount) as max_amount, @@ -326,7 +326,7 @@ FROM parquet.\`${VOLUME_PATH}/orders\` " # 5. Check referential integrity -databricks sql execute --warehouse-id $WAREHOUSE_ID --query " +databricks experimental aitools tools query --warehouse $WAREHOUSE_ID " SELECT COUNT(*) as orphan_orders FROM parquet.\`${VOLUME_PATH}/orders\` o LEFT JOIN parquet.\`${VOLUME_PATH}/customers\` c ON o.customer_id = c.customer_id @@ -334,7 +334,7 @@ WHERE c.customer_id IS NULL " # 6. Verify date range -databricks sql execute --warehouse-id $WAREHOUSE_ID --query " +databricks experimental aitools tools query --warehouse $WAREHOUSE_ID " SELECT MIN(order_date) as min_date, MAX(order_date) as max_date FROM parquet.\`${VOLUME_PATH}/orders\` " diff --git a/databricks-skills/databricks-unity-catalog/7-data-profiling.md b/databricks-skills/databricks-unity-catalog/7-data-profiling.md index 3103adf0..cf6c3ec1 100644 --- a/databricks-skills/databricks-unity-catalog/7-data-profiling.md +++ b/databricks-skills/databricks-unity-catalog/7-data-profiling.md @@ -68,7 +68,7 @@ DROP QUALITY MONITOR catalog.schema.my_table; ### Execute via CLI ```bash -databricks sql execute --warehouse-id WAREHOUSE_ID --query " +databricks experimental aitools tools query --warehouse WAREHOUSE_ID " CREATE OR REPLACE QUALITY MONITOR catalog.schema.my_table OPTIONS (OUTPUT_SCHEMA 'catalog.schema') " diff --git a/databricks-skills/databricks-unity-catalog/SKILL.md b/databricks-skills/databricks-unity-catalog/SKILL.md index 46dd2906..bbc77a6f 100644 --- a/databricks-skills/databricks-unity-catalog/SKILL.md +++ b/databricks-skills/databricks-unity-catalog/SKILL.md @@ -99,11 +99,11 @@ GROUP BY workspace_id, sku_name; ## SQL Queries via CLI -Use `databricks sql` for system table queries: +Use `databricks experimental aitools tools query` for system table queries: ```bash # Query lineage via CLI -databricks sql execute --warehouse-id WAREHOUSE_ID --query " +databricks experimental aitools tools query --warehouse WAREHOUSE_ID " SELECT source_table_full_name, target_table_full_name FROM system.access.table_lineage WHERE event_date >= current_date() - 7 diff --git a/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md b/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md index 1788d73e..5b10479d 100644 --- a/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md +++ b/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md @@ -7,189 +7,108 @@ description: "Generate PDF documents from HTML and upload to Unity Catalog volum Convert HTML content to PDF documents and upload them to Unity Catalog Volumes. -## Overview +## Workflow -Generate PDFs from HTML using `scripts/pdf_generator.py`. You generate the HTML content, write it to a temp file, then run the script to convert and upload. +1. Write HTML files to `./raw_data/html/` (write multiple files in parallel for speed) +2. Convert HTML β†’ PDF using `scripts/pdf_generator.py` (parallel conversion) +3. Upload PDFs to Unity Catalog volume using `databricks fs cp` +4. Generate `doc_questions.json` with test questions for each document -## Installation +## Dependencies ```bash uv pip install plutoprint ``` -## Usage - -Run the script via CLI: +## Step 1: Write HTML Files ```bash -# Generate from HTML file (recommended - avoids shell escaping issues) -python scripts/pdf_generator.py generate --html-file ./raw_pdf/content.html \ - --filename report.pdf --catalog my_catalog --schema my_schema +mkdir -p ./raw_data/html +``` -# Generate from inline HTML (for simple content) -python scripts/pdf_generator.py generate --html '

Hello

' \ - --filename hello.pdf --catalog my_catalog --schema my_schema +Write HTML documents to `./raw_data/html/filename.html`. Use subdirectories to organize (structure is preserved). -# With subfolder and JSON output -python scripts/pdf_generator.py generate --html-file ./raw_pdf/content.html \ - --filename report.pdf --catalog my_catalog --schema my_schema --folder reports --json -``` +## Step 2: Convert to PDF -**Workflow:** -1. Write HTML content to a local file (e.g., `./raw_pdf/content.html`) -2. Run the script with `--html-file` -3. Script converts to PDF and uploads to Unity Catalog volume +```bash +# Convert entire folder (parallel, 4 workers) +python scripts/pdf_generator.py convert --input ./raw_data/html --output ./raw_data/pdf +``` -## Quick Start +Skips files where PDF exists and is newer than HTML. Use `--force` to reconvert all. -1. Write HTML to a local file -2. Run the script +## Step 3: Upload to Volume ```bash -# Create folder and write HTML content -mkdir -p ./raw_pdf -cat > ./raw_pdf/report.html << 'EOF' - - - - - - -

Quarterly Report Q1 2024

-

Revenue increased 15% year-over-year...

- - -EOF - -# Generate and upload PDF -python scripts/pdf_generator.py generate --html-file ./raw_pdf/report.html \ - --filename q1_report.pdf --catalog my_catalog --schema my_schema +databricks fs cp -r ./raw_data/pdf /Volumes/my_catalog/my_schema/raw_data/ +``` + +## Step 4: Generate Test Questions + +Create `./raw_data/pdf/pdf_eval_questions.json` with questions for Knowledge Assistant evaluation or MAS: + +```json +{ + "api_errors_guide.pdf": { + "question": "What is the solution for error ERR-4521?", + "expected_fact": "Call /api/v2/auth/refresh with refresh_token before the 3600s TTL expires" + }, + "installation_manual.pdf": { + "question": "What port does the service use by default?", + "expected_fact": "Port 8443 for HTTPS, configurable via CONFIG_PORT environment variable" + } +} ``` -## HTML Best Practices +This JSON can be used to build KA test cases and validate retrieval accuracy. -### Use Complete HTML5 Structure +## Document Content Guidelines -Always include the full HTML structure: +When generating documents for Knowledge Assistant testing or demos: + +- **Multi-page documents**: Each PDF should be several pages with substantial content +- **Specific error codes and solutions**: Include product-specific error codes, causes, and resolution steps +- **Technical details**: API endpoints, configuration parameters, version numbers, specific commands +- **Simple CSS**: Keep styling minimal for fast HTML creation and reliable PDF conversion +- **Queryable facts**: Include details a KA must read the document to answer (not general knowledge) + +**Good document types:** +- Product user manuals with troubleshooting sections +- API error reference guides (error codes, causes, solutions) +- Installation/configuration guides with specific steps +- Technical specifications with version-specific details + +**Example content:** Instead of generic "Connection failed" errors, write: +- "Error ERR-4521: OAuth token expired. Cause: Token TTL exceeded 3600s default. Solution: Call `/api/v2/auth/refresh` with your refresh_token before expiration. See Section 4.2 for token lifecycle management." + +## CLI Reference -```html - - - - - - - - - ``` +python scripts/pdf_generator.py convert [OPTIONS] -### CSS Features Supported - -PlutoPrint supports modern CSS3: -- Flexbox and Grid layouts -- CSS variables (`--var-name`) -- Web fonts (system fonts recommended) -- Colors, backgrounds, borders -- Tables with styling - -### CSS to Avoid - -- Animations and transitions (static PDF) -- Interactive elements (forms, hover effects) -- External resources (images via URL) - use embedded base64 if needed - -### Professional Document Template - -```html - - - - - - -

Document Title

- -

Section 1

-

Content here...

- -
- Important: Key information highlighted here. -
- -

Data Table

- - - -
Column 1Column 2Column 3
DataDataData
- - - - + --input, -i Input HTML file or folder (required) + --output, -o Output folder for PDFs (required) + --force, -f Force reconvert (ignore timestamps) + --workers, -w Parallel workers (default: 4) ``` -## Prerequisites +## Folder Structure -- Unity Catalog schema must exist -- Volume must exist (default: `raw_data`) -- User must have WRITE permission on the volume -- Databricks CLI must be configured and authenticated +Subfolder structure is preserved: + +``` +./raw_data/html/ ./raw_data/pdf/ +β”œβ”€β”€ report.html β†’ β”œβ”€β”€ report.pdf +β”œβ”€β”€ quarterly/ β”œβ”€β”€ quarterly/ +β”‚ └── q1.html β†’ β”‚ └── q1.pdf +└── legal/ └── legal/ + └── terms.html β†’ └── terms.pdf +``` ## Troubleshooting | Issue | Solution | |-------|----------| -| "Volume does not exist" | Create the volume first or use an existing one | -| "Schema does not exist" | Create the schema or check the name | -| PDF looks wrong | Check HTML/CSS syntax, use supported CSS features | -| Slow generation | Call multiple PDFs in parallel, not sequentially | -| CLI not found | Ensure `databricks` CLI is installed and in PATH | +| "plutoprint not installed" | `uv pip install plutoprint` | +| PDF looks wrong | Check HTML/CSS syntax | +| "Volume does not exist" | `databricks volumes create catalog.schema.volume MANAGED` | diff --git a/databricks-skills/databricks-unstructured-pdf-generation/scripts/pdf_generator.py b/databricks-skills/databricks-unstructured-pdf-generation/scripts/pdf_generator.py index 97ce0725..e7808d13 100644 --- a/databricks-skills/databricks-unstructured-pdf-generation/scripts/pdf_generator.py +++ b/databricks-skills/databricks-unstructured-pdf-generation/scripts/pdf_generator.py @@ -1,280 +1,272 @@ #!/usr/bin/env python3 """ -PDF Generator - Self-contained HTML to PDF generation and upload to Unity Catalog volumes. +PDF Generator - Convert HTML files to PDF locally. Usage: - python pdf_generator.py generate --html '...' --filename report.pdf --catalog my_catalog --schema my_schema - python pdf_generator.py generate --html-file input.html --filename report.pdf --catalog my_catalog --schema my_schema --volume raw_data --folder docs + # Convert single file + python pdf_generator.py convert --input ./raw_data/html/report.html --output ./raw_data/pdf + + # Convert entire folder (parallel) + python pdf_generator.py convert --input ./raw_data/html --output ./raw_data/pdf + + # Force reconvert (ignore timestamps) + python pdf_generator.py convert --input ./raw_data/html --output ./raw_data/pdf --force Requires: plutoprint - pip install plutoprint + uv / pip install plutoprint """ import argparse -import json import logging -import subprocess import sys -import tempfile -from dataclasses import dataclass +from concurrent.futures import ThreadPoolExecutor, as_completed +from dataclasses import dataclass, field from pathlib import Path from typing import Optional +logging.basicConfig(level=logging.INFO, format="%(message)s") logger = logging.getLogger(__name__) +MAX_WORKERS = 4 + @dataclass -class PDFResult: - """Result from generating a PDF.""" - success: bool - volume_path: Optional[str] = None +class ConversionResult: + """Result from converting HTML to PDF.""" + html_path: str + pdf_path: Optional[str] = None + success: bool = False + skipped: bool = False error: Optional[str] = None def to_dict(self) -> dict: return { + "html_path": self.html_path, + "pdf_path": self.pdf_path, "success": self.success, - "volume_path": self.volume_path, + "skipped": self.skipped, "error": self.error, } -def _convert_html_to_pdf(html_content: str, output_path: str) -> bool: - """Convert HTML content to PDF using PlutoPrint. +@dataclass +class BatchResult: + """Result from batch conversion.""" + total: int = 0 + converted: int = 0 + skipped: int = 0 + failed: int = 0 + results: list = field(default_factory=list) - Args: - html_content: HTML string to convert - output_path: Path where PDF should be saved + def to_dict(self) -> dict: + return { + "total": self.total, + "converted": self.converted, + "skipped": self.skipped, + "failed": self.failed, + "results": [r.to_dict() for r in self.results], + } - Returns: - True if successful, False otherwise - """ - output_dir = Path(output_path).parent - output_dir.mkdir(parents=True, exist_ok=True) - try: - import plutoprint +def _needs_conversion(html_path: Path, pdf_path: Path) -> bool: + """Check if HTML needs to be converted (PDF missing or older than HTML). - logger.debug(f"Converting HTML to PDF using PlutoPrint: {output_path}") + Args: + html_path: Path to HTML file + pdf_path: Path to output PDF file - book = plutoprint.Book(plutoprint.PAGE_SIZE_A4) - book.load_html(html_content) - book.write_to_pdf(output_path) + Returns: + True if conversion needed, False if PDF is up-to-date + """ + if not pdf_path.exists(): + return True - if Path(output_path).exists(): - file_size = Path(output_path).stat().st_size - logger.info(f"PDF saved: {output_path} (size: {file_size:,} bytes)") - return True - else: - logger.error("PlutoPrint conversion failed - file not created") - return False + html_mtime = html_path.stat().st_mtime + pdf_mtime = pdf_path.stat().st_mtime - except ImportError: - logger.error("PlutoPrint is not installed. Install with: pip install plutoprint") - return False - except Exception as e: - logger.error(f"Failed to convert HTML to PDF: {str(e)}", exc_info=True) - return False + return html_mtime > pdf_mtime -def _run_cli(args: list[str], check: bool = True) -> subprocess.CompletedProcess: - """Run a databricks CLI command. +def convert_html_to_pdf( + html_path: Path, + pdf_path: Path, + force: bool = False, +) -> ConversionResult: + """Convert a single HTML file to PDF. Args: - args: Command arguments (without 'databricks' prefix) - check: Whether to raise on non-zero exit code + html_path: Path to HTML file + pdf_path: Path to output PDF file + force: If True, convert even if PDF is up-to-date Returns: - CompletedProcess with stdout/stderr + ConversionResult with success/skip/error status """ - cmd = ["databricks"] + args - logger.debug(f"Running: {' '.join(cmd)}") - return subprocess.run(cmd, capture_output=True, text=True, check=check) + result = ConversionResult(html_path=str(html_path)) + # Check if conversion needed + if not force and not _needs_conversion(html_path, pdf_path): + result.skipped = True + result.success = True + result.pdf_path = str(pdf_path) + logger.debug(f"Skipped (up-to-date): {html_path.name}") + return result -def _validate_volume_exists(catalog: str, schema: str, volume: str) -> Optional[str]: - """Validate that the volume exists using CLI. - - Args: - catalog: Catalog name - schema: Schema name - volume: Volume name - - Returns: - Error message if validation fails, None if successful - """ - # Check volume exists - result = _run_cli(["volumes", "read", f"{catalog}.{schema}.{volume}"], check=False) - if result.returncode != 0: - return f"Volume '{catalog}.{schema}.{volume}' does not exist or is not accessible: {result.stderr}" - return None + # Ensure output directory exists + pdf_path.parent.mkdir(parents=True, exist_ok=True) + try: + import plutoprint -def _upload_to_volume(local_path: str, volume_path: str) -> Optional[str]: - """Upload a file to Unity Catalog volume using CLI. + # Read HTML content + html_content = html_path.read_text(encoding="utf-8") - Args: - local_path: Local file path - volume_path: Volume path (e.g., /Volumes/catalog/schema/volume/file.pdf) + # Convert to PDF + book = plutoprint.Book(plutoprint.PAGE_SIZE_A4) + book.load_html(html_content) + book.write_to_pdf(str(pdf_path)) - Returns: - Error message if upload fails, None if successful - """ - result = _run_cli(["fs", "cp", local_path, volume_path, "--overwrite"], check=False) - if result.returncode != 0: - return f"Failed to upload to {volume_path}: {result.stderr}" - return None + if pdf_path.exists(): + result.success = True + result.pdf_path = str(pdf_path) + logger.info(f"Converted: {html_path.name} -> {pdf_path.name}") + else: + result.error = "PDF file not created" + logger.error(f"Failed: {html_path.name} - PDF not created") + except ImportError: + result.error = "plutoprint not installed. Run: pip install plutoprint" + logger.error(result.error) + except Exception as e: + result.error = str(e) + logger.error(f"Failed: {html_path.name} - {e}") -def _create_volume_directory(volume_path: str) -> None: - """Create a directory in the volume using CLI (best effort). + return result - Args: - volume_path: Volume directory path - """ - # Use fs mkdirs - it's idempotent - _run_cli(["fs", "mkdirs", volume_path], check=False) +def convert_folder( + input_dir: Path, + output_dir: Path, + force: bool = False, + max_workers: int = MAX_WORKERS, +) -> BatchResult: + """Convert all HTML files in a folder to PDF (parallel). -def generate_and_upload_pdf( - html_content: str, - filename: str, - catalog: str, - schema: str, - volume: str = "raw_data", - folder: Optional[str] = None, -) -> PDFResult: - """Convert HTML to PDF and upload to a Unity Catalog volume. + Preserves subfolder structure from input to output. Args: - html_content: Complete HTML document (including , , , - ...

Hello World

- ... - ... ''' - >>> result = generate_and_upload_pdf( - ... html_content=html, - ... filename="hello.pdf", - ... catalog="my_catalog", - ... schema="my_schema", - ... ) - >>> print(result.volume_path) - /Volumes/my_catalog/my_schema/raw_data/hello.pdf + BatchResult with counts and per-file results """ - # Ensure filename ends with .pdf - if not filename.lower().endswith(".pdf"): - filename = f"{filename}.pdf" - - # Validate volume exists - error = _validate_volume_exists(catalog, schema, volume) - if error: - return PDFResult(success=False, error=error) - - # Build volume path - if folder: - volume_path = f"/Volumes/{catalog}/{schema}/{volume}/{folder}/{filename}" - else: - volume_path = f"/Volumes/{catalog}/{schema}/{volume}/{filename}" + batch = BatchResult() - try: - with tempfile.TemporaryDirectory() as temp_dir: - local_pdf_path = str(Path(temp_dir) / filename) + # Find all HTML files + html_files = list(input_dir.rglob("*.html")) + batch.total = len(html_files) - # Convert HTML to PDF - if not _convert_html_to_pdf(html_content, local_pdf_path): - return PDFResult(success=False, error="Failed to convert HTML to PDF") + if batch.total == 0: + logger.warning(f"No HTML files found in {input_dir}") + return batch - # Create folder if needed - if folder: - folder_path = f"/Volumes/{catalog}/{schema}/{volume}/{folder}" - _create_volume_directory(folder_path) + logger.info(f"Found {batch.total} HTML file(s) in {input_dir}") - # Upload to volume - error = _upload_to_volume(local_pdf_path, volume_path) - if error: - return PDFResult(success=False, error=error) + def process_file(html_path: Path) -> ConversionResult: + # Compute relative path to preserve folder structure + relative_path = html_path.relative_to(input_dir) + pdf_relative = relative_path.with_suffix(".pdf") + pdf_path = output_dir / pdf_relative - logger.info(f"PDF uploaded to {volume_path}") - return PDFResult(success=True, volume_path=volume_path) + return convert_html_to_pdf(html_path, pdf_path, force=force) - except Exception as e: - error_msg = f"Error generating PDF: {str(e)}" - logger.error(error_msg, exc_info=True) - return PDFResult(success=False, error=error_msg) + # Process files in parallel + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = {executor.submit(process_file, f): f for f in html_files} + + for future in as_completed(futures): + result = future.result() + batch.results.append(result) + + if result.skipped: + batch.skipped += 1 + elif result.success: + batch.converted += 1 + else: + batch.failed += 1 + + logger.info(f"Done: {batch.converted} converted, {batch.skipped} skipped, {batch.failed} failed") + return batch def main(): """CLI entry point.""" parser = argparse.ArgumentParser( - description="Generate PDFs from HTML and upload to Unity Catalog volumes", + description="Convert HTML files to PDF", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: - # Generate from inline HTML - python pdf_generator.py generate --html '

Hello

' \\ - --filename hello.pdf --catalog my_catalog --schema my_schema + # Convert single file + python pdf_generator.py convert --input ./raw_data/html/report.html --output ./raw_data/pdf - # Generate from HTML file - python pdf_generator.py generate --html-file input.html \\ - --filename report.pdf --catalog my_catalog --schema my_schema --folder reports + # Convert entire folder (parallel) + python pdf_generator.py convert --input ./raw_data/html --output ./raw_data/pdf + + # Force reconvert all + python pdf_generator.py convert --input ./raw_data/html --output ./raw_data/pdf --force """, ) subparsers = parser.add_subparsers(dest="command", help="Commands") - # Generate command - gen_parser = subparsers.add_parser("generate", help="Generate PDF from HTML") - gen_parser.add_argument("--html", help="HTML content as string") - gen_parser.add_argument("--html-file", help="Path to HTML file") - gen_parser.add_argument("--filename", required=True, help="Output PDF filename") - gen_parser.add_argument("--catalog", required=True, help="Unity Catalog name") - gen_parser.add_argument("--schema", required=True, help="Schema name") - gen_parser.add_argument("--volume", default="raw_data", help="Volume name (default: raw_data)") - gen_parser.add_argument("--folder", help="Optional folder within volume") - gen_parser.add_argument("--json", action="store_true", help="Output result as JSON") + # Convert command + conv_parser = subparsers.add_parser("convert", help="Convert HTML to PDF") + conv_parser.add_argument("--input", "-i", required=True, help="Input HTML file or folder") + conv_parser.add_argument("--output", "-o", required=True, help="Output folder for PDFs") + conv_parser.add_argument("--force", "-f", action="store_true", help="Force reconvert (ignore timestamps)") + conv_parser.add_argument("--workers", "-w", type=int, default=MAX_WORKERS, help=f"Parallel workers (default: {MAX_WORKERS})") args = parser.parse_args() - if args.command == "generate": - # Get HTML content - if args.html: - html_content = args.html - elif args.html_file: - with open(args.html_file, "r") as f: - html_content = f.read() - else: - print("Error: Either --html or --html-file is required") + if args.command == "convert": + input_path = Path(args.input) + output_path = Path(args.output) + + if not input_path.exists(): + print(f"Error: Input path does not exist: {input_path}") sys.exit(1) - result = generate_and_upload_pdf( - html_content=html_content, - filename=args.filename, - catalog=args.catalog, - schema=args.schema, - volume=args.volume, - folder=args.folder, - ) - - if args.json: - print(json.dumps(result.to_dict(), indent=2)) - else: - if result.success: - print(f"Success: PDF uploaded to {result.volume_path}") + if input_path.is_file(): + # Single file conversion + if not input_path.suffix.lower() == ".html": + print(f"Error: Input file must be .html: {input_path}") + sys.exit(1) + + pdf_path = output_path / input_path.with_suffix(".pdf").name + result = convert_html_to_pdf(input_path, pdf_path, force=args.force) + + if result.skipped: + print(f"Skipped (up-to-date): {result.pdf_path}") + elif result.success: + print(f"Converted: {result.pdf_path}") else: print(f"Error: {result.error}") sys.exit(1) + else: + # Folder conversion + batch = convert_folder( + input_path, + output_path, + force=args.force, + max_workers=args.workers, + ) + + print(f"\nSummary: {batch.converted} converted, {batch.skipped} skipped, {batch.failed} failed") + if batch.failed > 0: + sys.exit(1) else: parser.print_help() sys.exit(1) diff --git a/databricks-skills/databricks-vector-search/end-to-end-rag.md b/databricks-skills/databricks-vector-search/end-to-end-rag.md index 4d020c83..60691a2a 100644 --- a/databricks-skills/databricks-vector-search/end-to-end-rag.md +++ b/databricks-skills/databricks-vector-search/end-to-end-rag.md @@ -6,7 +6,7 @@ Build a complete Retrieval-Augmented Generation pipeline: prepare documents, cre | Command | Step | |---------|------| -| `databricks sql execute` | Create source table, insert documents | +| `databricks experimental aitools tools query` | Create source table, insert documents | | `databricks vector-search endpoints create` | Create compute endpoint | | `databricks vector-search indexes create-index` | Create Delta Sync index with managed embeddings | | `databricks vector-search indexes sync-index` | Trigger index sync | @@ -37,7 +37,7 @@ INSERT INTO catalog.schema.knowledge_base VALUES Or via CLI: ```bash -databricks sql execute --warehouse-id WAREHOUSE_ID --query " +databricks experimental aitools tools query --warehouse WAREHOUSE_ID " CREATE TABLE IF NOT EXISTS catalog.schema.knowledge_base ( doc_id STRING, title STRING, From 633ee7aec61a7c31d92ea79a9d055b99e0b7fb2f Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Wed, 15 Apr 2026 09:25:58 +0200 Subject: [PATCH 35/66] Dashboard skill: use schema.table format with --dataset-catalog option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove catalog from all query examples (use schema.table format) - Add --dataset-catalog and --dataset-schema CLI options to create command - Update documentation to explain default catalog/schema approach πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../databricks-aibi-dashboards/3-filters.md | 2 +- .../databricks-aibi-dashboards/4-examples.md | 6 +++-- .../databricks-aibi-dashboards/SKILL.md | 22 +++++++++---------- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/databricks-skills/databricks-aibi-dashboards/3-filters.md b/databricks-skills/databricks-aibi-dashboards/3-filters.md index f1c55088..c981b1a4 100644 --- a/databricks-skills/databricks-aibi-dashboards/3-filters.md +++ b/databricks-skills/databricks-aibi-dashboards/3-filters.md @@ -151,7 +151,7 @@ Place filter widget directly on a `PAGE_TYPE_CANVAS` page (same widget structure { "name": "revenue_by_category", "queryLines": [ - "SELECT category, SUM(revenue) as revenue FROM catalog.schema.orders ", + "SELECT category, SUM(revenue) as revenue FROM sales.orders ", "WHERE order_date BETWEEN :date_range.min AND :date_range.max ", "GROUP BY category" ], diff --git a/databricks-skills/databricks-aibi-dashboards/4-examples.md b/databricks-skills/databricks-aibi-dashboards/4-examples.md index 8c2d0158..c240c70d 100644 --- a/databricks-skills/databricks-aibi-dashboards/4-examples.md +++ b/databricks-skills/databricks-aibi-dashboards/4-examples.md @@ -59,6 +59,8 @@ This example shows a complete dashboard with: - Data table for detailed records - Global filters (date range, region, category) +> **Note**: Queries use `schema.table` format (no catalog). Set the catalog via `--dataset-catalog "my_catalog"` when creating the dashboard. + ```json { "datasets": [ @@ -67,7 +69,7 @@ This example shows a complete dashboard with: "displayName": "Daily Sales", "queryLines": [ "SELECT sale_date, region, department, total_orders, total_units, total_revenue, total_cost, profit_margin ", - "FROM catalog.schema.gold_daily_sales ", + "FROM gold.daily_sales ", "ORDER BY sale_date" ] }, @@ -76,7 +78,7 @@ This example shows a complete dashboard with: "displayName": "Product Performance", "queryLines": [ "SELECT product_id, product_name, department, region, units_sold, revenue, cost, profit ", - "FROM catalog.schema.gold_product_performance" + "FROM gold.product_performance" ] } ], diff --git a/databricks-skills/databricks-aibi-dashboards/SKILL.md b/databricks-skills/databricks-aibi-dashboards/SKILL.md index 0e51d509..e360a69f 100644 --- a/databricks-skills/databricks-aibi-dashboards/SKILL.md +++ b/databricks-skills/databricks-aibi-dashboards/SKILL.md @@ -17,7 +17,7 @@ A dashboard should be showing something relevant for a human, typically some KPI | List tables | `databricks experimental aitools tools query --warehouse WH "SHOW TABLES IN catalog.schema"` | | Get schema | `databricks experimental aitools tools discover-schema catalog.schema.table1 catalog.schema.table2` | | Test query | `databricks experimental aitools tools query --warehouse WH "SELECT..."` | -| Create dashboard | `databricks lakeview create --display-name "X" --warehouse-id "Y" --serialized-dashboard "$(cat file.json)"` | +| Create dashboard | `databricks lakeview create --display-name "X" --warehouse-id "Y" --dataset-catalog "catalog" --dataset-schema "schema" --serialized-dashboard "$(cat file.json)"` | | Update dashboard | `databricks lakeview update DASHBOARD_ID --serialized-dashboard "$(cat file.json)"` | | Publish | `databricks lakeview publish DASHBOARD_ID --warehouse-id WH` | | Delete | `databricks lakeview trash DASHBOARD_ID` | @@ -54,7 +54,7 @@ databricks warehouses list ```bash # Get table schemas for designing queries databricks experimental aitools tools query --warehouse WAREHOUSE_ID "SHOW TABLES IN catalog.schema" 2>&1 -# IMPORTANT: Use CATALOG.SCHEMA.TABLE format (full 3-part name required) +# Use CATALOG.SCHEMA.TABLE format for discover-schema (this is for exploration only) databricks experimental aitools tools discover-schema catalog.schema.table1 catalog.schema.table2 # Example: @@ -64,6 +64,8 @@ databricks experimental aitools tools discover-schema samples.nyctaxi.trips main databricks experimental aitools tools query --warehouse WAREHOUSE_ID "" ``` +> **Note**: The `discover-schema` command requires full `catalog.schema.table` paths, but **dashboard queries should use `schema.table` format** with catalog set via `--dataset-catalog` at dashboard creation. + ### Step 3: Verify Data Matches Story The datasets.querylines in the dashboard json (see example below) must be tested to ensure @@ -97,19 +99,15 @@ Before writing JSON, plan your dashboard: Once created, you can edit the file as following: ```bash # Create a dashboard -# IMPORTANT: Use --display-name, --warehouse-id, and --serialized-dashboard (NOT --json @file.json with displayName in it) +# IMPORTANT: Use --dataset-catalog and --dataset-schema to set defaults for all queries +# This way, queries can use schema.table format instead of catalog.schema.table databricks lakeview create \ --display-name "My Dashboard" \ --warehouse-id "abc123def456" \ + --dataset-catalog "my_catalog" \ + --dataset-schema "my_schema" \ --serialized-dashboard "$(cat dashboard.json)" -# Alternative: Use --json with the correct structure -databricks lakeview create --json '{ - "display_name": "My Dashboard", - "warehouse_id": "abc123def456", - "serialized_dashboard": "{\"datasets\":[...],\"pages\":[...]}" -}' - # List all dashboards databricks lakeview list @@ -141,7 +139,7 @@ Every dashboard's `serialized_dashboard` content must follow this exact structur { "name": "ds_x", "displayName": "Dataset X", - "queryLines": ["SELECT col1, col2 ", "FROM catalog.schema.table"] + "queryLines": ["SELECT col1, col2 ", "FROM schema.table"] } ], "pages": [ @@ -211,7 +209,7 @@ Apply unless user specifies otherwise: - **One dataset per domain** (e.g., orders, customers, products). Datasets shared across widgets benefit from the same filters. - **Exactly ONE valid SQL query per dataset** (no multiple queries separated by `;`) -- Always use **fully-qualified table names**: `catalog.schema.table_name` +- **NEVER specify catalog in queries** - use `schema.table` format (e.g., `gold.daily_sales`). Set the default catalog and schema via CLI options `--dataset-catalog` and `--dataset-schema` when creating the dashboard - SELECT must include all dimensions needed by widgets and all derived columns via `AS` aliases - Put ALL business logic (CASE/WHEN, COALESCE, ratios) into the dataset SELECT with explicit aliases - **Contract rule**: Every widget `fieldName` must exactly match a dataset column or alias From 72628c114c1f451d1e1226babb60a963f0810f87 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Wed, 15 Apr 2026 14:16:02 +0200 Subject: [PATCH 36/66] Fix CLI commands and add resource tagging across skills MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Comprehensive CLI audit fixes: - Fix positional arguments vs flags (postgres, database, system-schemas, knowledge-assistants, storage-credentials) - Add cluster/warehouse create examples with tags to execution-compute - Add --cluster-sources UI,API filter to exclude job clusters (faster) - Fix genie export/import commands (use get-space --include-serialized-space) - Standardize tag instructions: "include" for inline JSON, "after creation" for workspace-entity-tag-assignments Resources with tags: - Jobs, Pipelines: inline "tags" in create JSON - Clusters: inline "custom_tags" in create JSON - Warehouses: inline "tags.custom_tags" array in create JSON - Dashboards, Apps, Genie: workspace-entity-tag-assignments API - Serving Endpoints: patch API with add_tags πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../databricks-agent-bricks/SKILL.md | 7 ++-- .../databricks-aibi-dashboards/SKILL.md | 4 ++ .../databricks-app-python/6-cli-approach.md | 4 ++ .../databricks-execution-compute/SKILL.md | 39 ++++++++++++++++++- databricks-skills/databricks-genie/SKILL.md | 18 +++++++-- .../databricks-iceberg/4-snowflake-interop.md | 2 +- databricks-skills/databricks-jobs/SKILL.md | 8 +++- .../databricks-jobs/triggers-schedules.md | 4 +- .../databricks-lakebase-autoscale/SKILL.md | 5 +-- .../databricks-lakebase-autoscale/projects.md | 3 +- .../databricks-lakebase-provisioned/SKILL.md | 14 +++---- .../3-genai-agents.md | 2 +- .../6-logging-registration.md | 2 +- .../databricks-model-serving/7-deployment.md | 8 ++-- .../databricks-model-serving/SKILL.md | 13 +++++-- .../references/2-cli-approach.md | 1 + .../references/3-advanced-configuration.md | 2 + .../5-system-tables.md | 9 ++--- .../databricks-vector-search/SKILL.md | 28 ++++++------- .../end-to-end-rag.md | 10 ++--- .../troubleshooting-and-operations.md | 4 +- .../databricks-zerobus-ingest/SKILL.md | 4 +- 22 files changed, 124 insertions(+), 67 deletions(-) diff --git a/databricks-skills/databricks-agent-bricks/SKILL.md b/databricks-skills/databricks-agent-bricks/SKILL.md index 1245a54d..dc89fa48 100644 --- a/databricks-skills/databricks-agent-bricks/SKILL.md +++ b/databricks-skills/databricks-agent-bricks/SKILL.md @@ -25,9 +25,10 @@ databricks experimental aitools tools query --warehouse WH "LIST '/Volumes/catal # Create KA databricks knowledge-assistants create-knowledge-assistant "Name" "Description" -# Add knowledge source -databricks knowledge-assistants create-knowledge-source "knowledge-assistants/{ka_id}" \ - --json '{"display_name": "Docs", "description": "...", "source_type": "files", "files": {"path": "/Volumes/catalog/schema/volume/"}}' +# Add knowledge source (4 positional args: PARENT DISPLAY_NAME DESCRIPTION SOURCE_TYPE) +databricks knowledge-assistants create-knowledge-source \ + "knowledge-assistants/{ka_id}" "Docs" "Documentation files" "files" \ + --json '{"files": {"path": "/Volumes/catalog/schema/volume/"}}' # Sync and check status databricks knowledge-assistants sync-knowledge-sources "knowledge-assistants/{ka_id}" diff --git a/databricks-skills/databricks-aibi-dashboards/SKILL.md b/databricks-skills/databricks-aibi-dashboards/SKILL.md index e360a69f..6e549fc8 100644 --- a/databricks-skills/databricks-aibi-dashboards/SKILL.md +++ b/databricks-skills/databricks-aibi-dashboards/SKILL.md @@ -125,6 +125,10 @@ databricks lakeview unpublish DASHBOARD_ID # Delete (trash) a dashboard databricks lakeview trash DASHBOARD_ID + +# By default, after creation, tag dashboards to track resources created with this skill +databricks workspace-entity-tag-assignments create-tag-assignment \ + dashboards DASHBOARD_ID aidevkit_project --tag-value ai-dev-kit ``` --- diff --git a/databricks-skills/databricks-app-python/6-cli-approach.md b/databricks-skills/databricks-app-python/6-cli-approach.md index 01543509..6b6273c1 100644 --- a/databricks-skills/databricks-app-python/6-cli-approach.md +++ b/databricks-skills/databricks-app-python/6-cli-approach.md @@ -24,6 +24,10 @@ databricks apps logs my-dashboard # Delete an app databricks apps delete my-dashboard + +# By default, after creation, tag apps to track resources created with this skill +databricks workspace-entity-tag-assignments create-tag-assignment \ + apps my-dashboard aidevkit_project --tag-value ai-dev-kit ``` --- diff --git a/databricks-skills/databricks-execution-compute/SKILL.md b/databricks-skills/databricks-execution-compute/SKILL.md index 151a467d..75a05bd8 100644 --- a/databricks-skills/databricks-execution-compute/SKILL.md +++ b/databricks-skills/databricks-execution-compute/SKILL.md @@ -63,8 +63,8 @@ databricks jobs create --json '{ "environments": [{"environment_key": "default", "spec": {"client": "4"}}] }' -# Run the job -databricks jobs run-now --job-id JOB_ID +# Run the job (JOB_ID is positional) +databricks jobs run-now JOB_ID ``` ### Interactive Cluster β†’ [reference](references/3-interactive-cluster.md) @@ -94,6 +94,41 @@ databricks jobs create --json '{ | `databricks clusters create/start/delete` | Interactive | Manage clusters. **COSTLY:** `start` takes 3-8 min | | `databricks warehouses create/list` | SQL | Manage SQL warehouses | +### List Interactive Clusters (exclude job clusters) + +```bash +# List only UI/API clusters (excludes job clusters - much faster) +databricks clusters list --cluster-sources UI,API --output json | jq '.[] | select(.state == "RUNNING")' +``` + +### Create Cluster + +```bash +# Create interactive cluster (SPARK_VERSION is positional) +# By default, include custom_tags to track resources created with this skill +databricks clusters create 15.4.x-scala2.12 --json '{ + "cluster_name": "my-cluster", + "node_type_id": "i3.xlarge", + "num_workers": 2, + "autotermination_minutes": 60, + "custom_tags": {"aidevkit_project": "ai-dev-kit"} +}' +``` + +### Create SQL Warehouse + +```bash +# Create serverless SQL warehouse +# By default, include tags to track resources created with this skill +databricks warehouses create --json '{ + "name": "my-warehouse", + "cluster_size": "Small", + "enable_serverless_compute": true, + "auto_stop_mins": 10, + "tags": {"custom_tags": [{"key": "aidevkit_project", "value": "ai-dev-kit"}]} +}' +``` + ## Related Skills - **[databricks-synthetic-data-gen](../databricks-synthetic-data-gen/SKILL.md)** β€” Data generation using Spark + Faker diff --git a/databricks-skills/databricks-genie/SKILL.md b/databricks-skills/databricks-genie/SKILL.md index a8652ebc..49628a9a 100644 --- a/databricks-skills/databricks-genie/SKILL.md +++ b/databricks-skills/databricks-genie/SKILL.md @@ -50,6 +50,10 @@ databricks genie create-space --json "{ # Get space details (with full config) databricks genie get-space SPACE_ID --include-serialized-space +# By default, after creation, tag the Genie Space to track resources created with this skill +databricks workspace-entity-tag-assignments create-tag-assignment \ + geniespaces SPACE_ID aidevkit_project --tag-value ai-dev-kit + # Delete a Genie Space databricks genie trash-space SPACE_ID ``` @@ -63,11 +67,17 @@ If answers are inaccurate or incomplete, improve the space β€” see "Improving a ### Export & Import ```bash -# Export space configuration -databricks genie export-space SPACE_ID > exported.json +# Export space configuration (extract serialized_space from get-space output) +databricks genie get-space SPACE_ID --include-serialized-space -o json | jq '.serialized_space' > genie_space.json -# Import space from exported config -databricks genie import-space --json @exported.json +# Import: Create a new space with the exported serialized_space +databricks genie create-space --json "{ + \"warehouse_id\": \"WAREHOUSE_ID\", + \"title\": \"Sales Analytics\", + \"description\": \"Migrated space\", + \"parent_path\": \"/Workspace/Users/you@company.com/genie_spaces\", + \"serialized_space\": $(cat genie_space.json) +}" ``` ### Improving a Genie Space diff --git a/databricks-skills/databricks-iceberg/4-snowflake-interop.md b/databricks-skills/databricks-iceberg/4-snowflake-interop.md index 2f9d9536..ad6fae3a 100644 --- a/databricks-skills/databricks-iceberg/4-snowflake-interop.md +++ b/databricks-skills/databricks-iceberg/4-snowflake-interop.md @@ -196,7 +196,7 @@ Create a storage credential for the cloud storage where Snowflake stores its Ice ```bash # In Databricks CLI (AWS example) databricks storage-credentials create snowflake_storage_cred \ - --aws-iam-role-arn "arn:aws:iam::123456789012:role/snowflake-data-access" + --json '{"aws_iam_role": {"role_arn": "arn:aws:iam::123456789012:role/snowflake-data-access"}}' ``` ### Step 3: Create an External Location diff --git a/databricks-skills/databricks-jobs/SKILL.md b/databricks-skills/databricks-jobs/SKILL.md index 0f60a241..d66f87e5 100644 --- a/databricks-skills/databricks-jobs/SKILL.md +++ b/databricks-skills/databricks-jobs/SKILL.md @@ -46,8 +46,10 @@ print(f"Created job: {job.job_id}") ### CLI ```bash +# By default, add tags to track resources created with this skill databricks jobs create --json '{ "name": "my-etl-job", + "tags": {"aidevkit_project": "ai-dev-kit"}, "tasks": [{ "task_key": "extract", "notebook_task": { @@ -58,6 +60,8 @@ databricks jobs create --json '{ }' ``` +> **Tagging**: By default, add `"tags": {"aidevkit_project": "ai-dev-kit"}` to track resources created with this skill. + ### Asset Bundles (DABs) ```yaml @@ -261,8 +265,8 @@ databricks jobs get 12345 # Run job databricks jobs run-now 12345 -# Run with parameters -databricks jobs run-now 12345 --job-params '{"env": "prod"}' +# Run with parameters (must use --json with job_id inside) +databricks jobs run-now --json '{"job_id": 12345, "job_parameters": {"env": "prod"}}' # Cancel run databricks jobs cancel-run 67890 diff --git a/databricks-skills/databricks-jobs/triggers-schedules.md b/databricks-skills/databricks-jobs/triggers-schedules.md index 9022c715..a0c0fd48 100644 --- a/databricks-skills/databricks-jobs/triggers-schedules.md +++ b/databricks-skills/databricks-jobs/triggers-schedules.md @@ -431,8 +431,8 @@ run_result = w.jobs.run_now_and_wait(job_id=12345) # Run job databricks jobs run-now 12345 -# Run with parameters -databricks jobs run-now 12345 --job-params '{"env": "prod"}' +# Run with parameters (must use --json with job_id inside) +databricks jobs run-now --json '{"job_id": 12345, "job_parameters": {"env": "prod"}}' ``` **DABs:** diff --git a/databricks-skills/databricks-lakebase-autoscale/SKILL.md b/databricks-skills/databricks-lakebase-autoscale/SKILL.md index 848e6e67..9f857882 100644 --- a/databricks-skills/databricks-lakebase-autoscale/SKILL.md +++ b/databricks-skills/databricks-lakebase-autoscale/SKILL.md @@ -182,9 +182,8 @@ w.postgres.update_endpoint( ### Project Management ```bash -# Create a project -databricks postgres create-project \ - --project-id my-app \ +# Create a project (PROJECT_ID is positional) +databricks postgres create-project my-app \ --json '{"spec": {"display_name": "My App", "pg_version": "17"}}' # List projects diff --git a/databricks-skills/databricks-lakebase-autoscale/projects.md b/databricks-skills/databricks-lakebase-autoscale/projects.md index 659207a4..0522a72e 100644 --- a/databricks-skills/databricks-lakebase-autoscale/projects.md +++ b/databricks-skills/databricks-lakebase-autoscale/projects.md @@ -64,8 +64,7 @@ print(f"Postgres version: {result.status.pg_version}") ### CLI ```bash -databricks postgres create-project \ - --project-id my-app \ +databricks postgres create-project my-app \ --json '{ "spec": { "display_name": "My Application", diff --git a/databricks-skills/databricks-lakebase-provisioned/SKILL.md b/databricks-skills/databricks-lakebase-provisioned/SKILL.md index 2dacbaa2..1d7e0589 100644 --- a/databricks-skills/databricks-lakebase-provisioned/SKILL.md +++ b/databricks-skills/databricks-lakebase-provisioned/SKILL.md @@ -231,25 +231,23 @@ mlflow.langchain.log_model( ### Instance Management ```bash -# Create instance -databricks database create-database-instance \ - --name my-lakebase-instance \ - --capacity CU_1 +# Create instance (NAME is positional) +databricks database create-database-instance my-lakebase-instance --capacity CU_1 # Get instance details -databricks database get-database-instance --name my-lakebase-instance +databricks database get-database-instance my-lakebase-instance # List instances databricks database list-database-instances # Stop instance (saves cost) -databricks database stop-database-instance --name my-lakebase-instance +databricks database stop-database-instance my-lakebase-instance # Start instance -databricks database start-database-instance --name my-lakebase-instance +databricks database start-database-instance my-lakebase-instance # Delete instance -databricks database delete-database-instance --name my-lakebase-instance +databricks database delete-database-instance my-lakebase-instance ``` ### OAuth Credentials diff --git a/databricks-skills/databricks-model-serving/3-genai-agents.md b/databricks-skills/databricks-model-serving/3-genai-agents.md index 66647687..1f1f9f8b 100644 --- a/databricks-skills/databricks-model-serving/3-genai-agents.md +++ b/databricks-skills/databricks-model-serving/3-genai-agents.md @@ -226,7 +226,7 @@ Run via CLI: ```bash # Upload and run on Databricks databricks workspace import-dir ./my_agent /Workspace/Users//my_agent -databricks jobs run-now --job-id # Job configured to run test_agent.py +databricks jobs run-now JOB_ID # JOB_ID is positional; job runs test_agent.py ``` ## Logging the Agent diff --git a/databricks-skills/databricks-model-serving/6-logging-registration.md b/databricks-skills/databricks-model-serving/6-logging-registration.md index bfa643b9..2413fd75 100644 --- a/databricks-skills/databricks-model-serving/6-logging-registration.md +++ b/databricks-skills/databricks-model-serving/6-logging-registration.md @@ -65,7 +65,7 @@ Run via CLI: ```bash # Upload and run on Databricks databricks workspace import-dir ./my_agent /Workspace/Users//my_agent -databricks jobs run-now --job-id # Job configured to run log_model.py +databricks jobs run-now JOB_ID # JOB_ID is positional; job runs log_model.py ``` ## Resources for Auto Authentication diff --git a/databricks-skills/databricks-model-serving/7-deployment.md b/databricks-skills/databricks-model-serving/7-deployment.md index 2f503112..14e4f69f 100644 --- a/databricks-skills/databricks-model-serving/7-deployment.md +++ b/databricks-skills/databricks-model-serving/7-deployment.md @@ -46,6 +46,7 @@ Use the Databricks CLI: ```bash databricks jobs create --json '{ "name": "deploy-agent-job", + "tags": {"aidevkit_project": "ai-dev-kit"}, "tasks": [{ "task_key": "deploy", "spark_python_task": { @@ -72,8 +73,8 @@ Save the returned `job_id`. Run the job - returns immediately: ```bash -databricks jobs run-now --job-id \ - --params '{"model_name": "main.agents.my_agent", "version": "1"}' +# Note: job_id is positional, parameters go in --json +databricks jobs run-now --json '{"job_id": , "job_parameters": {"model_name": "main.agents.my_agent", "version": "1"}}' ``` Save the returned `run_id`. @@ -83,7 +84,8 @@ Save the returned `run_id`. Check job run status: ```bash -databricks jobs get-run --run-id +# run_id is positional +databricks jobs get-run ``` Or check endpoint directly: diff --git a/databricks-skills/databricks-model-serving/SKILL.md b/databricks-skills/databricks-model-serving/SKILL.md index bf520b5a..448d769f 100644 --- a/databricks-skills/databricks-model-serving/SKILL.md +++ b/databricks-skills/databricks-model-serving/SKILL.md @@ -164,6 +164,11 @@ databricks serving-endpoints list # Get endpoint details and status databricks serving-endpoints get my-agent-endpoint +# By default, after creation, tag serving endpoints to track resources created with this skill +databricks serving-endpoints patch my-agent-endpoint --json '{ + "add_tags": [{"key": "aidevkit_project", "value": "ai-dev-kit"}] +}' + # Query a chat/agent endpoint databricks serving-endpoints query my-agent-endpoint --json '{ "messages": [{"role": "user", "content": "Hello!"}], @@ -192,11 +197,11 @@ databricks workspace list /Workspace/Users/you@company.com/my_agent # Create a deployment job databricks jobs create --json @deploy_job.json -# Run the deployment job -databricks jobs run-now --job-id JOB_ID +# Run the deployment job (JOB_ID is positional) +databricks jobs run-now JOB_ID -# Check job run status -databricks jobs get-run --run-id RUN_ID +# Check job run status (RUN_ID is positional) +databricks jobs get-run RUN_ID ``` --- diff --git a/databricks-skills/databricks-spark-declarative-pipelines/references/2-cli-approach.md b/databricks-skills/databricks-spark-declarative-pipelines/references/2-cli-approach.md index 4fdd27df..7c2dbb31 100644 --- a/databricks-skills/databricks-spark-declarative-pipelines/references/2-cli-approach.md +++ b/databricks-skills/databricks-spark-declarative-pipelines/references/2-cli-approach.md @@ -30,6 +30,7 @@ databricks pipelines create --json '{ "libraries": [ {"file": {"path": "/Workspace/Users/user@example.com/my_pipeline"}} ], + "tags": {"aidevkit_project": "ai-dev-kit"}, "development": true }' diff --git a/databricks-skills/databricks-spark-declarative-pipelines/references/3-advanced-configuration.md b/databricks-skills/databricks-spark-declarative-pipelines/references/3-advanced-configuration.md index 6a349f78..44435cae 100644 --- a/databricks-skills/databricks-spark-declarative-pipelines/references/3-advanced-configuration.md +++ b/databricks-skills/databricks-spark-declarative-pipelines/references/3-advanced-configuration.md @@ -161,6 +161,8 @@ Install pip dependencies for serverless pipelines: All examples use `databricks pipelines create --json '{...}'`. For updates, use `databricks pipelines update --json '{...}'`. +> **Tagging**: By default, add `"tags": {"aidevkit_project": "ai-dev-kit"}` to track resources created with this skill. + ### Development Mode Pipeline ```bash diff --git a/databricks-skills/databricks-unity-catalog/5-system-tables.md b/databricks-skills/databricks-unity-catalog/5-system-tables.md index e8c9d95f..0fbdb46d 100644 --- a/databricks-skills/databricks-unity-catalog/5-system-tables.md +++ b/databricks-skills/databricks-unity-catalog/5-system-tables.md @@ -48,12 +48,11 @@ w.system_schemas.enable( **CLI:** ```bash -# List system schemas -databricks system-schemas list --metastore-id your-metastore-id +# List system schemas (METASTORE_ID is positional) +databricks system-schemas list your-metastore-id -# Enable system schema -databricks system-schemas enable --metastore-id your-metastore-id \ - --schema-name access +# Enable system schema (METASTORE_ID and SCHEMA_NAME are positional) +databricks system-schemas enable your-metastore-id access ``` --- diff --git a/databricks-skills/databricks-vector-search/SKILL.md b/databricks-skills/databricks-vector-search/SKILL.md index 18cc7679..2bb84d84 100644 --- a/databricks-skills/databricks-vector-search/SKILL.md +++ b/databricks-skills/databricks-vector-search/SKILL.md @@ -268,28 +268,22 @@ scan_result = w.vector_search_indexes.scan_index( ```bash # List endpoints -databricks vector-search endpoints list +databricks vector-search-endpoints list-endpoints -# Create endpoint -databricks vector-search endpoints create \ - --name my-endpoint \ - --endpoint-type STANDARD +# Create endpoint (positional args: NAME ENDPOINT_TYPE) +databricks vector-search-endpoints create-endpoint my-endpoint STANDARD -# List indexes on endpoint -databricks vector-search indexes list-indexes \ - --endpoint-name my-endpoint +# List indexes on endpoint (positional arg: ENDPOINT_NAME) +databricks vector-search-indexes list-indexes my-endpoint -# Get index status -databricks vector-search indexes get-index \ - --index-name catalog.schema.my_index +# Get index status (positional arg: INDEX_NAME) +databricks vector-search-indexes get-index catalog.schema.my_index -# Sync index (for TRIGGERED) -databricks vector-search indexes sync-index \ - --index-name catalog.schema.my_index +# Sync index (positional arg: INDEX_NAME) +databricks vector-search-indexes sync-index catalog.schema.my_index -# Delete index -databricks vector-search indexes delete-index \ - --index-name catalog.schema.my_index +# Delete index (positional arg: INDEX_NAME) +databricks vector-search-indexes delete-index catalog.schema.my_index ``` ## Common Issues diff --git a/databricks-skills/databricks-vector-search/end-to-end-rag.md b/databricks-skills/databricks-vector-search/end-to-end-rag.md index 60691a2a..00959f91 100644 --- a/databricks-skills/databricks-vector-search/end-to-end-rag.md +++ b/databricks-skills/databricks-vector-search/end-to-end-rag.md @@ -7,11 +7,11 @@ Build a complete Retrieval-Augmented Generation pipeline: prepare documents, cre | Command | Step | |---------|------| | `databricks experimental aitools tools query` | Create source table, insert documents | -| `databricks vector-search endpoints create` | Create compute endpoint | -| `databricks vector-search indexes create-index` | Create Delta Sync index with managed embeddings | -| `databricks vector-search indexes sync-index` | Trigger index sync | -| `databricks vector-search indexes get-index` | Check index status | -| `databricks vector-search indexes query-index` | Test similarity search | +| `databricks vector-search-endpoints create-endpoint` | Create compute endpoint | +| `databricks vector-search-indexes create-index` | Create Delta Sync index with managed embeddings | +| `databricks vector-search-indexes sync-index` | Trigger index sync | +| `databricks vector-search-indexes get-index` | Check index status | +| `databricks vector-search-indexes query-index` | Test similarity search | --- diff --git a/databricks-skills/databricks-vector-search/troubleshooting-and-operations.md b/databricks-skills/databricks-vector-search/troubleshooting-and-operations.md index 23385adc..36614a23 100644 --- a/databricks-skills/databricks-vector-search/troubleshooting-and-operations.md +++ b/databricks-skills/databricks-vector-search/troubleshooting-and-operations.md @@ -4,7 +4,7 @@ Operational guidance for monitoring, cost optimization, capacity planning, and m ## Monitoring Endpoint Status -Use `databricks vector-search endpoints get` (CLI) or `w.vector_search_endpoints.get_endpoint()` (SDK) to check endpoint health. +Use `databricks vector-search-endpoints get-endpoint ENDPOINT_NAME` (CLI) or `w.vector_search_endpoints.get_endpoint()` (SDK) to check endpoint health. ### Endpoint fields @@ -34,7 +34,7 @@ print(f"Indexes: {endpoint.num_indexes}") ## Monitoring Index Status -Use `databricks vector-search indexes get-index` (CLI) or `w.vector_search_indexes.get_index()` (SDK) to check index health. +Use `databricks vector-search-indexes get-index INDEX_NAME` (CLI) or `w.vector_search_indexes.get_index()` (SDK) to check index health. ### Index fields diff --git a/databricks-skills/databricks-zerobus-ingest/SKILL.md b/databricks-skills/databricks-zerobus-ingest/SKILL.md index 668a4be2..fdac5f98 100644 --- a/databricks-skills/databricks-zerobus-ingest/SKILL.md +++ b/databricks-skills/databricks-zerobus-ingest/SKILL.md @@ -160,14 +160,14 @@ databricks jobs create --json '{ }] }' -databricks jobs run-now --job-id +databricks jobs run-now JOB_ID ``` **If execution fails:** 1. Read the error from the job run output 2. Edit the local Python file to fix the issue 3. Re-upload: `databricks workspace import-dir ./scripts /Workspace/Users//scripts` -4. Re-run: `databricks jobs run-now --job-id ` +4. Re-run: `databricks jobs run-now JOB_ID` --- From efd6e956b993f6d775ac46c0766795bc3e0fe801 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Wed, 15 Apr 2026 14:35:49 +0200 Subject: [PATCH 37/66] Restore MCP server and make installation optional MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Restore databricks-mcp-server/ and databricks-tools-core/ directories - Make MCP server installation optional in install.sh (default: skip) - Add --mcp and --mcp-path CLI options for non-interactive install - Add DEVKIT_INSTALL_MCP and DEVKIT_MCP_PATH env vars - Skills-only install is faster (no venv setup required) πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- install.sh | 179 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 174 insertions(+), 5 deletions(-) diff --git a/install.sh b/install.sh index 64c322ba..6ef2b780 100644 --- a/install.sh +++ b/install.sh @@ -80,6 +80,8 @@ fi # Installation mode defaults INSTALL_MCP=true INSTALL_SKILLS=true +INSTALL_MCP="${DEVKIT_INSTALL_MCP:-false}" +MCP_INSTALL_PATH="${DEVKIT_MCP_PATH:-$HOME/.ai-dev-kit}" # Minimum required versions MIN_CLI_VERSION="0.278.0" @@ -135,6 +137,8 @@ while [ $# -gt 0 ]; do --skills) USER_SKILLS="$2"; shift 2 ;; --list-skills) LIST_SKILLS=true; shift ;; --silent) SILENT=true; shift ;; + --mcp) INSTALL_MCP=true; shift ;; + --mcp-path) MCP_INSTALL_PATH="$2"; INSTALL_MCP=true; shift 2 ;; --tools) USER_TOOLS="$2"; shift 2 ;; --experimental) CHANNEL="experimental"; shift ;; -f|--force) FORCE=true; shift ;; @@ -156,6 +160,8 @@ while [ $# -gt 0 ]; do echo " --skills LIST Comma-separated skill names to install (overrides profile)" echo " --list-skills List available skills and profiles, then exit" echo " --experimental Install from experimental branch (early access features)" + echo " --mcp Install deprecated MCP server (default: no)" + echo " --mcp-path PATH MCP server install path (default: ~/.ai-dev-kit)" echo " -f, --force Force reinstall" echo " -h, --help Show this help" echo "" @@ -170,6 +176,8 @@ while [ $# -gt 0 ]; do echo " DEVKIT_SKILLS Comma-separated skill names" echo " DEVKIT_SILENT Set to 'true' for silent mode" echo " DEVKIT_CHANNEL 'stable' (default) or 'experimental'" + echo " DEVKIT_INSTALL_MCP Set to 'true' to install MCP server" + echo " DEVKIT_MCP_PATH MCP server install path" echo " AIDEVKIT_HOME Installation directory (default: ~/.ai-dev-kit)" echo "" echo "Examples:" @@ -942,6 +950,157 @@ prompt_custom_skills() { USER_SKILLS=$(echo "$selected" | tr ' ' ',') } +# ─── MCP Server installation prompt ──────────────────────────── +prompt_mcp_install() { + # Skip if already set via env var or flag + if [ "$INSTALL_MCP" = true ]; then + return + fi + + # Skip in silent mode or non-interactive + if [ "$SILENT" = true ] || [ ! -e /dev/tty ]; then + return + fi + + echo "" + echo -e " ${B}Deprecated MCP Server${N}" + echo -e " ${D}Skills now work via CLI for better performance. MCP server is optional for backwards compatibility.${N}" + + local -a labels=("Do not install" "Install MCP server") + local -a values=("no" "yes") + local -a hints=("Recommended - skills work without MCP" "Legacy - requires Python venv setup") + local count=2 + local selected=0 + local cursor=0 + + _mcp_draw() { + for i in 0 1; do + local dot="β—‹" + local dot_color="\033[2m" + [ "$i" = "$selected" ] && dot="●" && dot_color="\033[0;32m" + local arrow=" " + [ "$i" = "$cursor" ] && arrow="\033[0;34m❯\033[0m " + local hint_style="\033[2m" + [ "$i" = "$selected" ] && hint_style="\033[0;32m" + printf "\033[2K %b%b%b %-20s %b%s\033[0m\n" "$arrow" "$dot_color" "$dot" "${labels[$i]}" "$hint_style" "${hints[$i]}" > /dev/tty + done + } + + printf "\n \033[2m↑/↓ navigate Β· enter select\033[0m\n\n" > /dev/tty + printf "\033[?25l" > /dev/tty + trap 'printf "\033[?25h" > /dev/tty 2>/dev/null' EXIT + + _mcp_draw + + while true; do + printf "\033[%dA" "$count" > /dev/tty + _mcp_draw + + local key="" + IFS= read -rsn1 key < /dev/tty 2>/dev/null + + if [ "$key" = $'\x1b' ]; then + local s1="" s2="" + read -rsn1 s1 < /dev/tty 2>/dev/null + read -rsn1 s2 < /dev/tty 2>/dev/null + if [ "$s1" = "[" ]; then + case "$s2" in + A) [ "$cursor" -gt 0 ] && cursor=$((cursor - 1)) ;; + B) [ "$cursor" -lt 1 ] && cursor=$((cursor + 1)) ;; + esac + fi + elif [ "$key" = "" ]; then + selected=$cursor + printf "\033[%dA" "$count" > /dev/tty + _mcp_draw + break + elif [ "$key" = " " ]; then + selected=$cursor + fi + done + + printf "\033[?25h" > /dev/tty + trap - EXIT + + if [ "${values[$selected]}" = "yes" ]; then + INSTALL_MCP=true + # Prompt for install path + echo "" + MCP_INSTALL_PATH=$(prompt "MCP server install path" "$MCP_INSTALL_PATH") + fi +} + +# Install MCP server (venv + packages) +install_mcp_server() { + step "Installing MCP server" + + # Check for uv + if ! command -v uv &> /dev/null; then + warn "'uv' is not installed. Install it with: curl -LsSf https://astral.sh/uv/install.sh | sh" + die "MCP server installation requires 'uv'" + fi + ok "uv is installed" + + local mcp_dir="$MCP_INSTALL_PATH" + local tools_core_dir="$REPO_DIR/databricks-tools-core" + local mcp_server_dir="$REPO_DIR/databricks-mcp-server" + + # Verify source directories exist + if [ ! -d "$tools_core_dir" ]; then + die "databricks-tools-core not found at $tools_core_dir" + fi + if [ ! -d "$mcp_server_dir" ]; then + die "databricks-mcp-server not found at $mcp_server_dir" + fi + + # Create install directory + mkdir -p "$mcp_dir" + + # Copy source files + msg "Copying MCP server source files..." + cp -r "$tools_core_dir" "$mcp_dir/" + cp -r "$mcp_server_dir" "$mcp_dir/" + ok "Source files copied to $mcp_dir" + + # Create virtual environment + msg "Creating virtual environment..." + cd "$mcp_dir" + uv venv --python 3.11 -q + ok "Virtual environment created" + + # Install packages + msg "Installing databricks-tools-core..." + uv pip install --python .venv/bin/python -e "$mcp_dir/databricks-tools-core" --quiet + ok "databricks-tools-core installed" + + msg "Installing databricks-mcp-server..." + uv pip install --python .venv/bin/python -e "$mcp_dir/databricks-mcp-server" --quiet + ok "databricks-mcp-server installed" + + # Verify installation + if .venv/bin/python -c "import databricks_mcp_server" 2>/dev/null; then + ok "MCP server verified" + else + warn "MCP server import verification failed" + fi + + # Save MCP config for later reference + echo "$mcp_dir" > "$mcp_dir/.mcp-install-path" + + cd - > /dev/null + + msg "" + msg "${B}MCP server installed at:${N} $mcp_dir" + msg "" + msg "To configure Claude Code, add to your project's .mcp.json:" + msg " {\"mcpServers\": {\"databricks\": {" + msg " \"command\": \"$mcp_dir/.venv/bin/python\"," + msg " \"args\": [\"$mcp_dir/databricks-mcp-server/run_server.py\"]" + msg " }}}" + msg "" + msg "Or use: ${B}claude mcp add-json databricks '{\"command\":\"$mcp_dir/.venv/bin/python\",\"args\":[\"$mcp_dir/databricks-mcp-server/run_server.py\"]}'${N}" +} + # Compare semantic versions (returns 0 if $1 >= $2) version_gte() { printf '%s\n%s' "$2" "$1" | sort -V -C @@ -1781,13 +1940,16 @@ main() { fi fi - # ── Step 5: Interactive MCP path ── + # ── Step 4.5: MCP server installation prompt ── + step "MCP server (deprecated)" + prompt_mcp_install if [ "$INSTALL_MCP" = true ]; then - prompt_mcp_path - ok "MCP path: $INSTALL_DIR" + ok "Will install MCP server to: $MCP_INSTALL_PATH" + else + ok "Skipping MCP server (recommended)" fi - # ── Step 6: Confirm before proceeding ── + # ── Step 5: Confirm before proceeding ── if [ "$SILENT" = false ]; then echo "" echo -e " ${B}Summary${N}" @@ -1806,7 +1968,11 @@ main() { echo -e " Skills: ${G}${SKILLS_PROFILE:-all} ($sk_total skills)${N}" fi fi - [ "$INSTALL_MCP" = true ] && echo -e " MCP config: ${G}yes${N}" + if [ "$INSTALL_MCP" = true ]; then + echo -e " MCP server: ${Y}Yes${N} (legacy) β†’ $MCP_INSTALL_PATH" + else + echo -e " MCP server: ${G}No${N} (recommended)" + fi echo "" fi @@ -1840,6 +2006,9 @@ main() { # Install skills [ "$INSTALL_SKILLS" = true ] && install_skills "$base_dir" + # Install MCP server if requested + [ "$INSTALL_MCP" = true ] && install_mcp_server + # Write GEMINI.md if gemini is selected if echo "$TOOLS" | grep -q gemini; then if [ "$SCOPE" = "global" ]; then From 5204f2dd40d30484fbd96c33f0d1672f7166cfc4 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Wed, 15 Apr 2026 14:51:15 +0200 Subject: [PATCH 38/66] Add configuration persistence for faster reinstalls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Save installation config (tools, profile, scope, skills, MCP) to .install-config - On reinstall, show recap of previous settings with option to reuse or reconfigure - Use hash-based schema validation: auto-detects when new config fields are added - Silent/non-interactive modes auto-apply previous config when available - Config file stored in scope-appropriate location (project or global) πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- install.sh | 265 ++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 231 insertions(+), 34 deletions(-) diff --git a/install.sh b/install.sh index 6ef2b780..0c177d8d 100644 --- a/install.sh +++ b/install.sh @@ -83,6 +83,13 @@ INSTALL_SKILLS=true INSTALL_MCP="${DEVKIT_INSTALL_MCP:-false}" MCP_INSTALL_PATH="${DEVKIT_MCP_PATH:-$HOME/.ai-dev-kit}" +# Required config fields - if any new field is added here, saved configs become stale +# (hash is computed automatically, no manual version bump needed) +REQUIRED_CONFIG_FIELDS="SAVED_TOOLS SAVED_PROFILE SAVED_SCOPE SAVED_SKILLS_PROFILE SAVED_INSTALL_MCP" + +# Flag to track if we're using previous config +USE_PREVIOUS_CONFIG=false + # Minimum required versions MIN_CLI_VERSION="0.278.0" MIN_SDK_VERSION="0.85.0" @@ -496,6 +503,167 @@ radio_select() { echo "${values[$selected]}" } +# ─── Configuration persistence ─────────────────────────────────── +# Saves all user choices to allow quick reinstalls + +# Compute hash of required fields list (auto-detects schema changes) +get_config_schema_hash() { + # Use md5/md5sum depending on platform, truncate for readability + if command -v md5 >/dev/null 2>&1; then + echo "$REQUIRED_CONFIG_FIELDS" | md5 | cut -c1-8 + elif command -v md5sum >/dev/null 2>&1; then + echo "$REQUIRED_CONFIG_FIELDS" | md5sum | cut -c1-8 + else + # Fallback: simple checksum + echo "$REQUIRED_CONFIG_FIELDS" | cksum | cut -d' ' -f1 + fi +} + +# Get config file path (scope-aware) +get_config_file() { + local state_dir + if [ "$SCOPE" = "global" ]; then + state_dir="$INSTALL_DIR" + else + state_dir="$(pwd)/.ai-dev-kit" + fi + echo "$state_dir/.install-config" +} + +# Save current configuration to file +save_config() { + local config_file + config_file=$(get_config_file) + mkdir -p "$(dirname "$config_file")" + + local schema_hash + schema_hash=$(get_config_schema_hash) + + cat > "$config_file" << EOF +# AI Dev Kit installation configuration +# Generated on $(date -u +"%Y-%m-%d %H:%M:%S UTC") +CONFIG_SCHEMA_HASH=$schema_hash +SAVED_TOOLS=$TOOLS +SAVED_PROFILE=$PROFILE +SAVED_SCOPE=$SCOPE +SAVED_SKILLS_PROFILE=${SKILLS_PROFILE:-all} +SAVED_USER_SKILLS=$USER_SKILLS +SAVED_INSTALL_MCP=$INSTALL_MCP +SAVED_MCP_INSTALL_PATH=$MCP_INSTALL_PATH +EOF +} + +# Load and validate previous configuration +# Returns 0 if valid config exists, 1 otherwise +# Sets SAVED_* variables if successful +load_previous_config() { + # Determine where to look for config + # For project scope, check current directory + # For global scope (or auto-detect), check both + local config_file="" + + # First try project-local config + if [ -f "$(pwd)/.ai-dev-kit/.install-config" ]; then + config_file="$(pwd)/.ai-dev-kit/.install-config" + # Then try global config + elif [ -f "$INSTALL_DIR/.install-config" ]; then + config_file="$INSTALL_DIR/.install-config" + fi + + [ -z "$config_file" ] && return 1 + + # Source the config file + # shellcheck disable=SC1090 + source "$config_file" 2>/dev/null || return 1 + + # Validate schema hash matches (if new fields added, hash changes, config is stale) + local expected_hash + expected_hash=$(get_config_schema_hash) + if [ "${CONFIG_SCHEMA_HASH:-}" != "$expected_hash" ]; then + return 1 + fi + + # Validate all required fields are present and non-empty + local field + for field in $REQUIRED_CONFIG_FIELDS; do + eval "local value=\${$field:-}" + [ -z "$value" ] && return 1 + done + + return 0 +} + +# Apply loaded config to current session variables +apply_previous_config() { + TOOLS="$SAVED_TOOLS" + PROFILE="$SAVED_PROFILE" + SCOPE="$SAVED_SCOPE" + SCOPE_EXPLICIT=true + + if [ "$SAVED_SKILLS_PROFILE" = "custom" ] || [[ "$SAVED_USER_SKILLS" == *","* ]] || [[ "$SAVED_USER_SKILLS" == *" "* ]]; then + USER_SKILLS="$SAVED_USER_SKILLS" + else + SKILLS_PROFILE="$SAVED_SKILLS_PROFILE" + fi + + INSTALL_MCP="$SAVED_INSTALL_MCP" + MCP_INSTALL_PATH="${SAVED_MCP_INSTALL_PATH:-$HOME/.ai-dev-kit}" + + USE_PREVIOUS_CONFIG=true +} + +# Display previous config and ask if user wants to use it +prompt_use_previous_config() { + if ! load_previous_config; then + return 1 # No valid config, proceed with prompts + fi + + echo "" + echo -e " ${B}Previous installation detected${N}" + echo -e " ────────────────────────────────────" + echo -e " Tools: ${G}$(echo "$SAVED_TOOLS" | tr ' ' ', ')${N}" + echo -e " Profile: ${G}$SAVED_PROFILE${N}" + echo -e " Scope: ${G}$SAVED_SCOPE${N}" + + if [ -n "$SAVED_USER_SKILLS" ]; then + echo -e " Skills: ${G}custom selection${N}" + else + echo -e " Skills: ${G}${SAVED_SKILLS_PROFILE:-all}${N}" + fi + + if [ "$SAVED_INSTALL_MCP" = "true" ]; then + echo -e " MCP server: ${Y}Yes${N} β†’ $SAVED_MCP_INSTALL_PATH" + else + echo -e " MCP server: ${G}No${N}" + fi + echo "" + + if [ "$SILENT" = true ]; then + apply_previous_config + return 0 + fi + + if [ ! -e /dev/tty ]; then + apply_previous_config + return 0 + fi + + # Use radio selector for clear UX + local choice + choice=$(radio_select \ + "Use previous configuration|use|Quick reinstall with same settings" \ + "Reconfigure|new|Change tools, profile, or skills") + + if [ "$choice" = "use" ]; then + apply_previous_config + ok "Using previous configuration" + return 0 + else + msg "Starting fresh configuration..." + return 1 + fi +} + # ─── Tool detection & selection ───────────────────────────────── detect_tools() { # If provided via --tools flag or TOOLS env var, skip detection and prompts @@ -1902,51 +2070,77 @@ main() { step "Checking prerequisites" check_deps - # ── Step 2: Interactive tool selection ── - step "Selecting tools" - detect_tools - ok "Selected: $(echo "$TOOLS" | tr ' ' ', ')" + # ── Step 2: Check for previous configuration ── + # Only prompt if running interactively and no explicit flags were provided + if [ "$SILENT" = false ] && [ -z "$USER_TOOLS" ] && [ "$SCOPE_EXPLICIT" = false ]; then + if prompt_use_previous_config; then + # Config loaded and user chose to use it - skip to skill resolution + # Set state directory based on loaded scope + if [ "$SCOPE" = "global" ]; then + STATE_DIR="$INSTALL_DIR" + else + STATE_DIR="$(pwd)/.ai-dev-kit" + fi + # Resolve skills from loaded config + if [ "$INSTALL_SKILLS" = true ]; then + resolve_skills + fi + # Skip to confirmation (USE_PREVIOUS_CONFIG is true) + fi + fi + + # ── Interactive configuration (skip if using previous config) ── + if [ "$USE_PREVIOUS_CONFIG" = false ]; then + # ── Step 2: Interactive tool selection ── + step "Selecting tools" + detect_tools + ok "Selected: $(echo "$TOOLS" | tr ' ' ', ')" - # ── Step 3: Interactive profile selection ── - step "Databricks profile" - prompt_profile - ok "Profile: $PROFILE" + # ── Step 3: Interactive profile selection ── + step "Databricks profile" + prompt_profile + ok "Profile: $PROFILE" - # ── Step 3.5: Interactive scope selection ── - if [ "$SCOPE_EXPLICIT" = false ]; then - prompt_scope - ok "Scope: $SCOPE" + # ── Step 3.5: Interactive scope selection ── + if [ "$SCOPE_EXPLICIT" = false ]; then + prompt_scope + ok "Scope: $SCOPE" + fi fi # Set state directory based on scope (for profile/manifest storage) + # (Also set when using previous config, but doesn't hurt to ensure it's set) if [ "$SCOPE" = "global" ]; then STATE_DIR="$INSTALL_DIR" else STATE_DIR="$(pwd)/.ai-dev-kit" fi - # ── Step 4: Skill profile selection ── - if [ "$INSTALL_SKILLS" = true ]; then - step "Skill profiles" - prompt_skills_profile - resolve_skills - # Count for display - local sk_count=0 - for _ in $SELECTED_SKILLS $SELECTED_MLFLOW_SKILLS $SELECTED_APX_SKILLS; do sk_count=$((sk_count + 1)); done - if [ -n "$USER_SKILLS" ]; then - ok "Custom selection ($sk_count skills)" - else - ok "Profile: ${SKILLS_PROFILE:-all} ($sk_count skills)" + # ── Continue interactive configuration (skip if using previous config) ── + if [ "$USE_PREVIOUS_CONFIG" = false ]; then + # ── Step 4: Skill profile selection ── + if [ "$INSTALL_SKILLS" = true ]; then + step "Skill profiles" + prompt_skills_profile + resolve_skills + # Count for display + local sk_count=0 + for _ in $SELECTED_SKILLS $SELECTED_MLFLOW_SKILLS $SELECTED_APX_SKILLS; do sk_count=$((sk_count + 1)); done + if [ -n "$USER_SKILLS" ]; then + ok "Custom selection ($sk_count skills)" + else + ok "Profile: ${SKILLS_PROFILE:-all} ($sk_count skills)" + fi fi - fi - # ── Step 4.5: MCP server installation prompt ── - step "MCP server (deprecated)" - prompt_mcp_install - if [ "$INSTALL_MCP" = true ]; then - ok "Will install MCP server to: $MCP_INSTALL_PATH" - else - ok "Skipping MCP server (recommended)" + # ── Step 4.5: MCP server installation prompt ── + step "MCP server (deprecated)" + prompt_mcp_install + if [ "$INSTALL_MCP" = true ]; then + ok "Will install MCP server to: $MCP_INSTALL_PATH" + else + ok "Skipping MCP server (recommended)" + fi fi # ── Step 5: Confirm before proceeding ── @@ -2023,10 +2217,13 @@ main() { # Save version save_version - + + # Save configuration for quick reinstalls + save_config + # Prompt to run auth prompt_auth - + # Done summary } From 820b4aad45c93b09dfa13abede9360eba3913a7a Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Wed, 15 Apr 2026 15:01:10 +0200 Subject: [PATCH 39/66] Consolidate config persistence and pre-select from previous config MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Rename .install-config to .ai-dev-kit-install-config - Remove old .skills-profile mechanism (now unified in config file) - Add HAS_PREVIOUS_CONFIG flag for pre-selection mode - Pre-select all prompts from saved values when reconfiguring: - Tools: shows "previous" hint on saved selections - Databricks profile: pre-selects saved profile - Scope: pre-selects project/global - Skills: pre-selects skill profiles - MCP: pre-selects install option - Simplify "Keep this configuration? (Y/n)" prompt - Make header/prerequisites more compact (single line) πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- install.sh | 226 ++++++++++++++++++++++++++--------------------------- 1 file changed, 109 insertions(+), 117 deletions(-) diff --git a/install.sh b/install.sh index 0c177d8d..9a48188c 100644 --- a/install.sh +++ b/install.sh @@ -87,8 +87,9 @@ MCP_INSTALL_PATH="${DEVKIT_MCP_PATH:-$HOME/.ai-dev-kit}" # (hash is computed automatically, no manual version bump needed) REQUIRED_CONFIG_FIELDS="SAVED_TOOLS SAVED_PROFILE SAVED_SCOPE SAVED_SKILLS_PROFILE SAVED_INSTALL_MCP" -# Flag to track if we're using previous config +# Flags to track config state USE_PREVIOUS_CONFIG=false +HAS_PREVIOUS_CONFIG=false # True if previous config exists (for pre-selecting defaults) # Minimum required versions MIN_CLI_VERSION="0.278.0" @@ -527,7 +528,7 @@ get_config_file() { else state_dir="$(pwd)/.ai-dev-kit" fi - echo "$state_dir/.install-config" + echo "$state_dir/.ai-dev-kit-install-config" } # Save current configuration to file @@ -563,11 +564,11 @@ load_previous_config() { local config_file="" # First try project-local config - if [ -f "$(pwd)/.ai-dev-kit/.install-config" ]; then - config_file="$(pwd)/.ai-dev-kit/.install-config" + if [ -f "$(pwd)/.ai-dev-kit/.ai-dev-kit-install-config" ]; then + config_file="$(pwd)/.ai-dev-kit/.ai-dev-kit-install-config" # Then try global config - elif [ -f "$INSTALL_DIR/.install-config" ]; then - config_file="$INSTALL_DIR/.install-config" + elif [ -f "$INSTALL_DIR/.ai-dev-kit-install-config" ]; then + config_file="$INSTALL_DIR/.ai-dev-kit-install-config" fi [ -z "$config_file" ] && return 1 @@ -613,53 +614,36 @@ apply_previous_config() { } # Display previous config and ask if user wants to use it +# Returns 0 if user wants to keep previous config (skip prompts) +# Returns 1 if user wants to reconfigure (but SAVED_* are set as defaults) prompt_use_previous_config() { if ! load_previous_config; then return 1 # No valid config, proceed with prompts fi echo "" - echo -e " ${B}Previous installation detected${N}" - echo -e " ────────────────────────────────────" - echo -e " Tools: ${G}$(echo "$SAVED_TOOLS" | tr ' ' ', ')${N}" - echo -e " Profile: ${G}$SAVED_PROFILE${N}" - echo -e " Scope: ${G}$SAVED_SCOPE${N}" - + echo -e " ${B}Previous installation${N}" + echo -e " Tools: ${G}$(echo "$SAVED_TOOLS" | tr ' ' ', ')${N}, Profile: ${G}$SAVED_PROFILE${N}, Scope: ${G}$SAVED_SCOPE${N}" if [ -n "$SAVED_USER_SKILLS" ]; then - echo -e " Skills: ${G}custom selection${N}" - else - echo -e " Skills: ${G}${SAVED_SKILLS_PROFILE:-all}${N}" - fi - - if [ "$SAVED_INSTALL_MCP" = "true" ]; then - echo -e " MCP server: ${Y}Yes${N} β†’ $SAVED_MCP_INSTALL_PATH" + echo -e " Skills: ${G}custom${N}, MCP: ${G}${SAVED_INSTALL_MCP}${N}" else - echo -e " MCP server: ${G}No${N}" - fi - echo "" - - if [ "$SILENT" = true ]; then - apply_previous_config - return 0 + echo -e " Skills: ${G}${SAVED_SKILLS_PROFILE:-all}${N}, MCP: ${G}${SAVED_INSTALL_MCP}${N}" fi - if [ ! -e /dev/tty ]; then + if [ "$SILENT" = true ] || [ ! -e /dev/tty ]; then apply_previous_config return 0 fi - # Use radio selector for clear UX - local choice - choice=$(radio_select \ - "Use previous configuration|use|Quick reinstall with same settings" \ - "Reconfigure|new|Change tools, profile, or skills") + local keep + keep=$(prompt "Keep this configuration? ${D}(Y/n)${N}" "y") - if [ "$choice" = "use" ]; then + if [ "$keep" = "y" ] || [ "$keep" = "Y" ] || [ "$keep" = "yes" ] || [ -z "$keep" ]; then apply_previous_config - ok "Using previous configuration" return 0 else - msg "Starting fresh configuration..." + # User wants to reconfigure - SAVED_* values remain set as defaults for prompts + HAS_PREVIOUS_CONFIG=true return 1 fi } @@ -694,17 +678,28 @@ detect_tools() { # Build checkbox items: "Label|value|on_or_off|hint" local claude_state="off" cursor_state="off" codex_state="off" copilot_state="off" gemini_state="off" antigravity_state="off" local claude_hint="not found" cursor_hint="not found" codex_hint="not found" copilot_hint="not found" gemini_hint="not found" antigravity_hint="not found" - [ "$has_claude" = true ] && claude_state="on" && claude_hint="detected" - [ "$has_cursor" = true ] && cursor_state="on" && cursor_hint="detected" - [ "$has_codex" = true ] && codex_state="on" && codex_hint="detected" - [ "$has_copilot" = true ] && copilot_state="on" && copilot_hint="detected" - [ "$has_gemini" = true ] && gemini_state="on" && gemini_hint="detected" - [ "$has_antigravity" = true ] && antigravity_state="on" && antigravity_hint="detected" - # If nothing detected, pre-select claude as default - if [ "$has_claude" = false ] && [ "$has_cursor" = false ] && [ "$has_codex" = false ] && [ "$has_copilot" = false ] && [ "$has_gemini" = false ] && [ "$has_antigravity" = false ]; then - claude_state="on" - claude_hint="default" + # If previous config exists, use those selections; otherwise use auto-detection + if [ "$HAS_PREVIOUS_CONFIG" = true ] && [ -n "$SAVED_TOOLS" ]; then + [[ " $SAVED_TOOLS " == *" claude "* ]] && claude_state="on" && claude_hint="previous" + [[ " $SAVED_TOOLS " == *" cursor "* ]] && cursor_state="on" && cursor_hint="previous" + [[ " $SAVED_TOOLS " == *" codex "* ]] && codex_state="on" && codex_hint="previous" + [[ " $SAVED_TOOLS " == *" copilot "* ]] && copilot_state="on" && copilot_hint="previous" + [[ " $SAVED_TOOLS " == *" gemini "* ]] && gemini_state="on" && gemini_hint="previous" + [[ " $SAVED_TOOLS " == *" antigravity "* ]] && antigravity_state="on" && antigravity_hint="previous" + else + [ "$has_claude" = true ] && claude_state="on" && claude_hint="detected" + [ "$has_cursor" = true ] && cursor_state="on" && cursor_hint="detected" + [ "$has_codex" = true ] && codex_state="on" && codex_hint="detected" + [ "$has_copilot" = true ] && copilot_state="on" && copilot_hint="detected" + [ "$has_gemini" = true ] && gemini_state="on" && gemini_hint="detected" + [ "$has_antigravity" = true ] && antigravity_state="on" && antigravity_hint="detected" + + # If nothing detected, pre-select claude as default + if [ "$has_claude" = false ] && [ "$has_cursor" = false ] && [ "$has_codex" = false ] && [ "$has_copilot" = false ] && [ "$has_gemini" = false ] && [ "$has_antigravity" = false ]; then + claude_state="on" + claude_hint="default" + fi fi # Interactive or fallback @@ -749,6 +744,7 @@ prompt_profile() { # Skip in silent mode or non-interactive if [ "$SILENT" = true ] || [ ! -e /dev/tty ]; then + [ "$HAS_PREVIOUS_CONFIG" = true ] && [ -n "$SAVED_PROFILE" ] && PROFILE="$SAVED_PROFILE" return fi @@ -769,24 +765,29 @@ prompt_profile() { echo -e " ${B}Select Databricks profile${N}" if [ ${#profiles[@]} -gt 0 ] && [ -e /dev/tty ]; then + # Determine which profile to pre-select + local preselect="DEFAULT" + [ "$HAS_PREVIOUS_CONFIG" = true ] && [ -n "$SAVED_PROFILE" ] && preselect="$SAVED_PROFILE" + # Build radio items: "Label|value|on_or_off|hint" local -a items=() + local found_preselect=false for p in "${profiles[@]}"; do local state="off" local hint="" - [ "$p" = "DEFAULT" ] && state="on" && hint="default" + if [ "$p" = "$preselect" ]; then + state="on" + hint="previous" + found_preselect=true + fi items+=("${p}|${p}|${state}|${hint}") done - + # Add custom profile option at the end items+=("Custom profile name...|__CUSTOM__|off|Enter a custom profile name") - # If no DEFAULT profile exists, pre-select the first one - local has_default=false - for p in "${profiles[@]}"; do - [ "$p" = "DEFAULT" ] && has_default=true - done - if [ "$has_default" = false ]; then + # If preselect not found, select first one + if [ "$found_preselect" = false ]; then items[0]=$(echo "${items[0]}" | sed 's/|off|/|on|/') fi @@ -922,29 +923,10 @@ prompt_skills_profile() { # Skip in silent mode or non-interactive if [ "$SILENT" = true ] || [ ! -e /dev/tty ]; then - SKILLS_PROFILE="all" + SKILLS_PROFILE="${SAVED_SKILLS_PROFILE:-all}" return fi - # Check for previous selection (scope-local first, then global fallback for upgrades) - local profile_file="$STATE_DIR/.skills-profile" - [ ! -f "$profile_file" ] && [ "$SCOPE" = "project" ] && profile_file="$INSTALL_DIR/.skills-profile" - if [ -f "$profile_file" ]; then - local prev_profile - prev_profile=$(cat "$profile_file") - if [ "$FORCE" != true ]; then - echo "" - local display_profile - display_profile=$(echo "$prev_profile" | tr ',' ', ') - local keep - keep=$(prompt "Previous skill profile: ${B}${display_profile}${N}. Keep? ${D}(Y/n)${N}" "y") - if [ "$keep" = "y" ] || [ "$keep" = "Y" ] || [ "$keep" = "yes" ] || [ -z "$keep" ]; then - SKILLS_PROFILE="$prev_profile" - return - fi - fi - fi - echo "" echo -e " ${B}Select skill profile(s)${N}" @@ -952,7 +934,20 @@ prompt_skills_profile() { local -a p_labels=("All Skills" "Data Engineer" "Business Analyst" "AI/ML Engineer" "App Developer" "Custom") local -a p_values=("all" "data-engineer" "analyst" "ai-ml-engineer" "app-developer" "custom") local -a p_hints=("Install everything (34 skills)" "Pipelines, Spark, Jobs, Streaming (14 skills)" "Dashboards, SQL, Genie, Metrics (8 skills)" "Agents, RAG, Vector Search, MLflow (17 skills)" "Apps, Lakebase, Deployment (10 skills)" "Pick individual skills") - local -a p_states=(1 0 0 0 0 0) # "All" selected by default + + # Pre-select based on previous config if available + local -a p_states=(0 0 0 0 0 0) + if [ "$HAS_PREVIOUS_CONFIG" = true ] && [ -n "$SAVED_SKILLS_PROFILE" ]; then + # Parse comma-separated profiles and set states + IFS=',' read -ra prev_profiles <<< "$SAVED_SKILLS_PROFILE" + for prev in "${prev_profiles[@]}"; do + for i in "${!p_values[@]}"; do + [ "${p_values[$i]}" = "$prev" ] && p_states[$i]=1 + done + done + else + p_states[0]=1 # Default to "All" + fi local p_count=6 local p_cursor=0 local p_total_rows=$((p_count + 2)) @@ -1127,6 +1122,7 @@ prompt_mcp_install() { # Skip in silent mode or non-interactive if [ "$SILENT" = true ] || [ ! -e /dev/tty ]; then + [ "$HAS_PREVIOUS_CONFIG" = true ] && [ "$SAVED_INSTALL_MCP" = "true" ] && INSTALL_MCP=true return fi @@ -1138,8 +1134,11 @@ prompt_mcp_install() { local -a values=("no" "yes") local -a hints=("Recommended - skills work without MCP" "Legacy - requires Python venv setup") local count=2 + + # Pre-select based on previous config local selected=0 - local cursor=0 + [ "$HAS_PREVIOUS_CONFIG" = true ] && [ "$SAVED_INSTALL_MCP" = "true" ] && selected=1 + local cursor=$selected _mcp_draw() { for i in 0 1; do @@ -1280,15 +1279,16 @@ check_cli_version() { cli_version=$(databricks --version 2>/dev/null | grep -oE '[0-9]+\.[0-9]+\.[0-9]+' | head -1) if [ -z "$cli_version" ]; then - warn "Could not determine Databricks CLI version" - return + PREREQ_WARNINGS+=("Could not determine Databricks CLI version") + return 1 fi if version_gte "$cli_version" "$MIN_CLI_VERSION"; then - ok "Databricks CLI v${cli_version}" + PREREQS+=("Databricks CLI v${cli_version}") + return 0 else - warn "Databricks CLI v${cli_version} is outdated (minimum: v${MIN_CLI_VERSION})" - msg " ${B}Upgrade:${N} curl -fsSL https://raw.githubusercontent.com/databricks/setup-cli/main/install.sh | sh" + PREREQ_WARNINGS+=("Databricks CLI v${cli_version} outdated (min: v${MIN_CLI_VERSION}). Upgrade: curl -fsSL https://raw.githubusercontent.com/databricks/setup-cli/main/install.sh | sh") + return 1 fi } @@ -1310,51 +1310,51 @@ check_sdk_version() { fi } -# Check prerequisites +# Check prerequisites (prints inline) check_deps() { + PREREQS=() + PREREQ_WARNINGS=() + command -v git >/dev/null 2>&1 || die "git required" - ok "git" + PREREQS+=("git") if command -v databricks >/dev/null 2>&1; then check_cli_version else - warn "Databricks CLI not found. Install: ${B}curl -fsSL https://raw.githubusercontent.com/databricks/setup-cli/main/install.sh | sh${N}" - msg "${D}You can still install, but authentication will require the CLI later.${N}" + PREREQ_WARNINGS+=("Databricks CLI not found. Install: curl -fsSL https://raw.githubusercontent.com/databricks/setup-cli/main/install.sh | sh") fi if [ "$INSTALL_MCP" = true ]; then if command -v uv >/dev/null 2>&1; then - PKG="uv" - ok "$PKG ($(uv --version 2>/dev/null || echo 'unknown version'))" + PREREQS+=("uv $(uv --version 2>/dev/null | grep -oE '[0-9]+\.[0-9]+\.[0-9]+' || echo '')") else die "uv is required but not found on your PATH. Install it with: ${B}curl -LsSf https://astral.sh/uv/install.sh | sh${N} Then re-run this installer." fi fi + + # Print inline + if [ "$SILENT" = false ] && [ ${#PREREQS[@]} -gt 0 ]; then + local prereq_list + prereq_list=$(IFS=', '; echo "${PREREQS[*]}") + echo -e "${G}βœ“${N} ${prereq_list}" + fi + + # Print warnings on separate lines + for w in "${PREREQ_WARNINGS[@]}"; do + warn "$w" + done } # Check if update needed check_version() { local ver_file="$INSTALL_DIR/version" [ "$SCOPE" = "project" ] && ver_file=".ai-dev-kit/version" - + [ ! -f "$ver_file" ] && return [ "$FORCE" = true ] && return - # Skip version gate if user explicitly wants a different skill profile - if [ -n "$SKILLS_PROFILE" ] || [ -n "$USER_SKILLS" ]; then - local saved_profile_file="$STATE_DIR/.skills-profile" - [ ! -f "$saved_profile_file" ] && [ "$SCOPE" = "project" ] && saved_profile_file="$INSTALL_DIR/.skills-profile" - if [ -f "$saved_profile_file" ]; then - local saved_profile - saved_profile=$(cat "$saved_profile_file") - local requested="${USER_SKILLS:+custom:$USER_SKILLS}" - [ -z "$requested" ] && requested="$SKILLS_PROFILE" - [ "$saved_profile" != "$requested" ] && return - fi - fi - local local_ver=$(cat "$ver_file") # Use -f to fail on HTTP errors (like 404) local remote_ver=$(curl -fsSL "$RAW_URL/VERSION" 2>/dev/null || echo "") @@ -1363,7 +1363,7 @@ check_version() { if [ -n "$remote_ver" ] && [[ ! "$remote_ver" =~ (404|Not Found|error) ]]; then if [ "$local_ver" = "$remote_ver" ]; then ok "Already up to date (v${local_ver})" - msg "${D}Use --force to reinstall or --skills-profile to change profiles${N}" + msg "${D}Use --force to reinstall${N}" exit 0 fi fi @@ -1525,13 +1525,6 @@ install_skills() { # Save manifest of installed skills (for cleanup on profile change) mv "$manifest.tmp" "$manifest" - - # Save selected profile for future reinstalls (scope-local) - if [ -n "$USER_SKILLS" ]; then - echo "custom:$USER_SKILLS" > "$STATE_DIR/.skills-profile" - else - echo "${SKILLS_PROFILE:-all}" > "$STATE_DIR/.skills-profile" - fi } # Write MCP configs @@ -1884,19 +1877,23 @@ summary() { # Prompt for installation scope prompt_scope() { if [ "$SILENT" = true ] || [ ! -e /dev/tty ]; then + [ "$HAS_PREVIOUS_CONFIG" = true ] && SCOPE="${SAVED_SCOPE:-project}" return fi echo "" echo -e " ${B}Select installation scope${N}" - + # Simple radio selector without Confirm button local -a labels=("Project" "Global") local -a values=("project" "global") local -a hints=("Install in current directory (.cursor/, .claude/, .gemini/)" "Install in home directory (~/.cursor/, ~/.claude/, ~/.gemini/)") local count=2 + + # Pre-select based on previous config local selected=0 - local cursor=0 + [ "$HAS_PREVIOUS_CONFIG" = true ] && [ "$SAVED_SCOPE" = "global" ] && selected=1 + local cursor=$selected _scope_draw() { for i in 0 1; do @@ -2057,17 +2054,12 @@ prompt_auth() { # Main main() { - if [ "$SILENT" = false ]; then - echo "" - echo -e "${B}Databricks AI Dev Kit Installer${N}" - echo "────────────────────────────────" - fi - + [ "$SILENT" = false ] && echo -e "\n${B}Databricks AI Dev Kit Installer${N}" + # ── Step 1: Release channel selection (may re-exec from experimental branch) ── prompt_channel # Check dependencies - step "Checking prerequisites" check_deps # ── Step 2: Check for previous configuration ── From 23a2495a560ff1daf69bb7b79de65ec558541f97 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Wed, 15 Apr 2026 15:10:12 +0200 Subject: [PATCH 40/66] Polish installer UX: remove version check, add previous hints MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove version check (always reinstall) - Remove extra blank line after experimental download message - Add "previous" hint to all pre-selected options from saved config: - Scope, MCP install, skill profiles πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- install.sh | 49 ++++++++++++++++--------------------------------- 1 file changed, 16 insertions(+), 33 deletions(-) diff --git a/install.sh b/install.sh index 9a48188c..79d2caae 100644 --- a/install.sh +++ b/install.sh @@ -935,14 +935,17 @@ prompt_skills_profile() { local -a p_values=("all" "data-engineer" "analyst" "ai-ml-engineer" "app-developer" "custom") local -a p_hints=("Install everything (34 skills)" "Pipelines, Spark, Jobs, Streaming (14 skills)" "Dashboards, SQL, Genie, Metrics (8 skills)" "Agents, RAG, Vector Search, MLflow (17 skills)" "Apps, Lakebase, Deployment (10 skills)" "Pick individual skills") - # Pre-select based on previous config if available + # Pre-select based on previous config if available and add "previous" hint local -a p_states=(0 0 0 0 0 0) if [ "$HAS_PREVIOUS_CONFIG" = true ] && [ -n "$SAVED_SKILLS_PROFILE" ]; then - # Parse comma-separated profiles and set states + # Parse comma-separated profiles and set states + hints IFS=',' read -ra prev_profiles <<< "$SAVED_SKILLS_PROFILE" for prev in "${prev_profiles[@]}"; do for i in "${!p_values[@]}"; do - [ "${p_values[$i]}" = "$prev" ] && p_states[$i]=1 + if [ "${p_values[$i]}" = "$prev" ]; then + p_states[$i]=1 + p_hints[$i]="previous" + fi done done else @@ -1135,9 +1138,12 @@ prompt_mcp_install() { local -a hints=("Recommended - skills work without MCP" "Legacy - requires Python venv setup") local count=2 - # Pre-select based on previous config + # Pre-select based on previous config and add "previous" hint local selected=0 - [ "$HAS_PREVIOUS_CONFIG" = true ] && [ "$SAVED_INSTALL_MCP" = "true" ] && selected=1 + if [ "$HAS_PREVIOUS_CONFIG" = true ]; then + [ "$SAVED_INSTALL_MCP" = "true" ] && selected=1 + hints[$selected]="previous" + fi local cursor=$selected _mcp_draw() { @@ -1347,28 +1353,6 @@ check_deps() { done } -# Check if update needed -check_version() { - local ver_file="$INSTALL_DIR/version" - [ "$SCOPE" = "project" ] && ver_file=".ai-dev-kit/version" - - [ ! -f "$ver_file" ] && return - [ "$FORCE" = true ] && return - - local local_ver=$(cat "$ver_file") - # Use -f to fail on HTTP errors (like 404) - local remote_ver=$(curl -fsSL "$RAW_URL/VERSION" 2>/dev/null || echo "") - - # Validate remote version format (should not contain "404" or other error text) - if [ -n "$remote_ver" ] && [[ ! "$remote_ver" =~ (404|Not Found|error) ]]; then - if [ "$local_ver" = "$remote_ver" ]; then - ok "Already up to date (v${local_ver})" - msg "${D}Use --force to reinstall${N}" - exit 0 - fi - fi -} - # Setup MCP server setup_mcp() { step "Setting up MCP server" @@ -1890,9 +1874,12 @@ prompt_scope() { local -a hints=("Install in current directory (.cursor/, .claude/, .gemini/)" "Install in home directory (~/.cursor/, ~/.claude/, ~/.gemini/)") local count=2 - # Pre-select based on previous config + # Pre-select based on previous config and add "previous" hint local selected=0 - [ "$HAS_PREVIOUS_CONFIG" = true ] && [ "$SAVED_SCOPE" = "global" ] && selected=1 + if [ "$HAS_PREVIOUS_CONFIG" = true ] && [ -n "$SAVED_SCOPE" ]; then + [ "$SAVED_SCOPE" = "global" ] && selected=1 + hints[$selected]="previous" + fi local cursor=$selected _scope_draw() { @@ -1985,7 +1972,6 @@ prompt_channel() { echo -e " Discussions: ${BL}https://github.com/databricks-solutions/ai-dev-kit/discussions${N}" echo "" echo -e " ${D}Downloading installer from experimental branch...${N}" - echo "" # Build the command with all current flags preserved local args="--experimental" @@ -2172,9 +2158,6 @@ main() { fi fi - # ── Step 7: Version check (may exit early if up to date) ── - check_version - # Determine base directory local base_dir [ "$SCOPE" = "global" ] && base_dir="$HOME" || base_dir="$(pwd)" From 2284568bb712154f68a87f30e743c390f7d79e3f Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Wed, 15 Apr 2026 15:19:53 +0200 Subject: [PATCH 41/66] Fix comma-space separator in prerequisites list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- install.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/install.sh b/install.sh index 79d2caae..ca9b5b76 100644 --- a/install.sh +++ b/install.sh @@ -1343,7 +1343,7 @@ check_deps() { # Print inline if [ "$SILENT" = false ] && [ ${#PREREQS[@]} -gt 0 ]; then local prereq_list - prereq_list=$(IFS=', '; echo "${PREREQS[*]}") + prereq_list=$(printf '%s, ' "${PREREQS[@]}" | sed 's/, $//') echo -e "${G}βœ“${N} ${prereq_list}" fi From f4da8c60a42e365ffadf29d3ddb4b170c293e448 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Wed, 15 Apr 2026 15:26:13 +0200 Subject: [PATCH 42/66] Make config persistence robust: quote values, safe loading MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Quote all values in save_config to handle spaces correctly - Replace source with grep-based parsing (no code execution risk) - Any config error silently falls back to fresh install πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- install.sh | 58 +++++++++++++++++++++++++++--------------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/install.sh b/install.sh index ca9b5b76..0b1ea9d3 100644 --- a/install.sh +++ b/install.sh @@ -543,53 +543,53 @@ save_config() { cat > "$config_file" << EOF # AI Dev Kit installation configuration # Generated on $(date -u +"%Y-%m-%d %H:%M:%S UTC") -CONFIG_SCHEMA_HASH=$schema_hash -SAVED_TOOLS=$TOOLS -SAVED_PROFILE=$PROFILE -SAVED_SCOPE=$SCOPE -SAVED_SKILLS_PROFILE=${SKILLS_PROFILE:-all} -SAVED_USER_SKILLS=$USER_SKILLS -SAVED_INSTALL_MCP=$INSTALL_MCP -SAVED_MCP_INSTALL_PATH=$MCP_INSTALL_PATH +CONFIG_SCHEMA_HASH="$schema_hash" +SAVED_TOOLS="$TOOLS" +SAVED_PROFILE="$PROFILE" +SAVED_SCOPE="$SCOPE" +SAVED_SKILLS_PROFILE="${SKILLS_PROFILE:-all}" +SAVED_USER_SKILLS="$USER_SKILLS" +SAVED_INSTALL_MCP="$INSTALL_MCP" +SAVED_MCP_INSTALL_PATH="$MCP_INSTALL_PATH" EOF } # Load and validate previous configuration # Returns 0 if valid config exists, 1 otherwise # Sets SAVED_* variables if successful +# Robust: any error silently falls back to fresh install load_previous_config() { - # Determine where to look for config - # For project scope, check current directory - # For global scope (or auto-detect), check both local config_file="" - # First try project-local config + # First try project-local config, then global if [ -f "$(pwd)/.ai-dev-kit/.ai-dev-kit-install-config" ]; then config_file="$(pwd)/.ai-dev-kit/.ai-dev-kit-install-config" - # Then try global config elif [ -f "$INSTALL_DIR/.ai-dev-kit-install-config" ]; then config_file="$INSTALL_DIR/.ai-dev-kit-install-config" fi - [ -z "$config_file" ] && return 1 - # Source the config file - # shellcheck disable=SC1090 - source "$config_file" 2>/dev/null || return 1 - - # Validate schema hash matches (if new fields added, hash changes, config is stale) + # Safely read config using grep instead of source (avoids code execution) + CONFIG_SCHEMA_HASH=$(grep -E '^CONFIG_SCHEMA_HASH=' "$config_file" 2>/dev/null | cut -d'=' -f2- | tr -d '"') || return 1 + SAVED_TOOLS=$(grep -E '^SAVED_TOOLS=' "$config_file" 2>/dev/null | cut -d'=' -f2- | tr -d '"') || return 1 + SAVED_PROFILE=$(grep -E '^SAVED_PROFILE=' "$config_file" 2>/dev/null | cut -d'=' -f2- | tr -d '"') || return 1 + SAVED_SCOPE=$(grep -E '^SAVED_SCOPE=' "$config_file" 2>/dev/null | cut -d'=' -f2- | tr -d '"') || return 1 + SAVED_SKILLS_PROFILE=$(grep -E '^SAVED_SKILLS_PROFILE=' "$config_file" 2>/dev/null | cut -d'=' -f2- | tr -d '"') || return 1 + SAVED_INSTALL_MCP=$(grep -E '^SAVED_INSTALL_MCP=' "$config_file" 2>/dev/null | cut -d'=' -f2- | tr -d '"') || return 1 + SAVED_USER_SKILLS=$(grep -E '^SAVED_USER_SKILLS=' "$config_file" 2>/dev/null | cut -d'=' -f2- | tr -d '"') || true + SAVED_MCP_INSTALL_PATH=$(grep -E '^SAVED_MCP_INSTALL_PATH=' "$config_file" 2>/dev/null | cut -d'=' -f2- | tr -d '"') || true + + # Validate schema hash matches local expected_hash expected_hash=$(get_config_schema_hash) - if [ "${CONFIG_SCHEMA_HASH:-}" != "$expected_hash" ]; then - return 1 - fi - - # Validate all required fields are present and non-empty - local field - for field in $REQUIRED_CONFIG_FIELDS; do - eval "local value=\${$field:-}" - [ -z "$value" ] && return 1 - done + [ "${CONFIG_SCHEMA_HASH:-}" != "$expected_hash" ] && return 1 + + # Validate required fields are present + [ -z "$SAVED_TOOLS" ] && return 1 + [ -z "$SAVED_PROFILE" ] && return 1 + [ -z "$SAVED_SCOPE" ] && return 1 + [ -z "$SAVED_SKILLS_PROFILE" ] && return 1 + [ -z "$SAVED_INSTALL_MCP" ] && return 1 return 0 } From 395e79d5131c1c0f9e460ded445945d0ea9e2409 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Wed, 15 Apr 2026 17:23:57 +0200 Subject: [PATCH 43/66] Fix installer bugs: dedup mcp-path, remove double install, use Python 3.12 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove duplicate --mcp-path case in argument parser - Remove dead INSTALL_MCP=true assignment (was immediately overwritten) - Remove duplicate MCP server line in summary - Remove redundant install_mcp_server call (setup_mcp already handles it) - Use Python 3.12 instead of 3.11 for venv creation - Add --allow-existing to install_mcp_server venv creation πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- install.sh | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/install.sh b/install.sh index 0b1ea9d3..9ac4ca20 100644 --- a/install.sh +++ b/install.sh @@ -78,7 +78,6 @@ else fi # Installation mode defaults -INSTALL_MCP=true INSTALL_SKILLS=true INSTALL_MCP="${DEVKIT_INSTALL_MCP:-false}" MCP_INSTALL_PATH="${DEVKIT_MCP_PATH:-$HOME/.ai-dev-kit}" @@ -140,13 +139,12 @@ while [ $# -gt 0 ]; do -b|--branch) BRANCH="$2"; shift 2 ;; --skills-only) INSTALL_MCP=false; shift ;; --mcp-only) INSTALL_SKILLS=false; shift ;; - --mcp-path) USER_MCP_PATH="$2"; shift 2 ;; + --mcp-path) USER_MCP_PATH="$2"; MCP_INSTALL_PATH="$2"; INSTALL_MCP=true; shift 2 ;; --skills-profile) SKILLS_PROFILE="$2"; shift 2 ;; --skills) USER_SKILLS="$2"; shift 2 ;; --list-skills) LIST_SKILLS=true; shift ;; --silent) SILENT=true; shift ;; --mcp) INSTALL_MCP=true; shift ;; - --mcp-path) MCP_INSTALL_PATH="$2"; INSTALL_MCP=true; shift 2 ;; --tools) USER_TOOLS="$2"; shift 2 ;; --experimental) CHANNEL="experimental"; shift ;; -f|--force) FORCE=true; shift ;; @@ -1238,7 +1236,7 @@ install_mcp_server() { # Create virtual environment msg "Creating virtual environment..." cd "$mcp_dir" - uv venv --python 3.11 -q + uv venv --python 3.12 --allow-existing -q 2>/dev/null || uv venv --allow-existing -q ok "Virtual environment created" # Install packages @@ -1382,7 +1380,7 @@ setup_mcp() { fi msg "Installing Python dependencies..." - $arch_prefix uv venv --python 3.11 --allow-existing "$VENV_DIR" -q 2>/dev/null || $arch_prefix uv venv --allow-existing "$VENV_DIR" -q + $arch_prefix uv venv --python 3.12 --allow-existing "$VENV_DIR" -q 2>/dev/null || $arch_prefix uv venv --allow-existing "$VENV_DIR" -q $arch_prefix uv pip install --python "$VENV_PYTHON" -e "$REPO_DIR/databricks-tools-core" -e "$REPO_DIR/databricks-mcp-server" -q "$VENV_PYTHON" -c "import databricks_mcp_server" 2>/dev/null || die "MCP server install failed" @@ -2130,7 +2128,6 @@ main() { echo -e " Tools: ${G}$(echo "$TOOLS" | tr ' ' ', ')${N}" echo -e " Profile: ${G}${PROFILE}${N}" echo -e " Scope: ${G}${SCOPE}${N}" - [ "$INSTALL_MCP" = true ] && echo -e " MCP server: ${G}${INSTALL_DIR}${N}" if [ "$INSTALL_SKILLS" = true ]; then if [ -n "$USER_SKILLS" ]; then echo -e " Skills: ${G}custom selection${N}" @@ -2175,9 +2172,6 @@ main() { # Install skills [ "$INSTALL_SKILLS" = true ] && install_skills "$base_dir" - # Install MCP server if requested - [ "$INSTALL_MCP" = true ] && install_mcp_server - # Write GEMINI.md if gemini is selected if echo "$TOOLS" | grep -q gemini; then if [ "$SCOPE" = "global" ]; then From 091c327072b944441969a0053536e19da07ca64a Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Wed, 15 Apr 2026 18:15:41 +0200 Subject: [PATCH 44/66] Clean up installer: remove dead code, refactor UI, add profile env support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cleanup (~200 lines removed): - Remove dead functions: install_mcp_server(), check_sdk_version(), prompt_mcp_path() - Refactor prompt_scope() and prompt_mcp_install() to use radio_select() New feature - Claude profile env: - Add write_claude_env() to set DATABRICKS_CONFIG_PROFILE in .claude/settings.json - Only prompt for profile when Claude + project scope (not global) - Reorder flow: tools β†’ scope β†’ profile (so we know scope before asking profile) πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- install.sh | 353 +++++++++++++++++------------------------------------ 1 file changed, 110 insertions(+), 243 deletions(-) diff --git a/install.sh b/install.sh index 9ac4ca20..379dda51 100644 --- a/install.sh +++ b/install.sh @@ -760,7 +760,8 @@ prompt_profile() { fi echo "" - echo -e " ${B}Select Databricks profile${N}" + echo -e " ${B}Which Databricks profile for this project?${N}" + echo -e " ${D}This will be set in .claude/settings.json for Claude Code to use.${N}" if [ ${#profiles[@]} -gt 0 ] && [ -e /dev/tty ]; then # Determine which profile to pre-select @@ -810,32 +811,6 @@ prompt_profile() { fi } -# ─── MCP path selection ──────────────────────────────────────── -prompt_mcp_path() { - # If provided via --mcp-path flag, skip prompt - if [ -n "$USER_MCP_PATH" ]; then - INSTALL_DIR="$USER_MCP_PATH" - elif [ "$SILENT" = false ] && [ -e /dev/tty ]; then - [ "$SILENT" = false ] && echo "" - [ "$SILENT" = false ] && echo -e " ${B}MCP server location${N}" - [ "$SILENT" = false ] && echo -e " ${D}The MCP server runtime (Python venv + source) will be installed here.${N}" - [ "$SILENT" = false ] && echo -e " ${D}Shared across all your projects β€” only the config files are per-project.${N}" - [ "$SILENT" = false ] && echo "" - - local selected - selected=$(prompt "Install path" "$INSTALL_DIR") - - # Expand ~ to $HOME - INSTALL_DIR="${selected/#\~/$HOME}" - fi - - # Update derived paths - REPO_DIR="$INSTALL_DIR/repo" - VENV_DIR="$INSTALL_DIR/.venv" - VENV_PYTHON="$VENV_DIR/bin/python" - MCP_ENTRY="$REPO_DIR/databricks-mcp-server/run_server.py" -} - # ─── Skill profile selection ────────────────────────────────── # Resolve selected skills from profile names or explicit skill list resolve_skills() { @@ -1131,69 +1106,27 @@ prompt_mcp_install() { echo -e " ${B}Deprecated MCP Server${N}" echo -e " ${D}Skills now work via CLI for better performance. MCP server is optional for backwards compatibility.${N}" - local -a labels=("Do not install" "Install MCP server") - local -a values=("no" "yes") - local -a hints=("Recommended - skills work without MCP" "Legacy - requires Python venv setup") - local count=2 + # Build radio items with previous config pre-selection + local skip_state="on" skip_hint="Recommended - skills work without MCP" + local install_state="off" install_hint="Legacy - requires Python venv setup" - # Pre-select based on previous config and add "previous" hint - local selected=0 if [ "$HAS_PREVIOUS_CONFIG" = true ]; then - [ "$SAVED_INSTALL_MCP" = "true" ] && selected=1 - hints[$selected]="previous" - fi - local cursor=$selected - - _mcp_draw() { - for i in 0 1; do - local dot="β—‹" - local dot_color="\033[2m" - [ "$i" = "$selected" ] && dot="●" && dot_color="\033[0;32m" - local arrow=" " - [ "$i" = "$cursor" ] && arrow="\033[0;34m❯\033[0m " - local hint_style="\033[2m" - [ "$i" = "$selected" ] && hint_style="\033[0;32m" - printf "\033[2K %b%b%b %-20s %b%s\033[0m\n" "$arrow" "$dot_color" "$dot" "${labels[$i]}" "$hint_style" "${hints[$i]}" > /dev/tty - done - } - - printf "\n \033[2m↑/↓ navigate Β· enter select\033[0m\n\n" > /dev/tty - printf "\033[?25l" > /dev/tty - trap 'printf "\033[?25h" > /dev/tty 2>/dev/null' EXIT - - _mcp_draw - - while true; do - printf "\033[%dA" "$count" > /dev/tty - _mcp_draw - - local key="" - IFS= read -rsn1 key < /dev/tty 2>/dev/null - - if [ "$key" = $'\x1b' ]; then - local s1="" s2="" - read -rsn1 s1 < /dev/tty 2>/dev/null - read -rsn1 s2 < /dev/tty 2>/dev/null - if [ "$s1" = "[" ]; then - case "$s2" in - A) [ "$cursor" -gt 0 ] && cursor=$((cursor - 1)) ;; - B) [ "$cursor" -lt 1 ] && cursor=$((cursor + 1)) ;; - esac - fi - elif [ "$key" = "" ]; then - selected=$cursor - printf "\033[%dA" "$count" > /dev/tty - _mcp_draw - break - elif [ "$key" = " " ]; then - selected=$cursor + if [ "$SAVED_INSTALL_MCP" = "true" ]; then + skip_state="off" + install_state="on" + install_hint="previous" + else + skip_hint="previous" fi - done + fi - printf "\033[?25h" > /dev/tty - trap - EXIT + local selected + selected=$(radio_select \ + "Do not install|no|${skip_state}|${skip_hint}" \ + "Install MCP server|yes|${install_state}|${install_hint}" \ + ) - if [ "${values[$selected]}" = "yes" ]; then + if [ "$selected" = "yes" ]; then INSTALL_MCP=true # Prompt for install path echo "" @@ -1201,77 +1134,6 @@ prompt_mcp_install() { fi } -# Install MCP server (venv + packages) -install_mcp_server() { - step "Installing MCP server" - - # Check for uv - if ! command -v uv &> /dev/null; then - warn "'uv' is not installed. Install it with: curl -LsSf https://astral.sh/uv/install.sh | sh" - die "MCP server installation requires 'uv'" - fi - ok "uv is installed" - - local mcp_dir="$MCP_INSTALL_PATH" - local tools_core_dir="$REPO_DIR/databricks-tools-core" - local mcp_server_dir="$REPO_DIR/databricks-mcp-server" - - # Verify source directories exist - if [ ! -d "$tools_core_dir" ]; then - die "databricks-tools-core not found at $tools_core_dir" - fi - if [ ! -d "$mcp_server_dir" ]; then - die "databricks-mcp-server not found at $mcp_server_dir" - fi - - # Create install directory - mkdir -p "$mcp_dir" - - # Copy source files - msg "Copying MCP server source files..." - cp -r "$tools_core_dir" "$mcp_dir/" - cp -r "$mcp_server_dir" "$mcp_dir/" - ok "Source files copied to $mcp_dir" - - # Create virtual environment - msg "Creating virtual environment..." - cd "$mcp_dir" - uv venv --python 3.12 --allow-existing -q 2>/dev/null || uv venv --allow-existing -q - ok "Virtual environment created" - - # Install packages - msg "Installing databricks-tools-core..." - uv pip install --python .venv/bin/python -e "$mcp_dir/databricks-tools-core" --quiet - ok "databricks-tools-core installed" - - msg "Installing databricks-mcp-server..." - uv pip install --python .venv/bin/python -e "$mcp_dir/databricks-mcp-server" --quiet - ok "databricks-mcp-server installed" - - # Verify installation - if .venv/bin/python -c "import databricks_mcp_server" 2>/dev/null; then - ok "MCP server verified" - else - warn "MCP server import verification failed" - fi - - # Save MCP config for later reference - echo "$mcp_dir" > "$mcp_dir/.mcp-install-path" - - cd - > /dev/null - - msg "" - msg "${B}MCP server installed at:${N} $mcp_dir" - msg "" - msg "To configure Claude Code, add to your project's .mcp.json:" - msg " {\"mcpServers\": {\"databricks\": {" - msg " \"command\": \"$mcp_dir/.venv/bin/python\"," - msg " \"args\": [\"$mcp_dir/databricks-mcp-server/run_server.py\"]" - msg " }}}" - msg "" - msg "Or use: ${B}claude mcp add-json databricks '{\"command\":\"$mcp_dir/.venv/bin/python\",\"args\":[\"$mcp_dir/databricks-mcp-server/run_server.py\"]}'${N}" -} - # Compare semantic versions (returns 0 if $1 >= $2) version_gte() { printf '%s\n%s' "$2" "$1" | sort -V -C @@ -1296,24 +1158,6 @@ check_cli_version() { fi } -# Check Databricks SDK version in the MCP venv -check_sdk_version() { - local sdk_version - sdk_version=$("$VENV_PYTHON" -c "from databricks.sdk.version import __version__; print(__version__)" 2>/dev/null) - - if [ -z "$sdk_version" ]; then - warn "Could not determine Databricks SDK version" - return - fi - - if version_gte "$sdk_version" "$MIN_SDK_VERSION"; then - ok "Databricks SDK v${sdk_version}" - else - warn "Databricks SDK v${sdk_version} is outdated (minimum: v${MIN_SDK_VERSION})" - msg " ${B}Upgrade:${N} $VENV_PYTHON -m pip install --upgrade databricks-sdk" - fi -} - # Check prerequisites (prints inline) check_deps() { PREREQS=() @@ -1672,6 +1516,57 @@ GEMINIEOF ok "GEMINI.md" } +# Write DATABRICKS_CONFIG_PROFILE to Claude settings.json env section +# Safely merges with existing settings using Python or jq +write_claude_env() { + local path=$1 + local profile=$2 + mkdir -p "$(dirname "$path")" + + # Try Python first (most reliable for JSON manipulation) + if command -v python3 >/dev/null 2>&1; then + python3 -c " +import json +path = '$path' +profile = '$profile' +try: + with open(path) as f: cfg = json.load(f) +except: cfg = {} +env = cfg.setdefault('env', {}) +env['DATABRICKS_CONFIG_PROFILE'] = profile +with open(path, 'w') as f: json.dump(cfg, f, indent=2); f.write('\n') +" 2>/dev/null && return 0 + fi + + # Fallback: jq if available + if command -v jq >/dev/null 2>&1; then + if [ -f "$path" ]; then + local tmp="${path}.tmp" + jq --arg p "$profile" '.env = (.env // {}) | .env.DATABRICKS_CONFIG_PROFILE = $p' "$path" > "$tmp" && mv "$tmp" "$path" + else + echo "{\"env\":{\"DATABRICKS_CONFIG_PROFILE\":\"$profile\"}}" | jq '.' > "$path" + fi + return 0 + fi + + # Last resort: create new file only if it doesn't exist + if [ ! -f "$path" ]; then + cat > "$path" << EOF +{ + "env": { + "DATABRICKS_CONFIG_PROFILE": "$profile" + } +} +EOF + return 0 + fi + + # Can't safely merge without Python or jq + warn "Cannot update $path without python3 or jq. Add manually:" + msg " \"env\": {\"DATABRICKS_CONFIG_PROFILE\": \"$profile\"}" + return 1 +} + write_claude_hook() { local path=$1 local script=$2 @@ -1866,70 +1761,24 @@ prompt_scope() { echo "" echo -e " ${B}Select installation scope${N}" - # Simple radio selector without Confirm button - local -a labels=("Project" "Global") - local -a values=("project" "global") - local -a hints=("Install in current directory (.cursor/, .claude/, .gemini/)" "Install in home directory (~/.cursor/, ~/.claude/, ~/.gemini/)") - local count=2 + # Build radio items with previous config pre-selection + local project_state="on" project_hint="Install in current directory" + local global_state="off" global_hint="Install in home directory" - # Pre-select based on previous config and add "previous" hint - local selected=0 if [ "$HAS_PREVIOUS_CONFIG" = true ] && [ -n "$SAVED_SCOPE" ]; then - [ "$SAVED_SCOPE" = "global" ] && selected=1 - hints[$selected]="previous" - fi - local cursor=$selected - - _scope_draw() { - for i in 0 1; do - local dot="β—‹" - local dot_color="\033[2m" - [ "$i" = "$selected" ] && dot="●" && dot_color="\033[0;32m" - local arrow=" " - [ "$i" = "$cursor" ] && arrow="\033[0;34m❯\033[0m " - local hint_style="\033[2m" - [ "$i" = "$selected" ] && hint_style="\033[0;32m" - printf "\033[2K %b%b%b %-20s %b%s\033[0m\n" "$arrow" "$dot_color" "$dot" "${labels[$i]}" "$hint_style" "${hints[$i]}" > /dev/tty - done - } - - printf "\n \033[2m↑/↓ navigate Β· enter select\033[0m\n\n" > /dev/tty - printf "\033[?25l" > /dev/tty - trap 'printf "\033[?25h" > /dev/tty 2>/dev/null' EXIT - - _scope_draw - - while true; do - printf "\033[%dA" "$count" > /dev/tty - _scope_draw - - local key="" - IFS= read -rsn1 key < /dev/tty 2>/dev/null - - if [ "$key" = $'\x1b' ]; then - local s1="" s2="" - read -rsn1 s1 < /dev/tty 2>/dev/null - read -rsn1 s2 < /dev/tty 2>/dev/null - if [ "$s1" = "[" ]; then - case "$s2" in - A) [ "$cursor" -gt 0 ] && cursor=$((cursor - 1)) ;; - B) [ "$cursor" -lt 1 ] && cursor=$((cursor + 1)) ;; - esac - fi - elif [ "$key" = "" ]; then - selected=$cursor - printf "\033[%dA" "$count" > /dev/tty - _scope_draw - break - elif [ "$key" = " " ]; then - selected=$cursor + if [ "$SAVED_SCOPE" = "global" ]; then + project_state="off" + global_state="on" + global_hint="previous" + else + project_hint="previous" fi - done - - printf "\033[?25h" > /dev/tty - trap - EXIT - - SCOPE="${values[$selected]}" + fi + + SCOPE=$(radio_select \ + "Project|project|${project_state}|${project_hint}" \ + "Global|global|${global_state}|${global_hint}" \ + ) } # Prompt for release channel (stable vs experimental) @@ -1989,8 +1838,13 @@ prompt_channel() { fi } -# Prompt to run auth +# Prompt to run auth (only for Claude + project scope) prompt_auth() { + # Skip if not Claude or if global scope + if ! echo "$TOOLS" | grep -qw "claude" || [ "$SCOPE" = "global" ]; then + return + fi + if [ "$SILENT" = true ] || [ ! -e /dev/tty ]; then return fi @@ -2072,16 +1926,20 @@ main() { detect_tools ok "Selected: $(echo "$TOOLS" | tr ' ' ', ')" - # ── Step 3: Interactive profile selection ── - step "Databricks profile" - prompt_profile - ok "Profile: $PROFILE" - - # ── Step 3.5: Interactive scope selection ── + # ── Step 3: Interactive scope selection ── if [ "$SCOPE_EXPLICIT" = false ]; then prompt_scope ok "Scope: $SCOPE" fi + + # ── Step 4: Interactive profile selection (only if Claude + project scope) ── + # Profile is set in .claude/settings.json env, so only for project-scoped installs + # to avoid messing with global settings that affect all projects + if echo "$TOOLS" | grep -qw "claude" && [ "$SCOPE" != "global" ]; then + step "Databricks profile for this project" + prompt_profile + ok "Profile: $PROFILE" + fi fi # Set state directory based on scope (for profile/manifest storage) @@ -2126,8 +1984,9 @@ main() { echo -e " ────────────────────────────────────" [ "$CHANNEL" = "experimental" ] && echo -e " Channel: ${Y}experimental πŸ§ͺ${N}" echo -e " Tools: ${G}$(echo "$TOOLS" | tr ' ' ', ')${N}" - echo -e " Profile: ${G}${PROFILE}${N}" echo -e " Scope: ${G}${SCOPE}${N}" + # Only show profile for Claude + project scope (where it's actually used) + echo "$TOOLS" | grep -qw "claude" && [ "$SCOPE" != "global" ] && echo -e " Profile: ${G}${PROFILE}${N}" if [ "$INSTALL_SKILLS" = true ]; then if [ -n "$USER_SKILLS" ]; then echo -e " Skills: ${G}custom selection${N}" @@ -2172,6 +2031,14 @@ main() { # Install skills [ "$INSTALL_SKILLS" = true ] && install_skills "$base_dir" + # Write Databricks profile to Claude settings.json (project scope only) + if echo "$TOOLS" | grep -qw "claude" && [ "$SCOPE" != "global" ]; then + local claude_settings="$base_dir/.claude/settings.json" + if write_claude_env "$claude_settings" "$PROFILE"; then + ok "Claude env: DATABRICKS_CONFIG_PROFILE=$PROFILE" + fi + fi + # Write GEMINI.md if gemini is selected if echo "$TOOLS" | grep -q gemini; then if [ "$SCOPE" = "global" ]; then From 7a2ca949eac0c9e741972b3c585ed5f74b9352c0 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Mon, 20 Apr 2026 14:26:05 +0200 Subject: [PATCH 45/66] Clarify catalog/schema behavior in AI/BI dashboard skill MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The --dataset-catalog and --dataset-schema CLI flags only fill in missing parts of a query β€” they do NOT override catalog/schema hardcoded in the FROM clause. Dashboard queries must use bare table names only (e.g., "FROM trips", not "FROM nyctaxi.trips"). - SKILL.md: rewrite note with βœ…/❌ examples and a "why" explanation - 4-examples.md: update example queries to use bare table names - 3-filters.md: update example query to use bare table name πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../databricks-aibi-dashboards/3-filters.md | 2 +- .../databricks-aibi-dashboards/4-examples.md | 6 +++--- .../databricks-aibi-dashboards/SKILL.md | 21 +++++++++++++------ 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/databricks-skills/databricks-aibi-dashboards/3-filters.md b/databricks-skills/databricks-aibi-dashboards/3-filters.md index c981b1a4..f5fd55de 100644 --- a/databricks-skills/databricks-aibi-dashboards/3-filters.md +++ b/databricks-skills/databricks-aibi-dashboards/3-filters.md @@ -151,7 +151,7 @@ Place filter widget directly on a `PAGE_TYPE_CANVAS` page (same widget structure { "name": "revenue_by_category", "queryLines": [ - "SELECT category, SUM(revenue) as revenue FROM sales.orders ", + "SELECT category, SUM(revenue) as revenue FROM orders ", "WHERE order_date BETWEEN :date_range.min AND :date_range.max ", "GROUP BY category" ], diff --git a/databricks-skills/databricks-aibi-dashboards/4-examples.md b/databricks-skills/databricks-aibi-dashboards/4-examples.md index c240c70d..a44e83df 100644 --- a/databricks-skills/databricks-aibi-dashboards/4-examples.md +++ b/databricks-skills/databricks-aibi-dashboards/4-examples.md @@ -59,7 +59,7 @@ This example shows a complete dashboard with: - Data table for detailed records - Global filters (date range, region, category) -> **Note**: Queries use `schema.table` format (no catalog). Set the catalog via `--dataset-catalog "my_catalog"` when creating the dashboard. +> **Note**: Queries reference bare table names only (no catalog, no schema). Catalog and schema are set via `--dataset-catalog "my_catalog" --dataset-schema "gold"` when creating the dashboard. These flags only apply when the query omits catalog/schema β€” they will NOT override anything you hardcode in the `FROM` clause. ```json { @@ -69,7 +69,7 @@ This example shows a complete dashboard with: "displayName": "Daily Sales", "queryLines": [ "SELECT sale_date, region, department, total_orders, total_units, total_revenue, total_cost, profit_margin ", - "FROM gold.daily_sales ", + "FROM daily_sales ", "ORDER BY sale_date" ] }, @@ -78,7 +78,7 @@ This example shows a complete dashboard with: "displayName": "Product Performance", "queryLines": [ "SELECT product_id, product_name, department, region, units_sold, revenue, cost, profit ", - "FROM gold.product_performance" + "FROM product_performance" ] } ], diff --git a/databricks-skills/databricks-aibi-dashboards/SKILL.md b/databricks-skills/databricks-aibi-dashboards/SKILL.md index 6e549fc8..0efb71ef 100644 --- a/databricks-skills/databricks-aibi-dashboards/SKILL.md +++ b/databricks-skills/databricks-aibi-dashboards/SKILL.md @@ -17,7 +17,7 @@ A dashboard should be showing something relevant for a human, typically some KPI | List tables | `databricks experimental aitools tools query --warehouse WH "SHOW TABLES IN catalog.schema"` | | Get schema | `databricks experimental aitools tools discover-schema catalog.schema.table1 catalog.schema.table2` | | Test query | `databricks experimental aitools tools query --warehouse WH "SELECT..."` | -| Create dashboard | `databricks lakeview create --display-name "X" --warehouse-id "Y" --dataset-catalog "catalog" --dataset-schema "schema" --serialized-dashboard "$(cat file.json)"` | +| Create dashboard | `databricks lakeview create --display-name "X" --warehouse-id "Y" --dataset-catalog "catalog" --dataset-schema "schema" --serialized-dashboard "$(cat file.json)"` (always set `--dataset-catalog` and `--dataset-schema` β€” queries MUST use bare table names only) | | Update dashboard | `databricks lakeview update DASHBOARD_ID --serialized-dashboard "$(cat file.json)"` | | Publish | `databricks lakeview publish DASHBOARD_ID --warehouse-id WH` | | Delete | `databricks lakeview trash DASHBOARD_ID` | @@ -64,7 +64,14 @@ databricks experimental aitools tools discover-schema samples.nyctaxi.trips main databricks experimental aitools tools query --warehouse WAREHOUSE_ID "" ``` -> **Note**: The `discover-schema` command requires full `catalog.schema.table` paths, but **dashboard queries should use `schema.table` format** with catalog set via `--dataset-catalog` at dashboard creation. +> **Note**: The `discover-schema` command needs the full `catalog.schema.table` path (it's a separate exploration tool). +> +> **Dashboard queries are different** β€” inside the dashboard JSON, the `FROM` clause must reference ONLY the table name, with no catalog or schema prefix: +> - βœ… Correct: `FROM trips` +> - ❌ Wrong: `FROM nyctaxi.trips` +> - ❌ Wrong: `FROM samples.nyctaxi.trips` +> +> The catalog and schema are supplied separately via the `--dataset-catalog` and `--dataset-schema` flags when you run `databricks lakeview create`. These flags do NOT rewrite the query β€” they only fill in the catalog/schema when the query omits them. If you hardcode a catalog or schema in the `FROM` clause, the flags are ignored for that query and the dashboard won't be portable across environments. ### Step 3: Verify Data Matches Story @@ -99,8 +106,10 @@ Before writing JSON, plan your dashboard: Once created, you can edit the file as following: ```bash # Create a dashboard -# IMPORTANT: Use --dataset-catalog and --dataset-schema to set defaults for all queries -# This way, queries can use schema.table format instead of catalog.schema.table +# IMPORTANT: Use --dataset-catalog and --dataset-schema to set the catalog/schema for all queries +# Queries in the JSON MUST use bare table names only (e.g., "FROM trips"), +# NOT "FROM schema.trips" and NOT "FROM catalog.schema.trips". +# The CLI flags only provide a default β€” they do NOT override hardcoded catalog/schema in queries. databricks lakeview create \ --display-name "My Dashboard" \ --warehouse-id "abc123def456" \ @@ -143,7 +152,7 @@ Every dashboard's `serialized_dashboard` content must follow this exact structur { "name": "ds_x", "displayName": "Dataset X", - "queryLines": ["SELECT col1, col2 ", "FROM schema.table"] + "queryLines": ["SELECT col1, col2 ", "FROM my_table"] } ], "pages": [ @@ -213,7 +222,7 @@ Apply unless user specifies otherwise: - **One dataset per domain** (e.g., orders, customers, products). Datasets shared across widgets benefit from the same filters. - **Exactly ONE valid SQL query per dataset** (no multiple queries separated by `;`) -- **NEVER specify catalog in queries** - use `schema.table` format (e.g., `gold.daily_sales`). Set the default catalog and schema via CLI options `--dataset-catalog` and `--dataset-schema` when creating the dashboard +- **Queries must use bare table names only** β€” no catalog, no schema prefix. Example: `FROM orders`, never `FROM gold.orders` or `FROM main.gold.orders`. The catalog and schema come from the `--dataset-catalog` and `--dataset-schema` flags at creation time. These flags only fill in missing parts β€” they do NOT override any catalog/schema written in the query. - SELECT must include all dimensions needed by widgets and all derived columns via `AS` aliases - Put ALL business logic (CASE/WHEN, COALESCE, ratios) into the dataset SELECT with explicit aliases - **Contract rule**: Every widget `fieldName` must exactly match a dataset column or alias From 9f41acf191b9e04a21d047f850ec02e7032fd851 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Thu, 23 Apr 2026 23:56:42 +0200 Subject: [PATCH 46/66] Refactor Lakebase Autoscaling skill to CLI-first SKILL.md becomes a dense hub with one worked CLI example per concept (projects, branches, endpoints, credentials, reverse ETL). Deep-dive subfiles cover internals, limits, and advanced CLI, with an "SDK equivalents" section at the bottom of each. connection-patterns.md stays SDK-based since in-process OAuth token refresh is the one legitimate runtime SDK use case. Also fixes CLI bugs found during live testing: create-project and generate-database-credential take positional args (not flags); default endpoint is named primary (not ep-primary); duration fields use suspend_timeout_duration / history_retention_duration (not _seconds). Co-authored-by: Isaac --- .../databricks-lakebase-autoscale/SKILL.md | 375 +++++++----------- .../databricks-lakebase-autoscale/branches.md | 263 +++++------- .../databricks-lakebase-autoscale/computes.md | 261 +++++------- .../connection-patterns.md | 18 +- .../databricks-lakebase-autoscale/projects.md | 249 ++++-------- .../reverse-etl.md | 219 +++++----- 6 files changed, 525 insertions(+), 860 deletions(-) diff --git a/databricks-skills/databricks-lakebase-autoscale/SKILL.md b/databricks-skills/databricks-lakebase-autoscale/SKILL.md index 9f857882..fdcbb28f 100644 --- a/databricks-skills/databricks-lakebase-autoscale/SKILL.md +++ b/databricks-skills/databricks-lakebase-autoscale/SKILL.md @@ -5,293 +5,202 @@ description: "Patterns and best practices for Lakebase Autoscaling (next-gen man # Lakebase Autoscaling -Patterns and best practices for using Lakebase Autoscaling, the next-generation managed PostgreSQL on Databricks with autoscaling compute, branching, scale-to-zero, and instant restore. +Next-generation managed PostgreSQL on Databricks β€” autoscaling compute (0.5-112 CU, ~2 GB/CU), Git-like branching, scale-to-zero, and point-in-time restore (up to 35 days). -## When to Use +**Interface: Databricks CLI (`databricks postgres ...`).** Every admin command below uses the CLI. The only place SDK is preferred is inside an application that needs to refresh 1-hour OAuth tokens β€” see [connection-patterns.md](connection-patterns.md). -Use this skill when: -- Building applications that need a PostgreSQL database with autoscaling compute -- Working with database branching for dev/test/staging workflows -- Adding persistent state to applications with scale-to-zero cost savings -- Implementing reverse ETL from Delta Lake to an operational database via synced tables -- Managing Lakebase Autoscaling projects, branches, computes, or credentials +## Hierarchy -## Overview +``` +Project β†’ Branch(es) β†’ Endpoint(s) (compute) + Database(s) (Postgres DBs) +``` -Lakebase Autoscaling is Databricks' next-generation managed PostgreSQL service for OLTP workloads. It provides autoscaling compute, Git-like branching, scale-to-zero, and instant point-in-time restore. +A new project includes by default: a `production` branch, a primary R/W endpoint named `primary` (1 CU min/max, autoscaling on, scale-to-zero off), a `databricks_postgres` database, and a Postgres role for the creating user. -| Feature | Description | -|---------|-------------| -| **Autoscaling Compute** | 0.5-112 CU with 2 GB RAM per CU; scales dynamically based on load | -| **Scale-to-Zero** | Compute suspends after configurable inactivity timeout | -| **Branching** | Create isolated database environments (like Git branches) for dev/test | -| **Instant Restore** | Point-in-time restore from any moment within the configured window (up to 35 days) | -| **OAuth Authentication** | Token-based auth via Databricks SDK (1-hour expiry) | -| **Reverse ETL** | Sync data from Delta tables to PostgreSQL via synced tables | +Resource names are hierarchical paths: `projects/{id}/branches/{id}/endpoints/{id}`. IDs are 1-63 chars, lowercase/digits/hyphens, no leading or trailing hyphen, immutable after creation. -**Available Regions (AWS):** us-east-1, us-east-2, eu-central-1, eu-west-1, eu-west-2, ap-south-1, ap-southeast-1, ap-southeast-2 +**Regions (AWS):** us-east-1, us-east-2, eu-central-1, eu-west-1, eu-west-2, ap-south-1, ap-southeast-1, ap-southeast-2. **Azure (Beta):** eastus2, westeurope, westus. -**Available Regions (Azure Beta):** eastus2, westeurope, westus +All CLI create/update/delete operations are long-running β€” the CLI blocks until the LRO completes and returns the final resource. -## Project Hierarchy +--- -Understanding the hierarchy is essential for working with Lakebase Autoscaling: +## Projects -``` -Project (top-level container) - └── Branch(es) (isolated database environments) - β”œβ”€β”€ Compute (primary R/W endpoint) - β”œβ”€β”€ Read Replica(s) (optional, read-only) - β”œβ”€β”€ Role(s) (Postgres roles) - └── Database(s) (Postgres databases) - └── Schema(s) -``` +Top-level container. One per application or environment grouping. -| Object | Description | -|--------|-------------| -| **Project** | Top-level container. Created via `w.postgres.create_project()`. | -| **Branch** | Isolated database environment with copy-on-write storage. Default branch is `production`. | -| **Compute** | Postgres server powering a branch. Configurable CU sizing and autoscaling. | -| **Database** | Standard Postgres database within a branch. Default is `databricks_postgres`. | - -## Quick Start - -Create a project and connect: - -```python -from databricks.sdk import WorkspaceClient -from databricks.sdk.service.postgres import Project, ProjectSpec - -w = WorkspaceClient() - -# Create a project (long-running operation) -operation = w.postgres.create_project( - project=Project( - spec=ProjectSpec( - display_name="My Application", - pg_version="17" - ) - ), - project_id="my-app" -) -result = operation.wait() -print(f"Created project: {result.name}") +```bash +# Create β€” variants: pg_version 16 | 17. PROJECT_ID is positional. +databricks postgres create-project my-app \ + --json '{"spec": {"display_name": "My App", "pg_version": "17"}}' + +# Get / list +databricks postgres get-project projects/my-app +databricks postgres list-projects + +# Update β€” positional arg is the field mask +databricks postgres update-project projects/my-app spec.display_name \ + --json '{"spec": {"display_name": "Renamed App"}}' + +# Delete (irreversible β€” wipes all branches/data; drop UC catalogs/synced tables first) +databricks postgres delete-project projects/my-app ``` -## Common Patterns +β†’ Defaults, limits table, LRO mechanics, SDK equivalents: [projects.md](projects.md). + +--- -### Generate OAuth Token +## Branches -```python -from databricks.sdk import WorkspaceClient +Isolated DB environments sharing storage with their parent via copy-on-write. The default branch is `production` (cannot be deleted). Branches can be TTL-expiring or permanent. -w = WorkspaceClient() +```bash +# Create β€” variants: "ttl": "604800s" (TTL in seconds) | "no_expiry": true (permanent) +databricks postgres create-branch projects/my-app development \ + --json '{"spec": {"source_branch": "projects/my-app/branches/production", "ttl": "604800s"}}' -# Generate database credential for connecting (optionally scoped to an endpoint) -cred = w.postgres.generate_database_credential( - endpoint="projects/my-app/branches/production/endpoints/ep-primary" -) -token = cred.token # Use as password in connection string -# Token expires after 1 hour -``` +# Get / list +databricks postgres get-branch projects/my-app/branches/development +databricks postgres list-branches projects/my-app -### Connect from Notebook - -```python -import psycopg -from databricks.sdk import WorkspaceClient - -w = WorkspaceClient() - -# Get endpoint details -endpoint = w.postgres.get_endpoint( - name="projects/my-app/branches/production/endpoints/ep-primary" -) -host = endpoint.status.hosts.host - -# Generate token (scoped to endpoint) -cred = w.postgres.generate_database_credential( - endpoint="projects/my-app/branches/production/endpoints/ep-primary" -) - -# Connect using psycopg3 -conn_string = ( - f"host={host} " - f"dbname=databricks_postgres " - f"user={w.current_user.me().user_name} " - f"password={cred.token} " - f"sslmode=require" -) -with psycopg.connect(conn_string) as conn: - with conn.cursor() as cur: - cur.execute("SELECT version()") - print(cur.fetchone()) -``` +# Protect (protected branches can't be deleted/reset/archived) +databricks postgres update-branch projects/my-app/branches/production \ + spec.is_protected --json '{"spec": {"is_protected": true}}' -### Create a Branch for Development - -```python -from databricks.sdk.service.postgres import Branch, BranchSpec, Duration - -# Create a dev branch with 7-day expiration -branch = w.postgres.create_branch( - parent="projects/my-app", - branch=Branch( - spec=BranchSpec( - source_branch="projects/my-app/branches/production", - ttl=Duration(seconds=604800) # 7 days - ) - ), - branch_id="development" -).wait() -print(f"Branch created: {branch.name}") -``` +# Reset to parent's latest state (destroys local changes; not for root/protected/parent branches) +databricks postgres reset-branch projects/my-app/branches/development -### Resize Compute (Autoscaling) - -```python -from databricks.sdk.service.postgres import Endpoint, EndpointSpec, FieldMask - -# Update compute to autoscale between 2-8 CU -w.postgres.update_endpoint( - name="projects/my-app/branches/production/endpoints/ep-primary", - endpoint=Endpoint( - name="projects/my-app/branches/production/endpoints/ep-primary", - spec=EndpointSpec( - autoscaling_limit_min_cu=2.0, - autoscaling_limit_max_cu=8.0 - ) - ), - update_mask=FieldMask(field_mask=[ - "spec.autoscaling_limit_min_cu", - "spec.autoscaling_limit_max_cu" - ]) -).wait() +# Delete (children must be deleted first; protection must be removed first) +databricks postgres delete-branch projects/my-app/branches/development ``` -## Reference Files +β†’ Copy-on-write internals, TTL rules (max 30 days), reset constraints, SDK equivalents: [branches.md](branches.md). -- [projects.md](projects.md) - Project management patterns and settings -- [branches.md](branches.md) - Branching workflows, protection, and expiration -- [computes.md](computes.md) - Compute sizing, autoscaling, and scale-to-zero -- [connection-patterns.md](connection-patterns.md) - Connection patterns for different use cases -- [reverse-etl.md](reverse-etl.md) - Synced tables from Delta Lake to Lakebase +--- -## CLI Commands +## Endpoints (Compute) -### Project Management +A compute runs Postgres for one branch. One R/W endpoint per branch (plus optional read replicas). Autoscale range: 0.5-32 CU with max-min ≀ 8 CU. Large fixed sizes: 36-112 CU. ```bash -# Create a project (PROJECT_ID is positional) -databricks postgres create-project my-app \ - --json '{"spec": {"display_name": "My App", "pg_version": "17"}}' +# Create an R/W endpoint β€” replace RW with ENDPOINT_TYPE_READ_ONLY for read replicas +databricks postgres create-endpoint \ + projects/my-app/branches/production my-compute \ + --json '{"spec": {"endpoint_type": "ENDPOINT_TYPE_READ_WRITE", + "autoscaling_limit_min_cu": 0.5, + "autoscaling_limit_max_cu": 4.0}}' + +# Get host, state, CU range +databricks postgres get-endpoint projects/my-app/branches/production/endpoints/primary + +# Resize β€” mask is a comma-separated positional; JSON holds new values +databricks postgres update-endpoint \ + projects/my-app/branches/production/endpoints/primary \ + "spec.autoscaling_limit_min_cu,spec.autoscaling_limit_max_cu" \ + --json '{"spec": {"autoscaling_limit_min_cu": 2.0, "autoscaling_limit_max_cu": 8.0}}' -# List projects -databricks postgres list-projects +# Delete +databricks postgres delete-endpoint projects/my-app/branches/production/endpoints/my-compute +``` -# Get project details -databricks postgres get-project projects/my-app +**Scale-to-zero:** off on `production` by default, configurable elsewhere (min 60s, default 5min). Reactivation takes a few hundred ms; session context (temp tables, prepared statements, in-memory cache) is **reset** on wake. -# Delete a project -databricks postgres delete-project projects/my-app +β†’ CU sizing table, autoscaling math, scale-to-zero internals, SDK equivalents: [computes.md](computes.md). + +--- + +## Credentials & Connecting + +OAuth tokens are 1-hour TTL and used as the Postgres password with `sslmode=require`. + +```bash +# Generate a scoped OAuth token (use as PGPASSWORD). ENDPOINT is positional. +databricks postgres generate-database-credential \ + projects/my-app/branches/production/endpoints/primary ``` -### Branch Management +Full connection-string recipe (use with `psql`, psycopg, or any Postgres client): ```bash -# Create a branch with TTL -databricks postgres create-branch projects/my-app development \ - --json '{"spec": {"source_branch": "projects/my-app/branches/production", "ttl": {"seconds": 604800}}}' +ENDPOINT="projects/my-app/branches/production/endpoints/primary" +HOST=$(databricks postgres get-endpoint "$ENDPOINT" | jq -r '.status.hosts.host') +USER=$(databricks current-user me | jq -r '.userName') +TOKEN=$(databricks postgres generate-database-credential "$ENDPOINT" | jq -r '.token') -# Create a branch with no expiry -databricks postgres create-branch projects/my-app development \ - --json '{"spec": {"source_branch": "projects/my-app/branches/production", "no_expiry": true}}' +# psycopg keyword form: +echo "host=$HOST dbname=databricks_postgres user=$USER password=$TOKEN sslmode=require" -# List branches -databricks postgres list-branches projects/my-app +# Postgres URI form (the user is an email, so URL-encode @ as %40): +echo "postgresql://${USER/@/%40}:$TOKEN@$HOST:5432/databricks_postgres?sslmode=require" -# Delete a branch -databricks postgres delete-branch projects/my-app/branches/development +# Connect with psql: +PGPASSWORD="$TOKEN" psql "host=$HOST dbname=databricks_postgres user=$USER sslmode=require" ``` -### Endpoint Management +Token TTL is ~1 hour. For app deployment, store **only the endpoint path** as config and generate the token at startup (and every ~50 min thereafter) β€” never bake the token into env files. -```bash -# Get endpoint details -databricks postgres get-endpoint projects/my-app/branches/production/endpoints/ep-primary +Application code is the one place to use the SDK β€” tokens expire hourly and must be refreshed in-process. -# Update endpoint autoscaling limits -databricks postgres update-endpoint projects/my-app/branches/production/endpoints/ep-primary \ - --json '{"spec": {"autoscaling_limit_min_cu": 2.0, "autoscaling_limit_max_cu": 8.0}}' -``` +β†’ Runtime connection patterns (minimal SDK snippet, SQLAlchemy pooling, async refresh loop, macOS DNS workaround, static-URL local dev): [connection-patterns.md](connection-patterns.md). -### OAuth Credentials +--- + +## Reverse ETL (Synced Tables) + +Syncs Unity Catalog Delta tables into Lakebase as Postgres tables via managed Lakeflow pipelines. Modes: `SNAPSHOT` (one-shot) | `TRIGGERED` (scheduled, needs CDF) | `CONTINUOUS` (~15s latency, needs CDF). ```bash -# Generate database credential (for connections) -databricks postgres generate-database-credential \ - --endpoint projects/my-app/branches/production/endpoints/ep-primary +# Create a synced table β€” swap scheduling_policy to SNAPSHOT | TRIGGERED | CONTINUOUS +databricks database create-synced-database-table \ + --json '{"name": "lakebase_catalog.schema.synced_table", + "spec": {"source_table_full_name": "analytics.gold.user_profiles", + "primary_key_columns": ["user_id"], + "scheduling_policy": "TRIGGERED", + "new_pipeline_spec": {"storage_catalog": "lakebase_catalog", + "storage_schema": "staging"}}}' + +# Status (detailed_state shows sync progress) +databricks database get-synced-database-table lakebase_catalog.schema.synced_table ``` -## Key Differences from Lakebase Provisioned +Enable CDF on the source for TRIGGERED/CONTINUOUS: `ALTER TABLE ... SET TBLPROPERTIES (delta.enableChangeDataFeed = true)`. Each synced table uses up to 16 connections and counts against per-branch limits. -| Aspect | Provisioned | Autoscaling | -|--------|-------------|-------------| -| SDK module | `w.database` | `w.postgres` | -| Top-level resource | Instance | Project | -| Capacity | CU_1, CU_2, CU_4, CU_8 (16 GB/CU) | 0.5-112 CU (2 GB/CU) | -| Branching | Not supported | Full branching support | -| Scale-to-zero | Not supported | Configurable timeout | -| Operations | Synchronous | Long-running operations (LRO) | -| Read replicas | Readable secondaries | Dedicated read-only endpoints | +β†’ Mode comparison, type mapping (UC β†’ Postgres), capacity planning, schema evolution rules, SDK equivalents: [reverse-etl.md](reverse-etl.md). + +--- ## Common Issues | Issue | Solution | |-------|----------| -| **Token expired during long query** | Implement token refresh loop; tokens expire after 1 hour | -| **Connection refused after scale-to-zero** | Compute wakes automatically on connection; reactivation takes a few hundred ms; implement retry logic | -| **DNS resolution fails on macOS** | Use `dig` command to resolve hostname, pass `hostaddr` to psycopg | -| **Branch deletion blocked** | Delete child branches first; cannot delete branches with children | -| **Autoscaling range too wide** | Max - min cannot exceed 8 CU (e.g., 8-16 CU is valid, 0.5-32 CU is not) | -| **SSL required error** | Always use `sslmode=require` in connection string | -| **Update mask required** | All update operations require an `update_mask` specifying fields to modify | -| **Connection closed after 24h idle** | All connections have a 24-hour idle timeout and 3-day max lifetime; implement retry logic | - -## Current Limitations - -These features are NOT yet supported in Lakebase Autoscaling: -- High availability with readable secondaries (use read replicas instead) -- Databricks Apps UI integration (Apps can connect manually via credentials) -- Feature Store integration -- Stateful AI agents (LangChain memory) -- Postgres-to-Delta sync (only Delta-to-Postgres reverse ETL) -- Custom billing tags and serverless budget policies -- Direct migration from Lakebase Provisioned (use pg_dump/pg_restore or reverse ETL) - -## SDK Version Requirements - -- **Databricks SDK for Python**: >= 0.81.0 (for `w.postgres` module) -- **psycopg**: 3.x (supports `hostaddr` parameter for DNS workaround) -- **SQLAlchemy**: 2.x with `postgresql+psycopg` driver - -```python -%pip install -U "databricks-sdk>=0.81.0" "psycopg[binary]>=3.0" sqlalchemy -``` +| Token expired during long query | Refresh tokens every ~50 min (1h TTL) | +| Connection refused after scale-to-zero | Compute wakes on connect (~few hundred ms); add retry logic | +| DNS resolution fails on macOS | Pass `hostaddr` (resolved via `dig`) alongside `host` to psycopg | +| Branch delete blocked | Delete child branches first; remove protection first | +| Autoscaling range rejected | max-min must be ≀ 8 CU (e.g., 8-16 valid; 0.5-32 invalid) | +| SSL required error | Always `sslmode=require` | +| Update mask required | CLI `update-*` commands take the mask as a positional arg | +| Connection closed after 24h idle | 24h idle timeout, 3-day max lifetime β€” add retry | + +## Not Yet Supported + +HA readable secondaries (use read replicas), Databricks Apps UI integration (manual connect works), Feature Store, stateful AI agents (LangChain memory), Postgresβ†’Delta sync, custom billing tags / serverless budget policies, direct Provisionedβ†’Autoscaling migration (use `pg_dump`/`pg_restore` or reverse ETL). -## Notes +## Provisioned vs Autoscaling -- **Compute Units** in Autoscaling provide ~2 GB RAM each (vs 16 GB in Provisioned). -- **Resource naming** follows hierarchical paths: `projects/{id}/branches/{id}/endpoints/{id}`. -- All create/update/delete operations are **long-running** -- use `.wait()` in the SDK. -- Tokens are short-lived (1 hour) -- production apps MUST implement token refresh. -- **Postgres versions** 16 and 17 are supported. +| Aspect | Provisioned | Autoscaling | +|--------|-------------|-------------| +| CLI group | `databricks database` | `databricks postgres` | +| Top-level | Instance | Project | +| Capacity | CU_1/2/4/8 (16 GB/CU) | 0.5-112 CU (2 GB/CU) | +| Branching | No | Yes | +| Scale-to-zero | No | Yes | +| Operations | Synchronous | LRO | +| Read replicas | Readable secondaries | Dedicated read-only endpoints | ## Related Skills -- **[databricks-lakebase-provisioned](../databricks-lakebase-provisioned/SKILL.md)** - fixed-capacity managed PostgreSQL (predecessor) -- **[databricks-app-apx](../databricks-app-apx/SKILL.md)** - full-stack apps that can use Lakebase for persistence -- **[databricks-app-python](../databricks-app-python/SKILL.md)** - Python apps with Lakebase backend -- **[databricks-python-sdk](../databricks-python-sdk/SKILL.md)** - SDK used for project management and token generation -- **[databricks-bundles](../databricks-bundles/SKILL.md)** - deploying apps with Lakebase resources -- **[databricks-jobs](../databricks-jobs/SKILL.md)** - scheduling reverse ETL sync jobs +- [databricks-lakebase-provisioned](../databricks-lakebase-provisioned/SKILL.md) β€” fixed-capacity predecessor +- [databricks-app-apx](../databricks-app-apx/SKILL.md), [databricks-app-python](../databricks-app-python/SKILL.md) β€” apps using Lakebase +- [databricks-bundles](../databricks-bundles/SKILL.md) β€” bundle deploys with Lakebase resources +- [databricks-jobs](../databricks-jobs/SKILL.md) β€” scheduling reverse ETL pipelines diff --git a/databricks-skills/databricks-lakebase-autoscale/branches.md b/databricks-skills/databricks-lakebase-autoscale/branches.md index f44f7234..ade6428a 100644 --- a/databricks-skills/databricks-lakebase-autoscale/branches.md +++ b/databricks-skills/databricks-lakebase-autoscale/branches.md @@ -1,212 +1,133 @@ -# Lakebase Autoscaling Branches +# Lakebase Autoscaling β€” Branches (deep dive) -## Overview +Deep dive for the Branches concept. Basic CLI is in [SKILL.md](SKILL.md). -Branches in Lakebase Autoscaling are isolated database environments that share storage with their parent through copy-on-write. They enable Git-like workflows for databases: create isolated dev/test environments, test schema changes safely, and recover from mistakes. +## How Branching Works -## Branch Types +A branch is a logical Postgres instance whose storage is a **copy-on-write** fork of its parent at a specific LSN (point in the parent's WAL history). Reads hit the shared base until a page is modified; writes create branch-local copies. This is why branches are cheap to create and diverge gradually. -| Option | Description | Use Case | -|--------|-------------|----------| -| **Current data** | Branch from latest state of parent | Development, testing with current data | -| **Past data** | Branch from a specific point in time | Point-in-time recovery, historical analysis | +Consequences: +- Creating a branch is nearly instant regardless of parent size. +- Storage grows with write volume on the branch, not with the parent's size. +- Resetting a branch drops its CoW layer and re-points at the parent's current state. -## Creating a Branch +## Branch Sources -### With Expiration (TTL) +When creating a branch you pick a source LSN implicitly: -```python -from databricks.sdk import WorkspaceClient -from databricks.sdk.service.postgres import Branch, BranchSpec, Duration +| Option | `spec` field | Use case | +|--------|--------------|----------| +| Current data | `source_branch` only | Dev/test with up-to-date data | +| Past data | `source_branch` + `source_lsn` or `source_time` | Point-in-time recovery, reproduce a bug | -w = WorkspaceClient() +Past-data branching is bounded by the project's `history_retention_seconds` (default 7 days, max 35). -# Create branch with 7-day expiration -result = w.postgres.create_branch( - parent="projects/my-app", - branch=Branch( - spec=BranchSpec( - source_branch="projects/my-app/branches/production", - ttl=Duration(seconds=604800) # 7 days - ) - ), - branch_id="development" -).wait() +## TTL & Permanence -print(f"Branch created: {result.name}") -print(f"Expires: {result.status.expire_time}") -``` +Branches are either ephemeral (TTL) or permanent (`no_expiry: true`). Max TTL is 30 days from creation. You **cannot** set TTL on: +- Protected branches +- The default branch (`production`) +- Branches that have children -### Permanent Branch (No Expiration) +When a TTL branch expires, its endpoints and data are deleted. -```python -result = w.postgres.create_branch( - parent="projects/my-app", - branch=Branch( - spec=BranchSpec( - source_branch="projects/my-app/branches/production", - no_expiry=True - ) - ), - branch_id="staging" -).wait() -``` +## Protection -### CLI - -```bash -# With TTL -databricks postgres create-branch projects/my-app development \ - --json '{ - "spec": { - "source_branch": "projects/my-app/branches/production", - "ttl": "604800s" - } - }' - -# Permanent -databricks postgres create-branch projects/my-app staging \ - --json '{ - "spec": { - "source_branch": "projects/my-app/branches/production", - "no_expiry": true - } - }' -``` +A protected branch cannot be deleted, reset, or archived. Only 1 branch per project can be protected. Typically this is `production`. Protection is stored on the branch spec and toggled with `update-branch`. -## Getting Branch Details +## Reset -```python -branch = w.postgres.get_branch( - name="projects/my-app/branches/development" -) - -print(f"Branch: {branch.name}") -print(f"Protected: {branch.status.is_protected}") -print(f"Default: {branch.status.default}") -print(f"State: {branch.status.current_state}") -print(f"Size: {branch.status.logical_size_bytes} bytes") -``` +`reset-branch` replaces a branch's CoW layer with a fresh fork from its parent's current head. Effect: +- All local schema and data changes are discarded +- Active connections are interrupted briefly +- Cannot run on: root branches (no parent), protected branches, parents-of-others -## Listing Branches +Use reset when your dev branch has drifted and you want fresh data without recreating the branch (preserves the branch name and any downstream config). -```python -branches = list(w.postgres.list_branches( - parent="projects/my-app" -)) - -for branch in branches: - print(f"Branch: {branch.name}") - print(f" Default: {branch.status.default}") - print(f" Protected: {branch.status.is_protected}") -``` +## Constraints Cheat-Sheet -## Protecting a Branch +| Action | Blocked when | +|--------|-------------| +| Delete | Has children; is protected; is default | +| Reset | Is root; has children; is protected | +| TTL/expire | Is protected; is default; has children | +| Archive | Is protected | -Protected branches cannot be deleted, reset, or archived. +## Advanced CLI -```python -from databricks.sdk.service.postgres import Branch, BranchSpec, FieldMask +Past-data branch from LSN: -w.postgres.update_branch( - name="projects/my-app/branches/production", - branch=Branch( - name="projects/my-app/branches/production", - spec=BranchSpec(is_protected=True) - ), - update_mask=FieldMask(field_mask=["spec.is_protected"]) -).wait() +```bash +databricks postgres create-branch projects/my-app debug-bug-1234 \ + --json '{"spec": {"source_branch": "projects/my-app/branches/production", + "source_lsn": "0/1A2B3C4D", + "no_expiry": true}}' ``` -To remove protection: +Past-data branch from timestamp: -```python -w.postgres.update_branch( - name="projects/my-app/branches/production", - branch=Branch( - name="projects/my-app/branches/production", - spec=BranchSpec(is_protected=False) - ), - update_mask=FieldMask(field_mask=["spec.is_protected"]) -).wait() +```bash +databricks postgres create-branch projects/my-app pre-incident \ + --json '{"spec": {"source_branch": "projects/my-app/branches/production", + "source_time": "2026-04-20T14:30:00Z", + "ttl": "86400s"}}' ``` -## Updating Branch Expiration +Extend or drop a TTL: -```python +```bash # Extend to 14 days -w.postgres.update_branch( - name="projects/my-app/branches/development", - branch=Branch( - name="projects/my-app/branches/development", - spec=BranchSpec( - is_protected=False, - ttl=Duration(seconds=1209600) # 14 days - ) - ), - update_mask=FieldMask(field_mask=["spec.is_protected", "spec.expiration"]) -).wait() +databricks postgres update-branch projects/my-app/branches/development \ + spec.expiration --json '{"spec": {"ttl": "1209600s"}}' -# Remove expiration -w.postgres.update_branch( - name="projects/my-app/branches/development", - branch=Branch( - name="projects/my-app/branches/development", - spec=BranchSpec(no_expiry=True) - ), - update_mask=FieldMask(field_mask=["spec.expiration"]) -).wait() +# Convert to permanent +databricks postgres update-branch projects/my-app/branches/development \ + spec.expiration --json '{"spec": {"no_expiry": true}}' ``` -## Resetting a Branch from Parent +## Best Practices -Reset completely replaces a branch's data and schema with the latest from its parent. Local changes are lost. +- TTL everything ephemeral β€” dev/CI branches accumulate fast against the 10-unarchived limit. +- Protect `production` at project creation time, not "eventually". +- Prefer reset over recreate when you just need fresh data β€” it preserves the branch name and downstream references. +- Compare schemas between branches (`pg_dump --schema-only`) before merging changes back upstream. -```python -w.postgres.reset_branch( - name="projects/my-app/branches/development" -).wait() -``` +## Typical TTL Envelopes -**Constraints:** -- Root branches (like `production`) cannot be reset (no parent) -- Branches with children cannot be reset (delete children first) -- Connections are temporarily interrupted during reset +| Workload | TTL | +|----------|-----| +| CI run | 2-4 h | +| Demo | 24-48 h | +| Feature branch | 1-7 days | +| Long-lived test env | up to 30 days | -## Deleting a Branch +## SDK Equivalents ```python -w.postgres.delete_branch( - name="projects/my-app/branches/development" -).wait() -``` - -**Constraints:** -- Cannot delete branches with child branches (delete children first) -- Cannot delete protected branches (remove protection first) -- Cannot delete the default branch - -## Branch Expiration - -Branch expiration sets an automatic deletion timestamp. Useful for: -- **CI/CD environments**: 2-4 hours -- **Demos**: 24-48 hours -- **Feature development**: 1-7 days -- **Long-term testing**: up to 30 days - -**Maximum expiration period:** 30 days from current time. +from databricks.sdk import WorkspaceClient +from databricks.sdk.service.postgres import Branch, BranchSpec, Duration, FieldMask -### Expiration Restrictions +w = WorkspaceClient() -- Cannot expire protected branches -- Cannot expire default branches -- Cannot expire branches that have children -- When a branch expires, all compute resources are also deleted +# Create (TTL or permanent) +w.postgres.create_branch( + parent="projects/my-app", + branch=Branch(spec=BranchSpec( + source_branch="projects/my-app/branches/production", + ttl=Duration(seconds=604800), # or: no_expiry=True + )), + branch_id="development", +).wait() -## Best Practices +# Protect +w.postgres.update_branch( + name="projects/my-app/branches/production", + branch=Branch(name="projects/my-app/branches/production", + spec=BranchSpec(is_protected=True)), + update_mask=FieldMask(field_mask=["spec.is_protected"]), +).wait() -1. **Use TTL for ephemeral branches**: Set expiration for dev/test branches to avoid accumulation -2. **Protect production branches**: Prevent accidental deletion or reset -3. **Reset instead of recreate**: Use reset from parent when you need fresh data without new branch overhead -4. **Schema diff before merge**: Compare schemas between branches before applying changes to production -5. **Monitor unarchived limit**: Only 10 unarchived branches are allowed per project +# Reset / delete +w.postgres.reset_branch(name="projects/my-app/branches/development").wait() +w.postgres.delete_branch(name="projects/my-app/branches/development").wait() +``` diff --git a/databricks-skills/databricks-lakebase-autoscale/computes.md b/databricks-skills/databricks-lakebase-autoscale/computes.md index 0f53d50c..eb0bc62e 100644 --- a/databricks-skills/databricks-lakebase-autoscale/computes.md +++ b/databricks-skills/databricks-lakebase-autoscale/computes.md @@ -1,208 +1,143 @@ -# Lakebase Autoscaling Computes +# Lakebase Autoscaling β€” Computes (deep dive) -## Overview +Deep dive for Endpoints (computes). Basic CLI is in [SKILL.md](SKILL.md). -A compute is a virtualized service that runs Postgres for a branch. Each branch has one primary read-write compute and can have optional read replicas. Computes support autoscaling, scale-to-zero, and granular sizing from 0.5 to 112 CU. +## What an Endpoint Is -## Compute Sizing +An endpoint is a Postgres server instance attached to one branch. Each branch has exactly one R/W endpoint (conventionally `ep-primary`) and may have additional read-only replicas. The endpoint owns the hostname clients connect to and the CU budget that determines concurrency and RAM. -Each Compute Unit (CU) allocates approximately 2 GB of RAM. +## Compute Units -### Available Sizes +1 CU β‰ˆ 2 GB RAM (vs ~16 GB/CU on Lakebase Provisioned β€” the autoscaling tier trades per-unit RAM for finer scaling granularity). -| Category | Range | Notes | -|----------|-------|-------| -| **Autoscale computes** | 0.5-32 CU | Dynamic scaling within range (max-min <= 8 CU) | -| **Large fixed-size** | 36-112 CU | Fixed size, no autoscaling | +| CU | RAM | Max connections | +|----|-----|-----------------| +| 0.5 | ~1 GB | 104 | +| 1 | ~2 GB | 209 | +| 4 | ~8 GB | 839 | +| 8 | ~16 GB | 1,678 | +| 16 | ~32 GB | 3,357 | +| 32 | ~64 GB | 4,000 | +| 64 | ~128 GB | 4,000 | +| 112 | ~224 GB | 4,000 | -### Representative Sizes +Max connections flattens at 4,000 above 32 CU β€” scale up past 32 CU for memory/CPU, not for connection headroom. -| Compute Units | RAM | Max Connections | -|--------------|-----|-----------------| -| 0.5 CU | ~1 GB | 104 | -| 1 CU | ~2 GB | 209 | -| 4 CU | ~8 GB | 839 | -| 8 CU | ~16 GB | 1,678 | -| 16 CU | ~32 GB | 3,357 | -| 32 CU | ~64 GB | 4,000 | -| 64 CU | ~128 GB | 4,000 | -| 112 CU | ~224 GB | 4,000 | +## Sizing Categories -**Note:** Lakebase Provisioned used ~16 GB per CU. Autoscaling uses ~2 GB per CU for more granular scaling. +| Category | Range | Behavior | +|----------|-------|----------| +| Autoscale | 0.5-32 CU | Dynamic scaling; `max βˆ’ min ≀ 8 CU` | +| Large fixed | 36-112 CU | Fixed size, no autoscaling | -## Creating a Compute +**Autoscaling window constraint.** The spread between `autoscaling_limit_min_cu` and `autoscaling_limit_max_cu` cannot exceed 8 CU: +- Valid: 4-8, 8-16, 16-24 +- Invalid: 0.5-32 (31.5 CU spread), 0.5-16 (15.5 CU spread) -```python -from databricks.sdk import WorkspaceClient -from databricks.sdk.service.postgres import Endpoint, EndpointSpec, EndpointType +Set the minimum high enough to keep your working set in memory β€” traffic that lands after a scale-up pays a cache-warm penalty until hot pages are faulted back in. -w = WorkspaceClient() +## Scale-to-Zero -# Create a read-write compute endpoint -result = w.postgres.create_endpoint( - parent="projects/my-app/branches/production", - endpoint=Endpoint( - spec=EndpointSpec( - endpoint_type=EndpointType.ENDPOINT_TYPE_READ_WRITE, - autoscaling_limit_min_cu=0.5, - autoscaling_limit_max_cu=4.0 - ) - ), - endpoint_id="my-compute" -).wait() +When enabled, an endpoint suspends after an inactivity window (min 60 s, default 5 min). Default state per branch: -print(f"Endpoint created: {result.name}") -print(f"Host: {result.status.hosts.host}") -``` +| Branch | Default | +|--------|---------| +| `production` | Scale-to-zero **off** (always active) | +| Others | Scale-to-zero configurable | -### CLI +### Wake-up + +Incoming connections to a suspended endpoint trigger reactivation. Expected latency is a few hundred ms, but: +- First connection may see a timeout β€” applications must retry. +- Endpoint resumes at the **minimum** of its autoscaling range; expect cache-cold performance until load ramps up. +- All session-scoped state is lost: in-memory stats, temp tables, prepared statements, session GUCs, active transactions. + +If your app keeps session state server-side (e.g., Postgres advisory locks, prepared statements you don't re-prepare), disable scale-to-zero. + +## Advanced CLI + +Create a read replica: ```bash databricks postgres create-endpoint \ - projects/my-app/branches/production my-compute \ - --json '{ - "spec": { - "endpoint_type": "ENDPOINT_TYPE_READ_WRITE", - "autoscaling_limit_min_cu": 0.5, - "autoscaling_limit_max_cu": 4.0 - } - }' + projects/my-app/branches/production ep-readonly-1 \ + --json '{"spec": {"endpoint_type": "ENDPOINT_TYPE_READ_ONLY", + "autoscaling_limit_min_cu": 1.0, + "autoscaling_limit_max_cu": 4.0}}' ``` -**Important:** Each branch can have only one read-write compute. +Change scale-to-zero timeout (durations are strings ending in `s`): -## Getting Compute Details +```bash +databricks postgres update-endpoint \ + projects/my-app/branches/development/endpoints/primary \ + spec.suspend_timeout_duration \ + --json '{"spec": {"suspend_timeout_duration": "1800s"}}' # 30 min +``` -```python -endpoint = w.postgres.get_endpoint( - name="projects/my-app/branches/production/endpoints/my-compute" -) +Disable scale-to-zero on a non-default branch (`"0s"` = off): -print(f"Endpoint: {endpoint.name}") -print(f"Type: {endpoint.status.endpoint_type}") -print(f"State: {endpoint.status.current_state}") -print(f"Host: {endpoint.status.hosts.host}") -print(f"Min CU: {endpoint.status.autoscaling_limit_min_cu}") -print(f"Max CU: {endpoint.status.autoscaling_limit_max_cu}") +```bash +databricks postgres update-endpoint \ + projects/my-app/branches/staging/endpoints/primary \ + spec.suspend_timeout_duration \ + --json '{"spec": {"suspend_timeout_duration": "0s"}}' ``` -## Listing Computes +Convert from autoscale to a large fixed size (e.g., 64 CU): -```python -endpoints = list(w.postgres.list_endpoints( - parent="projects/my-app/branches/production" -)) - -for ep in endpoints: - print(f"Endpoint: {ep.name}") - print(f" Type: {ep.status.endpoint_type}") - print(f" CU Range: {ep.status.autoscaling_limit_min_cu}-{ep.status.autoscaling_limit_max_cu}") +```bash +databricks postgres update-endpoint \ + projects/my-app/branches/production/endpoints/primary \ + "spec.autoscaling_limit_min_cu,spec.autoscaling_limit_max_cu" \ + --json '{"spec": {"autoscaling_limit_min_cu": 64.0, "autoscaling_limit_max_cu": 64.0}}' ``` -## Resizing a Compute +## Sizing Guidance -Use `update_mask` to specify which fields to update: +| Factor | Recommendation | +|--------|----------------| +| Query complexity | Complex analytical queries benefit from larger computes | +| Concurrent connections | Scale up until 32 CU; past that you're buying CPU/RAM, not connections | +| Working-set size | Min CU should hold your hot data in RAM | +| Latency-sensitive apps | Disable scale-to-zero or accept wake-up retries | + +## SDK Equivalents ```python -from databricks.sdk.service.postgres import Endpoint, EndpointSpec, FieldMask +from databricks.sdk import WorkspaceClient +from databricks.sdk.service.postgres import ( + Endpoint, EndpointSpec, EndpointType, FieldMask, +) -# Update min and max CU +w = WorkspaceClient() + +# Create R/W endpoint +w.postgres.create_endpoint( + parent="projects/my-app/branches/production", + endpoint=Endpoint(spec=EndpointSpec( + endpoint_type=EndpointType.ENDPOINT_TYPE_READ_WRITE, + autoscaling_limit_min_cu=0.5, + autoscaling_limit_max_cu=4.0, + )), + endpoint_id="my-compute", +).wait() + +# Resize w.postgres.update_endpoint( name="projects/my-app/branches/production/endpoints/my-compute", endpoint=Endpoint( name="projects/my-app/branches/production/endpoints/my-compute", - spec=EndpointSpec( - autoscaling_limit_min_cu=2.0, - autoscaling_limit_max_cu=8.0 - ) + spec=EndpointSpec(autoscaling_limit_min_cu=2.0, autoscaling_limit_max_cu=8.0), ), update_mask=FieldMask(field_mask=[ "spec.autoscaling_limit_min_cu", - "spec.autoscaling_limit_max_cu" - ]) + "spec.autoscaling_limit_max_cu", + ]), ).wait() -``` - -### CLI -```bash -# Update single field -databricks postgres update-endpoint \ - projects/my-app/branches/production/endpoints/my-compute \ - spec.autoscaling_limit_max_cu \ - --json '{"spec": {"autoscaling_limit_max_cu": 8.0}}' - -# Update multiple fields -databricks postgres update-endpoint \ - projects/my-app/branches/production/endpoints/my-compute \ - "spec.autoscaling_limit_min_cu,spec.autoscaling_limit_max_cu" \ - --json '{"spec": {"autoscaling_limit_min_cu": 2.0, "autoscaling_limit_max_cu": 8.0}}' -``` - -## Deleting a Compute - -```python +# Delete w.postgres.delete_endpoint( name="projects/my-app/branches/production/endpoints/my-compute" ).wait() ``` - -## Autoscaling - -Autoscaling dynamically adjusts compute resources based on workload demand. - -### Configuration - -- **Range:** 0.5-32 CU -- **Constraint:** Max - Min cannot exceed 8 CU -- **Valid examples:** 4-8 CU, 8-16 CU, 16-24 CU -- **Invalid example:** 0.5-32 CU (range of 31.5 CU) - -### Best Practices - -- Set minimum CU large enough to cache your working set in memory -- Performance may be degraded until compute scales up and caches data -- Connection limits are based on the maximum CU in the range - -## Scale-to-Zero - -Automatically suspends compute after a period of inactivity. - -| Setting | Description | -|---------|-------------| -| **Enabled** | Compute suspends after inactivity timeout (saves cost) | -| **Disabled** | Always-active compute (eliminates wake-up latency) | - -**Default behavior:** -- `production` branch: Scale-to-zero **disabled** (always active) -- Other branches: Scale-to-zero can be configured - -**Default inactivity timeout:** 5 minutes -**Minimum inactivity timeout:** 60 seconds - -### Wake-up Behavior - -When a connection arrives on a suspended compute: -1. Compute starts automatically (reactivation takes a few hundred milliseconds) -2. The connection request is handled transparently once active -3. Compute restarts at minimum autoscaling size (if autoscaling enabled) -4. Applications should implement connection retry logic for the brief reactivation period - -### Session Context After Reactivation - -When a compute suspends and reactivates, session context is **reset**: -- In-memory statistics and cache contents are cleared -- Temporary tables and prepared statements are lost -- Session-specific configuration settings reset -- Connection pools and active transactions are terminated - -If your application requires persistent session data, consider disabling scale-to-zero. - -## Sizing Guidance - -| Factor | Recommendation | -|--------|---------------| -| Query complexity | Complex analytical queries benefit from larger computes | -| Concurrent connections | More connections need more CPU and memory | -| Data volume | Larger datasets may need more memory for performance | -| Response time | Critical apps may require larger computes | diff --git a/databricks-skills/databricks-lakebase-autoscale/connection-patterns.md b/databricks-skills/databricks-lakebase-autoscale/connection-patterns.md index 398862b3..111238e7 100644 --- a/databricks-skills/databricks-lakebase-autoscale/connection-patterns.md +++ b/databricks-skills/databricks-lakebase-autoscale/connection-patterns.md @@ -1,8 +1,20 @@ -# Lakebase Autoscaling Connection Patterns +# Lakebase Autoscaling β€” Connection Patterns (deep dive) -## Overview +Deep dive for the application-runtime connection layer. Basic credential generation and a minimal Python snippet are in [SKILL.md](SKILL.md#credentials--connecting). -This document covers different connection patterns for Lakebase Autoscaling, from simple scripts to production applications with token refresh. +**Why this file uses the SDK and the others don't.** OAuth tokens are 1-hour TTL and must be refreshed from inside the running process β€” shelling out to the CLI per refresh is slow, fragile, and awkward to embed in a pool. All admin operations (project, branch, endpoint, synced-table lifecycle) stay on the CLI; only runtime token rotation and connection pooling live here. + +This document covers connection patterns from simple scripts to production applications with token refresh. + +## Requirements + +```python +%pip install -U "databricks-sdk>=0.81.0" "psycopg[binary]>=3.0" sqlalchemy +``` + +- `databricks-sdk >= 0.81.0` β€” required for the `w.postgres` module +- `psycopg 3.x` β€” supports the `hostaddr` parameter for the macOS DNS workaround +- `sqlalchemy 2.x` with the `postgresql+psycopg` driver ## Authentication Methods diff --git a/databricks-skills/databricks-lakebase-autoscale/projects.md b/databricks-skills/databricks-lakebase-autoscale/projects.md index 0522a72e..1555f697 100644 --- a/databricks-skills/databricks-lakebase-autoscale/projects.md +++ b/databricks-skills/databricks-lakebase-autoscale/projects.md @@ -1,203 +1,122 @@ -# Lakebase Autoscaling Projects +# Lakebase Autoscaling β€” Projects (deep dive) -## Overview +Deep dive for the Projects concept. Basic CLI is in [SKILL.md](SKILL.md). -A project is the top-level container for Lakebase Autoscaling resources, including branches, computes, databases, and roles. Each project is isolated and contains its own Postgres version, compute defaults, and restore window settings. +## What a Project Is -## Project Structure +A project is the top-level isolation boundary: its own Postgres major version, compute defaults, restore window, and hierarchical namespace for branches/endpoints/databases/roles. Projects do not share storage or compute with each other. -``` -Project - └── Branches (production, development, staging, etc.) - β”œβ”€β”€ Computes (R/W compute, read replicas) - β”œβ”€β”€ Roles (Postgres roles) - └── Databases (Postgres databases) -``` +A freshly created project contains: +- Default branch `production` (cannot be deleted; default scale-to-zero **off**) +- Primary R/W endpoint named `primary` (1 CU min/max by default, autoscaling on) +- Database `databricks_postgres` +- A Postgres role mapped to the creator's Databricks identity -When a project is created, it includes by default: -- A `production` branch (the default branch) -- A primary read-write compute (8-32 CU, autoscaling enabled, scale-to-zero disabled) -- A `databricks_postgres` database -- A Postgres role for the creating user's Databricks identity +## Naming Rules -## Resource Naming - -Projects follow a hierarchical naming convention: -``` -projects/{project_id} -``` +Path: `projects/{project_id}`. The `project_id` must be 1-63 chars, `[a-z0-9-]`, no leading/trailing hyphen, and is immutable. -**Resource ID requirements:** -- 1-63 characters long -- Lowercase letters, digits, and hyphens only -- Cannot start or end with a hyphen -- Cannot be changed after creation +## Defaults & Settings -## Creating a Project +| Setting | Default | Configurable range | +|---------|---------|--------------------| +| Postgres version | 17 | 16 or 17 | +| Scale-to-zero (production) | off | off always; cannot be enabled on default branch | +| Scale-to-zero (other branches) | 5 min | β‰₯ 60 s | +| Primary endpoint CU range | 1-1 (creator can override) | 0.5-32 (autoscale) or 36-112 (fixed) | +| Restore window | 1 day | 2-35 days | -### Python SDK +Longer restore windows increase storage cost. -```python -from databricks.sdk import WorkspaceClient -from databricks.sdk.service.postgres import Project, ProjectSpec +## Limits -w = WorkspaceClient() +| Resource | Limit | +|----------|-------| +| Projects per workspace | 1,000 | +| Concurrently active computes per project | 20 | +| Branches per project | 500 | +| Unarchived branches | 10 | +| Root branches | 3 | +| Protected branches | 1 | +| Roles / databases per branch | 500 / 500 | +| Logical data size per branch | 8 TB | +| Snapshots | 10 | +| Max history retention | 35 days | +| Min scale-to-zero time | 60 s | -# Create a project (long-running operation) -operation = w.postgres.create_project( - project=Project( - spec=ProjectSpec( - display_name="My Application", - pg_version="17" - ) - ), - project_id="my-app" -) - -# Wait for completion -result = operation.wait() -print(f"Created project: {result.name}") -print(f"Display name: {result.status.display_name}") -print(f"Postgres version: {result.status.pg_version}") -``` +## Long-Running Operations -### CLI +Every create/update/delete returns an LRO. The CLI blocks until `done=true` by default; programmatic callers can poll: ```bash -databricks postgres create-project my-app \ - --json '{ - "spec": { - "display_name": "My Application", - "pg_version": "17" - } - }' +# Start async and get an operation name back (advanced; the default CLI behaviour is blocking) +databricks postgres list-operations projects/my-app +databricks postgres get-operation projects/my-app/operations/op- ``` -## Getting Project Details - -### Python SDK - -```python -project = w.postgres.get_project(name="projects/my-app") - -print(f"Project: {project.name}") -print(f"Display name: {project.status.display_name}") -print(f"Postgres version: {project.status.pg_version}") -``` +## Advanced CLI -### CLI +Full update with multiple fields (durations are ISO8601-style strings like `"2592000s"`): ```bash -databricks postgres get-project projects/my-app +databricks postgres update-project projects/my-app \ + "spec.display_name,spec.history_retention_duration" \ + --json '{"spec": {"display_name": "My App (prod)", + "history_retention_duration": "2592000s"}}' # 30 days ``` -**Note:** The `spec` field is not populated for GET operations. All properties are returned in the `status` field. +Deleting a project with active synced tables fails β€” drop the UC synced tables and catalogs first, then delete. -## Listing Projects +## Field Reference -```python -projects = w.postgres.list_projects() +Returned fields on `get-project` live under `status`, **not** `spec`: -for project in projects: - print(f"Project: {project.name}") - print(f" Display name: {project.status.display_name}") - print(f" Postgres version: {project.status.pg_version}") ``` - -## Updating a Project - -Updates require an `update_mask` specifying which fields to modify: - -```python -from databricks.sdk.service.postgres import Project, ProjectSpec, FieldMask - -# Update display name -operation = w.postgres.update_project( - name="projects/my-app", - project=Project( - name="projects/my-app", - spec=ProjectSpec( - display_name="My Updated Application" - ) - ), - update_mask=FieldMask(field_mask=["spec.display_name"]) -) -result = operation.wait() +status.display_name +status.pg_version +status.history_retention_duration # e.g. "86400s" +status.enable_pg_native_login +status.owner +status.default_endpoint_settings.autoscaling_limit_min_cu +status.default_endpoint_settings.autoscaling_limit_max_cu +status.default_endpoint_settings.suspend_timeout_duration # scale-to-zero; "0s" = off +create_time +update_time +uid ``` -### CLI +## SDK Equivalents -```bash -databricks postgres update-project projects/my-app spec.display_name \ - --json '{ - "spec": { - "display_name": "My Updated Application" - } - }' -``` - -## Deleting a Project - -**WARNING:** Deleting a project is permanent and also deletes all branches, computes, databases, roles, and data. - -Delete all Unity Catalog catalogs and synced tables before deleting the project. +Use these only inside Python automation where shelling out to the CLI is awkward. Requires `databricks-sdk>=0.81.0`. ```python -operation = w.postgres.delete_project(name="projects/my-app") -# This is a long-running operation -``` - -### CLI - -```bash -databricks postgres delete-project projects/my-app -``` - -## Project Settings - -### Compute Defaults - -Default settings for new primary computes: -- Compute size range (0.5-112 CU) -- Scale-to-zero timeout (default: 5 minutes) - -### Instant Restore - -Configure the restore window length (2-35 days). Longer windows increase storage costs. - -### Postgres Version - -Supports Postgres 16 and Postgres 17. - -## Project Limits +from databricks.sdk import WorkspaceClient +from databricks.sdk.service.postgres import Project, ProjectSpec, FieldMask -| Resource | Limit | -|----------|-------| -| Concurrently active computes | 20 | -| Branches per project | 500 | -| Postgres roles per branch | 500 | -| Postgres databases per branch | 500 | -| Logical data size per branch | 8 TB | -| Projects per workspace | 1000 | -| Protected branches | 1 | -| Root branches | 3 | -| Unarchived branches | 10 | -| Snapshots | 10 | -| Maximum history retention | 35 days | -| Minimum scale-to-zero time | 60 seconds | +w = WorkspaceClient() -## Long-Running Operations +# Create +w.postgres.create_project( + project=Project(spec=ProjectSpec(display_name="My App", pg_version="17")), + project_id="my-app", +).wait() -All create, update, and delete operations return a long-running operation (LRO). Use `.wait()` in the SDK to block until completion: +# Get / list +w.postgres.get_project(name="projects/my-app") +list(w.postgres.list_projects()) -```python -# Start operation -operation = w.postgres.create_project(...) +# Update +w.postgres.update_project( + name="projects/my-app", + project=Project(name="projects/my-app", + spec=ProjectSpec(display_name="Renamed App")), + update_mask=FieldMask(field_mask=["spec.display_name"]), +).wait() -# Wait for completion -result = operation.wait() +# Delete +w.postgres.delete_project(name="projects/my-app").wait() -# Or check status manually -op_status = w.postgres.get_operation(name=operation.name) -print(f"Done: {op_status.done}") +# Poll an LRO manually instead of .wait() +op = w.postgres.create_project(...) +w.postgres.get_operation(name=op.name).done ``` diff --git a/databricks-skills/databricks-lakebase-autoscale/reverse-etl.md b/databricks-skills/databricks-lakebase-autoscale/reverse-etl.md index f983eebb..ce7fa3e9 100644 --- a/databricks-skills/databricks-lakebase-autoscale/reverse-etl.md +++ b/databricks-skills/databricks-lakebase-autoscale/reverse-etl.md @@ -1,113 +1,44 @@ -# Reverse ETL with Lakebase Autoscaling +# Lakebase Autoscaling β€” Reverse ETL (deep dive) -## Overview +Deep dive for synced tables. Basic CLI is in [SKILL.md](SKILL.md). -Reverse ETL allows you to sync data from Unity Catalog Delta tables into Lakebase Autoscaling as PostgreSQL tables. This enables OLTP access patterns on data processed in the Lakehouse. +## How Synced Tables Work -## How It Works +A synced table is a managed replica of a Unity Catalog Delta table, materialized into Lakebase as a queryable Postgres table. Two artifacts are created: -Synced tables create a managed copy of Unity Catalog data in Lakebase: +1. A **Unity Catalog synced table** (read-only from the user's perspective, owned by the sync pipeline) +2. A **Postgres table** inside the Lakebase database, updated by the same pipeline -1. A new Unity Catalog table (read-only, managed by the sync pipeline) -2. A Postgres table in Lakebase (queryable by applications) +The pipeline is a managed Lakeflow Spark Declarative Pipeline that runs on Databricks compute β€” the user doesn't provision it directly. -The sync pipeline uses managed Lakeflow Spark Declarative Pipelines to continuously update both tables. +## Performance Envelope -### Performance +| Mode | Throughput (per CU) | Latency | +|------|---------------------|---------| +| Continuous writes | ~1,200 rows/s | seconds | +| Bulk writes (initial load / snapshot) | ~15,000 rows/s | minutes-hours depending on size | -- **Continuous writes:** ~1,200 rows/sec per CU -- **Bulk writes:** ~15,000 rows/sec per CU -- **Connections used:** Up to 16 per synced table +Each synced table uses **up to 16 connections** on the target endpoint. Account for this when sizing concurrency-heavy apps alongside reverse ETL on the same endpoint. ## Sync Modes -| Mode | Description | Best For | Notes | -|------|-------------|----------|-------| -| **Snapshot** | One-time full copy | Initial setup, historical analysis | 10x more efficient if modifying >10% of data | -| **Triggered** | Scheduled updates on demand | Dashboards updated hourly/daily | Requires CDF on source table | -| **Continuous** | Real-time streaming (seconds of latency) | Live applications | Highest cost, minimum 15s intervals, requires CDF | +| Mode | Mechanism | Best for | Requires CDF | +|------|-----------|----------|--------------| +| `SNAPSHOT` | One-shot full copy | Initial load, historical analysis, small reference tables; use when you'd modify >10% of data anyway | No | +| `TRIGGERED` | Scheduled incremental updates | Dashboards refreshed hourly/daily | Yes | +| `CONTINUOUS` | Streaming, min 15 s intervals | Real-time features, live apps (highest cost) | Yes | -**Note:** Triggered and Continuous modes require Change Data Feed (CDF) enabled on the source table: +Enable CDF on the source before creating TRIGGERED or CONTINUOUS synced tables: ```sql ALTER TABLE your_catalog.your_schema.your_table SET TBLPROPERTIES (delta.enableChangeDataFeed = true) ``` -## Creating Synced Tables +## Data Type Mapping (UC β†’ Postgres) -### Using Python SDK - -```python -from databricks.sdk import WorkspaceClient -from databricks.sdk.service.database import ( - SyncedDatabaseTable, - SyncedTableSpec, - NewPipelineSpec, - SyncedTableSchedulingPolicy, -) - -w = WorkspaceClient() - -# Create a synced table -synced_table = w.database.create_synced_database_table( - SyncedDatabaseTable( - name="lakebase_catalog.schema.synced_table", - spec=SyncedTableSpec( - source_table_full_name="analytics.gold.user_profiles", - primary_key_columns=["user_id"], - scheduling_policy=SyncedTableSchedulingPolicy.TRIGGERED, - new_pipeline_spec=NewPipelineSpec( - storage_catalog="lakebase_catalog", - storage_schema="staging" - ) - ), - ) -) -print(f"Created synced table: {synced_table.name}") -``` - -### Using CLI - -```bash -databricks database create-synced-database-table \ - --json '{ - "name": "lakebase_catalog.schema.synced_table", - "spec": { - "source_table_full_name": "analytics.gold.user_profiles", - "primary_key_columns": ["user_id"], - "scheduling_policy": "TRIGGERED", - "new_pipeline_spec": { - "storage_catalog": "lakebase_catalog", - "storage_schema": "staging" - } - } - }' -``` - -## Checking Synced Table Status - -```python -status = w.database.get_synced_database_table(name="lakebase_catalog.schema.synced_table") -print(f"State: {status.data_synchronization_status.detailed_state}") -print(f"Message: {status.data_synchronization_status.message}") -``` - -## Deleting a Synced Table - -Delete from both Unity Catalog and Postgres: - -1. **Unity Catalog:** Delete from Catalog Explorer or SDK -2. **Postgres:** Drop the table to free storage - -```sql -DROP TABLE your_database.your_schema.your_table; -``` - -## Data Type Mapping - -| Unity Catalog Type | Postgres Type | -|-------------------|---------------| +| Unity Catalog | Postgres | +|---------------|----------| | BIGINT | BIGINT | | BINARY | BYTEA | | BOOLEAN | BOOLEAN | @@ -122,56 +53,94 @@ DROP TABLE your_database.your_schema.your_table; | TIMESTAMP | TIMESTAMP WITH TIME ZONE | | TIMESTAMP_NTZ | TIMESTAMP WITHOUT TIME ZONE | | TINYINT | SMALLINT | -| ARRAY | JSONB | -| MAP | JSONB | -| STRUCT | JSONB | +| ARRAY / MAP / STRUCT | JSONB | + +**Unsupported:** GEOGRAPHY, GEOMETRY, VARIANT, OBJECT. -**Unsupported types:** GEOGRAPHY, GEOMETRY, VARIANT, OBJECT +## Naming & Schema Rules + +- Database, schema, and table names: `[A-Za-z0-9_]+` only. +- Schema evolution on TRIGGERED/CONTINUOUS: **additive only** (new columns). Breaking changes require dropping and re-creating the synced table. +- Primary key column(s) on the source are required and must be unique. ## Capacity Planning -- **Connection usage:** Each synced table uses up to 16 connections -- **Size limits:** 2 TB total across all synced tables; recommend < 1 TB per table -- **Naming:** Database, schema, and table names only allow `[A-Za-z0-9_]+` -- **Schema evolution:** Only additive changes (e.g., adding columns) for Triggered/Continuous modes +| Dimension | Limit / guideline | +|-----------|-------------------| +| Connections used per synced table | up to 16 | +| Total synced-table size per branch | 2 TB | +| Recommended size per synced table | < 1 TB | +| CDF retention on source | Longer than the sync pipeline's commit cadence | -## Use Cases +## Deletion -### Product Catalog for Web App +Synced tables exist in two places β€” drop both: -```python -w.database.create_synced_database_table( - SyncedDatabaseTable( - name="ecommerce_catalog.public.products", - spec=SyncedTableSpec( - source_table_full_name="gold.products.catalog", - primary_key_columns=["product_id"], - scheduling_policy=SyncedTableSchedulingPolicy.TRIGGERED, - ), - ) -) +1. Remove the UC synced table (Catalog Explorer, CLI, or SDK). +2. Drop the Postgres table to free branch storage: + +```sql +DROP TABLE your_database.your_schema.your_table; ``` -### Real-time Feature Serving +## Best Practices + +- Enable CDF **before** creating TRIGGERED/CONTINUOUS tables, not after β€” the initial snapshot won't include incremental updates otherwise. +- Pick the cheapest mode that meets latency: most dashboards are fine on TRIGGERED hourly. +- Index Postgres targets for your query patterns β€” synced tables arrive without indexes. +- Monitor pipeline status (`data_synchronization_status`) β€” it surfaces backpressure and schema mismatches before they affect readers. +- Keep reverse-ETL pipelines on their own branch (or at least a sized-up endpoint) when running alongside OLTP traffic. + +## Example Use Cases + +Product catalog for a web app (hourly refresh is plenty): + +```bash +databricks database create-synced-database-table \ + --json '{"name": "ecommerce_catalog.public.products", + "spec": {"source_table_full_name": "gold.products.catalog", + "primary_key_columns": ["product_id"], + "scheduling_policy": "TRIGGERED"}}' +``` + +Real-time feature serving for ML (needs CDF on `ml.features.user_features`): + +```bash +databricks database create-synced-database-table \ + --json '{"name": "ml_catalog.public.user_features", + "spec": {"source_table_full_name": "ml.features.user_features", + "primary_key_columns": ["user_id"], + "scheduling_policy": "CONTINUOUS"}}' +``` + +## SDK Equivalents + +Synced tables use the `w.database` SDK module (not `w.postgres`): ```python +from databricks.sdk import WorkspaceClient +from databricks.sdk.service.database import ( + SyncedDatabaseTable, SyncedTableSpec, + NewPipelineSpec, SyncedTableSchedulingPolicy, +) + +w = WorkspaceClient() + w.database.create_synced_database_table( SyncedDatabaseTable( - name="ml_catalog.public.user_features", + name="lakebase_catalog.schema.synced_table", spec=SyncedTableSpec( - source_table_full_name="ml.features.user_features", + source_table_full_name="analytics.gold.user_profiles", primary_key_columns=["user_id"], - scheduling_policy=SyncedTableSchedulingPolicy.CONTINUOUS, + scheduling_policy=SyncedTableSchedulingPolicy.TRIGGERED, + new_pipeline_spec=NewPipelineSpec( + storage_catalog="lakebase_catalog", + storage_schema="staging", + ), ), ) ) -``` - -## Best Practices -1. **Enable CDF** on source tables before creating Triggered or Continuous synced tables -2. **Choose appropriate sync mode**: Snapshot for small tables, Triggered for hourly/daily, Continuous for real-time -3. **Monitor sync status**: Check for failures and latency via Catalog Explorer -4. **Index target tables**: Create appropriate indexes in Postgres for your query patterns -5. **Handle schema changes**: Only additive changes are supported for streaming modes -6. **Account for connection limits**: Each synced table uses up to 16 connections +status = w.database.get_synced_database_table(name="lakebase_catalog.schema.synced_table") +print(status.data_synchronization_status.detailed_state) +``` From e56087a9ede5fb2fd6e65cd0fa662626d6335131 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Thu, 23 Apr 2026 23:56:57 +0200 Subject: [PATCH 47/66] Rewrite Execution & Compute skill around pure CLI jobs submit flow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SKILL.md and references/2-serverless-job.md now lead with databricks jobs submit (the one-shot create+run CLI primitive) instead of the defunct MCP execute_code wrapper that the reference file used to point at. Full flow documented: upload β†’ submit β†’ poll get-run β†’ fetch get-run-output, including the non-obvious gotcha that get-run-output takes the task run_id (.tasks[0].run_id), not the parent run_id from submit. scripts/compute.py gains --environments flag with dict-or-typed normalization so the standalone script can install pip dependencies (previously impossible from CLI β€” "client": "4" deps had no path). Interactive cluster section reduced to an "avoid by default" callout in SKILL.md; the raw-CLI cluster list and create patterns move into references/3-interactive-cluster.md alongside the existing script wrappers. SQL Warehouses section in SKILL.md expanded from create-only to the full CRUD surface (create, list, find, get, start, stop, edit, delete) with live-verified min_num_clusters/max_num_clusters and --no-wait gotchas. Co-authored-by: Isaac --- .../databricks-execution-compute/SKILL.md | 141 +++++++++++------- .../references/2-serverless-job.md | 106 +++++++++---- .../references/3-interactive-cluster.md | 26 ++++ .../scripts/compute.py | 101 +++++++++++-- 4 files changed, 276 insertions(+), 98 deletions(-) diff --git a/databricks-skills/databricks-execution-compute/SKILL.md b/databricks-skills/databricks-execution-compute/SKILL.md index 75a05bd8..30ebdd40 100644 --- a/databricks-skills/databricks-execution-compute/SKILL.md +++ b/databricks-skills/databricks-execution-compute/SKILL.md @@ -27,7 +27,7 @@ Run code on Databricks. Three execution modesβ€”choose based on workload. ### Decision Flow -Prefer Databricks Connect for all spark-based workload. +Prefer Databricks Connect for all spark-based workload, then serverless. ``` Spark-based code? β†’ Databricks Connect (fastest) └─ Python 3.12 missing? β†’ Install it + databricks-connect @@ -43,92 +43,125 @@ Scala/R? β†’ Interactive Cluster (list and ask which one to use) **Read the reference file for your chosen mode before proceeding.** -### Databricks Connect (run locally) β†’ [reference](references/1-databricks-connect.md) +### Databricks Connect (run locally, prefer when it's pure spark code) β†’ [reference](references/1-databricks-connect.md) ```bash +from databricks.connect import DatabricksSession +... +spark = DatabricksSession.builder.profile("my-local-profile").serverless(True).getOrCreate() + + python my_spark_script.py ``` ### Serverless Job β†’ [reference](references/2-serverless-job.md) -```bash -# Create and run a job with serverless compute -databricks jobs create --json '{ - "name": "my-script-job", - "tasks": [{ - "task_key": "main", - "spark_python_task": {"python_file": "/Workspace/Users/me/script.py"}, - "environment_key": "default" - }], - "environments": [{"environment_key": "default", "spec": {"client": "4"}}] -}' +Pure CLI flow: upload a local file as a workspace notebook, fire a one-time run with `databricks jobs submit` (create + run in one call, ephemeral β€” no Jobs UI entry, no retry), then poll + fetch the result. The local file must be a Databricks source notebook β€” top line `# Databricks notebook source` (Python) or `-- Databricks notebook source` (SQL). -# Run the job (JOB_ID is positional) -databricks jobs run-now JOB_ID -``` +**1. Upload the local file as a workspace notebook.** `TARGET_PATH` is positional; `--file` is the local path. -### Interactive Cluster β†’ [reference](references/3-interactive-cluster.md) +`databricks workspace import /Workspace/Users//.ai_dev_kit/train --file /local/path/to/train.py --format SOURCE --language PYTHON --overwrite` -```bash -# List running clusters -databricks clusters list --output json | jq '.[] | select(.state == "RUNNING")' +**2. Submit the run.** Use `--no-wait` to get `{"run_id": N}` back immediately; drop it to block until terminated. **`"client": "4"` is required** for `dependencies` to install (`"1"` silently ignores them). -# Run a notebook or script on a cluster -databricks workspace import /Workspace/Users/me/script.py --file ./script.py -databricks jobs create --json '{ - "name": "cluster-job", +`databricks jobs submit --no-wait --json @submit.json` + +```json +{ + "run_name": "train-run", "tasks": [{ "task_key": "main", - "existing_cluster_id": "CLUSTER_ID", - "spark_python_task": {"python_file": "/Workspace/Users/me/script.py"} + "notebook_task": {"notebook_path": "/Workspace/Users//.ai_dev_kit/train"}, + "environment_key": "ml_env" + }], + "environments": [{ + "environment_key": "ml_env", + "spec": {"client": "4", "dependencies": ["scikit-learn==1.5.2", "mlflow==2.22.0"]} }] -}' +} ``` -## CLI Commands +**3. Check state / wait for completion.** Life-cycle: `PENDING` β†’ `RUNNING` β†’ `TERMINATED` (or `SKIPPED` / `INTERNAL_ERROR`). Only read `.state.result_state` (`SUCCESS` / `FAILED` / `CANCELED`) once life-cycle is `TERMINATED`. -| Command | For | Purpose | -|---------|-----|---------| -| `databricks jobs create/run-now` | Serverless, Cluster | Run code remotely | -| `databricks clusters list` | Interactive | List clusters, check status | -| `databricks clusters create/start/delete` | Interactive | Manage clusters. **COSTLY:** `start` takes 3-8 min | -| `databricks warehouses create/list` | SQL | Manage SQL warehouses | +`databricks jobs get-run | jq '{state: .state.life_cycle_state, result: .state.result_state, duration_ms: .execution_duration, url: .run_page_url, task_run_id: .tasks[0].run_id}'` -### List Interactive Clusters (exclude job clusters) +**4. Fetch the output / error.** **Gotcha:** `get-run-output` takes the **task** run_id (`.tasks[0].run_id`), NOT the parent `run_id` from submit. `notebook_output.result` is the string passed to `dbutils.notebook.exit()`. -```bash -# List only UI/API clusters (excludes job clusters - much faster) -databricks clusters list --cluster-sources UI,API --output json | jq '.[] | select(.state == "RUNNING")' -``` +`databricks jobs get-run-output | jq '{result: .notebook_output.result, error, error_trace}'` -### Create Cluster +Always use `dbutils.notebook.exit()` in the notebook β€” `print()` is not captured by `get-run-output`. For JSON results: `dbutils.notebook.exit(json.dumps({...}))` then parse `.notebook_output.result` client-side. -```bash -# Create interactive cluster (SPARK_VERSION is positional) -# By default, include custom_tags to track resources created with this skill -databricks clusters create 15.4.x-scala2.12 --json '{ - "cluster_name": "my-cluster", - "node_type_id": "i3.xlarge", - "num_workers": 2, - "autotermination_minutes": 60, - "custom_tags": {"aidevkit_project": "ai-dev-kit"} -}' -``` +**Convenience wrapper.** `scripts/compute.py execute-code` does upload + submit + wait + cleanup in one command and returns a single tidy JSON: -### Create SQL Warehouse +`python scripts/compute.py execute-code --file /local/path/to/train.py --compute-type serverless --timeout 1500 --environments '[{"environment_key":"ml_env","spec":{"client":"4","dependencies":["scikit-learn==1.5.2","mlflow==2.22.0"]}}]' | jq '{success, state, output, error, run_id, run_page_url, execution_duration_ms}'` + +### Interactive Cluster β†’ [reference](references/3-interactive-cluster.md) + +**Avoid by default β€” prefer Serverless Job.** Only use an interactive cluster when: +- you have an existing classic cluster already running and available, or +- you need live, stateful execution across multiple calls (debugging via an execution context), or +- the user explicitly asks for it. + +Interactive clusters are **slow to start (3-8 min)** and cost money while running. Don't start one implicitly. + +## CLI Commands + +| Command | Purpose | +|---------|---------| +| `python scripts/compute.py execute-code` | Run code on serverless or an existing cluster | +| `python scripts/compute.py list-compute` | List clusters, node types, Spark versions | +| `python scripts/compute.py manage-cluster` | Create/start/terminate/delete clusters (see [3-interactive-cluster.md](references/3-interactive-cluster.md)) | +| `databricks warehouses create/list` | Manage SQL warehouses | + +### SQL Warehouses + +All `ID`-taking commands use positional arg (no `--id` flag). Use `databricks warehouses list` to find an ID. ```bash -# Create serverless SQL warehouse -# By default, include tags to track resources created with this skill +# Create a serverless SQL warehouse. min_num_clusters + max_num_clusters are REQUIRED +# (the server rejects the default 0). Keep the aidevkit_project tag for resource tracking. databricks warehouses create --json '{ "name": "my-warehouse", "cluster_size": "Small", "enable_serverless_compute": true, "auto_stop_mins": 10, + "min_num_clusters": 1, + "max_num_clusters": 1, "tags": {"custom_tags": [{"key": "aidevkit_project", "value": "ai-dev-kit"}]} }' + +# List / find β€” trim to id, name, state with jq +databricks warehouses list -o json | jq '.[] | {id, name, state, size: .cluster_size}' + +# Find by name +databricks warehouses list -o json | jq '.[] | select(.name == "my-warehouse")' + +# Get one warehouse's full config +databricks warehouses get + +# Start / stop (both are LROs; add --no-wait to return immediately) +databricks warehouses start +databricks warehouses stop + +# Resize / reconfigure β€” pass the FULL desired config (omitted fields revert to defaults, +# so always re-state min_num_clusters/max_num_clusters). Use --no-wait if the warehouse +# is STOPPED, otherwise edit blocks trying to reach RUNNING and errors out (the mutation +# itself still applies). When the warehouse is already RUNNING, --no-wait is optional. +databricks warehouses edit --no-wait --json '{ + "name": "my-warehouse", + "cluster_size": "Medium", + "enable_serverless_compute": true, + "auto_stop_mins": 15, + "min_num_clusters": 1, + "max_num_clusters": 1 +}' + +# Delete (irreversible) +databricks warehouses delete ``` +**Sizes:** `2X-Small`, `X-Small`, `Small`, `Medium`, `Large`, `X-Large`, `2X-Large`, `3X-Large`, `4X-Large`. **Types:** set `"warehouse_type": "PRO"` (default) or `"CLASSIC"` in the JSON body. + ## Related Skills - **[databricks-synthetic-data-gen](../databricks-synthetic-data-gen/SKILL.md)** β€” Data generation using Spark + Faker diff --git a/databricks-skills/databricks-execution-compute/references/2-serverless-job.md b/databricks-skills/databricks-execution-compute/references/2-serverless-job.md index 6cc29fd9..3b777174 100644 --- a/databricks-skills/databricks-execution-compute/references/2-serverless-job.md +++ b/databricks-skills/databricks-execution-compute/references/2-serverless-job.md @@ -1,6 +1,6 @@ # Serverless Job Execution -**Use when:** Running intensive Python code remotely (ML training, heavy processing) that doesn't need Spark, or when code shouldn't depend on local machine staying connected. +**Use when:** Running intensive Python code remotely (ML training, heavy processing) that doesn't need Spark, or when code shouldn't depend on the local machine staying connected. ## When to Choose Serverless Job @@ -15,55 +15,99 @@ |-----|-----| | No cluster to manage | ~25-50s cold start each invocation | | Up to 30 min timeout | No state preserved between calls | -| Independent execution | print() unreliableβ€”use `dbutils.notebook.exit()` | +| Independent execution | `print()` unreliable β€” use `dbutils.notebook.exit()` | -## Executing code -### Prefer running from a Local File (edit the local file then run it) +## Pure CLI flow -```python -execute_code( - file_path="/local/path/to/train_model.py", - compute_type="serverless" -) -``` +`databricks jobs submit` is the "create + run" primitive for ephemeral runs (no Jobs UI entry, no retry). The local file must be a Databricks source notebook β€” first line `# Databricks notebook source` (Python) or `-- Databricks notebook source` (SQL). -## Jobs with Custom Dependencies +### 1. Upload the local file as a workspace notebook -Use `job_extra_params` to install pip packages: +`TARGET_PATH` is positional; `--file` is the local path; `--language` is required when `--format SOURCE`. -```python -execute_code( - file_path="/path/to/train.py", - job_extra_params={ - "environments": [{ - "environment_key": "ml_env", - "spec": {"client": "4", "dependencies": ["scikit-learn", "pandas", "mlflow"]} - }] - } -) +`databricks workspace import /Workspace/Users//.ai_dev_kit/train --file /local/path/to/train.py --format SOURCE --language PYTHON --overwrite` + +### 2. Submit the run + +`--no-wait` returns `{"run_id": N}` immediately. Drop it to block until terminated. **`"client": "4"` is required** for `dependencies` to install β€” `"1"` silently ignores them. + +`databricks jobs submit --no-wait --json @submit.json` + +Where `submit.json`: + +```json +{ + "run_name": "train-run", + "tasks": [{ + "task_key": "main", + "notebook_task": {"notebook_path": "/Workspace/Users//.ai_dev_kit/train"}, + "environment_key": "ml_env" + }], + "environments": [{ + "environment_key": "ml_env", + "spec": {"client": "4", "dependencies": ["scikit-learn==1.5.2", "mlflow==2.22.0"]} + }] +} ``` -**CRITICAL:** Use `"client": "4"` in the spec. `"client": "1"` won't install dependencies. +### 3. Check status + +One-shot trim to the fields that matter: + +`databricks jobs get-run | jq '{state: .state.life_cycle_state, result: .state.result_state, duration_ms: .execution_duration, url: .run_page_url}'` + +Life-cycle states: `PENDING` β†’ `RUNNING` β†’ `TERMINATED` (or `SKIPPED` / `INTERNAL_ERROR`). Only read `.state.result_state` (`SUCCESS` / `FAILED` / `CANCELED`) once `life_cycle_state == TERMINATED`. + +### 4. Fetch the output / error + +**Gotcha:** `get-run-output` takes the **task** run_id (`.tasks[0].run_id`), not the parent `run_id` from submit. -## Output Handling +`databricks jobs get-run-output | jq '{result: .notebook_output.result, error, error_trace}'` + +`notebook_output.result` is whatever `dbutils.notebook.exit()` passed. `error` / `error_trace` populate on failure. + +### 5. (Optional) Delete the temp notebook + +`databricks workspace delete /Workspace/Users//.ai_dev_kit/train` + +## Output handling in the notebook ```python -# ❌ BAD - print() may not be captured +# BAD β€” print() output isn't returned by get-run-output print("Training complete!") -# βœ… GOOD - Use dbutils.notebook.exit() +# GOOD β€” dbutils.notebook.exit() populates notebook_output.result import json -results = {"accuracy": 0.95, "model_path": "/Volumes/..."} -dbutils.notebook.exit(json.dumps(results)) +dbutils.notebook.exit(json.dumps({"accuracy": 0.95, "model_path": "/Volumes/..."})) ``` +Max output size is 5 MB. Larger results should be written to a Volume/object store and referenced by path. + +## Convenience wrapper + +`scripts/compute.py execute-code` does upload + submit + wait + cleanup in one command and returns a single JSON with `success`, `state`, `output` (the `dbutils.notebook.exit` payload), `error`, `run_id`, `run_page_url`, `execution_duration_ms`. + +Minimal: + +`python scripts/compute.py execute-code --file train.py --compute-type serverless` + +With dependencies: + +`python scripts/compute.py execute-code --file /path/to/train.py --compute-type serverless --timeout 1500 --environments '[{"environment_key":"ml_env","spec":{"client":"4","dependencies":["scikit-learn==1.5.2","mlflow==2.22.0","xgboost==2.1.3"]}}]'` + +Long dependency list from a file: + +`python scripts/compute.py execute-code --file /path/to/train.py --compute-type serverless --environments @env.json` + ## Common Issues | Issue | Solution | |-------|----------| -| print() output missing | Use `dbutils.notebook.exit()` | -| `ModuleNotFoundError` | Add to environments spec with `"client": "4"` | -| Job times out | Max is 1800s; split into smaller tasks | +| `print()` output missing | Use `dbutils.notebook.exit()` β€” `print` isn't captured by `get-run-output` | +| `ModuleNotFoundError` | Add the package to the environments spec with `"client": "4"` | +| Dependencies listed but not installed | `"client": "1"` silently drops `dependencies`; use `"client": "4"` | +| `get-run-output` returns empty `notebook_output` | You passed the parent run_id, not `.tasks[0].run_id` | +| Job times out | Default 1800 s on the script wrapper; raise `--timeout` or use `jobs submit --no-wait` + your own polling | ## When NOT to Use diff --git a/databricks-skills/databricks-execution-compute/references/3-interactive-cluster.md b/databricks-skills/databricks-execution-compute/references/3-interactive-cluster.md index fbff2469..4af64ed7 100644 --- a/databricks-skills/databricks-execution-compute/references/3-interactive-cluster.md +++ b/databricks-skills/databricks-execution-compute/references/3-interactive-cluster.md @@ -125,6 +125,8 @@ python scripts/compute.py execute-code \ ## Managing Clusters +Two equivalent paths: the standalone script (convenience wrapper) or the raw `databricks` CLI (more fields exposed). Prefer the script for the common operations listed here. + ```bash # List all clusters python scripts/compute.py list-compute --resource clusters @@ -142,6 +144,30 @@ python scripts/compute.py manage-cluster --action terminate --cluster-id "1234-5 python scripts/compute.py manage-cluster --action create --name "my-cluster" --num-workers 2 ``` +### Filter running interactive clusters only (raw CLI) + +Useful before asking the user which cluster to reuse. `--cluster-sources UI,API` excludes job clusters (which would otherwise dominate the list on busy workspaces): + +```bash +databricks clusters list --cluster-sources UI,API --output json \ + | jq '.[] | select(.state == "RUNNING")' +``` + +### Create with a full spec (raw CLI) + +The script's `manage-cluster --action create` is fine for quick defaults; for full control (DBR version, instance type, tags) use the raw CLI: + +```bash +# SPARK_VERSION is positional; custom_tags recommended for resource tracking +databricks clusters create 15.4.x-scala2.12 --json '{ + "cluster_name": "my-cluster", + "node_type_id": "i3.xlarge", + "num_workers": 2, + "autotermination_minutes": 60, + "custom_tags": {"aidevkit_project": "ai-dev-kit"} +}' +``` + ## Common Issues | Issue | Solution | diff --git a/databricks-skills/databricks-execution-compute/scripts/compute.py b/databricks-skills/databricks-execution-compute/scripts/compute.py index 0e584f3c..e90a4ac7 100644 --- a/databricks-skills/databricks-execution-compute/scripts/compute.py +++ b/databricks-skills/databricks-execution-compute/scripts/compute.py @@ -292,8 +292,22 @@ def run_code_on_serverless( code: str, language: str = "python", timeout: int = 1800, + environments: Optional[List[Any]] = None, ) -> ServerlessRunResult: - """Run code on serverless compute using Jobs API runs/submit.""" + """Run code on serverless compute using Jobs API runs/submit. + + Args: + code: Source to execute. + language: "python" or "sql". + timeout: Max wait time in seconds. + environments: Optional list of environments to install dependencies. + Each entry may be a dict (documented shape) or a typed + ``JobEnvironment``. Dict shape: + {"environment_key": "my_env", + "spec": {"client": "4", "dependencies": ["pandas", "mlflow"]}} + ``client`` must be ``"4"`` (or higher) for dependencies to install; + ``"1"`` is the default but does NOT install ``dependencies``. + """ w = get_workspace_client() # Create temp notebook @@ -326,6 +340,39 @@ def run_code_on_serverless( overwrite=True, ) + # Normalize environments (accept dicts or typed JobEnvironment). + # The SDK serializes each list item via .as_dict(), so raw dicts fail there; + # typed objects also lack .get(), so we need to canonicalize before reading + # environment_key for the task binding. + if environments: + normalized = [] + for e in environments: + if isinstance(e, JobEnvironment): + normalized.append(e) + elif isinstance(e, dict): + spec = e.get("spec", {}) + if isinstance(spec, dict): + spec = Environment(**spec) + elif not isinstance(spec, Environment): + raise TypeError( + f"environments[].spec must be a dict or Environment, got {type(spec).__name__}" + ) + normalized.append( + JobEnvironment( + environment_key=e.get("environment_key", "default"), + spec=spec, + ) + ) + else: + raise TypeError( + f"environments[] entries must be dict or JobEnvironment, got {type(e).__name__}" + ) + job_envs = normalized + env_key = job_envs[0].environment_key or "default" + else: + job_envs = [JobEnvironment(environment_key="default", spec=Environment(client="1"))] + env_key = "default" + try: # Submit run run = w.jobs.submit( @@ -337,15 +384,10 @@ def run_code_on_serverless( notebook_path=notebook_path, source=Source.WORKSPACE, ), - environment_key="default", - ) - ], - environments=[ - JobEnvironment( - environment_key="default", - spec=Environment(client="1"), + environment_key=env_key, ) ], + environments=job_envs, ).result(timeout=timedelta(seconds=timeout)) # Get run output @@ -499,6 +541,22 @@ def cmd_execute_code(args): timeout = args.timeout destroy_context = args.destroy_context + # Parse --environments (JSON string or @path/to/file.json) for serverless + environments = None + env_arg = _none_if_empty(getattr(args, "environments", None)) + if env_arg: + try: + if env_arg.startswith("@"): + with open(env_arg[1:], "r", encoding="utf-8") as fh: + environments = json.load(fh) + else: + environments = json.loads(env_arg) + except (OSError, json.JSONDecodeError) as e: + return {"success": False, "error": f"Invalid --environments: {e}"} + if not isinstance(environments, list): + return {"success": False, + "error": "--environments must be a JSON array of environment objects"} + if not code and not file_path: return {"success": False, "error": "Either --code or --file must be provided."} @@ -522,13 +580,21 @@ def cmd_execute_code(args): # Serverless execution if compute_type == "serverless": default_timeout = timeout if timeout else 1800 - result = run_code_on_serverless( - code=code, - language=language, - timeout=default_timeout, - ) + try: + result = run_code_on_serverless( + code=code, + language=language, + timeout=default_timeout, + environments=environments, + ) + except TypeError as e: + return {"success": False, "error": str(e)} return result.to_dict() + if environments: + return {"success": False, + "error": "--environments is only supported with --compute-type serverless"} + # Cluster execution default_timeout = timeout if timeout else 120 try: @@ -639,6 +705,15 @@ def main(): help="Language (default: python)") exec_parser.add_argument("--timeout", type=int, help="Timeout in seconds") exec_parser.add_argument("--destroy-context", action="store_true", help="Destroy context after execution") + exec_parser.add_argument( + "--environments", + help=( + "Serverless only. JSON array of environments (or @path/to/file.json). " + 'Example: \'[{"environment_key":"ml_env","spec":{"client":"4",' + '"dependencies":["mlflow","scikit-learn"]}}]\'. ' + 'IMPORTANT: "client":"4" installs dependencies; "1" does not.' + ), + ) exec_parser.set_defaults(func=cmd_execute_code) # list-compute From 2c68f9b26265584aaea30ddcafa051b8436e19f8 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Thu, 23 Apr 2026 23:57:13 +0200 Subject: [PATCH 48/66] Fix run_code_on_serverless environments normalization + add tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit job_extra_params={"environments": [...]} was broken both ways: passing dicts (the documented shape) crashed in the SDK's jobs.submit because it serializes each list element via .as_dict(); passing typed JobEnvironment crashed earlier trying to read environment_key with .get(). Neither path worked. Normalize extra["environments"] to List[JobEnvironment] once at the top of the submit path: dicts get wrapped (nested spec dict β†’ typed Environment), typed objects pass through, anything else raises TypeError before hitting the SDK. env_key for the task binding is read off the canonical typed object. Adds TestServerlessJobExtraParams integration-test class covering the four cases: dict input, typed input, no environments (default path), malformed entry. Previously there was zero coverage of job_extra_params, which is how the bug landed. All four pass live (β‰ˆ110 s for the class). Co-authored-by: Isaac --- .../compute/serverless.py | 42 ++++++++--- .../integration/compute/test_serverless.py | 72 +++++++++++++++++++ 2 files changed, 106 insertions(+), 8 deletions(-) diff --git a/databricks-tools-core/databricks_tools_core/compute/serverless.py b/databricks-tools-core/databricks_tools_core/compute/serverless.py index 65fdff2b..f1c3588e 100644 --- a/databricks-tools-core/databricks_tools_core/compute/serverless.py +++ b/databricks-tools-core/databricks_tools_core/compute/serverless.py @@ -295,14 +295,40 @@ def run_code_on_serverless( try: # --- Step 2: Submit serverless run --- try: - # Build submit kwargs, allowing job_extra_params to override defaults - extra = job_extra_params or {} - - # Determine environment_key for the task - env_key = "Default" - if "environments" in extra and extra["environments"]: - # Use the first environment's key from extra params - env_key = extra["environments"][0].get("environment_key", "Default") + # Build submit kwargs, allowing job_extra_params to override defaults. + # Callers may pass environments as dicts (documented shape) or typed + # JobEnvironment objects. Normalize to typed before use, because: + # - the SDK's jobs.submit serializes each element via .as_dict() + # - we read environment_key off the first element here + extra = dict(job_extra_params or {}) + + if extra.get("environments"): + normalized = [] + for e in extra["environments"]: + if isinstance(e, JobEnvironment): + normalized.append(e) + elif isinstance(e, dict): + spec = e.get("spec", {}) + if isinstance(spec, dict): + spec = Environment(**spec) + elif not isinstance(spec, Environment): + raise TypeError( + f"environments[].spec must be a dict or Environment, got {type(spec).__name__}" + ) + normalized.append( + JobEnvironment( + environment_key=e.get("environment_key", "Default"), + spec=spec, + ) + ) + else: + raise TypeError( + f"environments[] entries must be dict or JobEnvironment, got {type(e).__name__}" + ) + extra["environments"] = normalized + env_key = normalized[0].environment_key or "Default" + else: + env_key = "Default" submit_kwargs = { "run_name": run_name, diff --git a/databricks-tools-core/tests/integration/compute/test_serverless.py b/databricks-tools-core/tests/integration/compute/test_serverless.py index 7f68bb42..934ba8ca 100644 --- a/databricks-tools-core/tests/integration/compute/test_serverless.py +++ b/databricks-tools-core/tests/integration/compute/test_serverless.py @@ -224,3 +224,75 @@ def test_persistent_to_dict_includes_workspace_path(self): assert result.success, f"Execution failed: {result.error}" d = result.to_dict() assert d["workspace_path"] == ws_path + + +@pytest.mark.integration +class TestServerlessJobExtraParams: + """Tests for job_extra_params, especially the environments list normalization. + + Regression coverage for the bug where passing environments as dicts (the + documented shape in the docstring) crashed with + "'dict' object has no attribute 'as_dict'", and passing typed JobEnvironment + crashed with "'JobEnvironment' object has no attribute 'get'". + """ + + def test_environments_as_dicts(self): + """Documented shape: environments as plain dicts with 'spec' nested dict.""" + result = run_code_on_serverless( + code='dbutils.notebook.exit("dict env ok")', + run_name="test-env-dict", + job_extra_params={ + "environments": [ + { + "environment_key": "dict_env", + "spec": {"client": "1"}, + } + ] + }, + ) + logger.info(f"dict-env result: success={result.success}, error={result.error}") + assert result.success, f"Execution failed: {result.error}" + assert "dict env ok" in result.output + + def test_environments_as_typed_objects(self): + """Typed shape: environments as JobEnvironment + Environment instances.""" + from databricks.sdk.service.compute import Environment + from databricks.sdk.service.jobs import JobEnvironment + + result = run_code_on_serverless( + code='dbutils.notebook.exit("typed env ok")', + run_name="test-env-typed", + job_extra_params={ + "environments": [ + JobEnvironment( + environment_key="typed_env", + spec=Environment(client="1"), + ) + ] + }, + ) + logger.info(f"typed-env result: success={result.success}, error={result.error}") + assert result.success, f"Execution failed: {result.error}" + assert "typed env ok" in result.output + + def test_no_job_extra_params_uses_default_env(self): + """Regression: omitting job_extra_params still submits with the default env.""" + result = run_code_on_serverless( + code='dbutils.notebook.exit("default env ok")', + run_name="test-env-default", + ) + logger.info(f"default-env result: success={result.success}, error={result.error}") + assert result.success, f"Execution failed: {result.error}" + assert "default env ok" in result.output + + def test_malformed_environment_entry_raises_type_error(self): + """Non-dict, non-typed entries should fail fast with a TypeError (no submit).""" + result = run_code_on_serverless( + code="print('never runs')", + run_name="test-env-malformed", + job_extra_params={"environments": ["not-a-dict-or-typed"]}, + ) + # The function catches the TypeError and returns a failure result. + assert not result.success + assert result.error is not None + assert "JobEnvironment" in result.error or "type" in result.error.lower() From 6d0d48562fec8aa4b8e169dafc984f44df706165 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Thu, 23 Apr 2026 23:57:43 +0200 Subject: [PATCH 49/66] Explain why AI/BI dashboards must use bare table names The --dataset-catalog / --dataset-schema guidance tells you what to do but not why. Clarify that bare table names exist so the serialized dashboard can be re-installed on a different catalog.schema without rewriting queries. Co-authored-by: Isaac --- databricks-skills/databricks-aibi-dashboards/SKILL.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/databricks-skills/databricks-aibi-dashboards/SKILL.md b/databricks-skills/databricks-aibi-dashboards/SKILL.md index 0efb71ef..61678e66 100644 --- a/databricks-skills/databricks-aibi-dashboards/SKILL.md +++ b/databricks-skills/databricks-aibi-dashboards/SKILL.md @@ -17,7 +17,7 @@ A dashboard should be showing something relevant for a human, typically some KPI | List tables | `databricks experimental aitools tools query --warehouse WH "SHOW TABLES IN catalog.schema"` | | Get schema | `databricks experimental aitools tools discover-schema catalog.schema.table1 catalog.schema.table2` | | Test query | `databricks experimental aitools tools query --warehouse WH "SELECT..."` | -| Create dashboard | `databricks lakeview create --display-name "X" --warehouse-id "Y" --dataset-catalog "catalog" --dataset-schema "schema" --serialized-dashboard "$(cat file.json)"` (always set `--dataset-catalog` and `--dataset-schema` β€” queries MUST use bare table names only) | +| Create dashboard | `databricks lakeview create --display-name "X" --warehouse-id "Y" --dataset-catalog "catalog" --dataset-schema "schema" --serialized-dashboard "$(cat file.json)"` (always set `--dataset-catalog` and `--dataset-schema` β€” queries must use bare table names only to support install on different catalog.schema) | | Update dashboard | `databricks lakeview update DASHBOARD_ID --serialized-dashboard "$(cat file.json)"` | | Publish | `databricks lakeview publish DASHBOARD_ID --warehouse-id WH` | | Delete | `databricks lakeview trash DASHBOARD_ID` | From 84ad0c85d0575ae31b9a4553f71faba8d9ce515b Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Fri, 24 Apr 2026 09:13:40 +0200 Subject: [PATCH 50/66] Replace legacy get_table_stats_and_schema MCP with discover-schema CLI Three skills previously documented the dead get_table_stats_and_schema MCP function or had related gaps: - databricks-metric-views: swap the MCP call for `databricks experimental aitools tools discover-schema` and note that deeper distribution probes go through the `query` subcommand. - databricks-genie: same replacement in Step 1 "Understand the Data", plus delete the bogus `databricks sql exec` calls (no such subcommand exists) in favor of `query`. - databricks-aibi-dashboards: expand Step 2 exploration guidance so the design decisions (widget vs. table, KPI vs. trend chart, trend granularity, filter options) are explicitly tied to what to probe (cardinality, top values, numeric distribution, trend viability). Keeps the skill conceptual rather than prescribing SQL the agent can already write. Co-authored-by: Isaac --- .../databricks-aibi-dashboards/SKILL.md | 23 +++++++++---------- databricks-skills/databricks-genie/SKILL.md | 13 ++++------- .../databricks-metric-views/SKILL.md | 13 ++++------- 3 files changed, 20 insertions(+), 29 deletions(-) diff --git a/databricks-skills/databricks-aibi-dashboards/SKILL.md b/databricks-skills/databricks-aibi-dashboards/SKILL.md index 61678e66..81f50855 100644 --- a/databricks-skills/databricks-aibi-dashboards/SKILL.md +++ b/databricks-skills/databricks-aibi-dashboards/SKILL.md @@ -51,21 +51,20 @@ databricks warehouses list ### Step 2: Discover Table Schemas and existing data pattern -```bash -# Get table schemas for designing queries -databricks experimental aitools tools query --warehouse WAREHOUSE_ID "SHOW TABLES IN catalog.schema" 2>&1 -# Use CATALOG.SCHEMA.TABLE format for discover-schema (this is for exploration only) -databricks experimental aitools tools discover-schema catalog.schema.table1 catalog.schema.table2 +A good dashboard comes from knowing the data first. Spend time here β€” the exploration drives design decisions in Step 4 (which widgets, which filters, which groupings). -# Example: -databricks experimental aitools tools discover-schema samples.nyctaxi.trips main.default.customers +Use `discover-schema` as the default β€” one call returns columns, types, sample rows, null counts, and row count. If you only know the schema, list tables first with `query "SHOW TABLES IN ..."`. -# Explore data patterns if needed to confirm the data tells the intended story (to understand what/how to visualize): -databricks experimental aitools tools query --warehouse WAREHOUSE_ID "" -``` +`databricks experimental aitools tools discover-schema catalog.schema.orders catalog.schema.customers` + +Sample rows alone don't tell you what to build. you can write aggregate SQL through `databricks experimental aitools tools query --warehouse "..."` to probe typically: + +- **Cardinality** of candidate grouping columns β†’ decides chart color-group vs. table (≀8 distinct values for charts, see Cardinality & Readability below). +- **Top categorical values** β†’ populates filter options and chart legends meaningfully. +- **Numeric distribution** (min/max/avg/percentiles) β†’ decides KPI with delta vs. trend chart (flat metrics shouldn't be line charts, see Data Variance Considerations below). +- **Trend viability** at daily/weekly/monthly grain β†’ picks the right trend granularity. +- **Story confirmation** β€” run the aggregations you plan to put in the dashboard and check they're not flat, empty, or uninteresting. Fix the query or adjust the story before moving on. -> **Note**: The `discover-schema` command needs the full `catalog.schema.table` path (it's a separate exploration tool). -> > **Dashboard queries are different** β€” inside the dashboard JSON, the `FROM` clause must reference ONLY the table name, with no catalog or schema prefix: > - βœ… Correct: `FROM trips` > - ❌ Wrong: `FROM nyctaxi.trips` diff --git a/databricks-skills/databricks-genie/SKILL.md b/databricks-skills/databricks-genie/SKILL.md index 49628a9a..eb621081 100644 --- a/databricks-skills/databricks-genie/SKILL.md +++ b/databricks-skills/databricks-genie/SKILL.md @@ -20,14 +20,11 @@ Before creating a Genie Space, explore the available tables to: - **Understand the story** β€” what business questions can this data answer? What insights can users discover? - **Design meaningful sample questions** β€” questions should reflect real use cases and lead to actionable insights in the data -```bash -# Discover table schemas, columns, and sample values -databricks experimental aitools tools discover-schema catalog.schema.gold_sales catalog.schema.gold_customers +Use `discover-schema` as the default β€” one call returns columns, types, sample rows, null counts, and row count. If you only know the schema, list tables first with `query "SHOW TABLES IN ..."`. -# Run SQL queries to explore the data and understand relationships -databricks sql exec "SELECT * FROM catalog.schema.gold_sales LIMIT 10" -databricks sql exec "DESCRIBE TABLE catalog.schema.gold_sales" -``` +`databricks experimental aitools tools discover-schema catalog.schema.gold_sales catalog.schema.gold_customers` + +For Genie, knowing column distribution shapes the sample questions and text instructions. Probe cardinality, ranges, and top categorical values with aggregate SQL through `databricks experimental aitools tools query --warehouse "..."` so your sample questions reflect what's actually in the data. Both commands auto-pick the default warehouse; set `DATABRICKS_WAREHOUSE_ID` or pass `--warehouse ` to override. ### Step 2: Create the Space @@ -70,7 +67,7 @@ If answers are inaccurate or incomplete, improve the space β€” see "Improving a # Export space configuration (extract serialized_space from get-space output) databricks genie get-space SPACE_ID --include-serialized-space -o json | jq '.serialized_space' > genie_space.json -# Import: Create a new space with the exported serialized_space +# Import: Create a new space with the exported serialized_space (then don't forge to tag) databricks genie create-space --json "{ \"warehouse_id\": \"WAREHOUSE_ID\", \"title\": \"Sales Analytics\", diff --git a/databricks-skills/databricks-metric-views/SKILL.md b/databricks-skills/databricks-metric-views/SKILL.md index b395a077..6f5e7eb2 100644 --- a/databricks-skills/databricks-metric-views/SKILL.md +++ b/databricks-skills/databricks-metric-views/SKILL.md @@ -27,16 +27,11 @@ Use this skill when: ### Inspect Source Table Schema -Before creating a metric view, call `get_table_stats_and_schema` to understand available columns for dimensions and measures: +Before authoring a metric view, inspect the source tables. Use `discover-schema` as the default β€” one call returns columns, types, sample rows, null counts, and row count. If you only know the schema, list tables first with `query "SHOW TABLES IN ..."`. -``` -get_table_stats_and_schema( - catalog="catalog", - schema="schema", - table_names=["orders"], - table_stat_level="SIMPLE" # Use "DETAILED" for cardinality, min/max, histograms -) -``` +`databricks experimental aitools tools discover-schema catalog.schema.orders catalog.schema.customers` + +For dimensions and measures, probe distribution beyond sampling β€” cardinality of candidate dimensions, min/max/percentiles for measures, top categorical values. Write aggregate SQL through `databricks experimental aitools tools query --warehouse "..."`. Both commands auto-pick the default warehouse; set `DATABRICKS_WAREHOUSE_ID` or pass `--warehouse ` to override. ### Create a Metric View From e6c2ba60e2c3c6251db34a221f6719da7f91c6b0 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Fri, 24 Apr 2026 13:36:34 +0200 Subject: [PATCH 51/66] Fix dbfs: prefix on volume fs commands and --overwrite on app import-dir MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit databricks fs on CLI v0.296 requires the dbfs: scheme prefix for UC Volume paths. Without it the CLI treats the path as local filesystem and errors with `no such directory`. Fix every fs example pointed at /Volumes/... in the PDF, UC, and SDP skills; also tighten the UC examples to use -r and --overwrite consistently, and clarify that -r copies the source directory's contents (not the directory itself). databricks workspace import-dir on redeploys silently skips files that already exist, so updates never reach the workspace and the app keeps running the old version. Add --overwrite to every import-dir example in the app skill's 4-deployment.md and 6-cli-approach.md. Also flag the first-ever-deploy gotcha on the redeployment recipe (the workspace delete line errors when the target dir doesn't exist yet). Fix the PDF skill's volumes-create troubleshooting row β€” it passed a single dotted arg (`catalog.schema.volume`) where the CLI wants four positional args (`CATALOG SCHEMA NAME MANAGED`). All corrected forms live-verified against the workspace once. Co-authored-by: Isaac --- .../databricks-app-python/4-deployment.md | 8 ++++++-- .../databricks-app-python/6-cli-approach.md | 6 ++++-- .../SKILL.md | 2 +- .../databricks-unity-catalog/6-volumes.md | 14 ++++++++------ .../databricks-unity-catalog/SKILL.md | 14 ++++++++------ .../SKILL.md | 6 ++++-- 6 files changed, 31 insertions(+), 19 deletions(-) diff --git a/databricks-skills/databricks-app-python/4-deployment.md b/databricks-skills/databricks-app-python/4-deployment.md index 384c82ac..25294e23 100644 --- a/databricks-skills/databricks-app-python/4-deployment.md +++ b/databricks-skills/databricks-app-python/4-deployment.md @@ -37,13 +37,15 @@ env: ### Step 2: Create and Deploy +`--overwrite` on `workspace import-dir` is required for redeploys β€” without it the CLI **silently skips files that already exist**, so your updated code never makes it to the workspace and the app keeps running the old version. Harmless on the first deploy. + ```bash # Create the app databricks apps create # Upload source code databricks workspace mkdirs /Workspace/Users//apps/ -databricks workspace import-dir . /Workspace/Users//apps/ +databricks workspace import-dir . /Workspace/Users//apps/ --overwrite # Deploy databricks apps deploy \ @@ -57,9 +59,11 @@ databricks apps get ### Redeployment +Use this recipe after the initial deploy, when you want a clean upload (stale files removed). On a first-ever deploy the `workspace delete` line errors because the directory doesn't exist yet β€” either run Step 2 first, or prefix the delete with ` 2>/dev/null || true` if you want this recipe to double as a clean deploy. + ```bash databricks workspace delete /Workspace/Users//apps/ --recursive -databricks workspace import-dir . /Workspace/Users//apps/ +databricks workspace import-dir . /Workspace/Users//apps/ --overwrite databricks apps deploy \ --source-code-path /Workspace/Users//apps/ ``` diff --git a/databricks-skills/databricks-app-python/6-cli-approach.md b/databricks-skills/databricks-app-python/6-cli-approach.md index 6b6273c1..0636511e 100644 --- a/databricks-skills/databricks-app-python/6-cli-approach.md +++ b/databricks-skills/databricks-app-python/6-cli-approach.md @@ -49,9 +49,11 @@ my_app/ ### Step 2: Upload to Workspace +`--overwrite` is required for redeploys β€” without it the CLI **silently skips files that already exist**, so your updated code never makes it to the workspace and the app keeps running the old version. Harmless on the first deploy. + ```bash # Upload local folder to workspace -databricks workspace import-dir /path/to/my_app /Workspace/Users/user@example.com/my_app +databricks workspace import-dir /path/to/my_app /Workspace/Users/user@example.com/my_app --overwrite ``` ### Step 3: Create and Deploy App @@ -77,7 +79,7 @@ databricks apps logs my-dashboard ### Step 5: Iterate 1. Fix issues in local files -2. Re-upload with `databricks workspace import-dir /path/to/my_app /Workspace/Users/user@example.com/my_app` +2. Re-upload with `databricks workspace import-dir /path/to/my_app /Workspace/Users/user@example.com/my_app --overwrite` 3. Re-deploy with `databricks apps deploy my-dashboard --source-code-path ...` 4. Check `databricks apps logs my-dashboard` for errors 5. Repeat until app is healthy diff --git a/databricks-skills/databricks-spark-declarative-pipelines/SKILL.md b/databricks-skills/databricks-spark-declarative-pipelines/SKILL.md index 0efeb017..97758497 100644 --- a/databricks-skills/databricks-spark-declarative-pipelines/SKILL.md +++ b/databricks-skills/databricks-spark-declarative-pipelines/SKILL.md @@ -39,7 +39,7 @@ description: "Creates, configures, and updates Databricks Lakeflow Spark Declara - When the user provides table schema and asks for code, respond directly with the code. Don't ask clarifying questions if the request is clear. ## Tools -- List files in volume: `databricks fs ls /Volumes/{catalog}/{schema}/{volume}/{path}` +- List files in volume: `databricks fs ls dbfs:/Volumes/{catalog}/{schema}/{volume}/{path}` (the `dbfs:` prefix is required even for UC Volume paths) - Query data: `databricks experimental aitools tools query --warehouse abc123 "SELECT 1 FROM catalog.schema.table"` - Discover schema: `databricks experimental aitools tools discover-schema catalog.schema.table1 catalog.schema.table2` - Pipelines CLI: `databricks pipelines create|get|delete|start-update|list-pipelines` or use `databricks pipelines --help` for more options diff --git a/databricks-skills/databricks-unity-catalog/6-volumes.md b/databricks-skills/databricks-unity-catalog/6-volumes.md index 179baa67..98166ec6 100644 --- a/databricks-skills/databricks-unity-catalog/6-volumes.md +++ b/databricks-skills/databricks-unity-catalog/6-volumes.md @@ -39,14 +39,16 @@ All volume operations use the path format: ## CLI Commands +`databricks fs` requires the `dbfs:` scheme prefix for UC Volume paths β€” without it the CLI treats the path as local filesystem and fails with `no such directory`. + | Command | Description | |---------|-------------| -| `databricks fs ls /Volumes/catalog/schema/volume/path/` | List files in a volume | -| `databricks fs cp /tmp/data/* /Volumes/.../dest --recursive` | Upload files/folders to volume | -| `databricks fs cp /Volumes/.../file.csv /tmp/file.csv` | Download files from volume | -| `databricks fs mkdirs /Volumes/.../new_folder` | Create directory (like `mkdir -p`) | -| `databricks fs rm /Volumes/.../file.csv` | Delete file | -| `databricks fs rm /Volumes/.../folder --recursive` | Delete directory recursively | +| `databricks fs ls dbfs:/Volumes/catalog/schema/volume/path/` | List files in a volume | +| `databricks fs cp -r --overwrite /tmp/data dbfs:/Volumes/.../dest` | Upload a directory's contents to a volume | +| `databricks fs cp dbfs:/Volumes/.../file.csv /tmp/file.csv` | Download a file from a volume | +| `databricks fs mkdirs dbfs:/Volumes/.../new_folder` | Create directory (like `mkdir -p`) | +| `databricks fs rm dbfs:/Volumes/.../file.csv` | Delete file | +| `databricks fs rm -r dbfs:/Volumes/.../folder` | Delete directory recursively | --- diff --git a/databricks-skills/databricks-unity-catalog/SKILL.md b/databricks-skills/databricks-unity-catalog/SKILL.md index bbc77a6f..97f4cfc1 100644 --- a/databricks-skills/databricks-unity-catalog/SKILL.md +++ b/databricks-skills/databricks-unity-catalog/SKILL.md @@ -51,18 +51,20 @@ databricks volumes list my_catalog.my_schema ### Volume File Operations (CLI) +`databricks fs` requires the `dbfs:` scheme prefix even for UC Volume paths β€” without it the CLI treats the path as local filesystem and errors with `no such directory`. + ```bash # List files in a volume -databricks fs ls /Volumes/catalog/schema/volume/path/ +databricks fs ls dbfs:/Volumes/catalog/schema/volume/path/ -# Upload files to a volume -databricks fs cp /tmp/data/* /Volumes/catalog/schema/volume/dest/ --recursive +# Upload a directory's contents to a volume (-r copies contents, not the directory itself) +databricks fs cp -r --overwrite /tmp/data dbfs:/Volumes/catalog/schema/volume/dest -# Download files from a volume -databricks fs cp /Volumes/catalog/schema/volume/file.csv /tmp/file.csv +# Download a file from a volume +databricks fs cp dbfs:/Volumes/catalog/schema/volume/file.csv /tmp/file.csv # Create a directory in a volume -databricks fs mkdirs /Volumes/catalog/schema/volume/new_folder +databricks fs mkdirs dbfs:/Volumes/catalog/schema/volume/new_folder ``` ### Enable System Tables Access diff --git a/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md b/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md index 5b10479d..57fe029f 100644 --- a/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md +++ b/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md @@ -39,8 +39,10 @@ Skips files where PDF exists and is newer than HTML. Use `--force` to reconvert ## Step 3: Upload to Volume +`databricks fs` requires the `dbfs:` scheme prefix even for UC Volume paths. `-r` copies the *contents* of the source directory into the target (the source directory name is not preserved), so files land directly under `raw_data/`. + ```bash -databricks fs cp -r ./raw_data/pdf /Volumes/my_catalog/my_schema/raw_data/ +databricks fs cp -r --overwrite ./raw_data/pdf dbfs:/Volumes/my_catalog/my_schema/raw_data ``` ## Step 4: Generate Test Questions @@ -111,4 +113,4 @@ Subfolder structure is preserved: |-------|----------| | "plutoprint not installed" | `uv pip install plutoprint` | | PDF looks wrong | Check HTML/CSS syntax | -| "Volume does not exist" | `databricks volumes create catalog.schema.volume MANAGED` | +| "Volume does not exist" | `databricks volumes create CATALOG SCHEMA VOLUME_NAME MANAGED` (four separate positional args, not `catalog.schema.volume`) | From 8fd7f31e839872da71133d7dd92ee2a584de3e4b Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Fri, 24 Apr 2026 15:29:36 +0200 Subject: [PATCH 52/66] Replace Genie conversation.py script with pure CLI flow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit scripts/conversation.py was a 171-line Python glue wrapper around client.genie.start_conversation_and_wait and client.genie.get_message with manual polling. The CLI now exposes all the primitives directly (start-conversation, create-message, get-message, get-message-attachment-query-result), and start-conversation has a built-in --no-wait / --timeout LRO flag. Document the three-command flow end-to-end and delete the script. No external Python callers (only SKILL.md pointed at it). Also in this commit: - Fix the Export/Import quoting inconsistency: genie_space.json on disk is now a parsed object (not a JSON-string blob). Export unwraps with `jq '.serialized_space | fromjson'`; import and update both stringify consistently with `jq -c '.' | jq -Rs '.'`. - Add two troubleshooting rows: slow answers / query timeouts (warehouse sizing) and wrong/empty answers (example_question_sqls + text_instructions). - Drop the redundant serialized_space "Structure" skeleton β€” its information is a strict subset of the Complete Example, now renamed "Example" with the top-level keys called out in the lead-in. All three primitives live-verified against a real Genie Space on the workspace (NordWind Fleet Analytics): start-conversation β†’ poll get-message β†’ get-message-attachment-query-result (columns + rows) β†’ create-message for follow-up. Co-authored-by: Isaac --- databricks-skills/databricks-genie/SKILL.md | 67 ++++--- .../databricks-genie/scripts/conversation.py | 171 ------------------ 2 files changed, 32 insertions(+), 206 deletions(-) delete mode 100644 databricks-skills/databricks-genie/scripts/conversation.py diff --git a/databricks-skills/databricks-genie/SKILL.md b/databricks-skills/databricks-genie/SKILL.md index eb621081..d1b669f2 100644 --- a/databricks-skills/databricks-genie/SKILL.md +++ b/databricks-skills/databricks-genie/SKILL.md @@ -57,23 +57,24 @@ databricks genie trash-space SPACE_ID ### Step 3: Test and Iterate -Use `scripts/conversation.py` (see Conversation API section below) to test questions and verify answers are accurate. - -If answers are inaccurate or incomplete, improve the space β€” see "Improving a Genie Space" below. +Use the Conversation API (section below) to ask questions and verify answers. If answers are inaccurate or incomplete, improve the space β€” see "Improving a Genie Space" below. ### Export & Import +**Convention:** `genie_space.json` always holds the **parsed** space object (not a JSON-string-encoded blob), so it's readable and editable. At each use site we stringify it with `jq -c '.' | jq -Rs '.'` β€” same pattern as Step 2 Create and "Improving a Genie Space" below. `jq -r '.serialized_space | fromjson'` on export strips the outer quoting so the file is already a parsed object. + ```bash -# Export space configuration (extract serialized_space from get-space output) -databricks genie get-space SPACE_ID --include-serialized-space -o json | jq '.serialized_space' > genie_space.json +# Export: extract serialized_space AND unwrap it to a parsed object on disk +databricks genie get-space SPACE_ID --include-serialized-space -o json \ + | jq '.serialized_space | fromjson' > genie_space.json -# Import: Create a new space with the exported serialized_space (then don't forge to tag) +# Import: same stringify pattern as Step 2 (Create) databricks genie create-space --json "{ \"warehouse_id\": \"WAREHOUSE_ID\", \"title\": \"Sales Analytics\", \"description\": \"Migrated space\", \"parent_path\": \"/Workspace/Users/you@company.com/genie_spaces\", - \"serialized_space\": $(cat genie_space.json) + \"serialized_space\": $(cat genie_space.json | jq -c '.' | jq -Rs '.') }" ``` @@ -92,24 +93,6 @@ databricks genie update-space SPACE_ID --json "{\"serialized_space\": $(cat geni The `serialized_space` field is a JSON string containing the full space configuration. -### Structure - -```json -{ - "version": 2, - "config": { - "sample_questions": [...] - }, - "data_sources": { - "tables": [{"identifier": "catalog.schema.table"}] - }, - "instructions": { - "example_question_sqls": [...], - "text_instructions": [...] - } -} -``` - ### Field Format Requirements **IMPORTANT:** All items in `sample_questions`, `example_question_sqls`, and `text_instructions` require a unique `id` field. @@ -132,9 +115,9 @@ The `serialized_space` field is a JSON string containing the full space configur Well-crafted instructions significantly improve answer accuracy. -### Complete Example +### Example -This example shows a properly formatted `serialized_space` with sample questions, SQL examples, and text instructions. Note that every item has a unique 32-char hex `id` and all text fields are arrays: +Top-level keys are `version`, `config`, `data_sources`, `instructions`. Every item in `sample_questions`, `example_question_sqls`, and `text_instructions` needs a unique 32-char hex `id` and all text fields are arrays: ```json { @@ -185,20 +168,32 @@ Use `DATABRICKS_CONFIG_PROFILE=profile_name` to target different workspaces. ## Conversation API -Use `scripts/conversation.py` to ask questions programmatically: +Ask questions via three CLI primitives: `start-conversation`, `create-message` (follow-ups), and `get-message` (state + SQL + text). `--no-wait` on `start-conversation` / `create-message` returns immediately with `{conversation_id, message_id}`; poll `get-message` until `.status` is `COMPLETED`, `FAILED`, or `CANCELLED`. Intermediate states you'll see: `SUBMITTED`, `FILTERING_CONTEXT`, `ASKING_AI`, `EXECUTING_QUERY`. ```bash -# Ask a question -python scripts/conversation.py ask SPACE_ID "What were total sales last month?" +# Start a new conversation (async β€” get IDs back immediately) +databricks genie start-conversation --no-wait SPACE_ID "What were total sales last month?" +# β†’ {"conversation_id": "...", "message_id": "..."} + +# Poll state +databricks genie get-message SPACE_ID CONV_ID MSG_ID | jq '{status, error}' -# Follow-up in same conversation (Genie remembers context) -python scripts/conversation.py ask SPACE_ID "Break down by region" --conversation-id CONV_ID +# When COMPLETED, pull the generated SQL and any text reply +databricks genie get-message SPACE_ID CONV_ID MSG_ID \ + | jq '.attachments[] | {sql: .query.query, description: .query.description, text: .text.content}' -# With timeout for complex queries -python scripts/conversation.py ask SPACE_ID "Complex query" --timeout 120 +# Fetch the query result rows (columns + data_array) +databricks genie get-message-attachment-query-result SPACE_ID CONV_ID MSG_ID ATTACHMENT_ID \ + | jq '{columns: .statement_response.manifest.schema.columns | map({name, type: .type_name}), + rows: .statement_response.result.data_array}' + +# Follow-up in the same conversation (Genie remembers context) +databricks genie create-message --no-wait SPACE_ID CONV_ID "Break that down by region" ``` -Start a new conversation for unrelated topics. Use `--conversation-id` only for follow-ups on the same topic. +Start a new conversation for unrelated topics. Use `create-message` (same `CONV_ID`) only for follow-ups on the same topic. + +On `FAILED`, `get-message` populates `.error.error` with the underlying error string (e.g. `[INSUFFICIENT_PERMISSIONS] ...`) and `.error.type` (e.g. `SQL_EXECUTION_EXCEPTION`). Attachments may still include `suggested_questions` even when the primary query failed. ## Troubleshooting @@ -209,6 +204,8 @@ Start a new conversation for unrelated topics. Use `--conversation-id` only for | No warehouse available | Create a SQL warehouse or provide `warehouse_id` | | Empty `serialized_space` on export | Requires CAN EDIT permission on the space | | Tables not found after migration | Remap catalog name in `serialized_space` before import | +| Slow answers / query timeouts | Size up the warehouse attached to the space; simplify or pre-aggregate tall source tables | +| Wrong or empty answers | Add `example_question_sqls` and `text_instructions` β€” see "Improving a Genie Space" | ## Related Skills diff --git a/databricks-skills/databricks-genie/scripts/conversation.py b/databricks-skills/databricks-genie/scripts/conversation.py deleted file mode 100644 index e1a670ff..00000000 --- a/databricks-skills/databricks-genie/scripts/conversation.py +++ /dev/null @@ -1,171 +0,0 @@ -#!/usr/bin/env python3 -""" -Genie Conversation API - CLI interface for asking questions to Genie Spaces. - -Usage: - python conversation.py ask SPACE_ID "What were total sales last month?" - python conversation.py ask SPACE_ID "Break that down by region" --conversation-id CONV_ID - python conversation.py ask SPACE_ID "Complex query" --timeout 120 - -Requires: databricks-sdk package -""" - -import argparse -import json -import sys -import time -from typing import Any, Dict, Optional - -from databricks.sdk import WorkspaceClient -from databricks.sdk.service.dashboards import GenieMessage - - -def ask_genie( - space_id: str, - question: str, - conversation_id: Optional[str] = None, - timeout_seconds: int = 60, -) -> Dict[str, Any]: - """Ask a question to a Genie Space. - - Args: - space_id: The Genie Space ID - question: Natural language question to ask - conversation_id: Optional conversation ID for follow-up questions - timeout_seconds: Maximum time to wait for response (default: 60) - - Returns: - Dict with question, conversation_id, message_id, status, sql, columns, data, row_count - """ - client = WorkspaceClient() - - # Start or continue conversation - if conversation_id: - response = client.genie.start_conversation_and_wait( - space_id=space_id, - content=question, - conversation_id=conversation_id, - ) - else: - response = client.genie.start_conversation_and_wait( - space_id=space_id, - content=question, - ) - - # Extract conversation and message IDs - conv_id = response.conversation_id if hasattr(response, 'conversation_id') else None - msg_id = response.message_id if hasattr(response, 'message_id') else None - - # Poll for completion - start_time = time.time() - while True: - if time.time() - start_time > timeout_seconds: - return { - "question": question, - "conversation_id": conv_id, - "message_id": msg_id, - "status": "TIMEOUT", - "error": f"Query timed out after {timeout_seconds} seconds", - } - - # Get message details - message = client.genie.get_message( - space_id=space_id, - conversation_id=conv_id, - message_id=msg_id, - ) - - status = message.status.value if hasattr(message.status, 'value') else str(message.status) - - if status == "COMPLETED": - # Extract results - result = { - "question": question, - "conversation_id": conv_id, - "message_id": msg_id, - "status": "COMPLETED", - } - - # Get SQL and data from attachments - if message.attachments: - for attachment in message.attachments: - if hasattr(attachment, 'query') and attachment.query: - result["sql"] = attachment.query.query - if hasattr(attachment, 'text') and attachment.text: - result["text_response"] = attachment.text.content - - # Get query result if available - if hasattr(message, 'query_result') and message.query_result: - qr = message.query_result - if hasattr(qr, 'columns'): - result["columns"] = [c.name for c in qr.columns] - if hasattr(qr, 'data_array'): - result["data"] = qr.data_array - result["row_count"] = len(qr.data_array) - - return result - - elif status in ["FAILED", "CANCELLED"]: - error_msg = "" - if message.attachments: - for attachment in message.attachments: - if hasattr(attachment, 'text') and attachment.text: - error_msg = attachment.text.content - return { - "question": question, - "conversation_id": conv_id, - "message_id": msg_id, - "status": status, - "error": error_msg or f"Query {status.lower()}", - } - - # Still processing, wait and retry - time.sleep(2) - - -def _print_json(data: Any) -> None: - """Print data as formatted JSON.""" - print(json.dumps(data, indent=2, default=str)) - - -def main(): - """CLI entry point.""" - parser = argparse.ArgumentParser( - description="Ask questions to a Genie Space", - formatter_class=argparse.RawDescriptionHelpFormatter, - epilog=__doc__, - ) - subparsers = parser.add_subparsers(dest="command", required=True) - - # ask command - ask_parser = subparsers.add_parser("ask", help="Ask a question to a Genie Space") - ask_parser.add_argument("space_id", help="The Genie Space ID") - ask_parser.add_argument("question", help="Natural language question to ask") - ask_parser.add_argument( - "--conversation-id", "-c", - help="Conversation ID for follow-up questions", - ) - ask_parser.add_argument( - "--timeout", "-t", - type=int, - default=60, - help="Timeout in seconds (default: 60)", - ) - - args = parser.parse_args() - - if args.command == "ask": - result = ask_genie( - space_id=args.space_id, - question=args.question, - conversation_id=args.conversation_id, - timeout_seconds=args.timeout, - ) - _print_json(result) - else: - parser.print_help() - sys.exit(1) - - -if __name__ == "__main__": - main() From 292ea147e6ef69d2d003e26b165402d05ad52507 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Fri, 24 Apr 2026 23:02:03 +0200 Subject: [PATCH 53/66] Fix SDP pipeline create + update flow; simplify UC schema/volume create MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit SDP skill had three concrete bugs that bit an agent running a real pipeline update end-to-end: 1. references/2-cli-approach.md claimed "file" libraries could point to a directory. They can't β€” the API errors with "Paths must end with .py or .sql". The correct shape for a folder is {"glob": {"include": "/**"}}. Fixed the example and added a troubleshooting row for the exact error string. Live-verified with a pipelines update round-trip. 2. No documented flow for editing an existing pipeline. Added a dense "Updating a Pipeline" section covering re-upload + start-update. Key gotcha: pipelines consume raw FILE entries, so re-imports need --format RAW --overwrite. --format SOURCE --language SQL|PYTHON creates a workspace NOTEBOOK (deprecated for pipelines) and fails on an existing FILE path with "type mismatch (asked: NOTEBOOK, actual: FILE)". Live-verified both failure and success modes. Added troubleshooting row. 3. Contradictory streaming read guidance β€” SKILL.md said FROM stream(table), 4-dlt-migration.md showed FROM STREAM table. Both parse, but the function form is the canonical one. Reworked the troubleshooting row to spell out when each form applies and flag FROM STREAM table as legacy DLT compatibility. Bonus: pipelines list-pipeline-events returns a bare array, not {"events": [...]} β€” skill previously showed the raw command with no output shape hint. Replaced with a ready jq pattern that surfaces just ERROR/WARN entries; agent had written two failing Python one-liners trying to guess the shape. Also simplified databricks-unity-catalog SKILL.md to show the positional form for schemas create and volumes create (what the help text documents as canonical) instead of the --json form that was redundant with the positional CLI. Co-authored-by: Isaac --- .../SKILL.md | 32 +++++++++++++++---- .../references/2-cli-approach.md | 11 +++---- .../databricks-unity-catalog/SKILL.md | 10 +++--- 3 files changed, 35 insertions(+), 18 deletions(-) diff --git a/databricks-skills/databricks-spark-declarative-pipelines/SKILL.md b/databricks-skills/databricks-spark-declarative-pipelines/SKILL.md index 97758497..35ce49c7 100644 --- a/databricks-skills/databricks-spark-declarative-pipelines/SKILL.md +++ b/databricks-skills/databricks-spark-declarative-pipelines/SKILL.md @@ -281,17 +281,33 @@ After running a pipeline (via DAB or CLI), you **MUST** validate both the execut ### Step 1: Check Pipeline Execution Status +`list-pipeline-events` returns a bare JSON array (not `{"events": [...]}`). For DAB runs, also check `databricks bundle run` output; `pipelines get`/`list-pipeline-events` still apply. + ```bash -# Get pipeline status and details (pipeline_id is positional) databricks pipelines get -# Get recent events/logs -databricks pipelines list-pipeline-events +# Surface just failures +databricks pipelines list-pipeline-events \ + | jq '[.[] | select(.level=="ERROR" or .level=="WARN") | {level, event_type, message: (.message // "")[0:200]}] | .[0:10]' +``` + +### Updating a Pipeline (edit β†’ re-upload β†’ restart) + +Use `--format RAW --overwrite` β€” pipelines use raw `.sql`/`.py` FILE entries. `--format SOURCE --language SQL|PYTHON` uploads a workspace notebook instead and **notebooks are deprecated for pipelines**; mixing the two on the same path fails with `Cannot overwrite the asset ... due to type mismatch (asked: NOTEBOOK, actual: FILE)`. + +```bash +# Single file +databricks workspace import /Workspace/Users//pipeline/07_gold.sql \ + --file ./src/pipeline/07_gold.sql --format RAW --overwrite + +# Whole directory +databricks workspace import-dir ./src/pipeline /Workspace/Users//pipeline --overwrite + +# Restart. --full-refresh reprocesses everything (destructive on streaming state); omit for incremental. +databricks pipelines start-update --full-refresh ``` -**From DAB (`databricks bundle run`):** -- Check the command output for success/failure -- Use `databricks pipelines get ` to get detailed status and recent events +If pipeline is RUNNING, `start-update` queues the new update. Force-stop with `databricks pipelines stop ` first if needed. ### Step 2: Validate Output Data @@ -340,12 +356,14 @@ If validation reveals problems, trace upstream to find the root cause: | **Empty output tables** | Use `discover-schema` to check upstream tables. Verify source files exist and paths are correct. | | **Pipeline stuck INITIALIZING** | Normal for serverless, wait a few minutes | | **"Column not found"** | Check `schemaHints` match actual data | -| **Streaming reads fail** | For file ingestion in a streaming table, you must use the `STREAM` keyword with `read_files`: `FROM STREAM read_files(...)`. For table streams use `FROM stream(table)`. See [read_files β€” Usage in streaming tables](https://docs.databricks.com/aws/en/sql/language-manual/functions/read_files#usage-in-streaming-tables). | +| **Streaming reads fail** | Use `FROM STREAM read_files(...)` only for file ingestion; use `FROM stream(table)` for table-to-table streams. `FROM STREAM table` (no parens) parses but is legacy DLT β€” prefer the function form. See [read_files β€” Usage in streaming tables](https://docs.databricks.com/aws/en/sql/language-manual/functions/read_files#usage-in-streaming-tables). | | **Timeout during run** | Use `databricks pipelines get ` to check status | | **MV doesn't refresh** | Enable row tracking on source tables | | **SCD2: query column not found** | Lakeflow uses `__START_AT` and `__END_AT` (double underscore), not `START_AT`/`END_AT`. Use `WHERE __END_AT IS NULL` for current rows. See [sql/4-cdc-patterns.md](references/sql/4-cdc-patterns.md). | | **AUTO CDC parse error at APPLY/SEQUENCE** | Put `APPLY AS DELETE WHEN` **before** `SEQUENCE BY`. Only list columns in `COLUMNS * EXCEPT (...)` that exist in the source (omit `_rescued_data` unless bronze uses rescue data). Omit `TRACK HISTORY ON *` if it causes "end of input" errors; default is equivalent. See [sql/4-cdc-patterns.md](references/sql/4-cdc-patterns.md). | | **"Cannot create streaming table from batch query"** | In a streaming table query, use `FROM STREAM read_files(...)` so `read_files` leverages Auto Loader; `FROM read_files(...)` alone is batch. See [sql/2-ingestion.md](references/sql/2-ingestion.md) and [read_files β€” Usage in streaming tables](https://docs.databricks.com/aws/en/sql/language-manual/functions/read_files#usage-in-streaming-tables). | +| **"Paths must end with .py or .sql"** on `pipelines create` | `{"file": {"path": ...}}` needs a single file. Use `{"glob": {"include": "/**"}}` for a directory, or enumerate files individually. | +| **`type mismatch (asked: NOTEBOOK, actual: FILE)`** on `workspace import` | Existing path is a FILE (raw `.sql`/`.py`). Re-upload with `--format RAW --overwrite`, not `--format SOURCE --language SQL` (creates a NOTEBOOK β€” deprecated for pipelines). | **For detailed errors**, use `databricks pipelines get ` which includes recent events, or `databricks pipelines list-pipeline-events ` for full event history. diff --git a/databricks-skills/databricks-spark-declarative-pipelines/references/2-cli-approach.md b/databricks-skills/databricks-spark-declarative-pipelines/references/2-cli-approach.md index 7c2dbb31..cafe4046 100644 --- a/databricks-skills/databricks-spark-declarative-pipelines/references/2-cli-approach.md +++ b/databricks-skills/databricks-spark-declarative-pipelines/references/2-cli-approach.md @@ -20,27 +20,26 @@ databricks workspace import-dir ./my_pipeline /Workspace/Users/user@example.com/ ### Step 3: Create Pipeline ```bash -# Create pipeline with JSON config -# Use "file" - can point to a single .sql/.py file OR a directory (includes all files) +# libraries: "file" = single .sql/.py file; "glob" = directory of files. +# A "file" pointing at a folder fails: "Paths must end with .py or .sql". +# "notebook" is deprecated β€” use "file" or "glob". databricks pipelines create --json '{ "name": "my_orders_pipeline", "catalog": "my_catalog", "schema": "my_schema", "serverless": true, "libraries": [ - {"file": {"path": "/Workspace/Users/user@example.com/my_pipeline"}} + {"glob": {"include": "/Workspace/Users/user@example.com/my_pipeline/**"}} ], "tags": {"aidevkit_project": "ai-dev-kit"}, "development": true }' -# Or specify individual files: +# Enumerate files instead of glob: # "libraries": [ # {"file": {"path": "/Workspace/.../bronze/ingest_orders.sql"}}, # {"file": {"path": "/Workspace/.../silver/clean_orders.sql"}} # ] -# -# Legacy (avoid): {"notebook": {"path": "..."}} - use "file" instead ``` Save the returned `pipeline_id` for subsequent operations. diff --git a/databricks-skills/databricks-unity-catalog/SKILL.md b/databricks-skills/databricks-unity-catalog/SKILL.md index 97f4cfc1..177eb953 100644 --- a/databricks-skills/databricks-unity-catalog/SKILL.md +++ b/databricks-skills/databricks-unity-catalog/SKILL.md @@ -35,13 +35,13 @@ Use this skill when: ```bash # Create a catalog -databricks catalogs create --json '{"name": "my_catalog"}' +databricks catalogs create my_catalog -# Create a schema -databricks schemas create --json '{"name": "my_schema", "catalog_name": "my_catalog"}' +# Create a schema (args: NAME CATALOG_NAME β€” positional, name first) +databricks schemas create my_schema my_catalog -# Create a volume -databricks volumes create --json '{"name": "my_volume", "catalog_name": "my_catalog", "schema_name": "my_schema", "volume_type": "MANAGED"}' +# Create a volume (args: CATALOG_NAME SCHEMA_NAME NAME VOLUME_TYPE β€” catalog first) +databricks volumes create my_catalog my_schema my_volume MANAGED # List catalogs, schemas, volumes databricks catalogs list From 18906e9118d33b64ac25ffd7c426d989cc882ec0 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Sat, 25 Apr 2026 00:11:45 +0200 Subject: [PATCH 54/66] Use convention for script paths across skills MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Script invocations in SKILL.md and references (python scripts/X) previously assumed the reader was running from the skill's install folder. Agents running from an arbitrary project cwd hit "No such file or directory" errors β€” the agent-bricks, execution-compute, and pdf-generation skills all trip the same way. Switch to the literal token for every script invocation and add a one-line convention note at the top of each affected SKILL.md and reference file: > = the directory containing this SKILL.md; resolve to > the absolute install path (e.g. ~/.claude/skills/). Rewrote: - python scripts/compute.py ... β†’ python /scripts/compute.py ... - python scripts/pdf_generator.py ... β†’ python /scripts/pdf_generator.py ... Also fixed a stale markdown link in the SDP skill whose display text said "examples/exploration_notebook.py" but whose path was "scripts/...". databricks-agent-bricks script references come in a separate commit. Co-authored-by: Isaac --- .../databricks-execution-compute/SKILL.md | 10 +++--- .../references/2-serverless-job.md | 8 +++-- .../references/3-interactive-cluster.md | 34 ++++++++++--------- .../SKILL.md | 2 +- .../SKILL.md | 8 +++-- 5 files changed, 35 insertions(+), 27 deletions(-) diff --git a/databricks-skills/databricks-execution-compute/SKILL.md b/databricks-skills/databricks-execution-compute/SKILL.md index 30ebdd40..74cc43d9 100644 --- a/databricks-skills/databricks-execution-compute/SKILL.md +++ b/databricks-skills/databricks-execution-compute/SKILL.md @@ -15,6 +15,8 @@ description: >- Run code on Databricks. Three execution modesβ€”choose based on workload. +> **Path convention:** `` in examples below = the directory containing this SKILL.md. Resolve it to the absolute path in your install (e.g. `~/.claude/skills/databricks-execution-compute`). Commands like `python /scripts/compute.py ...` work from any cwd. + ## Execution Mode Decision Matrix | Aspect | [Databricks Connect](references/1-databricks-connect.md) ⭐ | [Serverless Job](references/2-serverless-job.md) | [Interactive Cluster](references/3-interactive-cluster.md) | @@ -93,7 +95,7 @@ Always use `dbutils.notebook.exit()` in the notebook β€” `print()` is no **Convenience wrapper.** `scripts/compute.py execute-code` does upload + submit + wait + cleanup in one command and returns a single tidy JSON: -`python scripts/compute.py execute-code --file /local/path/to/train.py --compute-type serverless --timeout 1500 --environments '[{"environment_key":"ml_env","spec":{"client":"4","dependencies":["scikit-learn==1.5.2","mlflow==2.22.0"]}}]' | jq '{success, state, output, error, run_id, run_page_url, execution_duration_ms}'` +`python /scripts/compute.py execute-code --file /local/path/to/train.py --compute-type serverless --timeout 1500 --environments '[{"environment_key":"ml_env","spec":{"client":"4","dependencies":["scikit-learn==1.5.2","mlflow==2.22.0"]}}]' | jq '{success, state, output, error, run_id, run_page_url, execution_duration_ms}'` ### Interactive Cluster β†’ [reference](references/3-interactive-cluster.md) @@ -108,9 +110,9 @@ Interactive clusters are **slow to start (3-8 min)** and cost money while runnin | Command | Purpose | |---------|---------| -| `python scripts/compute.py execute-code` | Run code on serverless or an existing cluster | -| `python scripts/compute.py list-compute` | List clusters, node types, Spark versions | -| `python scripts/compute.py manage-cluster` | Create/start/terminate/delete clusters (see [3-interactive-cluster.md](references/3-interactive-cluster.md)) | +| `python /scripts/compute.py execute-code` | Run code on serverless or an existing cluster | +| `python /scripts/compute.py list-compute` | List clusters, node types, Spark versions | +| `python /scripts/compute.py manage-cluster` | Create/start/terminate/delete clusters (see [3-interactive-cluster.md](references/3-interactive-cluster.md)) | | `databricks warehouses create/list` | Manage SQL warehouses | ### SQL Warehouses diff --git a/databricks-skills/databricks-execution-compute/references/2-serverless-job.md b/databricks-skills/databricks-execution-compute/references/2-serverless-job.md index 3b777174..d80c31e9 100644 --- a/databricks-skills/databricks-execution-compute/references/2-serverless-job.md +++ b/databricks-skills/databricks-execution-compute/references/2-serverless-job.md @@ -2,6 +2,8 @@ **Use when:** Running intensive Python code remotely (ML training, heavy processing) that doesn't need Spark, or when code shouldn't depend on the local machine staying connected. +> `` in examples = the directory containing the parent SKILL.md β€” substitute the absolute install path (e.g. `~/.claude/skills/databricks-execution-compute`). + ## When to Choose Serverless Job - ML model training (runs independently of local machine) @@ -89,15 +91,15 @@ Max output size is 5 MB. Larger results should be written to a Volume/object sto Minimal: -`python scripts/compute.py execute-code --file train.py --compute-type serverless` +`python /scripts/compute.py execute-code --file train.py --compute-type serverless` With dependencies: -`python scripts/compute.py execute-code --file /path/to/train.py --compute-type serverless --timeout 1500 --environments '[{"environment_key":"ml_env","spec":{"client":"4","dependencies":["scikit-learn==1.5.2","mlflow==2.22.0","xgboost==2.1.3"]}}]'` +`python /scripts/compute.py execute-code --file /path/to/train.py --compute-type serverless --timeout 1500 --environments '[{"environment_key":"ml_env","spec":{"client":"4","dependencies":["scikit-learn==1.5.2","mlflow==2.22.0","xgboost==2.1.3"]}}]'` Long dependency list from a file: -`python scripts/compute.py execute-code --file /path/to/train.py --compute-type serverless --environments @env.json` +`python /scripts/compute.py execute-code --file /path/to/train.py --compute-type serverless --environments @env.json` ## Common Issues diff --git a/databricks-skills/databricks-execution-compute/references/3-interactive-cluster.md b/databricks-skills/databricks-execution-compute/references/3-interactive-cluster.md index 4af64ed7..7197334a 100644 --- a/databricks-skills/databricks-execution-compute/references/3-interactive-cluster.md +++ b/databricks-skills/databricks-execution-compute/references/3-interactive-cluster.md @@ -2,6 +2,8 @@ **Use when:** You have an existing running cluster and need to preserve state across multiple tool calls, or need Scala/R support. +> `` in examples = the directory containing the parent SKILL.md β€” substitute the absolute install path (e.g. `~/.claude/skills/databricks-execution-compute`). + ## When to Choose Interactive Cluster - Multiple sequential commands where variables must persist @@ -21,7 +23,7 @@ **Starting a cluster takes 3-8 minutes and costs money.** Always check first: ```bash -python scripts/compute.py list-compute --resource clusters +python /scripts/compute.py list-compute --resource clusters ``` If no cluster is running, ask the user: @@ -35,7 +37,7 @@ If no cluster is running, ask the user: ### First Command: Creates Context ```bash -python scripts/compute.py execute-code \ +python /scripts/compute.py execute-code \ --code "import pandas as pd; df = pd.DataFrame({'a': [1, 2, 3]}); print(df)" \ --compute-type cluster \ --cluster-id "1234-567890-abcdef" @@ -55,7 +57,7 @@ Response includes `context_id` for reuse: ```bash # Variables from first command still available -python scripts/compute.py execute-code \ +python /scripts/compute.py execute-code \ --code "print(df.shape)" \ --compute-type cluster \ --cluster-id "1234-567890-abcdef" \ @@ -66,11 +68,11 @@ python scripts/compute.py execute-code \ ```bash # Get best running cluster -python scripts/compute.py list-compute --auto-select +python /scripts/compute.py list-compute --auto-select # Returns: {"cluster_id": "1234-567890-abcdef"} # Then execute on it -python scripts/compute.py execute-code \ +python /scripts/compute.py execute-code \ --code "spark.range(100).show()" \ --compute-type cluster \ --cluster-id "1234-567890-abcdef" @@ -80,13 +82,13 @@ python scripts/compute.py execute-code \ ```bash # Scala -python scripts/compute.py execute-code --code 'println("Hello")' --compute-type cluster --language scala --cluster-id ... +python /scripts/compute.py execute-code --code 'println("Hello")' --compute-type cluster --language scala --cluster-id ... # SQL -python scripts/compute.py execute-code --code "SELECT * FROM table LIMIT 10" --compute-type cluster --language sql --cluster-id ... +python /scripts/compute.py execute-code --code "SELECT * FROM table LIMIT 10" --compute-type cluster --language sql --cluster-id ... # R -python scripts/compute.py execute-code --code 'print("Hello")' --compute-type cluster --language r --cluster-id ... +python /scripts/compute.py execute-code --code 'print("Hello")' --compute-type cluster --language r --cluster-id ... ``` ## Installing Libraries @@ -94,7 +96,7 @@ python scripts/compute.py execute-code --code 'print("Hello")' --compute-type cl Install pip packages directly in the execution context: ```bash -python scripts/compute.py execute-code \ +python /scripts/compute.py execute-code \ --code "%pip install faker" \ --compute-type cluster \ --cluster-id "..." \ @@ -103,7 +105,7 @@ python scripts/compute.py execute-code \ If needed, restart Python to pick up new packages: ```bash -python scripts/compute.py execute-code \ +python /scripts/compute.py execute-code \ --code "dbutils.library.restartPython()" \ --compute-type cluster \ --cluster-id "..." \ @@ -116,7 +118,7 @@ python scripts/compute.py execute-code \ **Destroy when done:** ```bash -python scripts/compute.py execute-code \ +python /scripts/compute.py execute-code \ --code "print('Done!')" \ --compute-type cluster \ --cluster-id "..." \ @@ -129,19 +131,19 @@ Two equivalent paths: the standalone script (convenience wrapper) or the raw `da ```bash # List all clusters -python scripts/compute.py list-compute --resource clusters +python /scripts/compute.py list-compute --resource clusters # Get specific cluster status -python scripts/compute.py list-compute --cluster-id "1234-567890-abcdef" +python /scripts/compute.py list-compute --cluster-id "1234-567890-abcdef" # Start a cluster (WITH USER APPROVAL ONLY - costs money, 3-8min startup) -python scripts/compute.py manage-cluster --action start --cluster-id "1234-567890-abcdef" +python /scripts/compute.py manage-cluster --action start --cluster-id "1234-567890-abcdef" # Terminate a cluster (reversible) -python scripts/compute.py manage-cluster --action terminate --cluster-id "1234-567890-abcdef" +python /scripts/compute.py manage-cluster --action terminate --cluster-id "1234-567890-abcdef" # Create a new cluster -python scripts/compute.py manage-cluster --action create --name "my-cluster" --num-workers 2 +python /scripts/compute.py manage-cluster --action create --name "my-cluster" --num-workers 2 ``` ### Filter running interactive clusters only (raw CLI) diff --git a/databricks-skills/databricks-spark-declarative-pipelines/SKILL.md b/databricks-skills/databricks-spark-declarative-pipelines/SKILL.md index 35ce49c7..d4286902 100644 --- a/databricks-skills/databricks-spark-declarative-pipelines/SKILL.md +++ b/databricks-skills/databricks-spark-declarative-pipelines/SKILL.md @@ -256,7 +256,7 @@ For detailed syntax, see [sql/1-syntax-basics.md](references/sql/1-syntax-basics ### Modern Defaults - **Always use raw `.sql`/`.py` files for the transformations files** - NO notebooks in your pipeline. Pipeline code must be plain files. -- **Databricks notebook source for explorations** - Use `# Databricks notebook source` format with `# COMMAND ----------` separators for ad-hoc queries. See [examples/exploration_notebook.py](scripts/exploration_notebook.py). +- **Databricks notebook source for explorations** - Use `# Databricks notebook source` format with `# COMMAND ----------` separators for ad-hoc queries. See [scripts/exploration_notebook.py](scripts/exploration_notebook.py). - **Serverless compute** - Do not use classic clusters unless explicitly required (R, RDD APIs, JAR libraries) - **Unity Catalog** (required for serverless) - **CLUSTER BY** (Liquid Clustering), not PARTITION BY with ZORDER - see [sql/5-performance.md](references/sql/5-performance.md) or [python/5-performance.md](references/python/5-performance.md) diff --git a/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md b/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md index 57fe029f..1a1a6368 100644 --- a/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md +++ b/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md @@ -10,10 +10,12 @@ Convert HTML content to PDF documents and upload them to Unity Catalog Volumes. ## Workflow 1. Write HTML files to `./raw_data/html/` (write multiple files in parallel for speed) -2. Convert HTML β†’ PDF using `scripts/pdf_generator.py` (parallel conversion) +2. Convert HTML β†’ PDF using `/scripts/pdf_generator.py` (parallel conversion) 3. Upload PDFs to Unity Catalog volume using `databricks fs cp` 4. Generate `doc_questions.json` with test questions for each document +> **Path convention:** `` below = the directory containing this SKILL.md. Resolve to the absolute install path (e.g. `~/.claude/skills/databricks-unstructured-pdf-generation`). `./raw_data/...` paths are relative to your own project cwd. + ## Dependencies ```bash @@ -32,7 +34,7 @@ Write HTML documents to `./raw_data/html/filename.html`. Use subdirectories to o ```bash # Convert entire folder (parallel, 4 workers) -python scripts/pdf_generator.py convert --input ./raw_data/html --output ./raw_data/pdf +python /scripts/pdf_generator.py convert --input ./raw_data/html --output ./raw_data/pdf ``` Skips files where PDF exists and is newer than HTML. Use `--force` to reconvert all. @@ -86,7 +88,7 @@ When generating documents for Knowledge Assistant testing or demos: ## CLI Reference ``` -python scripts/pdf_generator.py convert [OPTIONS] +python /scripts/pdf_generator.py convert [OPTIONS] --input, -i Input HTML file or folder (required) --output, -o Output folder for PDFs (required) From 37f931e6ef5895786f2024d295e6f8e031f60d7e Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Sat, 25 Apr 2026 00:11:57 +0200 Subject: [PATCH 55/66] Fix Agent Bricks KA create-knowledge-source invocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The skill told readers to call create-knowledge-source with four positional args (PARENT DISPLAY_NAME DESCRIPTION SOURCE_TYPE) alongside --json. The CLI rejects that combination: Error: when --json flag is specified, provide only PARENT as positional arguments. Provide 'display_name', 'description', 'source_type' in your JSON input. Only two forms actually work (verified live on the workspace): 1. PARENT + --json '{display_name, description, source_type, files|index|...}' 2. positional-only (no --json) β€” but then there's nowhere to pass files.path / index.index_name, so this form only works for source types that need no extra body, which today is none. Updated SKILL.md and 1-knowledge-assistants.md to show the single working shape: PARENT positional + everything else in --json. Added the display_name / description fields inside each example body. Co-authored-by: Isaac --- .../databricks-agent-bricks/1-knowledge-assistants.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/databricks-skills/databricks-agent-bricks/1-knowledge-assistants.md b/databricks-skills/databricks-agent-bricks/1-knowledge-assistants.md index f7d0a942..0b349c5b 100644 --- a/databricks-skills/databricks-agent-bricks/1-knowledge-assistants.md +++ b/databricks-skills/databricks-agent-bricks/1-knowledge-assistants.md @@ -4,23 +4,29 @@ For commands, see [SKILL.md](SKILL.md). ## Source Types +Both shapes go inside the `--json` body alongside `display_name` and `description` β€” see SKILL.md for the full invocation. + ### Files (Volume) ```json { + "display_name": "...", + "description": "...", "source_type": "files", "files": {"path": "/Volumes/catalog/schema/volume/folder/"} } ``` -Supported formats: PDF, TXT, MD, DOCX +Supported formats: PDF, TXT, MD, DOCX. ### Vector Search Index -Use existing index instead of auto-indexing: +Use an existing index instead of auto-indexing: ```json { + "display_name": "...", + "description": "...", "source_type": "index", "index": { "index_name": "catalog.schema.my_index", From 30eb32ea0cccd0a97fa7f0c85de1f178648f8f73 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Sat, 25 Apr 2026 00:12:34 +0200 Subject: [PATCH 56/66] Fix MAS manager: instructions path, drop broken queue, add --wait MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three concrete bugs in scripts/mas_manager.py triggered by a real agent session: 1. get_mas (L481) and update_mas (L531) read instructions from mas_data.get("instructions") β€” wrong nested level, always empty. The GET response nests it on tile: mas_data.tile.instructions. Consequence: update_mas(tile_id, name="...") without an explicit instructions= arg wiped the existing instructions on every call. Verified the correct path live: "instructions_len: 232" vs 0 before. 2. add_examples_queued span up an in-process daemon thread that polled get_endpoint_status every 30s. When the CLI process exited, the thread died and examples were never added β€” silent data loss. Removed add_examples_queued, TileExampleQueue, get_tile_example_queue, the _tile_example_queue singleton, and the now-unused threading / Tuple imports. 3. Replaced the broken queue with a wait_for_online flag on add_examples (CLI: --wait). Blocks and polls every 30s for up to 15 min (covers the ~10 min NOT_READY -> ONLINE wait after create_mas or a big update_mas, with headroom). No background queue β€” the caller process must stay alive for the wait. Also live-verified that the MAS PATCH endpoint is NOT partial: missing `name` returns 400 Missing required field, missing `agents` returns 400 "At least one BaseAgent must be provided". update_mas already handles this internally (fetches existing + merges), so the full-replace reality stays an internal detail of the HTTP layer β€” callers see a partial-update-shaped API. Skill doc updates: - SKILL.md: reorder list_mas to the top of the check/manage block with a one-liner describing the return shape. - SKILL.md: flag the ~10min NOT_READY wait on add_examples with --wait. - SKILL.md: fix status legend from "(2-5 min)" to "up to ~10 min". - 2-supervisor-agents.md: replace the dual add_examples / add_examples_wait block with a single add_examples [--wait] example. - SKILL.md also includes the KA create-knowledge-source fix from the previous commit's companion page (PARENT + everything-in-json). Co-authored-by: Isaac --- .../2-supervisor-agents.md | 16 +- .../databricks-agent-bricks/SKILL.md | 37 ++-- .../scripts/mas_manager.py | 207 ++++++------------ 3 files changed, 93 insertions(+), 167 deletions(-) diff --git a/databricks-skills/databricks-agent-bricks/2-supervisor-agents.md b/databricks-skills/databricks-agent-bricks/2-supervisor-agents.md index bbb296d0..accc610d 100644 --- a/databricks-skills/databricks-agent-bricks/2-supervisor-agents.md +++ b/databricks-skills/databricks-agent-bricks/2-supervisor-agents.md @@ -1,6 +1,6 @@ # Supervisor Agents - Details -For commands, see [SKILL.md](SKILL.md). +For commands, see [SKILL.md](SKILL.md). `` in examples = the directory containing SKILL.md. ## Unity Catalog Functions @@ -61,20 +61,20 @@ The `description` field drives routing. Be specific: ## Adding Examples -Examples help evaluation and routing optimization. MAS must be ONLINE. +Examples help evaluation and routing optimization. **The MAS endpoint must be ONLINE.** Right after `create_mas` (or a big `update_mas`), the endpoint is `NOT_READY` and takes **up to ~10 minutes** to come ONLINE. Pass `--wait` to block until then: ```bash -python scripts/mas_manager.py add_examples TILE_ID '[ +# Fails fast if endpoint isn't ONLINE yet +python /scripts/mas_manager.py add_examples TILE_ID '[ {"question": "I need my invoice for March", "guideline": "Route to billing_agent"}, {"question": "API returns 500 error", "guideline": "Route to tech_agent"} ]' -python scripts/mas_manager.py list_examples TILE_ID -``` +# --wait blocks until endpoint is ONLINE (default timeout 15 min) then adds. +# The process must stay alive for the whole wait β€” there is no background queue. +python /scripts/mas_manager.py add_examples TILE_ID '[...]' --wait -**In automated jobs** (waits for ONLINE): -```bash -python scripts/mas_manager.py add_examples_wait TILE_ID '[...]' +python /scripts/mas_manager.py list_examples TILE_ID ``` ## Troubleshooting diff --git a/databricks-skills/databricks-agent-bricks/SKILL.md b/databricks-skills/databricks-agent-bricks/SKILL.md index dc89fa48..9ef56920 100644 --- a/databricks-skills/databricks-agent-bricks/SKILL.md +++ b/databricks-skills/databricks-agent-bricks/SKILL.md @@ -25,10 +25,17 @@ databricks experimental aitools tools query --warehouse WH "LIST '/Volumes/catal # Create KA databricks knowledge-assistants create-knowledge-assistant "Name" "Description" -# Add knowledge source (4 positional args: PARENT DISPLAY_NAME DESCRIPTION SOURCE_TYPE) +# Add knowledge source. With --json, pass ONLY the PARENT as a positional arg +# and put display_name / description / source_type / the source body (files|index|file_table) +# inside the JSON. Mixing positional DISPLAY_NAME/DESCRIPTION/SOURCE_TYPE with --json errors. databricks knowledge-assistants create-knowledge-source \ - "knowledge-assistants/{ka_id}" "Docs" "Documentation files" "files" \ - --json '{"files": {"path": "/Volumes/catalog/schema/volume/"}}' + "knowledge-assistants/{ka_id}" \ + --json '{ + "display_name": "Docs", + "description": "Documentation files", + "source_type": "files", + "files": {"path": "/Volumes/catalog/schema/volume/"} + }' # Sync and check status databricks knowledge-assistants sync-knowledge-sources "knowledge-assistants/{ka_id}" @@ -47,11 +54,11 @@ databricks knowledge-assistants delete-knowledge-assistant "knowledge-assistants ## Supervisor Agent -**No CLI** - use `scripts/mas_manager.py` (run from skill folder): +**No CLI** β€” use `mas_manager.py` from this skill's `scripts/` folder. All `/...` paths below are relative to the directory containing this SKILL.md (resolve to the absolute path in your install location). ```bash # Create MAS -python scripts/mas_manager.py create_mas "My Supervisor" '{ +python /scripts/mas_manager.py create_mas "My Supervisor" '{ "description": "Routes queries to specialized agents", "instructions": "Route data questions to analyst, document questions to docs_agent.", "agents": [ @@ -60,14 +67,18 @@ python scripts/mas_manager.py create_mas "My Supervisor" '{ ] }' -# Check status and manage -python scripts/mas_manager.py get_mas TILE_ID -python scripts/mas_manager.py list_mas -python scripts/mas_manager.py update_mas TILE_ID '{"agents": [...]}' -python scripts/mas_manager.py delete_mas TILE_ID +# Check status and manage. list_mas enumerates every MAS you can access +# and returns {tile_id, name, endpoint_status, agents_count} β€” use it to +# find a tile_id / see which MAS are ONLINE before operations. +python /scripts/mas_manager.py list_mas +python /scripts/mas_manager.py get_mas TILE_ID +python /scripts/mas_manager.py update_mas TILE_ID '{"agents": [...]}' +python /scripts/mas_manager.py delete_mas TILE_ID -# Add examples (requires ONLINE) -python scripts/mas_manager.py add_examples TILE_ID '[{"question": "...", "guideline": "..."}]' +# Add examples β€” requires endpoint_status=ONLINE. After create_mas the MAS is +# NOT_READY and takes up to ~10 min to reach ONLINE. Without --wait, this +# fails fast if not ONLINE yet. With --wait, it blocks until ONLINE then adds. +python /scripts/mas_manager.py add_examples TILE_ID '[{"question": "...", "guideline": "..."}]' [--wait] # Find IDs databricks knowledge-assistants list-knowledge-assistants --output json | jq '.[].id' @@ -84,7 +95,7 @@ databricks genie list-spaces --output json | jq '.[].space_id' | `uc_function_name` | UC function (`catalog.schema.func`) | | `connection_name` | MCP server (UC HTTP Connection) | -**Status:** `NOT_READY` (2-5 min) β†’ `ONLINE` β†’ `OFFLINE` +**Status:** `NOT_READY` (up to ~10 min after create/big update) β†’ `ONLINE` β†’ `OFFLINE` --- diff --git a/databricks-skills/databricks-agent-bricks/scripts/mas_manager.py b/databricks-skills/databricks-agent-bricks/scripts/mas_manager.py index 27b6b424..bf1d92ee 100644 --- a/databricks-skills/databricks-agent-bricks/scripts/mas_manager.py +++ b/databricks-skills/databricks-agent-bricks/scripts/mas_manager.py @@ -9,10 +9,12 @@ python mas_manager.py update_mas TILE_ID '{"name": ..., "agents": [...], ...}' python mas_manager.py delete_mas TILE_ID python mas_manager.py list_mas - python mas_manager.py add_examples TILE_ID '[{"question": "...", "guideline": "..."}]' - python mas_manager.py add_examples_queued TILE_ID '[{"question": "...", "guideline": "..."}]' + python mas_manager.py add_examples TILE_ID '[{"question": "...", "guideline": "..."}]' [--wait] python mas_manager.py list_examples TILE_ID + --wait on add_examples blocks until the MAS endpoint reaches ONLINE state + (up to ~10 min after a create/update) before adding the examples. + Requires: databricks-sdk, requests pip install databricks-sdk requests """ @@ -21,12 +23,11 @@ import logging import re import sys -import threading import time from concurrent.futures import ThreadPoolExecutor, as_completed from dataclasses import dataclass from enum import Enum -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional import requests from databricks.sdk import WorkspaceClient @@ -287,98 +288,6 @@ def _delete(self, path: str) -> Dict[str, Any]: return response.json() -# ============================================================================ -# Example Queue (for adding examples when MAS becomes ONLINE) -# ============================================================================ - - -class TileExampleQueue: - """Background queue for adding examples to tiles that aren't ready yet.""" - - def __init__(self, poll_interval: float = 30.0, max_attempts: int = 120): - self.queue: Dict[str, Tuple[MASManager, List[Dict[str, Any]], float, int]] = {} - self.lock = threading.Lock() - self.running = False - self.thread: Optional[threading.Thread] = None - self.poll_interval = poll_interval - self.max_attempts = max_attempts - - def enqueue(self, tile_id: str, manager: MASManager, questions: List[Dict[str, Any]]) -> None: - """Add a tile and its questions to the processing queue.""" - with self.lock: - self.queue[tile_id] = (manager, questions, time.time(), 0) - logger.info(f"Enqueued {len(questions)} examples for MAS {tile_id}") - - if not self.running: - self.start() - - def start(self) -> None: - """Start the background processing thread.""" - if not self.running: - self.running = True - self.thread = threading.Thread(target=self._process_loop, daemon=True) - self.thread.start() - - def stop(self) -> None: - """Stop the background processing thread.""" - self.running = False - if self.thread: - self.thread.join(timeout=5) - - def _process_loop(self) -> None: - """Background loop that checks tile status and adds examples when ready.""" - while self.running: - try: - with self.lock: - items_to_process = list(self.queue.items()) - - for tile_id, (manager, questions, enqueue_time, attempt_count) in items_to_process: - try: - if attempt_count >= self.max_attempts: - logger.error(f"MAS {tile_id} exceeded max attempts. Removing from queue.") - with self.lock: - self.queue.pop(tile_id, None) - continue - - with self.lock: - if tile_id in self.queue: - self.queue[tile_id] = (manager, questions, enqueue_time, attempt_count + 1) - - status = manager.get_endpoint_status(tile_id) - - if status == EndpointStatus.ONLINE.value: - logger.info(f"MAS {tile_id} is ONLINE, adding {len(questions)} examples...") - created = manager.add_examples_batch(tile_id, questions) - logger.info(f"Added {len(created)} examples to MAS {tile_id}") - with self.lock: - self.queue.pop(tile_id, None) - - except Exception as e: - logger.error(f"Error processing MAS {tile_id}: {e}") - with self.lock: - self.queue.pop(tile_id, None) - - except Exception as e: - logger.error(f"Error in queue processor: {e}") - - time.sleep(self.poll_interval) - - -# Global singleton queue instance -_tile_example_queue: Optional[TileExampleQueue] = None -_queue_lock = threading.Lock() - - -def get_tile_example_queue() -> TileExampleQueue: - """Get or create the global tile example queue instance.""" - global _tile_example_queue - if _tile_example_queue is None: - with _queue_lock: - if _tile_example_queue is None: - _tile_example_queue = TileExampleQueue() - return _tile_example_queue - - # ============================================================================ # CLI Functions # ============================================================================ @@ -478,7 +387,8 @@ def get_mas(tile_id: str) -> Dict[str, Any]: "description": tile_data.get("description", ""), "endpoint_status": status_data.get("endpoint_status", "UNKNOWN"), "agents": mas_data.get("agents", []), - "instructions": mas_data.get("instructions", ""), + # instructions live on the tile, not on the mas_data root + "instructions": tile_data.get("instructions", ""), } @@ -528,7 +438,8 @@ def update_mas( final_name = name or tile_data.get("name", "") final_description = description or tile_data.get("description", "") - final_instructions = instructions or mas_data.get("instructions", "") + # instructions live on the tile in GET responses, not on the mas_data root + final_instructions = instructions or tile_data.get("instructions", "") if agents: agent_list = _build_agent_list(agents) @@ -587,58 +498,70 @@ def list_mas() -> List[Dict[str, Any]]: return results -def add_examples(tile_id: str, examples: List[Dict[str, Any]]) -> Dict[str, Any]: - """Add example questions to a Supervisor Agent.""" +def add_examples( + tile_id: str, + examples: List[Dict[str, Any]], + wait_for_online: bool = False, + wait_timeout_seconds: int = 900, + poll_interval_seconds: float = 30.0, +) -> Dict[str, Any]: + """Add example questions to a Supervisor Agent. + + Examples can only be added once the MAS endpoint is ONLINE. Right after + create_mas or a big update_mas the endpoint is NOT_READY and takes up to + ~10 min to come ONLINE. + + If wait_for_online=True, this call BLOCKS synchronously until the endpoint + reaches ONLINE (polling every `poll_interval_seconds`, up to + `wait_timeout_seconds`). Default timeout 15 min covers the ~10 min + provisioning with some headroom. The caller's process MUST stay alive for + the whole duration β€” there is no background queue. + + If wait_for_online=False (default) and the endpoint is not ONLINE, returns + an error immediately without adding anything. Retry later once ONLINE. + """ manager = _get_manager() status = get_mas(tile_id) if "error" in status: return status - if status.get("endpoint_status") != "ONLINE": - return { - "error": f"MAS is not ONLINE (status: {status.get('endpoint_status')}). " - "Use add_examples_queued to queue examples for when it's ready.", - "tile_id": tile_id, - } + current = status.get("endpoint_status") + + if current != "ONLINE": + if not wait_for_online: + return { + "error": f"MAS is not ONLINE (status: {current}). " + "Retry once it's ONLINE, or pass wait_for_online=True " + "(--wait on the CLI) to block until it comes up.", + "tile_id": tile_id, + "endpoint_status": current, + } + + # Block-and-poll until ONLINE or timeout. + deadline = time.monotonic() + wait_timeout_seconds + while time.monotonic() < deadline: + current = manager.get_endpoint_status(tile_id) + if current == "ONLINE": + break + time.sleep(poll_interval_seconds) + else: + return { + "error": f"Timed out after {wait_timeout_seconds}s waiting for MAS " + f"to reach ONLINE (last status: {current}).", + "tile_id": tile_id, + "endpoint_status": current, + } created = manager.add_examples_batch(tile_id, examples) return { "tile_id": tile_id, "added_count": len(created), "total_requested": len(examples), + "endpoint_status": current, } -def add_examples_queued(tile_id: str, examples: List[Dict[str, Any]]) -> Dict[str, Any]: - """Queue example questions to be added when MAS becomes ONLINE.""" - manager = _get_manager() - - status = get_mas(tile_id) - if "error" in status: - return status - - if status.get("endpoint_status") == "ONLINE": - created = manager.add_examples_batch(tile_id, examples) - return { - "tile_id": tile_id, - "status": "added", - "added_count": len(created), - "total_requested": len(examples), - } - else: - queue = get_tile_example_queue() - queue.start() - queue.enqueue(tile_id, manager, examples) - return { - "tile_id": tile_id, - "status": "queued", - "queued_count": len(examples), - "endpoint_status": status.get("endpoint_status"), - "message": "Examples will be added automatically when endpoint becomes ONLINE", - } - - def list_examples(tile_id: str) -> Dict[str, Any]: """List all examples for a Supervisor Agent.""" manager = _get_manager() @@ -719,20 +642,12 @@ def main(): elif command == "add_examples": if len(sys.argv) < 4: - print("Usage: python mas_manager.py add_examples TILE_ID '[{\"question\": \"...\", \"guideline\": \"...\"}]'") - sys.exit(1) - tile_id = sys.argv[2] - examples = json.loads(sys.argv[3]) - result = add_examples(tile_id, examples) - _print_json(result) - - elif command == "add_examples_queued": - if len(sys.argv) < 4: - print("Usage: python mas_manager.py add_examples_queued TILE_ID '[{\"question\": \"...\", \"guideline\": \"...\"}]'") + print("Usage: python mas_manager.py add_examples TILE_ID '[{\"question\": \"...\", \"guideline\": \"...\"}]' [--wait]") sys.exit(1) tile_id = sys.argv[2] examples = json.loads(sys.argv[3]) - result = add_examples_queued(tile_id, examples) + wait = "--wait" in sys.argv[4:] + result = add_examples(tile_id, examples, wait_for_online=wait) _print_json(result) elif command == "list_examples": From 407c203bba57df650b06bc2bbad59a2e34f80fa9 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Sat, 25 Apr 2026 00:22:33 +0200 Subject: [PATCH 57/66] Remove databricks-lakebase-provisioned skill MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lakebase Autoscaling is the canonical path for all new Lakebase work (autoscaling, branching, scale-to-zero, point-in-time restore). The Provisioned skill covers the predecessor fixed-capacity tier; keeping both causes agents to spend time deciding between them or picking the older one. Delete the Provisioned skill and point everything at autoscale. Files deleted: - databricks-skills/databricks-lakebase-provisioned/SKILL.md - databricks-skills/databricks-lakebase-provisioned/connection-patterns.md - databricks-skills/databricks-lakebase-provisioned/reverse-etl.md Cross-references updated: - install_skills.sh: drop from DATABRICKS_SKILLS list, description map, and reference-files map. - README.md: replace the Provisioned bullet with a Lakebase Autoscale bullet under the same Development & Deployment section. - databricks-python-sdk/SKILL.md, databricks-app-python/SKILL.md: redirect the "Related Skills" link to databricks-lakebase-autoscale. - databricks-lakebase-autoscale/SKILL.md: drop the now-meaningless "Provisioned vs Autoscaling" comparison table and the predecessor link. Keep the one prose mention in computes.md explaining CU RAM sizing context β€” that's justification, not a link. Co-authored-by: Isaac --- databricks-skills/README.md | 2 +- .../databricks-app-python/SKILL.md | 2 +- .../databricks-lakebase-autoscale/SKILL.md | 15 +- .../databricks-lakebase-provisioned/SKILL.md | 312 ------------------ .../connection-patterns.md | 279 ---------------- .../reverse-etl.md | 171 ---------- .../databricks-python-sdk/SKILL.md | 2 +- databricks-skills/install_skills.sh | 4 +- 8 files changed, 5 insertions(+), 782 deletions(-) delete mode 100644 databricks-skills/databricks-lakebase-provisioned/SKILL.md delete mode 100644 databricks-skills/databricks-lakebase-provisioned/connection-patterns.md delete mode 100644 databricks-skills/databricks-lakebase-provisioned/reverse-etl.md diff --git a/databricks-skills/README.md b/databricks-skills/README.md index 95e9a3f2..a5e50868 100644 --- a/databricks-skills/README.md +++ b/databricks-skills/README.md @@ -105,7 +105,7 @@ cp -r ai-dev-kit/databricks-skills/databricks-agent-bricks .claude/skills/ - **databricks-app-python** - Python web apps (Dash, Streamlit, Flask) with foundation model integration - **databricks-python-sdk** - Python SDK, Connect, CLI, REST API - **databricks-config** - Profile authentication setup -- **databricks-lakebase-provisioned** - Managed PostgreSQL for OLTP workloads +- **databricks-lakebase-autoscale** - Lakebase Autoscaling managed PostgreSQL with branching, scale-to-zero, reverse ETL ### πŸ“š Reference - **databricks-docs** - Documentation index via llms.txt diff --git a/databricks-skills/databricks-app-python/SKILL.md b/databricks-skills/databricks-app-python/SKILL.md index 7b34b74b..c5a595a5 100644 --- a/databricks-skills/databricks-app-python/SKILL.md +++ b/databricks-skills/databricks-app-python/SKILL.md @@ -207,5 +207,5 @@ class EntityIn(BaseModel): - **[databricks-app-apx](../databricks-app-apx/SKILL.md)** - full-stack apps with FastAPI + React - **[databricks-bundles](../databricks-bundles/SKILL.md)** - deploying apps via DABs - **[databricks-python-sdk](../databricks-python-sdk/SKILL.md)** - backend SDK integration -- **[databricks-lakebase-provisioned](../databricks-lakebase-provisioned/SKILL.md)** - adding persistent PostgreSQL state +- **[databricks-lakebase-autoscale](../databricks-lakebase-autoscale/SKILL.md)** - adding persistent PostgreSQL state (autoscaling managed PG with branching) - **[databricks-model-serving](../databricks-model-serving/SKILL.md)** - serving ML models for app integration diff --git a/databricks-skills/databricks-lakebase-autoscale/SKILL.md b/databricks-skills/databricks-lakebase-autoscale/SKILL.md index fdcbb28f..797b8bd4 100644 --- a/databricks-skills/databricks-lakebase-autoscale/SKILL.md +++ b/databricks-skills/databricks-lakebase-autoscale/SKILL.md @@ -184,23 +184,10 @@ Enable CDF on the source for TRIGGERED/CONTINUOUS: `ALTER TABLE ... SET TBLPROPE ## Not Yet Supported -HA readable secondaries (use read replicas), Databricks Apps UI integration (manual connect works), Feature Store, stateful AI agents (LangChain memory), Postgresβ†’Delta sync, custom billing tags / serverless budget policies, direct Provisionedβ†’Autoscaling migration (use `pg_dump`/`pg_restore` or reverse ETL). - -## Provisioned vs Autoscaling - -| Aspect | Provisioned | Autoscaling | -|--------|-------------|-------------| -| CLI group | `databricks database` | `databricks postgres` | -| Top-level | Instance | Project | -| Capacity | CU_1/2/4/8 (16 GB/CU) | 0.5-112 CU (2 GB/CU) | -| Branching | No | Yes | -| Scale-to-zero | No | Yes | -| Operations | Synchronous | LRO | -| Read replicas | Readable secondaries | Dedicated read-only endpoints | +HA readable secondaries (use read replicas), Databricks Apps UI integration (manual connect works), Feature Store, stateful AI agents (LangChain memory), Postgresβ†’Delta sync, custom billing tags / serverless budget policies. ## Related Skills -- [databricks-lakebase-provisioned](../databricks-lakebase-provisioned/SKILL.md) β€” fixed-capacity predecessor - [databricks-app-apx](../databricks-app-apx/SKILL.md), [databricks-app-python](../databricks-app-python/SKILL.md) β€” apps using Lakebase - [databricks-bundles](../databricks-bundles/SKILL.md) β€” bundle deploys with Lakebase resources - [databricks-jobs](../databricks-jobs/SKILL.md) β€” scheduling reverse ETL pipelines diff --git a/databricks-skills/databricks-lakebase-provisioned/SKILL.md b/databricks-skills/databricks-lakebase-provisioned/SKILL.md deleted file mode 100644 index 1d7e0589..00000000 --- a/databricks-skills/databricks-lakebase-provisioned/SKILL.md +++ /dev/null @@ -1,312 +0,0 @@ ---- -name: databricks-lakebase-provisioned -description: "Patterns and best practices for Lakebase Provisioned (Databricks managed PostgreSQL) for OLTP workloads. Use when creating Lakebase instances, connecting applications or Databricks Apps to PostgreSQL, implementing reverse ETL via synced tables, storing agent or chat memory, or configuring OAuth authentication for Lakebase." ---- - -# Lakebase Provisioned - -Patterns and best practices for using Lakebase Provisioned (Databricks managed PostgreSQL) for OLTP workloads. - -## When to Use - -Use this skill when: -- Building applications that need a PostgreSQL database for transactional workloads -- Adding persistent state to Databricks Apps -- Implementing reverse ETL from Delta Lake to an operational database -- Storing chat/agent memory for LangChain applications - -## Overview - -Lakebase Provisioned is Databricks' managed PostgreSQL database service for OLTP (Online Transaction Processing) workloads. It provides a fully managed PostgreSQL-compatible database that integrates with Unity Catalog and supports OAuth token-based authentication. - -| Feature | Description | -|---------|-------------| -| **Managed PostgreSQL** | Fully managed instances with automatic provisioning | -| **OAuth Authentication** | Token-based auth via Databricks SDK (1-hour expiry) | -| **Unity Catalog** | Register databases for governance | -| **Reverse ETL** | Sync data from Delta tables to PostgreSQL | -| **Apps Integration** | First-class support in Databricks Apps | - -**Available Regions (AWS):** us-east-1, us-east-2, us-west-2, eu-central-1, eu-west-1, ap-south-1, ap-southeast-1, ap-southeast-2 - -## Quick Start - -Create and connect to a Lakebase Provisioned instance: - -```python -from databricks.sdk import WorkspaceClient -import uuid - -# Initialize client -w = WorkspaceClient() - -# Create a database instance -instance = w.database.create_database_instance( - name="my-lakebase-instance", - capacity="CU_1", # CU_1, CU_2, CU_4, CU_8 - stopped=False -) -print(f"Instance created: {instance.name}") -print(f"DNS endpoint: {instance.read_write_dns}") -``` - -## Common Patterns - -### Generate OAuth Token - -```python -from databricks.sdk import WorkspaceClient -import uuid - -w = WorkspaceClient() - -# Generate OAuth token for database connection -cred = w.database.generate_database_credential( - request_id=str(uuid.uuid4()), - instance_names=["my-lakebase-instance"] -) -token = cred.token # Use this as password in connection string -``` - -### Connect from Notebook - -```python -import psycopg -from databricks.sdk import WorkspaceClient -import uuid - -# Get instance details -w = WorkspaceClient() -instance = w.database.get_database_instance(name="my-lakebase-instance") - -# Generate token -cred = w.database.generate_database_credential( - request_id=str(uuid.uuid4()), - instance_names=["my-lakebase-instance"] -) - -# Connect using psycopg3 -conn_string = f"host={instance.read_write_dns} dbname=postgres user={w.current_user.me().user_name} password={cred.token} sslmode=require" -with psycopg.connect(conn_string) as conn: - with conn.cursor() as cur: - cur.execute("SELECT version()") - print(cur.fetchone()) -``` - -### SQLAlchemy with Token Refresh (Production) - -For long-running applications, tokens must be refreshed (expire after 1 hour): - -```python -import asyncio -import os -import uuid -from sqlalchemy import event -from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession -from sqlalchemy.orm import sessionmaker -from databricks.sdk import WorkspaceClient - -# Token refresh state -_current_token = None -_token_refresh_task = None -TOKEN_REFRESH_INTERVAL = 50 * 60 # 50 minutes (before 1-hour expiry) - -def _generate_token(instance_name: str) -> str: - """Generate fresh OAuth token.""" - w = WorkspaceClient() - cred = w.database.generate_database_credential( - request_id=str(uuid.uuid4()), - instance_names=[instance_name] - ) - return cred.token - -async def _token_refresh_loop(instance_name: str): - """Background task to refresh token every 50 minutes.""" - global _current_token - while True: - await asyncio.sleep(TOKEN_REFRESH_INTERVAL) - _current_token = await asyncio.to_thread(_generate_token, instance_name) - -def init_database(instance_name: str, database_name: str, username: str) -> AsyncEngine: - """Initialize database with OAuth token injection.""" - global _current_token - - w = WorkspaceClient() - instance = w.database.get_database_instance(name=instance_name) - - # Generate initial token - _current_token = _generate_token(instance_name) - - # Build URL (password injected via do_connect) - url = f"postgresql+psycopg://{username}@{instance.read_write_dns}:5432/{database_name}" - - engine = create_async_engine( - url, - pool_size=5, - max_overflow=10, - pool_recycle=3600, - connect_args={"sslmode": "require"} - ) - - # Inject token on each connection - @event.listens_for(engine.sync_engine, "do_connect") - def provide_token(dialect, conn_rec, cargs, cparams): - cparams["password"] = _current_token - - return engine -``` - -### Databricks Apps Integration - -For Databricks Apps, use environment variables for configuration: - -```python -# Environment variables set by Databricks Apps: -# - LAKEBASE_INSTANCE_NAME: Instance name -# - LAKEBASE_DATABASE_NAME: Database name -# - LAKEBASE_USERNAME: Username (optional, defaults to service principal) - -import os - -def is_lakebase_configured() -> bool: - """Check if Lakebase is configured for this app.""" - return bool( - os.environ.get("LAKEBASE_PG_URL") or - (os.environ.get("LAKEBASE_INSTANCE_NAME") and - os.environ.get("LAKEBASE_DATABASE_NAME")) - ) -``` - -Add Lakebase as an app resource via CLI: - -```bash -databricks apps add-resource $APP_NAME \ - --resource-type database \ - --resource-name lakebase \ - --database-instance my-lakebase-instance -``` - -### Register with Unity Catalog - -```python -from databricks.sdk import WorkspaceClient - -w = WorkspaceClient() - -# Register database in Unity Catalog -w.database.register_database_instance( - name="my-lakebase-instance", - catalog="my_catalog", - schema="my_schema" -) -``` - -### MLflow Model Resources - -Declare Lakebase as a model resource for automatic credential provisioning: - -```python -from mlflow.models.resources import DatabricksLakebase - -resources = [ - DatabricksLakebase(database_instance_name="my-lakebase-instance"), -] - -# When logging model -mlflow.langchain.log_model( - model, - artifact_path="model", - resources=resources, - pip_requirements=["databricks-langchain[memory]"] -) -``` - -## Reference Files - -- [connection-patterns.md](connection-patterns.md) - Detailed connection patterns for different use cases -- [reverse-etl.md](reverse-etl.md) - Syncing data from Delta Lake to Lakebase - -## CLI Commands - -### Instance Management - -```bash -# Create instance (NAME is positional) -databricks database create-database-instance my-lakebase-instance --capacity CU_1 - -# Get instance details -databricks database get-database-instance my-lakebase-instance - -# List instances -databricks database list-database-instances - -# Stop instance (saves cost) -databricks database stop-database-instance my-lakebase-instance - -# Start instance -databricks database start-database-instance my-lakebase-instance - -# Delete instance -databricks database delete-database-instance my-lakebase-instance -``` - -### OAuth Credentials - -```bash -# Generate credentials for connection -databricks database generate-database-credential \ - --request-id $(uuidgen) \ - --json '{"instance_names": ["my-lakebase-instance"]}' -``` - -### Reverse ETL (Synced Tables) - -Synced tables are managed via Unity Catalog SQL commands: - -```sql --- Create synced table from Delta to Lakebase -CREATE TABLE lakebase_catalog.schema.target_table -SYNC FROM catalog.schema.source_delta_table -SCHEDULE TRIGGERED; - --- List synced tables -SHOW TABLES IN lakebase_catalog.schema; - --- Drop synced table -DROP TABLE lakebase_catalog.schema.target_table; -``` - -## Common Issues - -| Issue | Solution | -|-------|----------| -| **Token expired during long query** | Implement token refresh loop (see SQLAlchemy with Token Refresh section); tokens expire after 1 hour | -| **DNS resolution fails on macOS** | Use `dig` command to resolve hostname, pass `hostaddr` to psycopg | -| **Connection refused** | Ensure instance is not stopped; check `instance.state` | -| **Permission denied** | User must be granted access to the Lakebase instance | -| **SSL required error** | Always use `sslmode=require` in connection string | - -## SDK Version Requirements - -- **Databricks SDK for Python**: >= 0.61.0 (0.81.0+ recommended for full API support) -- **psycopg**: 3.x (supports `hostaddr` parameter for DNS workaround) -- **SQLAlchemy**: 2.x with `postgresql+psycopg` driver - -```python -%pip install -U "databricks-sdk>=0.81.0" "psycopg[binary]>=3.0" sqlalchemy -``` - -## Notes - -- **Capacity values** use compute unit sizing: `CU_1`, `CU_2`, `CU_4`, `CU_8`. -- **Lakebase Autoscaling** is a newer offering with automatic scaling but limited regional availability. This skill focuses on **Lakebase Provisioned** which is more widely available. -- For memory/state in LangChain agents, use `databricks-langchain[memory]` which includes Lakebase support. -- Tokens are short-lived (1 hour) - production apps MUST implement token refresh. - -## Related Skills - -- **[databricks-app-apx](../databricks-app-apx/SKILL.md)** - full-stack apps that can use Lakebase for persistence -- **[databricks-app-python](../databricks-app-python/SKILL.md)** - Python apps with Lakebase backend -- **[databricks-python-sdk](../databricks-python-sdk/SKILL.md)** - SDK used for instance management and token generation -- **[databricks-bundles](../databricks-bundles/SKILL.md)** - deploying apps with Lakebase resources -- **[databricks-jobs](../databricks-jobs/SKILL.md)** - scheduling reverse ETL sync jobs diff --git a/databricks-skills/databricks-lakebase-provisioned/connection-patterns.md b/databricks-skills/databricks-lakebase-provisioned/connection-patterns.md deleted file mode 100644 index e6843548..00000000 --- a/databricks-skills/databricks-lakebase-provisioned/connection-patterns.md +++ /dev/null @@ -1,279 +0,0 @@ -# Lakebase Connection Patterns - -## Overview - -This document covers different connection patterns for Lakebase Provisioned, from simple scripts to production applications with token refresh. - -## Connection Methods - -### 1. Direct psycopg Connection (Simple Scripts) - -For one-off scripts or notebooks: - -```python -import psycopg -from databricks.sdk import WorkspaceClient -import uuid - -def get_connection(instance_name: str, database_name: str = "postgres"): - """Get a database connection with fresh OAuth token.""" - w = WorkspaceClient() - - # Get instance details - instance = w.database.get_database_instance(name=instance_name) - - # Generate OAuth token (valid for 1 hour) - cred = w.database.generate_database_credential( - request_id=str(uuid.uuid4()), - instance_names=[instance_name] - ) - - # Build connection string - conn_string = ( - f"host={instance.read_write_dns} " - f"dbname={database_name} " - f"user={w.current_user.me().user_name} " - f"password={cred.token} " - f"sslmode=require" - ) - - return psycopg.connect(conn_string) - -# Usage -with get_connection("my-instance") as conn: - with conn.cursor() as cur: - cur.execute("SELECT NOW()") - print(cur.fetchone()) -``` - -### 2. Connection Pool with Token Refresh (Production) - -For long-running applications that need connection pooling: - -```python -import asyncio -import uuid -from contextlib import asynccontextmanager -from typing import AsyncGenerator, Optional - -from sqlalchemy import event -from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker -from databricks.sdk import WorkspaceClient - -class LakebaseConnectionManager: - """Manages Lakebase connections with automatic token refresh.""" - - def __init__( - self, - instance_name: str, - database_name: str, - pool_size: int = 5, - max_overflow: int = 10, - token_refresh_seconds: int = 3000 # 50 minutes - ): - self.instance_name = instance_name - self.database_name = database_name - self.pool_size = pool_size - self.max_overflow = max_overflow - self.token_refresh_seconds = token_refresh_seconds - - self._current_token: Optional[str] = None - self._refresh_task: Optional[asyncio.Task] = None - self._engine = None - self._session_maker = None - - def _generate_token(self) -> str: - """Generate fresh OAuth token.""" - w = WorkspaceClient() - cred = w.database.generate_database_credential( - request_id=str(uuid.uuid4()), - instance_names=[self.instance_name] - ) - return cred.token - - async def _refresh_loop(self): - """Background task to refresh token periodically.""" - while True: - await asyncio.sleep(self.token_refresh_seconds) - try: - self._current_token = await asyncio.to_thread(self._generate_token) - except Exception as e: - print(f"Token refresh failed: {e}") - - def initialize(self): - """Initialize database engine and start token refresh.""" - w = WorkspaceClient() - - # Get instance info - instance = w.database.get_database_instance(name=self.instance_name) - username = w.current_user.me().user_name - - # Generate initial token - self._current_token = self._generate_token() - - # Create engine (password injected via event) - url = ( - f"postgresql+psycopg://{username}@" - f"{instance.read_write_dns}:5432/{self.database_name}" - ) - - self._engine = create_async_engine( - url, - pool_size=self.pool_size, - max_overflow=self.max_overflow, - pool_recycle=3600, - connect_args={"sslmode": "require"} - ) - - # Inject token on connect - @event.listens_for(self._engine.sync_engine, "do_connect") - def inject_token(dialect, conn_rec, cargs, cparams): - cparams["password"] = self._current_token - - self._session_maker = async_sessionmaker( - self._engine, - class_=AsyncSession, - expire_on_commit=False - ) - - def start_refresh(self): - """Start background token refresh task.""" - if not self._refresh_task: - self._refresh_task = asyncio.create_task(self._refresh_loop()) - - async def stop_refresh(self): - """Stop token refresh task.""" - if self._refresh_task: - self._refresh_task.cancel() - try: - await self._refresh_task - except asyncio.CancelledError: - pass - self._refresh_task = None - - @asynccontextmanager - async def session(self) -> AsyncGenerator[AsyncSession, None]: - """Get a database session.""" - async with self._session_maker() as session: - yield session - - async def close(self): - """Close all connections.""" - await self.stop_refresh() - if self._engine: - await self._engine.dispose() - -# Usage in FastAPI -from fastapi import FastAPI - -app = FastAPI() -db_manager = LakebaseConnectionManager("my-instance", "my_database") - -@app.on_event("startup") -async def startup(): - db_manager.initialize() - db_manager.start_refresh() - -@app.on_event("shutdown") -async def shutdown(): - await db_manager.close() - -@app.get("/data") -async def get_data(): - async with db_manager.session() as session: - result = await session.execute("SELECT * FROM my_table") - return result.fetchall() -``` - -### 3. Static URL Mode (Local Development) - -For local development, use a static connection URL: - -```python -import os -from sqlalchemy.ext.asyncio import create_async_engine - -# Set environment variable with full connection URL -# LAKEBASE_PG_URL=postgresql://user:password@host:5432/database - -def get_database_url() -> str: - """Get database URL from environment.""" - url = os.environ.get("LAKEBASE_PG_URL") - if url and url.startswith("postgresql://"): - # Convert to psycopg3 async driver - url = url.replace("postgresql://", "postgresql+psycopg://", 1) - return url - -engine = create_async_engine( - get_database_url(), - pool_size=5, - connect_args={"sslmode": "require"} -) -``` - -### 4. DNS Resolution Workaround (macOS) - -Python's `socket.getaddrinfo()` fails with long hostnames on macOS. Use `dig` as fallback: - -```python -import subprocess -import socket - -def resolve_hostname(hostname: str) -> str: - """Resolve hostname using dig command (macOS workaround).""" - try: - # Try Python's resolver first - return socket.gethostbyname(hostname) - except socket.gaierror: - pass - - # Fallback to dig command - try: - result = subprocess.run( - ["dig", "+short", hostname], - capture_output=True, - text=True, - timeout=5 - ) - ips = result.stdout.strip().split('\n') - for ip in ips: - if ip and not ip.startswith(';'): - return ip - except Exception: - pass - - raise RuntimeError(f"Could not resolve hostname: {hostname}") - -# Use with psycopg -conn_params = { - "host": hostname, # For TLS SNI - "hostaddr": resolve_hostname(hostname), # Actual IP - "dbname": database_name, - "user": username, - "password": token, - "sslmode": "require" -} -conn = psycopg.connect(**conn_params) -``` - -## Environment Variables - -| Variable | Description | Required | -|----------|-------------|----------| -| `LAKEBASE_PG_URL` | Static PostgreSQL URL (local dev) | Either this OR instance/database | -| `LAKEBASE_INSTANCE_NAME` | Lakebase instance name | With DATABASE_NAME | -| `LAKEBASE_DATABASE_NAME` | Database name | With INSTANCE_NAME | -| `LAKEBASE_USERNAME` | Override username | No | -| `LAKEBASE_HOST` | Override host | No | -| `DB_POOL_SIZE` | Connection pool size | No (default: 5) | -| `DB_MAX_OVERFLOW` | Max pool overflow | No (default: 10) | -| `DB_POOL_RECYCLE_INTERVAL` | Pool recycle seconds | No (default: 3600) | - -## Best Practices - -1. **Always use SSL**: Set `sslmode=require` in all connections -2. **Implement token refresh**: Tokens expire after 1 hour; refresh at 50 minutes -3. **Use connection pooling**: Avoid creating new connections per request -4. **Handle DNS issues on macOS**: Use the `hostaddr` workaround if needed -5. **Close connections properly**: Use context managers or explicit cleanup -6. **Log token refresh events**: Helps debug authentication issues diff --git a/databricks-skills/databricks-lakebase-provisioned/reverse-etl.md b/databricks-skills/databricks-lakebase-provisioned/reverse-etl.md deleted file mode 100644 index 5b5caef4..00000000 --- a/databricks-skills/databricks-lakebase-provisioned/reverse-etl.md +++ /dev/null @@ -1,171 +0,0 @@ -# Reverse ETL with Lakebase Provisioned - -## Overview - -Reverse ETL allows you to sync data from Unity Catalog Delta tables into Lakebase Provisioned as PostgreSQL tables. This enables OLTP access patterns on data processed in the Lakehouse. - -## Sync Modes - -| Mode | Description | Best For | Notes | -|------|-------------|----------|-------| -| **Snapshot** | One-time full copy | Initial setup, small tables | 10x more efficient if modifying >10% of data | -| **Triggered** | Scheduled updates on demand | Dashboards updated hourly/daily | Requires CDF on source table | -| **Continuous** | Real-time streaming (seconds of latency) | Live applications | Highest cost, minimum 15s intervals, requires CDF | - -**Note:** Triggered and Continuous modes require Change Data Feed (CDF) enabled on the source table: - -```sql -ALTER TABLE your_catalog.your_schema.your_table -SET TBLPROPERTIES (delta.enableChangeDataFeed = true) -``` - -## Creating Synced Tables - -### Using Python SDK - -```python -from databricks.sdk import WorkspaceClient -from databricks.sdk.service.database import ( - SyncedDatabaseTable, - SyncedTableSpec, - SyncedTableSchedulingPolicy, -) - -w = WorkspaceClient() - -# Create a synced table from Unity Catalog to Lakebase Provisioned -synced_table = w.database.create_synced_database_table( - SyncedDatabaseTable( - name="lakebase_catalog.schema.synced_table", - database_instance_name="my-lakebase-instance", - spec=SyncedTableSpec( - source_table_full_name="analytics.gold.user_profiles", - primary_key_columns=["user_id"], - scheduling_policy=SyncedTableSchedulingPolicy.TRIGGERED, - ), - ) -) -print(f"Created synced table: {synced_table.name}") -``` - -**Key parameters:** - -| Parameter | Description | -|-----------|-------------| -| `name` | Fully qualified target table name (catalog.schema.table) | -| `database_instance_name` | Lakebase Provisioned instance name | -| `source_table_full_name` | Fully qualified source Delta table (catalog.schema.table) | -| `primary_key_columns` | List of primary key columns from the source table | -| `scheduling_policy` | `SNAPSHOT`, `TRIGGERED`, or `CONTINUOUS` | - -### Using CLI - -```bash -databricks database create-synced-database-table \ - --json '{ - "name": "lakebase_catalog.schema.synced_table", - "database_instance_name": "my-lakebase-instance", - "spec": { - "source_table_full_name": "analytics.gold.user_profiles", - "primary_key_columns": ["user_id"], - "scheduling_policy": "TRIGGERED" - } - }' -``` - -**Note:** There is no SQL syntax for creating synced tables. Use the Python SDK, CLI, or Catalog Explorer UI. - -## Checking Synced Table Status - -```python -status = w.database.get_synced_database_table(name="lakebase_catalog.schema.synced_table") -print(f"State: {status.data_synchronization_status.detailed_state}") -print(f"Message: {status.data_synchronization_status.message}") -``` - -## Deleting a Synced Table - -Delete from both Unity Catalog and Postgres: - -1. **Unity Catalog:** Delete via Catalog Explorer or SDK -2. **Postgres:** Drop the table to free storage - -```python -# Delete the synced table via SDK -w.database.delete_synced_database_table(name="lakebase_catalog.schema.synced_table") -``` - -```sql --- Drop the Postgres table to free storage -DROP TABLE your_database.your_schema.your_table; -``` - -## Use Cases - -### 1. Product Catalog for Web App - -```python -w.database.create_synced_database_table( - SyncedDatabaseTable( - name="ecommerce_catalog.public.products", - database_instance_name="ecommerce-db", - spec=SyncedTableSpec( - source_table_full_name="gold.products.catalog", - primary_key_columns=["product_id"], - scheduling_policy=SyncedTableSchedulingPolicy.TRIGGERED, - ), - ) -) -# Application queries PostgreSQL directly with low-latency point lookups -``` - -### 2. User Profiles for Authentication - -```python -w.database.create_synced_database_table( - SyncedDatabaseTable( - name="auth_catalog.public.user_profiles", - database_instance_name="auth-db", - spec=SyncedTableSpec( - source_table_full_name="gold.users.profiles", - primary_key_columns=["user_id"], - scheduling_policy=SyncedTableSchedulingPolicy.CONTINUOUS, - ), - ) -) -``` - -### 3. Feature Store for Real-time ML - -```python -w.database.create_synced_database_table( - SyncedDatabaseTable( - name="ml_catalog.public.user_features", - database_instance_name="feature-store-db", - spec=SyncedTableSpec( - source_table_full_name="ml.features.user_features", - primary_key_columns=["user_id"], - scheduling_policy=SyncedTableSchedulingPolicy.CONTINUOUS, - ), - ) -) -# ML model queries features with low latency -``` - -## Best Practices - -1. **Enable CDF** on source tables before creating Triggered or Continuous synced tables -2. **Choose appropriate sync mode**: Snapshot for small tables or one-time loads, Triggered for hourly/daily refreshes, Continuous for real-time -3. **Monitor sync status**: Check for failures and latency via Catalog Explorer or `get_synced_database_table()` -4. **Index target tables**: Create appropriate indexes in PostgreSQL for your query patterns -5. **Handle schema changes**: Only additive changes (e.g., adding columns) are supported for Triggered/Continuous modes -6. **Account for connection limits**: Each synced table uses up to 16 connections - -## Common Issues - -| Issue | Solution | -|-------|----------| -| **Sync fails with CDF error** | Enable Change Data Feed on source table before using Triggered or Continuous mode | -| **Schema mismatch** | Only additive schema changes are supported; for breaking changes, delete and recreate the synced table | -| **Sync takes too long** | Switch to Triggered mode for scheduled updates; use Snapshot for initial bulk loads | -| **Target table locked** | Avoid DDL on target during sync operations | diff --git a/databricks-skills/databricks-python-sdk/SKILL.md b/databricks-skills/databricks-python-sdk/SKILL.md index 4d03b5ce..bbe2dada 100644 --- a/databricks-skills/databricks-python-sdk/SKILL.md +++ b/databricks-skills/databricks-python-sdk/SKILL.md @@ -622,4 +622,4 @@ If I'm unsure about a method, I should: - **[databricks-unity-catalog](../databricks-unity-catalog/SKILL.md)** - catalog governance - **[databricks-model-serving](../databricks-model-serving/SKILL.md)** - serving endpoint management - **[databricks-vector-search](../databricks-vector-search/SKILL.md)** - vector index operations -- **[databricks-lakebase-provisioned](../databricks-lakebase-provisioned/SKILL.md)** - managed PostgreSQL via SDK +- **[databricks-lakebase-autoscale](../databricks-lakebase-autoscale/SKILL.md)** - managed PostgreSQL with autoscaling + branching diff --git a/databricks-skills/install_skills.sh b/databricks-skills/install_skills.sh index 3613b00c..b91dc3c5 100755 --- a/databricks-skills/install_skills.sh +++ b/databricks-skills/install_skills.sh @@ -47,7 +47,7 @@ MLFLOW_REPO_RAW_URL="https://raw.githubusercontent.com/mlflow/skills" MLFLOW_REPO_REF="main" # Databricks skills (hosted in this repo) -DATABRICKS_SKILLS="databricks-agent-bricks databricks-ai-functions databricks-aibi-dashboards databricks-bundles databricks-app-python databricks-config databricks-dbsql databricks-docs databricks-genie databricks-iceberg databricks-jobs databricks-lakebase-autoscale databricks-lakebase-provisioned databricks-metric-views databricks-mlflow-evaluation databricks-model-serving databricks-python-sdk databricks-execution-compute databricks-spark-declarative-pipelines databricks-spark-structured-streaming databricks-synthetic-data-gen databricks-unity-catalog databricks-unstructured-pdf-generation databricks-vector-search databricks-zerobus-ingest spark-python-data-source" +DATABRICKS_SKILLS="databricks-agent-bricks databricks-ai-functions databricks-aibi-dashboards databricks-bundles databricks-app-python databricks-config databricks-dbsql databricks-docs databricks-genie databricks-iceberg databricks-jobs databricks-lakebase-autoscale databricks-metric-views databricks-mlflow-evaluation databricks-model-serving databricks-python-sdk databricks-execution-compute databricks-spark-declarative-pipelines databricks-spark-structured-streaming databricks-synthetic-data-gen databricks-unity-catalog databricks-unstructured-pdf-generation databricks-vector-search databricks-zerobus-ingest spark-python-data-source" # MLflow skills (fetched from mlflow/skills repo) MLFLOW_SKILLS="agent-evaluation analyze-mlflow-chat-session analyze-mlflow-trace instrumenting-with-mlflow-tracing mlflow-onboarding querying-mlflow-metrics retrieving-mlflow-traces searching-mlflow-docs" @@ -83,7 +83,6 @@ get_skill_description() { "databricks-execution-compute") echo "Execute code and manage compute on Databricks - serverless, clusters, and SQL warehouses" ;; "databricks-unity-catalog") echo "System tables for lineage, audit, billing" ;; "databricks-lakebase-autoscale") echo "Lakebase Autoscale - managed PostgreSQL with autoscaling" ;; - "databricks-lakebase-provisioned") echo "Lakebase Provisioned - data connections and reverse ETL" ;; "databricks-metric-views") echo "Unity Catalog Metric Views - governed business metrics in YAML" ;; "databricks-model-serving") echo "Model Serving - deploy MLflow models and AI agents" ;; "databricks-mlflow-evaluation") echo "MLflow evaluation and trace analysis" ;; @@ -124,7 +123,6 @@ get_skill_extra_files() { "databricks-python-sdk") echo "doc-index.md examples/1-authentication.py examples/2-clusters-and-jobs.py examples/3-sql-and-warehouses.py examples/4-unity-catalog.py examples/5-serving-and-vector-search.py" ;; "databricks-unity-catalog") echo "5-system-tables.md" ;; "databricks-lakebase-autoscale") echo "projects.md branches.md computes.md connection-patterns.md reverse-etl.md" ;; - "databricks-lakebase-provisioned") echo "connection-patterns.md reverse-etl.md" ;; "databricks-metric-views") echo "yaml-reference.md patterns.md" ;; "databricks-model-serving") echo "1-classical-ml.md 2-custom-pyfunc.md 3-genai-agents.md 4-tools-integration.md 5-development-testing.md 6-logging-registration.md 7-deployment.md 8-querying-endpoints.md 9-package-requirements.md" ;; "databricks-mlflow-evaluation") echo "references/CRITICAL-interfaces.md references/GOTCHAS.md references/patterns-context-optimization.md references/patterns-datasets.md references/patterns-evaluation.md references/patterns-scorers.md references/patterns-trace-analysis.md references/user-journeys.md" ;; From 385a6dec4a7893e9a01593811dff9d8132b783f7 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Mon, 27 Apr 2026 14:43:01 +0200 Subject: [PATCH 58/66] Polish skill clarity: parallel SQL probes, Genie ID rules, SDP first run MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three independent doc improvements surfaced while exercising the skills end-to-end: databricks-aibi-dashboards SKILL.md Add a dense Statement Execution API fan-out snippet to Step 2 so multiple discovery probes (cardinality, top values, distribution, trend viability) run in parallel instead of serializing through `tools query`. Submit with `wait_timeout:"0s"` returns a `statement_id` immediately; `databricks api get /api/2.0/sql/statements/$SID` polls for state ∈ PENDING|RUNNING|SUCCEEDED|FAILED|CANCELED|CLOSED. Live-verified against TPC-H samples β€” 5 probes in 17s wall time. databricks-genie SKILL.md Same parallel-probe snippet in Step 1 (gated on "if you don't already know the data" so it's skippable). Then add two missing serialized_space rules to Field Format Requirements that previously caused 3+ retry rounds on space creation: - IDs must be unique across all three lists combined (text_instructions / example_question_sqls / sample_questions); the API rejects cross-list duplicates with "Duplicate instruction ID '...': first seen in ..., duplicated in ...". - data_sources.tables must be sorted by identifier; example_question_sqls and text_instructions must be sorted by id. (sample_questions is silently re-sorted server-side, so it doesn't enforce.) Plus a simple ID scheme that satisfies both rules in one go: per-list prefix + monotonic counter (1…0001 for sample_questions, 2…0001 for example_question_sqls, 3…0001 for text_instructions). Authoring order = sort order, no collisions. Live-verified. databricks-spark-declarative-pipelines SKILL.md Step 1 now spells out the first-run flow (start-update on a freshly created pipeline, latest_updates is null until then) and gives a null-safe jq snippet so polling doesn't crash on never-run pipelines: (.latest_updates // [{}])[0]. The Updating section drops its duplicate start-update line and points back to Step 1 for the canonical command. All three changes verified live on the workspace. Co-authored-by: Isaac --- .../databricks-aibi-dashboards/SKILL.md | 9 ++++++++ databricks-skills/databricks-genie/SKILL.md | 15 +++++++++++-- .../SKILL.md | 21 ++++++++++++------- 3 files changed, 36 insertions(+), 9 deletions(-) diff --git a/databricks-skills/databricks-aibi-dashboards/SKILL.md b/databricks-skills/databricks-aibi-dashboards/SKILL.md index 81f50855..49745539 100644 --- a/databricks-skills/databricks-aibi-dashboards/SKILL.md +++ b/databricks-skills/databricks-aibi-dashboards/SKILL.md @@ -65,6 +65,15 @@ Sample rows alone don't tell you what to build. you can write aggregate SQL thro - **Trend viability** at daily/weekly/monthly grain β†’ picks the right trend granularity. - **Story confirmation** β€” run the aggregations you plan to put in the dashboard and check they're not flat, empty, or uninteresting. Fix the query or adjust the story before moving on. +Fan out independent probes (state ∈ `PENDING|RUNNING|SUCCEEDED|FAILED|CANCELED|CLOSED`): + +```bash +submit() { databricks api post /api/2.0/sql/statements --json "$(jq -nc --arg w "$1" --arg s "$2" '{warehouse_id:$w,statement:$s,wait_timeout:"0s",on_wait_timeout:"CONTINUE"}')" | jq -r .statement_id; } +SIDS=(); for q in "$@"; do SIDS+=( "$(submit "$WH" "$q")" ); done +for s in "${SIDS[@]}"; do databricks api get "/api/2.0/sql/statements/$s" | jq '{state:.status.state, rows:.result.data_array}'; done +# cancel: databricks api post "/api/2.0/sql/statements/$SID/cancel" +``` + > **Dashboard queries are different** β€” inside the dashboard JSON, the `FROM` clause must reference ONLY the table name, with no catalog or schema prefix: > - βœ… Correct: `FROM trips` > - ❌ Wrong: `FROM nyctaxi.trips` diff --git a/databricks-skills/databricks-genie/SKILL.md b/databricks-skills/databricks-genie/SKILL.md index d1b669f2..381d7c11 100644 --- a/databricks-skills/databricks-genie/SKILL.md +++ b/databricks-skills/databricks-genie/SKILL.md @@ -24,7 +24,16 @@ Use `discover-schema` as the default β€” one call returns columns, types, sample `databricks experimental aitools tools discover-schema catalog.schema.gold_sales catalog.schema.gold_customers` -For Genie, knowing column distribution shapes the sample questions and text instructions. Probe cardinality, ranges, and top categorical values with aggregate SQL through `databricks experimental aitools tools query --warehouse "..."` so your sample questions reflect what's actually in the data. Both commands auto-pick the default warehouse; set `DATABRICKS_WAREHOUSE_ID` or pass `--warehouse ` to override. +For Genie, knowing column distribution shapes the sample questions and text instructions. If you don't already know the data, probe cardinality, ranges, and top categorical values with aggregate SQL through `databricks experimental aitools tools query --warehouse "..."` so your sample questions reflect what's actually in the data. Both commands auto-pick the default warehouse; set `DATABRICKS_WAREHOUSE_ID` or pass `--warehouse ` to override. + +Fan out independent probes (state ∈ `PENDING|RUNNING|SUCCEEDED|FAILED|CANCELED|CLOSED`): + +```bash +submit() { databricks api post /api/2.0/sql/statements --json "$(jq -nc --arg w "$1" --arg s "$2" '{warehouse_id:$w,statement:$s,wait_timeout:"0s",on_wait_timeout:"CONTINUE"}')" | jq -r .statement_id; } +SIDS=(); for q in "$@"; do SIDS+=( "$(submit "$WH" "$q")" ); done +for s in "${SIDS[@]}"; do databricks api get "/api/2.0/sql/statements/$s" | jq '{state:.status.state, rows:.result.data_array}'; done +# cancel: databricks api post "/api/2.0/sql/statements/$SID/cancel" +``` ### Step 2: Create the Space @@ -103,8 +112,10 @@ The `serialized_space` field is a JSON string containing the full space configur | `instructions.example_question_sqls[]` | `{"id": "32hexchars", "question": ["..."], "sql": ["..."]}` | | `instructions.text_instructions[]` | `{"id": "32hexchars", "content": ["..."]}` | -- **ID format:** 32-character lowercase hex UUID without hyphens. +- **ID format:** 32-character lowercase hex, unique across **all three lists combined** (a duplicate between e.g. `text_instructions` and `example_question_sqls` is rejected). - **Text fields are arrays:** `question`, `sql`, and `content` are arrays of strings, not plain strings. +- **Sort order matters:** `data_sources.tables` must be sorted by `identifier`; `example_question_sqls` and `text_instructions` must be sorted by `id`. (`sample_questions` is silently re-sorted server-side.) +- **Simple ID scheme that satisfies both rules:** prefix per list + monotonic counter, total 32 hex chars β€” `1…0001`, `1…0002` for `sample_questions`; `2…0001`, `2…0002` for `example_question_sqls`; `3…0001` for `text_instructions`. Authoring order = sort order, no collisions. ### Text Instructions diff --git a/databricks-skills/databricks-spark-declarative-pipelines/SKILL.md b/databricks-skills/databricks-spark-declarative-pipelines/SKILL.md index d4286902..5e065d52 100644 --- a/databricks-skills/databricks-spark-declarative-pipelines/SKILL.md +++ b/databricks-skills/databricks-spark-declarative-pipelines/SKILL.md @@ -281,16 +281,26 @@ After running a pipeline (via DAB or CLI), you **MUST** validate both the execut ### Step 1: Check Pipeline Execution Status -`list-pipeline-events` returns a bare JSON array (not `{"events": [...]}`). For DAB runs, also check `databricks bundle run` output; `pipelines get`/`list-pipeline-events` still apply. +A freshly created pipeline has `state: IDLE` and `latest_updates: null` until you trigger the first run with `start-update`. `list-pipeline-events` returns a bare JSON array (not `{"events": [...]}`). For DAB runs, also check `databricks bundle run` output. ```bash -databricks pipelines get +# Kick off (or re-run) a pipeline. --full-refresh reprocesses everything +# from scratch (destructive on streaming state); omit for incremental. +databricks pipelines start-update +databricks pipelines start-update --full-refresh + +# Poll status. The (.latest_updates // [{}]) guard handles the null case +# on a never-run pipeline so jq doesn't crash. +databricks pipelines get \ + | jq '{state, latest: (.latest_updates // [{}])[0] | {state, update_id, creation_time}}' -# Surface just failures +# Surface just failures from the event log databricks pipelines list-pipeline-events \ | jq '[.[] | select(.level=="ERROR" or .level=="WARN") | {level, event_type, message: (.message // "")[0:200]}] | .[0:10]' ``` +If a pipeline is already RUNNING, `start-update` queues the new update; force-stop with `databricks pipelines stop ` first if needed. + ### Updating a Pipeline (edit β†’ re-upload β†’ restart) Use `--format RAW --overwrite` β€” pipelines use raw `.sql`/`.py` FILE entries. `--format SOURCE --language SQL|PYTHON` uploads a workspace notebook instead and **notebooks are deprecated for pipelines**; mixing the two on the same path fails with `Cannot overwrite the asset ... due to type mismatch (asked: NOTEBOOK, actual: FILE)`. @@ -302,12 +312,9 @@ databricks workspace import /Workspace/Users//pipeline/07_gold.sql \ # Whole directory databricks workspace import-dir ./src/pipeline /Workspace/Users//pipeline --overwrite - -# Restart. --full-refresh reprocesses everything (destructive on streaming state); omit for incremental. -databricks pipelines start-update --full-refresh ``` -If pipeline is RUNNING, `start-update` queues the new update. Force-stop with `databricks pipelines stop ` first if needed. +After re-uploading, trigger a new run with `databricks pipelines start-update ` (see Step 1 above for the full polling pattern). ### Step 2: Validate Output Data From c84ee7ebd62e03185a38f9b8ee1d28fd3d9dab21 Mon Sep 17 00:00:00 2001 From: Dustin Vannoy <126025751+dustinvannoy-db@users.noreply.github.com> Date: Thu, 30 Apr 2026 14:07:33 -0700 Subject: [PATCH 59/66] Add MCP deprecation prompt to Windows installer (#506) Brings install.ps1 into parity with install.sh on the experimental branch: defaults InstallMcp to false, adds --mcp flag, adds Invoke-PromptMcpInstall with a radio select offering "Do not install" as the default. Co-authored-by: Isaac --- install.ps1 | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/install.ps1 b/install.ps1 index a4648fa6..0011501c 100644 --- a/install.ps1 +++ b/install.ps1 @@ -64,7 +64,7 @@ $MinSdkVersion = "0.85.0" $script:Profile_ = "DEFAULT" $script:Scope = "project" $script:ScopeExplicit = $false # Track if --global was explicitly passed -$script:InstallMcp = $true +$script:InstallMcp = $false $script:InstallSkills = $true $script:Force = $false $script:Silent = $false @@ -212,8 +212,9 @@ while ($i -lt $args.Count) { { $_ -in "-p", "--profile" } { $script:Profile_ = $args[$i + 1]; $script:ProfileProvided = $true; $i += 2 } { $_ -in "-g", "--global", "-Global" } { $script:Scope = "global"; $script:ScopeExplicit = $true; $i++ } { $_ -in "--skills-only", "-SkillsOnly" } { $script:InstallMcp = $false; $i++ } + { $_ -in "--mcp", "-Mcp" } { $script:InstallMcp = $true; $i++ } { $_ -in "--mcp-only", "-McpOnly" } { $script:InstallSkills = $false; $i++ } - { $_ -in "--mcp-path", "-McpPath" } { $script:UserMcpPath = $args[$i + 1]; $i += 2 } + { $_ -in "--mcp-path", "-McpPath" } { $script:UserMcpPath = $args[$i + 1]; $script:InstallMcp = $true; $i += 2 } { $_ -in "--silent", "-Silent" } { $script:Silent = $true; $i++ } { $_ -in "--tools", "-Tools" } { $script:UserTools = $args[$i + 1]; $i += 2 } { $_ -in "--skills-profile", "-SkillsProfile" } { $script:SkillsProfile = $args[$i + 1]; $i += 2 } @@ -232,6 +233,7 @@ while ($i -lt $args.Count) { Write-Host " --skills-only Skip MCP server setup" Write-Host " --mcp-only Skip skills installation" Write-Host " --mcp-path PATH Path to MCP server installation" + Write-Host " --mcp Install deprecated MCP server (default: no)" Write-Host " --silent Silent mode (no output except errors)" Write-Host " --tools LIST Comma-separated: claude,cursor,copilot,codex,gemini,antigravity,windsurf,opencode" Write-Host " --skills-profile LIST Comma-separated profiles: all,data-engineer,analyst,ai-ml-engineer,app-developer" @@ -689,6 +691,28 @@ function Invoke-PromptMcpPath { $script:McpEntry = Join-Path $script:RepoDir "databricks-mcp-server\run_server.py" } +# ─── MCP install prompt ────────────────────────────────────── +function Invoke-PromptMcpInstall { + if ($script:InstallMcp) { return } + if ($script:Silent -or -not (Test-Interactive)) { return } + + Write-Host "" + Write-Host " Deprecated MCP Server" -ForegroundColor White + Write-Host " Skills now work via CLI for better performance. MCP server is optional for backwards compatibility." -ForegroundColor DarkGray + + $items = @( + @{ Label = "Do not install"; Value = "no"; Selected = $true; Hint = "Recommended - skills work without MCP" } + @{ Label = "Install MCP server"; Value = "yes"; Selected = $false; Hint = "Legacy - requires Python venv setup" } + ) + + $selected = Select-Radio -Items $items + + if ($selected -eq "yes") { + $script:InstallMcp = $true + Invoke-PromptMcpPath + } +} + # ─── Check prerequisites ───────────────────────────────────── function Test-Dependencies { # Git @@ -1942,9 +1966,9 @@ function Invoke-Main { } } - # MCP path + # MCP server + Invoke-PromptMcpInstall if ($script:InstallMcp) { - Invoke-PromptMcpPath Write-Ok "MCP path: $($script:InstallDir)" } From c91dd717de6df0760b7121dbe455ed45e73dfba1 Mon Sep 17 00:00:00 2001 From: Dustin Vannoy <126025751+dustinvannoy-db@users.noreply.github.com> Date: Thu, 30 Apr 2026 16:44:42 -0700 Subject: [PATCH 60/66] Fix missing experimental parameter in the windows install (#507) --- install.ps1 | 87 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/install.ps1 b/install.ps1 index 0011501c..b755a7f6 100644 --- a/install.ps1 +++ b/install.ps1 @@ -76,6 +76,7 @@ $script:ProfileProvided = $false $script:SkillsProfile = "" $script:UserSkills = "" $script:ListSkills = $false +$script:Channel = if ($env:AIDEVKIT_CHANNEL) { $env:AIDEVKIT_CHANNEL } else { "stable" } # stable or experimental # Databricks skills (bundled in repo) $script:Skills = @( @@ -220,6 +221,7 @@ while ($i -lt $args.Count) { { $_ -in "--skills-profile", "-SkillsProfile" } { $script:SkillsProfile = $args[$i + 1]; $i += 2 } { $_ -in "--skills", "-Skills" } { $script:UserSkills = $args[$i + 1]; $i += 2 } { $_ -in "--list-skills", "-ListSkills" } { $script:ListSkills = $true; $i++ } + { $_ -in "--experimental", "-Experimental" } { $script:Channel = "experimental"; $i++ } { $_ -in "-f", "--force", "-Force" } { $script:Force = $true; $i++ } { $_ -in "-h", "--help", "-Help" } { Write-Host "Databricks AI Dev Kit Installer (Windows)" @@ -239,12 +241,14 @@ while ($i -lt $args.Count) { Write-Host " --skills-profile LIST Comma-separated profiles: all,data-engineer,analyst,ai-ml-engineer,app-developer" Write-Host " --skills LIST Comma-separated skill names to install (overrides profile)" Write-Host " --list-skills List available skills and profiles, then exit" + Write-Host " --experimental Install from experimental branch (early access features)" Write-Host " -f, --force Force reinstall" Write-Host " -h, --help Show this help" Write-Host "" Write-Host "Environment Variables:" Write-Host " AIDEVKIT_BRANCH Branch or tag to install (default: latest release)" Write-Host " AIDEVKIT_HOME Installation directory (default: ~/.ai-dev-kit)" + Write-Host " AIDEVKIT_CHANNEL 'stable' (default) or 'experimental'" Write-Host "" Write-Host "Examples:" Write-Host " # Basic installation" @@ -1743,7 +1747,19 @@ function Show-Summary { Write-Msg "Location: $($script:InstallDir)" Write-Msg "Scope: $($script:Scope)" Write-Msg "Tools: $(($script:Tools -split ' ') -join ', ')" + if ($script:Channel -eq "experimental") { + Write-Msg "Channel: experimental πŸ§ͺ" + } Write-Host "" + if ($script:Channel -eq "experimental") { + Write-Host " ============================================================" -ForegroundColor Yellow + Write-Host " πŸ§ͺ You're using the experimental channel" -ForegroundColor White + Write-Host " ============================================================" -ForegroundColor Yellow + Write-Host "" + Write-Msg "Thank you for testing early features! Your feedback helps us improve." + Write-Msg "Report issues: https://github.com/databricks-solutions/ai-dev-kit/issues" + Write-Host "" + } Write-Msg "Next steps:" $step = 1 if ($script:Tools -match 'cursor') { @@ -1917,6 +1933,71 @@ function Invoke-PromptAuth { } } +# ─── Release channel prompt ─────────────────────────────────── +function Invoke-PromptChannel { + # Skip if already set via --experimental flag or env var + if ($script:Channel -eq "experimental") { return } + + # Skip in silent mode or non-interactive + if ($script:Silent) { return } + if (-not (Test-Interactive)) { return } + + Write-Host "" + Write-Host " Select release channel" -ForegroundColor White + + $items = @( + @{ Label = "Stable"; Value = "stable"; Selected = $true; Hint = "Latest stable release (recommended)" } + @{ Label = "Experimental"; Value = "experimental"; Selected = $false; Hint = "Early access to new features -- help us test!" } + ) + + $script:Channel = Select-Radio -Items $items + + # If experimental was selected, re-download and re-exec from experimental branch + if ($script:Channel -eq "experimental") { + Write-Host "" + Write-Host " ============================================================" -ForegroundColor Yellow + Write-Host " πŸ§ͺ Experimental Channel" -ForegroundColor White + Write-Host " ============================================================" -ForegroundColor Yellow + Write-Host "" + Write-Host " You're about to install the " -NoNewline + Write-Host "experimental" -ForegroundColor White -NoNewline + Write-Host " version of AI Dev Kit." + Write-Host " This includes early access features that may change or break." + Write-Host "" + Write-Host " We'd love your feedback!" -ForegroundColor White + Write-Host " Report issues: https://github.com/databricks-solutions/ai-dev-kit/issues" -ForegroundColor Blue + Write-Host " Discussions: https://github.com/databricks-solutions/ai-dev-kit/discussions" -ForegroundColor Blue + Write-Host "" + Write-Host " Downloading installer from experimental branch..." -ForegroundColor DarkGray + Write-Host "" + + # Build argument list preserving current flags + $newArgs = @("--experimental") + if ($script:Force) { $newArgs += "--force" } + if ($script:UserTools) { $newArgs += "--tools"; $newArgs += $script:UserTools } + if ($script:UserMcpPath) { $newArgs += "--mcp-path"; $newArgs += $script:UserMcpPath } + if ($script:SkillsProfile) { $newArgs += "--skills-profile"; $newArgs += $script:SkillsProfile } + if ($script:UserSkills) { $newArgs += "--skills"; $newArgs += $script:UserSkills } + if ($script:ScopeExplicit -and $script:Scope -eq "global") { $newArgs += "--global" } + if ($script:Profile_ -ne "DEFAULT") { $newArgs += "--profile"; $newArgs += $script:Profile_ } + if ($script:InstallMcp) { $newArgs += "--mcp" } + if (-not $script:InstallSkills) { $newArgs += "--mcp-only" } + + # Download experimental installer to a temp file and execute + $expUrl = "https://raw.githubusercontent.com/databricks-solutions/ai-dev-kit/experimental/install.ps1" + $tempScript = Join-Path $env:TEMP "ai-dev-kit-install-experimental.ps1" + try { + Invoke-WebRequest -Uri $expUrl -OutFile $tempScript -UseBasicParsing -ErrorAction Stop + } catch { + Write-Err "Failed to download experimental installer from ${expUrl}: $($_.Exception.Message)" + } + + # Execute the experimental installer with preserved args, then exit + & $tempScript @newArgs + exit $LASTEXITCODE + } +} + # ─── Main ───────────────────────────────────────────────────── function Invoke-Main { if (-not $script:Silent) { @@ -1925,6 +2006,9 @@ function Invoke-Main { Write-Host "--------------------------------" } + # ── Step 1: Release channel selection (may re-exec from experimental branch) ── + Invoke-PromptChannel + # Check dependencies Write-Step "Checking prerequisites" Test-Dependencies @@ -1977,6 +2061,9 @@ function Invoke-Main { Write-Host "" Write-Host " Summary" -ForegroundColor White Write-Host " ------------------------------------" + if ($script:Channel -eq "experimental") { + Write-Host " Channel: " -NoNewline; Write-Host "experimental πŸ§ͺ" -ForegroundColor Yellow + } Write-Host " Tools: " -NoNewline; Write-Host "$(($script:Tools -split ' ') -join ', ')" -ForegroundColor Green Write-Host " Profile: " -NoNewline; Write-Host $script:Profile_ -ForegroundColor Green Write-Host " Scope: " -NoNewline; Write-Host $script:Scope -ForegroundColor Green From 1e1cc118b9609a56bd118608ff8e32878641cae8 Mon Sep 17 00:00:00 2001 From: Dustin Vannoy <126025751+dustinvannoy-db@users.noreply.github.com> Date: Thu, 30 Apr 2026 21:31:14 -0700 Subject: [PATCH 61/66] Fix experimental channel to install skills from experimental branch (#508) The --experimental flag previously only changed the channel display in the UI; it did not affect which branch the repo (and thus skills) was cloned from. Skills like databricks-dbsql ended up coming from the latest release tag instead of experimental. Now, when --experimental is selected (or AIDEVKIT_CHANNEL=experimental), $Branch / $BRANCH defaults to "experimental" unless the user explicitly passed -b/--branch or set DEVKIT_BRANCH/AIDEVKIT_BRANCH. Also adds the missing -b/--branch flag to install.ps1 (install.sh already had it) so it can be used as an explicit override on Windows. --- install.ps1 | 16 +++++++++++++++- install.sh | 11 ++++++++++- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/install.ps1 b/install.ps1 index b755a7f6..780b7255 100644 --- a/install.ps1 +++ b/install.ps1 @@ -36,8 +36,10 @@ $Owner = "databricks-solutions" $Repo = "ai-dev-kit" # Determine branch/tag to use +$script:BranchExplicit = $false if ($env:AIDEVKIT_BRANCH) { $Branch = $env:AIDEVKIT_BRANCH + $script:BranchExplicit = $true } else { try { $latestReleaseUri = "https://api.github.com/repos/$Owner/$Repo/releases/latest" @@ -49,7 +51,7 @@ if ($env:AIDEVKIT_BRANCH) { } $RepoUrl = "https://github.com/$Owner/$Repo.git" -$RawUrl = "https://raw.githubusercontent.com/$Owner/$Repo/$Branch" +# $RawUrl is set after argument parsing so --branch / --experimental can affect it $InstallDir = if ($env:AIDEVKIT_HOME) { $env:AIDEVKIT_HOME } else { Join-Path $env:USERPROFILE ".ai-dev-kit" } $RepoDir = Join-Path $InstallDir "repo" $VenvDir = Join-Path $InstallDir ".venv" @@ -222,6 +224,7 @@ while ($i -lt $args.Count) { { $_ -in "--skills", "-Skills" } { $script:UserSkills = $args[$i + 1]; $i += 2 } { $_ -in "--list-skills", "-ListSkills" } { $script:ListSkills = $true; $i++ } { $_ -in "--experimental", "-Experimental" } { $script:Channel = "experimental"; $i++ } + { $_ -in "-b", "--branch", "-Branch" } { $Branch = $args[$i + 1]; $script:BranchExplicit = $true; $i += 2 } { $_ -in "-f", "--force", "-Force" } { $script:Force = $true; $i++ } { $_ -in "-h", "--help", "-Help" } { Write-Host "Databricks AI Dev Kit Installer (Windows)" @@ -242,6 +245,7 @@ while ($i -lt $args.Count) { Write-Host " --skills LIST Comma-separated skill names to install (overrides profile)" Write-Host " --list-skills List available skills and profiles, then exit" Write-Host " --experimental Install from experimental branch (early access features)" + Write-Host " -b, --branch NAME Git branch/tag to install (default: latest release)" Write-Host " -f, --force Force reinstall" Write-Host " -h, --help Show this help" Write-Host "" @@ -266,6 +270,15 @@ while ($i -lt $args.Count) { } } +# If experimental channel is selected and branch wasn't explicitly overridden, +# install skills from the experimental branch instead of the latest release. +if ($script:Channel -eq "experimental" -and -not $script:BranchExplicit) { + $Branch = "experimental" +} + +# Set raw URL after branch resolution +$RawUrl = "https://raw.githubusercontent.com/$Owner/$Repo/$Branch" + # ─── Interactive helpers ────────────────────────────────────── function Test-Interactive { @@ -1982,6 +1995,7 @@ function Invoke-PromptChannel { if ($script:Profile_ -ne "DEFAULT") { $newArgs += "--profile"; $newArgs += $script:Profile_ } if ($script:InstallMcp) { $newArgs += "--mcp" } if (-not $script:InstallSkills) { $newArgs += "--mcp-only" } + if ($script:BranchExplicit) { $newArgs += "--branch"; $newArgs += $Branch } # Download experimental installer to a temp file and execute $expUrl = "https://raw.githubusercontent.com/databricks-solutions/ai-dev-kit/experimental/install.ps1" diff --git a/install.sh b/install.sh index 98e1aa01..9fbb6806 100644 --- a/install.sh +++ b/install.sh @@ -65,8 +65,10 @@ CHANNEL="${DEVKIT_CHANNEL:-stable}" # stable or experimental OWNER="databricks-solutions" REPO="ai-dev-kit" +BRANCH_EXPLICIT=false if [ -n "${DEVKIT_BRANCH:-}" ]; then BRANCH="$DEVKIT_BRANCH" + BRANCH_EXPLICIT=true else BRANCH="$( curl -s "https://api.github.com/repos/${OWNER}/${REPO}/releases/latest" \ @@ -136,7 +138,7 @@ while [ $# -gt 0 ]; do case $1 in -p|--profile) PROFILE="$2"; shift 2 ;; -g|--global) SCOPE="global"; SCOPE_EXPLICIT=true; shift ;; - -b|--branch) BRANCH="$2"; shift 2 ;; + -b|--branch) BRANCH="$2"; BRANCH_EXPLICIT=true; shift 2 ;; --skills-only) INSTALL_MCP=false; shift ;; --mcp-only) INSTALL_SKILLS=false; shift ;; --mcp-path) USER_MCP_PATH="$2"; MCP_INSTALL_PATH="$2"; INSTALL_MCP=true; shift 2 ;; @@ -259,6 +261,12 @@ if [ "${LIST_SKILLS:-false}" = true ]; then exit 0 fi +# If experimental channel is selected and branch wasn't explicitly overridden, +# install skills from the experimental branch instead of the latest release. +if [ "$CHANNEL" = "experimental" ] && [ "$BRANCH_EXPLICIT" != true ]; then + BRANCH="experimental" +fi + # Set configuration URLs after parsing branch argument REPO_URL="https://github.com/databricks-solutions/ai-dev-kit.git" RAW_URL="https://raw.githubusercontent.com/databricks-solutions/ai-dev-kit/${BRANCH}" @@ -1966,6 +1974,7 @@ prompt_channel() { [ "$PROFILE" != "DEFAULT" ] && args="$args --profile $PROFILE" [ "$INSTALL_MCP" = false ] && args="$args --skills-only" [ "$INSTALL_SKILLS" = false ] && args="$args --mcp-only" + [ "$BRANCH_EXPLICIT" = true ] && args="$args --branch $BRANCH" # Download and execute the experimental installer exec bash <(curl -fsSL "https://raw.githubusercontent.com/databricks-solutions/ai-dev-kit/experimental/install.sh") $args From eb48a4c49945a3d415434461d64588a54e6b82a7 Mon Sep 17 00:00:00 2001 From: Dustin Vannoy <126025751+dustinvannoy-db@users.noreply.github.com> Date: Wed, 6 May 2026 00:41:10 -0700 Subject: [PATCH 62/66] =?UTF-8?q?Update=20lakebase-autoscale=20skill=20wit?= =?UTF-8?q?h=20revised=20CLI=20commands=20and=20new=20fea=E2=80=A6=20(#497?= =?UTF-8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Update lakebase-autoscale skill with revised CLI commands and new features - Fix autoscaling spread constraint: 8 CU β†’ 16 CU across SKILL.md and computes.md - Fix scale-to-zero wake-up latency: few hundred ms β†’ ~100ms - Update token refresh guidance: 50 min β†’ 30-40 min - Move synced-table CLI from `databricks database` to `databricks postgres` group (v0.294.0+) - Update SDK module from `databricks.sdk.service.database` to `databricks.sdk.service.postgres` - Correct reverse-ETL throughput figures: snapshot 2k rows/s/CU, incremental 150 rows/s/CU - Add High Availability section (secondaries vs read replicas, HA constraints) - Add Data API section (PostgREST-compatible HTTP CRUD, Autoscaling-only) - Add Lakehouse Sync Beta section (Postgres β†’ UC Delta, AWS only) - Add `databricks apps init --features lakebase` command and `list-endpoints` command Co-authored-by: Isaac * Move lakebase-autoscale reference docs into references/ subdirectory Reorganizes branches.md, computes.md, connection-patterns.md, projects.md, and reverse-etl.md into a references/ subfolder. Updates all links in SKILL.md (references/foo.md) and back-links in each reference file (../SKILL.md). Also corrects token refresh guidance to 45 min per official Databricks docs (docs.databricks.com/aws/en/oltp/projects/external-apps-connect). Co-authored-by: Isaac * Add ground truth test cases 007-009 to lakebase-autoscale skill Ports three hard-difficulty interactive test cases from ai-dev-kit-lakebase_updates: - 007: Full project setup (create project, autoscaling, branch protection, dev branch, connectivity, database) - 008: Schema DDL (4-table support schema with FKs, CHECK constraints, indexes) - 009: Extended DDL (support_cases, case_products, case_notes with uv/pip install) Fixes token refresh guidance in 007 response from ~50 min to ~45 min. Co-authored-by: Isaac * Align lakebase-autoscale ground truth to CLI-first approach and fix ep-primary Follow-up to the CLI-first rewrite in this PR. Three fixes that were blocking accurate eval scoring: 1. ground_truth.yaml β€” replaced all SDK expected_facts/patterns with CLI equivalents for tasks 001, 002, 004, 005, 007 (management-plane tasks). Connection and DDL facts in tasks 003, 008, 009 made approach-agnostic. Proxy eval confirmed: 0.216 β†’ 0.609 (+0.393). Tasks 002/003/005 all hit 0.825 after the fix; task 006 (cli_reference) is residual work for a follow-on PR. 2. SKILL.md β€” added psycopg3 connection snippet inline to the Credentials section. Moving it to references/connection-patterns.md caused a regression where the surrogate LLM defaulted to import psycopg2. Reference files are not loaded at eval time (evaluator reads only SKILL.md); guidance must be inline to be effective. 3. references/computes.md line 7 + ground_truth.yaml tasks 007/008/009 β€” ep-primary β†’ primary. The wrong endpoint name was the direct cause of the task 009 floor (score 0.000 during agent-eval; agent used the wrong path and the connection failed). Also adds psycopg[binary] to .test/pyproject.toml so pre-validation passes for tasks 003/007/008/009 (which import psycopg in their reference responses), and fixes agent executor to skip empty env var values so Claude Code falls back to keychain auth correctly. Co-authored-by: Isaac * Fix task 006 ground truth: remove --project-id flag, expand CLI reference Task 006 (cli_reference) was scoring 0.000 on all three judge dimensions (correctness, completeness, guideline adherence). Root cause: the reference response used '--project-id my-app' which is invalid syntax β€” the project ID is a positional argument to create-project, not a flag. Changes: - Fix create-project syntax: positional argument, not --project-id flag - Add update-endpoint example with correct positional field-mask syntax - Add generate-database-credential example (credentials are a core CLI op) - Replace 'no_expiry: true' example with 'ttl: 604800s' for consistency with the rest of the ground truth - Expand expected_facts to assert positional arg and field-mask patterns - Add expected_patterns for endpoint and credential commands - Update guideline: 5 subcommands (was 4), explicitly forbid --project-id flag Co-authored-by: Isaac --------- Co-authored-by: cankoklu-db --- .test/pyproject.toml | 2 + .../ground_truth.yaml | 843 +++++++++++++++--- .test/src/skill_test/agent/executor.py | 4 +- .test/uv.lock | 401 +++++---- .../databricks-lakebase-autoscale/SKILL.md | 73 +- .../{ => references}/branches.md | 2 +- .../{ => references}/computes.md | 28 +- .../{ => references}/connection-patterns.md | 46 +- .../{ => references}/projects.md | 2 +- .../{ => references}/reverse-etl.md | 32 +- 10 files changed, 1080 insertions(+), 353 deletions(-) rename databricks-skills/databricks-lakebase-autoscale/{ => references}/branches.md (98%) rename databricks-skills/databricks-lakebase-autoscale/{ => references}/computes.md (77%) rename databricks-skills/databricks-lakebase-autoscale/{ => references}/connection-patterns.md (86%) rename databricks-skills/databricks-lakebase-autoscale/{ => references}/projects.md (98%) rename databricks-skills/databricks-lakebase-autoscale/{ => references}/reverse-etl.md (81%) diff --git a/.test/pyproject.toml b/.test/pyproject.toml index 73461d8e..af4e838a 100644 --- a/.test/pyproject.toml +++ b/.test/pyproject.toml @@ -13,6 +13,8 @@ dependencies = [ "mlflow[databricks]>=3.10.1", "protobuf>=5.26", "python-dotenv>=1.0.0", + "claude-agent-sdk>=0.1.39", + "gepa>=0.1.0", ] [project.optional-dependencies] diff --git a/.test/skills/databricks-lakebase-autoscale/ground_truth.yaml b/.test/skills/databricks-lakebase-autoscale/ground_truth.yaml index 9832a09c..f1342dac 100644 --- a/.test/skills/databricks-lakebase-autoscale/ground_truth.yaml +++ b/.test/skills/databricks-lakebase-autoscale/ground_truth.yaml @@ -6,50 +6,37 @@ metadata: test_cases: - id: lakebase_auto_create_project_001 inputs: - prompt: "How do I create a new Lakebase Autoscale project using the Python SDK?" + prompt: "How do I create a new Lakebase Autoscale project?" outputs: response: | - ```python - from databricks.sdk import WorkspaceClient - from databricks.sdk.service.postgres import Project, ProjectSpec - - w = WorkspaceClient() - - # Create a project (long-running operation) - operation = w.postgres.create_project( - project=Project( - spec=ProjectSpec( - display_name="My Application", - pg_version="17" - ) - ), - project_id="my-app" - ) - result = operation.wait() - print(f"Created project: {result.name}") + ```bash + databricks postgres create-project my-app \ + --json '{"spec": {"display_name": "My Application", "pg_version": "17"}}' ``` + + The CLI blocks until the long-running operation completes and returns the final resource. execution_success: true expectations: expected_facts: - - "Projects are created with w.postgres.create_project()" - - "create_project returns a long-running operation that requires .wait()" - - "ProjectSpec requires display_name and pg_version" + - "Projects are created with 'databricks postgres create-project'" + - "Project spec is passed as --json with display_name and pg_version" + - "The CLI blocks until the long-running operation completes" expected_patterns: - - pattern: "create_project" + - pattern: "databricks postgres create-project" min_count: 1 - description: "Uses create_project() method" - - pattern: "ProjectSpec" + description: "Uses CLI create-project command" + - pattern: "display_name" min_count: 1 - description: "Uses ProjectSpec for configuration" - - pattern: "\\.wait\\(\\)" + description: "Specifies project display name in spec" + - pattern: "pg_version" min_count: 1 - description: "Waits for the long-running operation to complete" - - pattern: "WorkspaceClient" + description: "Specifies Postgres version" + - pattern: "--json" min_count: 1 - description: "Uses WorkspaceClient for SDK access" + description: "Passes spec as JSON" guidelines: - - "Must show the long-running operation pattern with .wait()" - - "Must include ProjectSpec with display_name and pg_version" + - "Must use databricks postgres create-project with --json for the spec" + - "Must include display_name and pg_version in the spec" metadata: category: happy_path difficulty: easy @@ -60,44 +47,32 @@ test_cases: prompt: "How do I create a development branch in Lakebase Autoscale with an expiration?" outputs: response: | - ```python - from databricks.sdk.service.postgres import Branch, BranchSpec, Duration - - # Create a dev branch with 7-day expiration - branch = w.postgres.create_branch( - parent="projects/my-app", - branch=Branch( - spec=BranchSpec( - source_branch="projects/my-app/branches/production", - ttl=Duration(seconds=604800) # 7 days - ) - ), - branch_id="development" - ).wait() - print(f"Branch created: {branch.name}") + ```bash + databricks postgres create-branch projects/my-app development \ + --json '{"spec": {"source_branch": "projects/my-app/branches/production", "ttl": "604800s"}}' ``` Branches are copy-on-write snapshots of the production branch, ideal for development and testing. execution_success: true expectations: expected_facts: - - "Branches are created with w.postgres.create_branch()" - - "BranchSpec takes a source_branch and optional ttl for expiration" - - "Duration(seconds=604800) sets a 7-day TTL" + - "Branches are created with 'databricks postgres create-branch'" + - "Branch spec is passed as --json with source_branch and ttl in seconds" + - "TTL of 604800s sets a 7-day expiration" - "Branches use copy-on-write from the source branch" expected_patterns: - - pattern: "create_branch" - min_count: 1 - description: "Uses create_branch() method" - - pattern: "BranchSpec" - min_count: 1 - description: "Uses BranchSpec for branch configuration" - - pattern: "Duration" + - pattern: "databricks postgres create-branch" min_count: 1 - description: "Uses Duration for TTL specification" + description: "Uses CLI create-branch command" - pattern: "source_branch" min_count: 1 description: "Specifies the parent branch to fork from" + - pattern: "604800s" + min_count: 1 + description: "Sets 7-day TTL in seconds" + - pattern: "copy-on-write" + min_count: 1 + description: "Explains branch isolation model" guidelines: - "Must include TTL/expiration on the development branch" - "Must explain that branches are copy-on-write snapshots" @@ -119,13 +94,13 @@ test_cases: # Get endpoint details endpoint = w.postgres.get_endpoint( - name="projects/my-app/branches/production/endpoints/ep-primary" + name="projects/my-app/branches/production/endpoints/primary" ) host = endpoint.status.hosts.host # Generate OAuth token (scoped to endpoint) cred = w.postgres.generate_database_credential( - endpoint="projects/my-app/branches/production/endpoints/ep-primary" + endpoint="projects/my-app/branches/production/endpoints/primary" ) # Connect using psycopg3 @@ -152,15 +127,15 @@ test_cases: - pattern: "psycopg\\.connect" min_count: 1 description: "Connects using psycopg3" - - pattern: "generate_database_credential" + - pattern: "generate.database.credential" min_count: 1 - description: "Generates OAuth token for authentication" + description: "Generates OAuth token for authentication (CLI or SDK)" - pattern: "sslmode=require" min_count: 1 description: "Requires SSL for the connection" - - pattern: "get_endpoint" + - pattern: "get.endpoint" min_count: 1 - description: "Retrieves endpoint details" + description: "Retrieves endpoint details (CLI or SDK)" guidelines: - "Must use psycopg (not psycopg2) for the connection" - "Must generate OAuth token and use it as password" @@ -182,7 +157,7 @@ test_cases: # Generate database credential (optionally scoped to an endpoint) cred = w.postgres.generate_database_credential( - endpoint="projects/my-app/branches/production/endpoints/ep-primary" + endpoint="projects/my-app/branches/production/endpoints/primary" ) token = cred.token # Use as password in connection string # Token expires after 1 hour @@ -192,17 +167,17 @@ test_cases: execution_success: true expectations: expected_facts: - - "OAuth tokens are generated via w.postgres.generate_database_credential()" + - "OAuth tokens are generated with 'databricks postgres generate-database-credential'" - "Tokens expire after 1 hour" - "Tokens can be scoped to a specific endpoint" - "The token is used as the password in PostgreSQL connection strings" expected_patterns: - - pattern: "generate_database_credential" + - pattern: "generate.database.credential" min_count: 1 - description: "Uses generate_database_credential() method" - - pattern: "cred\\.token" + description: "Uses generate-database-credential (CLI or SDK)" + - pattern: "token" min_count: 1 - description: "Accesses the token from the credential response" + description: "References the token field from the credential response" guidelines: - "Must mention the 1-hour token expiration" - "Must show that the token is used as the connection password" @@ -216,50 +191,37 @@ test_cases: prompt: "How do I resize the compute capacity for a Lakebase Autoscale endpoint?" outputs: response: | - ```python - from databricks.sdk.service.postgres import Endpoint, EndpointSpec, FieldMask - - # Update compute to autoscale between 2-8 CU - w.postgres.update_endpoint( - name="projects/my-app/branches/production/endpoints/ep-primary", - endpoint=Endpoint( - name="projects/my-app/branches/production/endpoints/ep-primary", - spec=EndpointSpec( - autoscaling_limit_min_cu=2.0, - autoscaling_limit_max_cu=8.0 - ) - ), - update_mask=FieldMask(field_mask=[ - "spec.autoscaling_limit_min_cu", - "spec.autoscaling_limit_max_cu" - ]) - ).wait() + ```bash + databricks postgres update-endpoint \ + projects/my-app/branches/production/endpoints/primary \ + "spec.autoscaling_limit_min_cu,spec.autoscaling_limit_max_cu" \ + --json '{"spec": {"autoscaling_limit_min_cu": 2.0, "autoscaling_limit_max_cu": 8.0}}' ``` Compute Units (CU) scale automatically within the min/max range based on workload. execution_success: true expectations: expected_facts: - - "Compute is resized with w.postgres.update_endpoint()" + - "Compute is resized with 'databricks postgres update-endpoint'" - "Autoscaling is configured with min and max CU limits" - - "FieldMask specifies which fields to update" - - "The operation is long-running and requires .wait()" + - "The update mask is passed as a positional argument" + - "The CLI blocks until the update completes" expected_patterns: - - pattern: "update_endpoint" + - pattern: "databricks postgres update-endpoint" min_count: 1 - description: "Uses update_endpoint() to resize" + description: "Uses CLI update-endpoint command" - pattern: "autoscaling_limit_min_cu" min_count: 1 description: "Sets minimum autoscaling CU" - pattern: "autoscaling_limit_max_cu" min_count: 1 description: "Sets maximum autoscaling CU" - - pattern: "FieldMask" + - pattern: "spec\\.autoscaling" min_count: 1 - description: "Uses FieldMask for partial update" + description: "Positional mask specifies autoscaling fields" guidelines: - "Must include both min and max CU configuration" - - "Must use FieldMask to specify updated fields" + - "Must pass update mask as positional argument to update-endpoint" metadata: category: happy_path difficulty: medium @@ -271,33 +233,42 @@ test_cases: outputs: response: | ```bash - # Create a project - databricks postgres create-project \ - --project-id my-app \ + # Create a project (PROJECT_ID is a positional argument) + databricks postgres create-project my-app \ --json '{"spec": {"display_name": "My App", "pg_version": "17"}}' - # List projects + # List / get projects databricks postgres list-projects - - # Get project details databricks postgres get-project projects/my-app - # Create a branch + # Create a branch (TTL in seconds; use no_expiry: true for permanent) databricks postgres create-branch projects/my-app development \ - --json '{"spec": {"source_branch": "projects/my-app/branches/production", "no_expiry": true}}' + --json '{"spec": {"source_branch": "projects/my-app/branches/production", "ttl": "604800s"}}' # List branches databricks postgres list-branches projects/my-app - # Delete a project + # Resize endpoint compute + databricks postgres update-endpoint \ + projects/my-app/branches/production/endpoints/primary \ + "spec.autoscaling_limit_min_cu,spec.autoscaling_limit_max_cu" \ + --json '{"spec": {"autoscaling_limit_min_cu": 2.0, "autoscaling_limit_max_cu": 8.0}}' + + # Generate OAuth credential for connecting + databricks postgres generate-database-credential \ + projects/my-app/branches/production/endpoints/primary + + # Delete a project (irreversible) databricks postgres delete-project projects/my-app ``` execution_success: true expectations: expected_facts: - - "Lakebase Autoscale CLI commands are under databricks postgres" - - "Project creation uses --json for the spec configuration" - - "Resource names follow the pattern projects/{id}/branches/{id}/endpoints/{id}" + - "All Lakebase Autoscale CLI commands are under 'databricks postgres'" + - "The project ID is a positional argument to create-project, not a flag" + - "Spec is passed as --json for create and update commands" + - "Resource names follow the hierarchical path: projects/{id}/branches/{id}/endpoints/{id}" + - "update-endpoint takes the field mask as a positional argument before --json" expected_patterns: - pattern: "databricks postgres create-project" min_count: 1 @@ -308,10 +279,662 @@ test_cases: - pattern: "databricks postgres create-branch" min_count: 1 description: "Shows branch creation command" + - pattern: "databricks postgres.*endpoint" + min_count: 1 + description: "Shows endpoint management command" + - pattern: "generate-database-credential" + min_count: 1 + description: "Shows credential generation for connecting" guidelines: - - "Must show at least 4 different CLI subcommands" - - "Must include both project and branch management commands" + - "Must show at least 5 different CLI subcommands covering projects, branches, endpoints, and credentials" + - "Must not use --project-id flag; project ID is always a positional argument" + - "Must show the correct positional field-mask syntax for update-endpoint" metadata: category: happy_path difficulty: easy source: auto_generated + +- id: lakebase_auto_full_project_setup_007 + inputs: + prompt: | + Create a new Lakebase Autoscaling project for a retail customer support system. + - Project name: `support-autoscale` + - Display name: `Support Autoscale` + - PG version 17 + - Set autoscaling compute: min 0 CU, max 2 CU with scale-to-zero enabled (5 minute suspend timeout) + - Create a `production` branch (default, protected) with a read-write endpoint + - Create a `dev` branch from production with a 7-day TTL expiration + - Generate an OAuth credential and verify connectivity to the production endpoint by running `SELECT version()` + - Create a database named `support_db` + outputs: + response: | + Here's how to set up a complete Lakebase Autoscaling project for a retail customer support system: + + ```python + from databricks.sdk import WorkspaceClient + from databricks.sdk.service.postgres import ( + Project, ProjectSpec, Branch, BranchSpec, + Endpoint, EndpointSpec, FieldMask, Duration + ) + import psycopg + + w = WorkspaceClient() + + # Step 1: Create the project (production branch + default endpoint created automatically) + project = w.postgres.create_project( + project=Project( + spec=ProjectSpec( + display_name="Support Autoscale", + pg_version="17" + ) + ), + project_id="support-autoscale" + ).wait() + print(f"Created project: {project.name}") + + # Step 2: Configure autoscaling on the production endpoint (0-2 CU, scale-to-zero 5 min) + w.postgres.update_endpoint( + name="projects/support-autoscale/branches/production/endpoints/primary", + endpoint=Endpoint( + name="projects/support-autoscale/branches/production/endpoints/primary", + spec=EndpointSpec( + autoscaling_limit_min_cu=0, + autoscaling_limit_max_cu=2.0, + suspend_timeout_seconds=300 + ) + ), + update_mask=FieldMask(field_mask=[ + "spec.autoscaling_limit_min_cu", + "spec.autoscaling_limit_max_cu", + "spec.suspend_timeout_seconds" + ]) + ).wait() + print("Configured autoscaling: 0-2 CU with 5-minute scale-to-zero") + + # Step 3: Protect the production branch + w.postgres.update_branch( + name="projects/support-autoscale/branches/production", + branch=Branch( + name="projects/support-autoscale/branches/production", + spec=BranchSpec(is_protected=True) + ), + update_mask=FieldMask(field_mask=["spec.is_protected"]) + ).wait() + print("Production branch is now protected") + + # Step 4: Create dev branch from production with 7-day TTL + dev_branch = w.postgres.create_branch( + parent="projects/support-autoscale", + branch=Branch( + spec=BranchSpec( + source_branch="projects/support-autoscale/branches/production", + ttl=Duration(seconds=604800) # 7 days + ) + ), + branch_id="dev" + ).wait() + print(f"Created dev branch: {dev_branch.name} (expires in 7 days)") + + # Step 5: Generate OAuth credential and verify connectivity + endpoint = w.postgres.get_endpoint( + name="projects/support-autoscale/branches/production/endpoints/primary" + ) + host = endpoint.status.hosts.host + + cred = w.postgres.generate_database_credential( + endpoint="projects/support-autoscale/branches/production/endpoints/primary" + ) + + conn_string = ( + f"host={host} " + f"dbname=databricks_postgres " + f"user={w.current_user.me().user_name} " + f"password={cred.token} " + f"sslmode=require" + ) + with psycopg.connect(conn_string) as conn: + with conn.cursor() as cur: + cur.execute("SELECT version()") + print(f"Connected! PostgreSQL version: {cur.fetchone()[0]}") + + # Step 6: Create the support_db database + with psycopg.connect(conn_string) as conn: + conn.autocommit = True + with conn.cursor() as cur: + cur.execute("CREATE DATABASE support_db") + print("Created database: support_db") + ``` + + **Key points:** + + - The `production` branch and a primary read-write endpoint named `primary` are created automatically when you create a project. + - `update_endpoint` with `FieldMask` configures autoscaling limits (0-2 CU) and the scale-to-zero suspend timeout (300 seconds = 5 minutes). + - `update_branch` with `is_protected=True` prevents accidental deletion or reset of the production branch. + - The `dev` branch uses copy-on-write from production, so it's storage-efficient. The 7-day TTL (`Duration(seconds=604800)`) auto-deletes it after expiration. + - OAuth tokens expire after 1 hour; for production apps, implement token refresh at ~45 minutes. + - `CREATE DATABASE` is standard PostgreSQL DDL executed over the psycopg connection. The connection must have `autocommit = True` since `CREATE DATABASE` cannot run inside a transaction. + - All SDK create/update/delete operations are long-running and require `.wait()` to block until completion. + - Always use `sslmode=require` in connection strings. + execution_success: true + expectations: + expected_facts: + - "Project is created with 'databricks postgres create-project'" + - "The CLI blocks until project creation completes" + - "Production branch and a read-write endpoint named 'primary' are created automatically with the project" + - "Autoscaling is configured via 'databricks postgres update-endpoint' with positional mask" + - "Scale-to-zero is set via suspend_timeout_seconds (300 = 5 minutes)" + - "Production branch is protected using 'databricks postgres update-branch'" + - "Dev branch is created with 'databricks postgres create-branch' using 7-day TTL (604800s)" + - "OAuth tokens are generated via generate_database_credential()" + - "Connectivity is verified by running SELECT version() over psycopg" + - "CREATE DATABASE requires autocommit=True on the psycopg connection" + - "All connection strings must include sslmode=require" + expected_patterns: + - pattern: "databricks postgres create-project" + min_count: 1 + description: "Uses CLI create-project command" + - pattern: "databricks postgres update-endpoint" + min_count: 1 + description: "Uses CLI update-endpoint to configure autoscaling" + - pattern: "autoscaling_limit_min_cu" + min_count: 1 + description: "Sets minimum autoscaling CU" + - pattern: "autoscaling_limit_max_cu" + min_count: 1 + description: "Sets maximum autoscaling CU" + - pattern: "is_protected" + min_count: 1 + description: "Protects the production branch" + - pattern: "databricks postgres create-branch" + min_count: 1 + description: "Uses CLI create-branch for the dev branch" + - pattern: "604800s" + min_count: 1 + description: "Sets 7-day TTL on dev branch" + - pattern: "CREATE DATABASE support_db" + min_count: 1 + description: "Creates the support_db database" + guidelines: + - "Must create the project with display_name='Support Autoscale' and pg_version='17'" + - "Must configure autoscaling with min 0 CU and max 2.0 CU" + - "Must enable scale-to-zero with a 5-minute (300 second) timeout" + - "Must protect the production branch using update-branch" + - "Must create a dev branch from production with 7-day TTL expiration" + - "Must create a database named support_db using standard Postgres DDL" + - "Must include sslmode=require in connection strings" + - "Must use positional mask argument in update-endpoint and update-branch" + metadata: + category: happy_path + difficulty: hard + source: interactive_add + +- id: lakebase_auto_schema_ddl_008 + inputs: + prompt: | + Connect to the `support_db` database on the `production` branch of the `support-autoscale` Lakebase project. + Create a `support` schema and build the following tables with appropriate constraints, indexes, and foreign keys: + + **products** - Product catalog for referencing in support cases + - product_id (PK, serial), sku (unique, varchar 50), name (varchar 200), category (varchar 100), price (numeric 10,2), is_active (boolean default true), created_at (timestamptz default now) + + **customers** - Customer records + - customer_id (PK, serial), email (unique, varchar 255), first_name (varchar 100), last_name (varchar 100), membership_tier (varchar 20, check in 'Free','Pro','Enterprise'), region (varchar 50), created_at (timestamptz default now) + + **orders** - Order history tied to customers + - order_id (PK, serial), customer_id (FK to customers), order_date (timestamptz default now), total_amount (numeric 12,2), status (varchar 20, check in 'pending','processing','shipped','delivered','cancelled','returned'), shipping_address (text), created_at (timestamptz default now) + + **order_items** - Line items tying orders to products + - order_item_id (PK, serial), order_id (FK to orders ON DELETE CASCADE), product_id (FK to products), quantity (integer, check > 0), unit_price (numeric 10,2), created_at (timestamptz default now) + + Add indexes on: customers(email), orders(customer_id, order_date), order_items(order_id), order_items(product_id), orders(status). + + Verify the schema was created correctly by querying information_schema. + outputs: + response: | + Here's how to connect to the Lakebase Autoscale project and build the support schema: + + ```python + import psycopg + from databricks.sdk import WorkspaceClient + + w = WorkspaceClient() + + # Get production endpoint for the support-autoscale project + endpoint = w.postgres.get_endpoint( + name="projects/support-autoscale/branches/production/endpoints/primary" + ) + host = endpoint.status.hosts.host + + # Generate OAuth credential scoped to this endpoint + cred = w.postgres.generate_database_credential( + endpoint="projects/support-autoscale/branches/production/endpoints/primary" + ) + + # Connect to support_db on the production branch + conn_string = ( + f"host={host} " + f"dbname=support_db " + f"user={w.current_user.me().user_name} " + f"password={cred.token} " + f"sslmode=require" + ) + + with psycopg.connect(conn_string) as conn: + conn.autocommit = True + with conn.cursor() as cur: + cur.execute("CREATE SCHEMA IF NOT EXISTS support") + + cur.execute(""" + CREATE TABLE support.products ( + product_id SERIAL PRIMARY KEY, + sku VARCHAR(50) UNIQUE NOT NULL, + name VARCHAR(200) NOT NULL, + category VARCHAR(100), + price NUMERIC(10, 2), + is_active BOOLEAN DEFAULT TRUE, + created_at TIMESTAMPTZ DEFAULT NOW() + ) + """) + + cur.execute(""" + CREATE TABLE support.customers ( + customer_id SERIAL PRIMARY KEY, + email VARCHAR(255) UNIQUE NOT NULL, + first_name VARCHAR(100) NOT NULL, + last_name VARCHAR(100) NOT NULL, + membership_tier VARCHAR(20) CHECK ( + membership_tier IN ('Free', 'Pro', 'Enterprise') + ), + region VARCHAR(50), + created_at TIMESTAMPTZ DEFAULT NOW() + ) + """) + + cur.execute(""" + CREATE TABLE support.orders ( + order_id SERIAL PRIMARY KEY, + customer_id INTEGER NOT NULL + REFERENCES support.customers(customer_id), + order_date TIMESTAMPTZ DEFAULT NOW(), + total_amount NUMERIC(12, 2), + status VARCHAR(20) CHECK ( + status IN ('pending', 'processing', 'shipped', + 'delivered', 'cancelled', 'returned') + ), + shipping_address TEXT, + created_at TIMESTAMPTZ DEFAULT NOW() + ) + """) + + cur.execute(""" + CREATE TABLE support.order_items ( + order_item_id SERIAL PRIMARY KEY, + order_id INTEGER NOT NULL + REFERENCES support.orders(order_id) ON DELETE CASCADE, + product_id INTEGER NOT NULL + REFERENCES support.products(product_id), + quantity INTEGER CHECK (quantity > 0), + unit_price NUMERIC(10, 2), + created_at TIMESTAMPTZ DEFAULT NOW() + ) + """) + + cur.execute("CREATE INDEX idx_customers_email ON support.customers(email)") + cur.execute("CREATE INDEX idx_orders_customer_date ON support.orders(customer_id, order_date)") + cur.execute("CREATE INDEX idx_order_items_order ON support.order_items(order_id)") + cur.execute("CREATE INDEX idx_order_items_product ON support.order_items(product_id)") + cur.execute("CREATE INDEX idx_orders_status ON support.orders(status)") + + cur.execute(""" + SELECT table_name, column_name, data_type, is_nullable + FROM information_schema.columns + WHERE table_schema = 'support' + ORDER BY table_name, ordinal_position + """) + print("Schema columns:") + for row in cur.fetchall(): + print(f" {row[0]}.{row[1]}: {row[2]} (nullable: {row[3]})") + + cur.execute(""" + SELECT indexname, tablename + FROM pg_indexes + WHERE schemaname = 'support' + ORDER BY tablename, indexname + """) + print("\nIndexes:") + for row in cur.fetchall(): + print(f" {row[0]} on {row[1]}") + ``` + + **Key points:** + + - Connect to `support_db` (not the default `databricks_postgres`) by specifying `dbname=support_db`. + - `autocommit = True` is required for DDL statements. + - Foreign keys: `orders.customer_id` references `customers`; `order_items` references `orders` (ON DELETE CASCADE) and `products`. + - CHECK constraints enforce valid values for `membership_tier`, `status`, and `quantity > 0`. + - The composite index on `orders(customer_id, order_date)` optimizes queries that filter by customer and sort by date. + - Verification uses `information_schema.columns` and `pg_indexes`. + - Always use `sslmode=require` in connection strings. + execution_success: true + expectations: + expected_facts: + - "Connection uses get-endpoint and generate-database-credential to get host and OAuth token" + - "Connects to support_db via dbname=support_db in the connection string" + - "autocommit must be True for DDL statements" + - "Creates a support schema with CREATE SCHEMA IF NOT EXISTS" + - "products table has SERIAL PK, UNIQUE sku, and BOOLEAN default" + - "customers table has CHECK constraint on membership_tier" + - "orders table has FK to customers(customer_id)" + - "order_items table has FK to orders with ON DELETE CASCADE" + - "order_items table has FK to products(product_id)" + - "CHECK constraint on quantity > 0" + - "Composite index on orders(customer_id, order_date) for query optimization" + - "Verification queries information_schema.columns and pg_indexes" + - "sslmode=require is used in the connection string" + expected_patterns: + - pattern: "get.endpoint" + min_count: 1 + description: "Gets endpoint details for connection host (CLI or SDK)" + - pattern: "generate.database.credential" + min_count: 1 + description: "Generates OAuth token for authentication (CLI or SDK)" + - pattern: "support_db" + min_count: 1 + description: "Connects to the support_db database" + - pattern: "sslmode=require" + min_count: 1 + description: "Requires SSL for the connection" + - pattern: "CREATE SCHEMA" + min_count: 1 + description: "Creates the support schema" + - pattern: "CREATE TABLE support\\.products" + min_count: 1 + description: "Creates the products table in support schema" + - pattern: "CREATE TABLE support\\.customers" + min_count: 1 + description: "Creates the customers table in support schema" + - pattern: "CREATE TABLE support\\.orders" + min_count: 1 + description: "Creates the orders table in support schema" + - pattern: "CREATE TABLE support\\.order_items" + min_count: 1 + description: "Creates the order_items table in support schema" + - pattern: "ON DELETE CASCADE" + min_count: 1 + description: "Cascade delete on order_items when order is deleted" + - pattern: "REFERENCES" + min_count: 3 + description: "Foreign key constraints (orders->customers, order_items->orders, order_items->products)" + - pattern: "CHECK" + min_count: 3 + description: "CHECK constraints on membership_tier, status, and quantity" + - pattern: "CREATE INDEX" + min_count: 5 + description: "Creates all 5 requested indexes" + - pattern: "information_schema" + min_count: 1 + description: "Verifies schema using information_schema" + - pattern: "sslmode=require" + min_count: 1 + description: "Requires SSL for the connection" + guidelines: + - "Must connect to support_db, not the default databricks_postgres" + - "Must use autocommit=True for DDL execution" + - "Must create all 4 tables in the support schema: products, customers, orders, order_items" + - "Must include all specified columns with correct data types and constraints" + - "Must define foreign keys between orders->customers and order_items->orders/products" + - "order_items.order_id FK must include ON DELETE CASCADE" + - "Must include CHECK constraints on membership_tier, status, and quantity" + - "Must create all 5 requested indexes including the composite index" + - "Must verify the schema using information_schema queries" + - "Must use Lakebase connection pattern: get-endpoint + generate-database-credential + sslmode=require" + metadata: + category: happy_path + difficulty: hard + source: interactive_add + +- id: lakebase_auto_support_cases_ddl_009 + inputs: + prompt: | + Use uv to install psycopg if not installed (use pip as a fallback). + Connect to `support_db` on `production` of `support-autoscale` and add support case tables to the `support` schema: + + **support_cases** - Customer support tickets tied to orders + - case_id (PK, serial), customer_id (FK to customers NOT NULL), order_id (FK to orders, nullable - not all cases relate to an order), subject (varchar 300 NOT NULL), description (text), category (varchar 50, check in 'order_issue','product_defect','shipping','billing','returns','general'), priority (varchar 20, check in 'low','medium','high','critical'), status (varchar 20 default 'open', check in 'open','in_progress','waiting_customer','resolved','closed'), assigned_agent (varchar 100), created_at (timestamptz default now), updated_at (timestamptz default now), resolved_at (timestamptz nullable) + + **case_products** - Junction table linking cases to one or more products involved + - case_product_id (PK, serial), case_id (FK to support_cases ON DELETE CASCADE), product_id (FK to products), is_primary (boolean default false), created_at (timestamptz default now) + - Add unique constraint on (case_id, product_id) + + **case_notes** - Activity log / notes on each support case + - note_id (PK, serial), case_id (FK to support_cases ON DELETE CASCADE), author (varchar 100 NOT NULL), note_type (varchar 20, check in 'internal','customer_reply','agent_reply','system'), content (text NOT NULL), created_at (timestamptz default now) + + Add indexes on: support_cases(customer_id), support_cases(order_id), support_cases(status, priority), support_cases(created_at), case_products(case_id), case_products(product_id), case_notes(case_id, created_at). + outputs: + response: | + Here's how to install psycopg, connect to the Lakebase Autoscale project, and create the support case tables: + + ```python + import subprocess + import sys + + try: + import psycopg + except ImportError: + try: + subprocess.check_call(["uv", "pip", "install", "psycopg[binary]>=3.0"]) + except (subprocess.CalledProcessError, FileNotFoundError): + subprocess.check_call( + [sys.executable, "-m", "pip", "install", "psycopg[binary]>=3.0"] + ) + import psycopg + + from databricks.sdk import WorkspaceClient + + w = WorkspaceClient() + + endpoint = w.postgres.get_endpoint( + name="projects/support-autoscale/branches/production/endpoints/primary" + ) + host = endpoint.status.hosts.host + + cred = w.postgres.generate_database_credential( + endpoint="projects/support-autoscale/branches/production/endpoints/primary" + ) + + conn_string = ( + f"host={host} " + f"dbname=support_db " + f"user={w.current_user.me().user_name} " + f"password={cred.token} " + f"sslmode=require" + ) + + with psycopg.connect(conn_string) as conn: + conn.autocommit = True + with conn.cursor() as cur: + cur.execute(""" + CREATE TABLE support.support_cases ( + case_id SERIAL PRIMARY KEY, + customer_id INTEGER NOT NULL + REFERENCES support.customers(customer_id), + order_id INTEGER + REFERENCES support.orders(order_id), + subject VARCHAR(300) NOT NULL, + description TEXT, + category VARCHAR(50) CHECK ( + category IN ('order_issue', 'product_defect', 'shipping', + 'billing', 'returns', 'general') + ), + priority VARCHAR(20) CHECK ( + priority IN ('low', 'medium', 'high', 'critical') + ), + status VARCHAR(20) DEFAULT 'open' CHECK ( + status IN ('open', 'in_progress', 'waiting_customer', + 'resolved', 'closed') + ), + assigned_agent VARCHAR(100), + created_at TIMESTAMPTZ DEFAULT NOW(), + updated_at TIMESTAMPTZ DEFAULT NOW(), + resolved_at TIMESTAMPTZ + ) + """) + + cur.execute(""" + CREATE TABLE support.case_products ( + case_product_id SERIAL PRIMARY KEY, + case_id INTEGER NOT NULL + REFERENCES support.support_cases(case_id) ON DELETE CASCADE, + product_id INTEGER NOT NULL + REFERENCES support.products(product_id), + is_primary BOOLEAN DEFAULT FALSE, + created_at TIMESTAMPTZ DEFAULT NOW(), + UNIQUE (case_id, product_id) + ) + """) + + cur.execute(""" + CREATE TABLE support.case_notes ( + note_id SERIAL PRIMARY KEY, + case_id INTEGER NOT NULL + REFERENCES support.support_cases(case_id) ON DELETE CASCADE, + author VARCHAR(100) NOT NULL, + note_type VARCHAR(20) CHECK ( + note_type IN ('internal', 'customer_reply', + 'agent_reply', 'system') + ), + content TEXT NOT NULL, + created_at TIMESTAMPTZ DEFAULT NOW() + ) + """) + + cur.execute("CREATE INDEX idx_cases_customer ON support.support_cases(customer_id)") + cur.execute("CREATE INDEX idx_cases_order ON support.support_cases(order_id)") + cur.execute("CREATE INDEX idx_cases_status_priority ON support.support_cases(status, priority)") + cur.execute("CREATE INDEX idx_cases_created ON support.support_cases(created_at)") + cur.execute("CREATE INDEX idx_case_products_case ON support.case_products(case_id)") + cur.execute("CREATE INDEX idx_case_products_product ON support.case_products(product_id)") + cur.execute("CREATE INDEX idx_case_notes_case_created ON support.case_notes(case_id, created_at)") + + cur.execute(""" + SELECT table_name, column_name, data_type, column_default, is_nullable + FROM information_schema.columns + WHERE table_schema = 'support' + AND table_name IN ('support_cases', 'case_products', 'case_notes') + ORDER BY table_name, ordinal_position + """) + print("New table columns:") + for row in cur.fetchall(): + print(f" {row[0]}.{row[1]}: {row[2]} (default: {row[3]}, nullable: {row[4]})") + + cur.execute(""" + SELECT indexname, tablename + FROM pg_indexes + WHERE schemaname = 'support' + AND tablename IN ('support_cases', 'case_products', 'case_notes') + ORDER BY tablename, indexname + """) + print("\nIndexes:") + for row in cur.fetchall(): + print(f" {row[0]} on {row[1]}") + + cur.execute(""" + SELECT constraint_name, constraint_type + FROM information_schema.table_constraints + WHERE table_schema = 'support' + AND table_name = 'case_products' + AND constraint_type = 'UNIQUE' + """) + for row in cur.fetchall(): + print(f"\nUnique constraint: {row[0]} ({row[1]})") + ``` + + **Key points:** + + - **psycopg install**: Tries `uv pip install` first for speed, falls back to `pip` if uv is unavailable. + - **Nullable FK**: `order_id` on `support_cases` is nullable because not all cases relate to an order. + - **Default status**: `status DEFAULT 'open'` with CHECK means new cases start open and can only transition to valid states. + - **Junction table**: `case_products` uses `UNIQUE (case_id, product_id)` to prevent duplicate product associations. + - **ON DELETE CASCADE**: Deleting a support case automatically removes its `case_products` and `case_notes`. + - **Composite indexes**: `idx_cases_status_priority` optimizes dashboard queries; `idx_case_notes_case_created` optimizes chronological note retrieval. + - OAuth tokens expire after 1 hour. Always use `sslmode=require`. + execution_success: true + expectations: + expected_facts: + - "psycopg is installed via uv pip install with pip as fallback" + - "Connection uses get-endpoint and generate-database-credential to get host and OAuth token" + - "Connects to support_db, not the default databricks_postgres" + - "autocommit must be True for DDL statements" + - "support_cases has nullable FK order_id (not all cases relate to an order)" + - "support_cases has customer_id FK that is NOT NULL" + - "support_cases status defaults to 'open' with CHECK constraint" + - "case_products is a junction table with UNIQUE(case_id, product_id) constraint" + - "case_products and case_notes have ON DELETE CASCADE from support_cases" + - "case_notes has CHECK constraint on note_type" + - "Composite index on support_cases(status, priority) for dashboard queries" + - "Composite index on case_notes(case_id, created_at) for chronological note retrieval" + - "Verification uses information_schema and table_constraints" + expected_patterns: + - pattern: "uv.*pip.*install.*psycopg" + min_count: 1 + description: "Installs psycopg via uv" + - pattern: "pip.*install.*psycopg" + min_count: 1 + description: "Falls back to pip for psycopg install" + - pattern: "get.endpoint" + min_count: 1 + description: "Gets endpoint details for connection host (CLI or SDK)" + - pattern: "generate.database.credential" + min_count: 1 + description: "Generates OAuth token for authentication (CLI or SDK)" + - pattern: "sslmode=require" + min_count: 1 + description: "Requires SSL for the connection" + - pattern: "CREATE TABLE support\\.support_cases" + min_count: 1 + description: "Creates the support_cases table" + - pattern: "CREATE TABLE support\\.case_products" + min_count: 1 + description: "Creates the case_products junction table" + - pattern: "CREATE TABLE support\\.case_notes" + min_count: 1 + description: "Creates the case_notes table" + - pattern: "ON DELETE CASCADE" + min_count: 2 + description: "Cascade delete on case_products and case_notes when case is deleted" + - pattern: "UNIQUE.*case_id.*product_id" + min_count: 1 + description: "Unique constraint preventing duplicate case-product associations" + - pattern: "REFERENCES" + min_count: 5 + description: "FK constraints (support_cases->customers, ->orders, case_products->cases, ->products, case_notes->cases)" + - pattern: "CHECK" + min_count: 4 + description: "CHECK constraints on category, priority, status, and note_type" + - pattern: "CREATE INDEX" + min_count: 7 + description: "Creates all 7 requested indexes" + - pattern: "information_schema" + min_count: 1 + description: "Verifies schema using information_schema" + - pattern: "table_constraints" + min_count: 1 + description: "Verifies unique constraint via table_constraints" + - pattern: "DEFAULT 'open'" + min_count: 1 + description: "Status defaults to open for new cases" + guidelines: + - "Must install psycopg using uv with pip fallback" + - "Must connect to support_db using Autoscale pattern (w.postgres)" + - "Must create all 3 tables: support_cases, case_products, case_notes" + - "support_cases.order_id must be nullable (not all cases relate to an order)" + - "support_cases.customer_id must be NOT NULL with FK to customers" + - "support_cases.status must default to 'open' with CHECK constraint" + - "case_products must have UNIQUE(case_id, product_id) constraint" + - "case_products and case_notes must have ON DELETE CASCADE from support_cases" + - "Must create all 7 requested indexes including composite indexes" + - "Must verify schema using information_schema and table_constraints queries" + metadata: + category: happy_path + difficulty: hard + source: interactive_add diff --git a/.test/src/skill_test/agent/executor.py b/.test/src/skill_test/agent/executor.py index 4ef4af31..96b74dfe 100644 --- a/.test/src/skill_test/agent/executor.py +++ b/.test/src/skill_test/agent/executor.py @@ -292,7 +292,9 @@ def _get_agent_env() -> dict[str, str]: file_env = settings.get("env", {}) for k, v in file_env.items(): if isinstance(v, str): - env[k] = _resolve_env_refs(v) + resolved = _resolve_env_refs(v) + if resolved: # Skip empty values so Claude Code falls back to keychain/token-cache auth + env[k] = resolved logger.info("Loaded agent env from %s (%d vars)", p, len(file_env)) except (json.JSONDecodeError, OSError) as e: logger.warning("Failed to load %s: %s", p, e) diff --git a/.test/uv.lock b/.test/uv.lock index dbed6c0b..7a6113f8 100644 --- a/.test/uv.lock +++ b/.test/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 2 +revision = 3 requires-python = ">=3.10" resolution-markers = [ "python_full_version >= '3.13'", @@ -11,7 +11,7 @@ resolution-markers = [ [[package]] name = "aiohappyeyeballs" version = "2.6.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/26/30/f84a107a9c4331c14b2b586036f40965c128aa4fee4dda5d3d51cb14ad54/aiohappyeyeballs-2.6.1.tar.gz", hash = "sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558", size = 22760, upload-time = "2025-03-12T01:42:48.764Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/0f/15/5bf3b99495fb160b63f95972b81750f18f7f4e02ad051373b669d17d44f2/aiohappyeyeballs-2.6.1-py3-none-any.whl", hash = "sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8", size = 15265, upload-time = "2025-03-12T01:42:47.083Z" }, @@ -20,7 +20,7 @@ wheels = [ [[package]] name = "aiohttp" version = "3.13.3" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "aiohappyeyeballs" }, { name = "aiosignal" }, @@ -140,7 +140,7 @@ wheels = [ [[package]] name = "aiosignal" version = "1.4.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "frozenlist" }, { name = "typing-extensions", marker = "python_full_version < '3.13'" }, @@ -153,7 +153,7 @@ wheels = [ [[package]] name = "alembic" version = "1.18.3" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "mako" }, { name = "sqlalchemy" }, @@ -168,7 +168,7 @@ wheels = [ [[package]] name = "annotated-doc" version = "0.0.4" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/57/ba/046ceea27344560984e26a590f90bc7f4a75b06701f653222458922b558c/annotated_doc-0.0.4.tar.gz", hash = "sha256:fbcda96e87e9c92ad167c2e53839e57503ecfda18804ea28102353485033faa4", size = 7288, upload-time = "2025-11-10T22:07:42.062Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/1e/d3/26bf1008eb3d2daa8ef4cacc7f3bfdc11818d111f7e2d0201bc6e3b49d45/annotated_doc-0.0.4-py3-none-any.whl", hash = "sha256:571ac1dc6991c450b25a9c2d84a3705e2ae7a53467b5d111c24fa8baabbed320", size = 5303, upload-time = "2025-11-10T22:07:40.673Z" }, @@ -177,7 +177,7 @@ wheels = [ [[package]] name = "annotated-types" version = "0.7.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081, upload-time = "2024-05-20T21:33:25.928Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" }, @@ -186,7 +186,7 @@ wheels = [ [[package]] name = "anyio" version = "4.12.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "exceptiongroup", marker = "python_full_version < '3.11'" }, { name = "idna" }, @@ -200,7 +200,7 @@ wheels = [ [[package]] name = "async-timeout" version = "5.0.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/a5/ae/136395dfbfe00dfc94da3f3e136d0b13f394cba8f4841120e34226265780/async_timeout-5.0.1.tar.gz", hash = "sha256:d9321a7a3d5a6a5e187e824d2fa0793ce379a202935782d555d6e9d2735677d3", size = 9274, upload-time = "2024-11-06T16:41:39.6Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/fe/ba/e2081de779ca30d473f21f5b30e0e737c438205440784c7dfc81efc2b029/async_timeout-5.0.1-py3-none-any.whl", hash = "sha256:39e3809566ff85354557ec2398b55e096c8364bacac9405a7a1fa429e77fe76c", size = 6233, upload-time = "2024-11-06T16:41:37.9Z" }, @@ -209,7 +209,7 @@ wheels = [ [[package]] name = "attrs" version = "25.4.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/6b/5c/685e6633917e101e5dcb62b9dd76946cbb57c26e133bae9e0cd36033c0a9/attrs-25.4.0.tar.gz", hash = "sha256:16d5969b87f0859ef33a48b35d55ac1be6e42ae49d5e853b597db70c35c57e11", size = 934251, upload-time = "2025-10-06T13:54:44.725Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/3a/2a/7cc015f5b9f5db42b7d48157e23356022889fc354a2813c15934b7cb5c0e/attrs-25.4.0-py3-none-any.whl", hash = "sha256:adcf7e2a1fb3b36ac48d97835bb6d8ade15b8dcce26aba8bf1d14847b57a3373", size = 67615, upload-time = "2025-10-06T13:54:43.17Z" }, @@ -218,7 +218,7 @@ wheels = [ [[package]] name = "azure-core" version = "1.38.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "requests" }, { name = "typing-extensions" }, @@ -231,7 +231,7 @@ wheels = [ [[package]] name = "azure-storage-blob" version = "12.28.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "azure-core" }, { name = "cryptography" }, @@ -246,7 +246,7 @@ wheels = [ [[package]] name = "azure-storage-file-datalake" version = "12.23.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "azure-core" }, { name = "azure-storage-blob" }, @@ -261,7 +261,7 @@ wheels = [ [[package]] name = "backports-asyncio-runner" version = "1.2.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/8e/ff/70dca7d7cb1cbc0edb2c6cc0c38b65cba36cccc491eca64cabd5fe7f8670/backports_asyncio_runner-1.2.0.tar.gz", hash = "sha256:a5aa7b2b7d8f8bfcaa2b57313f70792df84e32a2a746f585213373f900b42162", size = 69893, upload-time = "2025-07-02T02:27:15.685Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/a0/59/76ab57e3fe74484f48a53f8e337171b4a2349e506eabe136d7e01d059086/backports_asyncio_runner-1.2.0-py3-none-any.whl", hash = "sha256:0da0a936a8aeb554eccb426dc55af3ba63bcdc69fa1a600b5bb305413a4477b5", size = 12313, upload-time = "2025-07-02T02:27:14.263Z" }, @@ -270,7 +270,7 @@ wheels = [ [[package]] name = "blinker" version = "1.9.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/21/28/9b3f50ce0e048515135495f198351908d99540d69bfdc8c1d15b73dc55ce/blinker-1.9.0.tar.gz", hash = "sha256:b4ce2265a7abece45e7cc896e98dbebe6cead56bcf805a3d23136d145f5445bf", size = 22460, upload-time = "2024-11-08T17:25:47.436Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/10/cb/f2ad4230dc2eb1a74edf38f1a38b9b52277f75bef262d8908e60d957e13c/blinker-1.9.0-py3-none-any.whl", hash = "sha256:ba0efaa9080b619ff2f3459d1d500c57bddea4a6b424b60a91141db6fd2f08bc", size = 8458, upload-time = "2024-11-08T17:25:46.184Z" }, @@ -279,7 +279,7 @@ wheels = [ [[package]] name = "boto3" version = "1.42.41" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "botocore" }, { name = "jmespath" }, @@ -293,7 +293,7 @@ wheels = [ [[package]] name = "botocore" version = "1.42.41" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "jmespath" }, { name = "python-dateutil" }, @@ -307,7 +307,7 @@ wheels = [ [[package]] name = "cachetools" version = "6.2.6" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/39/91/d9ae9a66b01102a18cd16db0cf4cd54187ffe10f0865cc80071a4104fbb3/cachetools-6.2.6.tar.gz", hash = "sha256:16c33e1f276b9a9c0b49ab5782d901e3ad3de0dd6da9bf9bcd29ac5672f2f9e6", size = 32363, upload-time = "2026-01-27T20:32:59.956Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/90/45/f458fa2c388e79dd9d8b9b0c99f1d31b568f27388f2fdba7bb66bbc0c6ed/cachetools-6.2.6-py3-none-any.whl", hash = "sha256:8c9717235b3c651603fff0076db52d6acbfd1b338b8ed50256092f7ce9c85bda", size = 11668, upload-time = "2026-01-27T20:32:58.527Z" }, @@ -316,7 +316,7 @@ wheels = [ [[package]] name = "certifi" version = "2026.1.4" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/e0/2d/a891ca51311197f6ad14a7ef42e2399f36cf2f9bd44752b3dc4eab60fdc5/certifi-2026.1.4.tar.gz", hash = "sha256:ac726dd470482006e014ad384921ed6438c457018f4b3d204aea4281258b2120", size = 154268, upload-time = "2026-01-04T02:42:41.825Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/e6/ad/3cc14f097111b4de0040c83a525973216457bbeeb63739ef1ed275c1c021/certifi-2026.1.4-py3-none-any.whl", hash = "sha256:9943707519e4add1115f44c2bc244f782c0249876bf51b6599fee1ffbedd685c", size = 152900, upload-time = "2026-01-04T02:42:40.15Z" }, @@ -325,7 +325,7 @@ wheels = [ [[package]] name = "cffi" version = "2.0.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "pycparser", marker = "implementation_name != 'PyPy'" }, ] @@ -407,7 +407,7 @@ wheels = [ [[package]] name = "charset-normalizer" version = "3.4.4" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/13/69/33ddede1939fdd074bce5434295f38fae7136463422fe4fd3e0e89b98062/charset_normalizer-3.4.4.tar.gz", hash = "sha256:94537985111c35f28720e43603b8e7b43a6ecfb2ce1d3058bbe955b73404e21a", size = 129418, upload-time = "2025-10-14T04:42:32.879Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/1f/b8/6d51fc1d52cbd52cd4ccedd5b5b2f0f6a11bbf6765c782298b0f3e808541/charset_normalizer-3.4.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e824f1492727fa856dd6eda4f7cee25f8518a12f3c4a56a74e8095695089cf6d", size = 209709, upload-time = "2025-10-14T04:40:11.385Z" }, @@ -496,7 +496,7 @@ wheels = [ [[package]] name = "claude-agent-sdk" version = "0.1.46" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "anyio" }, { name = "mcp" }, @@ -513,7 +513,7 @@ wheels = [ [[package]] name = "click" version = "8.3.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "colorama", marker = "sys_platform == 'win32'" }, ] @@ -525,7 +525,7 @@ wheels = [ [[package]] name = "cloudpickle" version = "3.1.2" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/27/fb/576f067976d320f5f0114a8d9fa1215425441bb35627b1993e5afd8111e5/cloudpickle-3.1.2.tar.gz", hash = "sha256:7fda9eb655c9c230dab534f1983763de5835249750e85fbcef43aaa30a9a2414", size = 22330, upload-time = "2025-11-03T09:25:26.604Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/88/39/799be3f2f0f38cc727ee3b4f1445fe6d5e4133064ec2e4115069418a5bb6/cloudpickle-3.1.2-py3-none-any.whl", hash = "sha256:9acb47f6afd73f60dc1df93bb801b472f05ff42fa6c84167d25cb206be1fbf4a", size = 22228, upload-time = "2025-11-03T09:25:25.534Z" }, @@ -534,7 +534,7 @@ wheels = [ [[package]] name = "colorama" version = "0.4.6" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, @@ -543,12 +543,12 @@ wheels = [ [[package]] name = "contourpy" version = "1.3.2" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } resolution-markers = [ "python_full_version < '3.11'", ] dependencies = [ - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version < '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/66/54/eb9bfc647b19f2009dd5c7f5ec51c4e6ca831725f1aea7a993034f483147/contourpy-1.3.2.tar.gz", hash = "sha256:b6945942715a034c671b7fc54f9588126b0b8bf23db2696e3ca8328f3ff0ab54", size = 13466130, upload-time = "2025-04-15T17:47:53.79Z" } wheels = [ @@ -613,14 +613,14 @@ wheels = [ [[package]] name = "contourpy" version = "1.3.3" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } resolution-markers = [ "python_full_version >= '3.13'", "python_full_version == '3.12.*'", "python_full_version == '3.11.*'", ] dependencies = [ - { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "numpy", version = "2.4.2", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version >= '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/58/01/1253e6698a07380cd31a736d248a3f2a50a7c88779a1813da27503cadc2a/contourpy-1.3.3.tar.gz", hash = "sha256:083e12155b210502d0bca491432bb04d56dc3432f95a979b429f2848c3dbe880", size = 13466174, upload-time = "2025-07-26T12:03:12.549Z" } wheels = [ @@ -700,7 +700,7 @@ wheels = [ [[package]] name = "cryptography" version = "46.0.4" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "cffi", marker = "platform_python_implementation != 'PyPy'" }, { name = "typing-extensions", marker = "python_full_version < '3.11'" }, @@ -760,7 +760,7 @@ wheels = [ [[package]] name = "cycler" version = "0.12.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/a9/95/a3dbbb5028f35eafb79008e7522a75244477d2838f38cbb722248dabc2a8/cycler-0.12.1.tar.gz", hash = "sha256:88bb128f02ba341da8ef447245a9e138fae777f6a23943da4540077d3601eb1c", size = 7615, upload-time = "2023-10-07T05:32:18.335Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30", size = 8321, upload-time = "2023-10-07T05:32:16.783Z" }, @@ -769,7 +769,7 @@ wheels = [ [[package]] name = "databricks-agents" version = "1.9.3" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "boto3" }, { name = "botocore" }, @@ -779,8 +779,8 @@ dependencies = [ { name = "jinja2" }, { name = "litellm" }, { name = "mlflow-skinny" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, - { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version < '3.11'" }, + { name = "numpy", version = "2.4.2", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version >= '3.11'" }, { name = "pandas" }, { name = "pydantic" }, { name = "tenacity" }, @@ -796,7 +796,7 @@ wheels = [ [[package]] name = "databricks-sdk" version = "0.84.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "google-auth" }, { name = "protobuf" }, @@ -817,7 +817,7 @@ openai = [ [[package]] name = "dataclasses-json" version = "0.6.7" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "marshmallow" }, { name = "typing-inspect" }, @@ -830,7 +830,7 @@ wheels = [ [[package]] name = "distro" version = "1.9.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/fc/f8/98eea607f65de6527f8a2e8885fc8015d3e6f5775df186e443e0964a11c3/distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed", size = 60722, upload-time = "2023-12-24T09:54:32.31Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277, upload-time = "2023-12-24T09:54:30.421Z" }, @@ -839,7 +839,7 @@ wheels = [ [[package]] name = "docker" version = "7.1.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "pywin32", marker = "sys_platform == 'win32'" }, { name = "requests" }, @@ -853,7 +853,7 @@ wheels = [ [[package]] name = "exceptiongroup" version = "1.3.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "typing-extensions", marker = "python_full_version < '3.11'" }, ] @@ -865,7 +865,7 @@ wheels = [ [[package]] name = "fastapi" version = "0.128.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "annotated-doc" }, { name = "pydantic" }, @@ -880,7 +880,7 @@ wheels = [ [[package]] name = "fastuuid" version = "0.14.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/c3/7d/d9daedf0f2ebcacd20d599928f8913e9d2aea1d56d2d355a93bfa2b611d7/fastuuid-0.14.0.tar.gz", hash = "sha256:178947fc2f995b38497a74172adee64fdeb8b7ec18f2a5934d037641ba265d26", size = 18232, upload-time = "2025-10-19T22:19:22.402Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/ad/b2/731a6696e37cd20eed353f69a09f37a984a43c9713764ee3f7ad5f57f7f9/fastuuid-0.14.0-cp310-cp310-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:6e6243d40f6c793c3e2ee14c13769e341b90be5ef0c23c82fa6515a96145181a", size = 516760, upload-time = "2025-10-19T22:25:21.509Z" }, @@ -943,7 +943,7 @@ wheels = [ [[package]] name = "filelock" version = "3.20.3" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/1d/65/ce7f1b70157833bf3cb851b556a37d4547ceafc158aa9b34b36782f23696/filelock-3.20.3.tar.gz", hash = "sha256:18c57ee915c7ec61cff0ecf7f0f869936c7c30191bb0cf406f1341778d0834e1", size = 19485, upload-time = "2026-01-09T17:55:05.421Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/b5/36/7fb70f04bf00bc646cd5bb45aa9eddb15e19437a28b8fb2b4a5249fac770/filelock-3.20.3-py3-none-any.whl", hash = "sha256:4b0dda527ee31078689fc205ec4f1c1bf7d56cf88b6dc9426c4f230e46c2dce1", size = 16701, upload-time = "2026-01-09T17:55:04.334Z" }, @@ -952,7 +952,7 @@ wheels = [ [[package]] name = "flask" version = "3.1.2" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "blinker" }, { name = "click" }, @@ -969,7 +969,7 @@ wheels = [ [[package]] name = "flask-cors" version = "6.0.2" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "flask" }, { name = "werkzeug" }, @@ -982,7 +982,7 @@ wheels = [ [[package]] name = "fonttools" version = "4.61.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/ec/ca/cf17b88a8df95691275a3d77dc0a5ad9907f328ae53acbe6795da1b2f5ed/fonttools-4.61.1.tar.gz", hash = "sha256:6675329885c44657f826ef01d9e4fb33b9158e9d93c537d84ad8399539bc6f69", size = 3565756, upload-time = "2025-12-12T17:31:24.246Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/5b/94/8a28707adb00bed1bf22dac16ccafe60faf2ade353dcb32c3617ee917307/fonttools-4.61.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7c7db70d57e5e1089a274cbb2b1fd635c9a24de809a231b154965d415d6c6d24", size = 2854799, upload-time = "2025-12-12T17:29:27.5Z" }, @@ -1039,7 +1039,7 @@ wheels = [ [[package]] name = "frozenlist" version = "1.8.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/2d/f5/c831fac6cc817d26fd54c7eaccd04ef7e0288806943f7cc5bbf69f3ac1f0/frozenlist-1.8.0.tar.gz", hash = "sha256:3ede829ed8d842f6cd48fc7081d7a41001a56f1f38603f9d49bf3020d59a31ad", size = 45875, upload-time = "2025-10-06T05:38:17.865Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/83/4a/557715d5047da48d54e659203b9335be7bfaafda2c3f627b7c47e0b3aaf3/frozenlist-1.8.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:b37f6d31b3dcea7deb5e9696e529a6aa4a898adc33db82da12e4c60a7c4d2011", size = 86230, upload-time = "2025-10-06T05:35:23.699Z" }, @@ -1160,7 +1160,7 @@ wheels = [ [[package]] name = "fsspec" version = "2026.1.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/d5/7d/5df2650c57d47c57232af5ef4b4fdbff182070421e405e0d62c6cdbfaa87/fsspec-2026.1.0.tar.gz", hash = "sha256:e987cb0496a0d81bba3a9d1cee62922fb395e7d4c3b575e57f547953334fe07b", size = 310496, upload-time = "2026-01-09T15:21:35.562Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/01/c9/97cc5aae1648dcb851958a3ddf73ccd7dbe5650d95203ecb4d7720b4cdbf/fsspec-2026.1.0-py3-none-any.whl", hash = "sha256:cb76aa913c2285a3b49bdd5fc55b1d7c708d7208126b60f2eb8194fe1b4cbdcc", size = 201838, upload-time = "2026-01-09T15:21:34.041Z" }, @@ -1169,7 +1169,7 @@ wheels = [ [[package]] name = "gepa" version = "0.1.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/f5/30/511e52916956508f56eca721260fcd524cfffd580e57782dd471be925f7e/gepa-0.1.0.tar.gz", hash = "sha256:f8b3d7918d4cdcf8593f39ef1cc757c4ba1a4e6793e3ffb622e6c0bc60a1efd9", size = 226064, upload-time = "2026-02-19T19:43:08.272Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/1f/32/fe8afb3d2a6605a6bcbc8f119f0a2adae96e9e5d57ebed055490219956a8/gepa-0.1.0-py3-none-any.whl", hash = "sha256:4e3f8fe8ca20169e60518b2e9d416e8c4a579459848adffdcad12223fbf9643e", size = 191392, upload-time = "2026-02-19T19:43:07.065Z" }, @@ -1178,7 +1178,7 @@ wheels = [ [[package]] name = "gitdb" version = "4.0.12" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "smmap" }, ] @@ -1190,7 +1190,7 @@ wheels = [ [[package]] name = "gitpython" version = "3.1.46" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "gitdb" }, ] @@ -1202,7 +1202,7 @@ wheels = [ [[package]] name = "google-api-core" version = "2.29.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "google-auth" }, { name = "googleapis-common-protos" }, @@ -1218,7 +1218,7 @@ wheels = [ [[package]] name = "google-auth" version = "2.48.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "cryptography" }, { name = "pyasn1-modules" }, @@ -1232,7 +1232,7 @@ wheels = [ [[package]] name = "google-cloud-core" version = "2.5.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "google-api-core" }, { name = "google-auth" }, @@ -1245,7 +1245,7 @@ wheels = [ [[package]] name = "google-cloud-storage" version = "3.9.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "google-api-core" }, { name = "google-auth" }, @@ -1262,7 +1262,7 @@ wheels = [ [[package]] name = "google-crc32c" version = "1.8.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/03/41/4b9c02f99e4c5fb477122cd5437403b552873f014616ac1d19ac8221a58d/google_crc32c-1.8.0.tar.gz", hash = "sha256:a428e25fb7691024de47fecfbff7ff957214da51eddded0da0ae0e0f03a2cf79", size = 14192, upload-time = "2025-12-16T00:35:25.142Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/95/ac/6f7bc93886a823ab545948c2dd48143027b2355ad1944c7cf852b338dc91/google_crc32c-1.8.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:0470b8c3d73b5f4e3300165498e4cf25221c7eb37f1159e221d1825b6df8a7ff", size = 31296, upload-time = "2025-12-16T00:19:07.261Z" }, @@ -1297,7 +1297,7 @@ wheels = [ [[package]] name = "google-resumable-media" version = "2.8.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "google-crc32c" }, ] @@ -1309,7 +1309,7 @@ wheels = [ [[package]] name = "googleapis-common-protos" version = "1.72.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "protobuf" }, ] @@ -1321,7 +1321,7 @@ wheels = [ [[package]] name = "graphene" version = "3.4.3" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "graphql-core" }, { name = "graphql-relay" }, @@ -1336,7 +1336,7 @@ wheels = [ [[package]] name = "graphql-core" version = "3.2.7" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/ac/9b/037a640a2983b09aed4a823f9cf1729e6d780b0671f854efa4727a7affbe/graphql_core-3.2.7.tar.gz", hash = "sha256:27b6904bdd3b43f2a0556dad5d579bdfdeab1f38e8e8788e555bdcb586a6f62c", size = 513484, upload-time = "2025-11-01T22:30:40.436Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/0a/14/933037032608787fb92e365883ad6a741c235e0ff992865ec5d904a38f1e/graphql_core-3.2.7-py3-none-any.whl", hash = "sha256:17fc8f3ca4a42913d8e24d9ac9f08deddf0a0b2483076575757f6c412ead2ec0", size = 207262, upload-time = "2025-11-01T22:30:38.912Z" }, @@ -1345,7 +1345,7 @@ wheels = [ [[package]] name = "graphql-relay" version = "3.2.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "graphql-core" }, ] @@ -1357,7 +1357,7 @@ wheels = [ [[package]] name = "greenlet" version = "3.3.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/8a/99/1cd3411c56a410994669062bd73dd58270c00cc074cac15f385a1fd91f8a/greenlet-3.3.1.tar.gz", hash = "sha256:41848f3230b58c08bb43dee542e74a2a2e34d3c59dc3076cec9151aeeedcae98", size = 184690, upload-time = "2026-01-23T15:31:02.076Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/fe/65/5b235b40581ad75ab97dcd8b4218022ae8e3ab77c13c919f1a1dfe9171fd/greenlet-3.3.1-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:04bee4775f40ecefcdaa9d115ab44736cd4b9c5fba733575bfe9379419582e13", size = 273723, upload-time = "2026-01-23T15:30:37.521Z" }, @@ -1417,7 +1417,7 @@ wheels = [ [[package]] name = "gunicorn" version = "23.0.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "packaging" }, ] @@ -1429,7 +1429,7 @@ wheels = [ [[package]] name = "h11" version = "0.16.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250, upload-time = "2025-04-24T03:35:25.427Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" }, @@ -1438,7 +1438,7 @@ wheels = [ [[package]] name = "hf-xet" version = "1.2.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/5e/6e/0f11bacf08a67f7fb5ee09740f2ca54163863b07b70d579356e9222ce5d8/hf_xet-1.2.0.tar.gz", hash = "sha256:a8c27070ca547293b6890c4bf389f713f80e8c478631432962bb7f4bc0bd7d7f", size = 506020, upload-time = "2025-10-24T19:04:32.129Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/9e/a5/85ef910a0aa034a2abcfadc360ab5ac6f6bc4e9112349bd40ca97551cff0/hf_xet-1.2.0-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:ceeefcd1b7aed4956ae8499e2199607765fbd1c60510752003b6cc0b8413b649", size = 2861870, upload-time = "2025-10-24T19:04:11.422Z" }, @@ -1467,7 +1467,7 @@ wheels = [ [[package]] name = "httpcore" version = "1.0.9" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "certifi" }, { name = "h11" }, @@ -1480,7 +1480,7 @@ wheels = [ [[package]] name = "httpx" version = "0.28.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "anyio" }, { name = "certifi" }, @@ -1495,7 +1495,7 @@ wheels = [ [[package]] name = "httpx-sse" version = "0.4.3" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/0f/4c/751061ffa58615a32c31b2d82e8482be8dd4a89154f003147acee90f2be9/httpx_sse-0.4.3.tar.gz", hash = "sha256:9b1ed0127459a66014aec3c56bebd93da3c1bc8bb6618c8082039a44889a755d", size = 15943, upload-time = "2025-10-10T21:48:22.271Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/d2/fd/6668e5aec43ab844de6fc74927e155a3b37bf40d7c3790e49fc0406b6578/httpx_sse-0.4.3-py3-none-any.whl", hash = "sha256:0ac1c9fe3c0afad2e0ebb25a934a59f4c7823b60792691f779fad2c5568830fc", size = 8960, upload-time = "2025-10-10T21:48:21.158Z" }, @@ -1504,7 +1504,7 @@ wheels = [ [[package]] name = "huey" version = "2.6.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/fe/29/3428d52eb8e85025e264a291641a9f9d6407cc1e51d1b630f6ac5815999a/huey-2.6.0.tar.gz", hash = "sha256:8d11f8688999d65266af1425b831f6e3773e99415027177b8734b0ffd5e251f6", size = 221068, upload-time = "2026-01-06T03:01:02.055Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/1a/34/fae9ac8f1c3a552fd3f7ff652b94c78d219dedc5fce0c0a4232457760a00/huey-2.6.0-py3-none-any.whl", hash = "sha256:1b9df9d370b49c6d5721ba8a01ac9a787cf86b3bdc584e4679de27b920395c3f", size = 76951, upload-time = "2026-01-06T03:01:00.808Z" }, @@ -1513,7 +1513,7 @@ wheels = [ [[package]] name = "huggingface-hub" version = "1.4.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "filelock" }, { name = "fsspec" }, @@ -1534,7 +1534,7 @@ wheels = [ [[package]] name = "idna" version = "3.11" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/6f/6d/0703ccc57f3a7233505399edb88de3cbd678da106337b9fcde432b65ed60/idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902", size = 194582, upload-time = "2025-10-12T14:55:20.501Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" }, @@ -1543,7 +1543,7 @@ wheels = [ [[package]] name = "importlib-metadata" version = "8.7.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "zipp" }, ] @@ -1555,7 +1555,7 @@ wheels = [ [[package]] name = "iniconfig" version = "2.3.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" }, @@ -1564,7 +1564,7 @@ wheels = [ [[package]] name = "isodate" version = "0.7.2" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/54/4d/e940025e2ce31a8ce1202635910747e5a87cc3a6a6bb2d00973375014749/isodate-0.7.2.tar.gz", hash = "sha256:4cd1aa0f43ca76f4a6c6c0292a85f40b35ec2e43e315b59f06e6d32171a953e6", size = 29705, upload-time = "2024-10-08T23:04:11.5Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/15/aa/0aca39a37d3c7eb941ba736ede56d689e7be91cab5d9ca846bde3999eba6/isodate-0.7.2-py3-none-any.whl", hash = "sha256:28009937d8031054830160fce6d409ed342816b543597cece116d966c6d99e15", size = 22320, upload-time = "2024-10-08T23:04:09.501Z" }, @@ -1573,7 +1573,7 @@ wheels = [ [[package]] name = "itsdangerous" version = "2.2.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/9c/cb/8ac0172223afbccb63986cc25049b154ecfb5e85932587206f42317be31d/itsdangerous-2.2.0.tar.gz", hash = "sha256:e0050c0b7da1eea53ffaf149c0cfbb5c6e2e2b69c4bef22c81fa6eb73e5f6173", size = 54410, upload-time = "2024-04-16T21:28:15.614Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/04/96/92447566d16df59b2a776c0fb82dbc4d9e07cd95062562af01e408583fc4/itsdangerous-2.2.0-py3-none-any.whl", hash = "sha256:c6242fc49e35958c8b15141343aa660db5fc54d4f13a1db01a3f5891b98700ef", size = 16234, upload-time = "2024-04-16T21:28:14.499Z" }, @@ -1582,7 +1582,7 @@ wheels = [ [[package]] name = "jinja2" version = "3.1.6" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "markupsafe" }, ] @@ -1594,7 +1594,7 @@ wheels = [ [[package]] name = "jiter" version = "0.13.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/0d/5e/4ec91646aee381d01cdb9974e30882c9cd3b8c5d1079d6b5ff4af522439a/jiter-0.13.0.tar.gz", hash = "sha256:f2839f9c2c7e2dffc1bc5929a510e14ce0a946be9365fd1219e7ef342dae14f4", size = 164847, upload-time = "2026-02-02T12:37:56.441Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/d0/5a/41da76c5ea07bec1b0472b6b2fdb1b651074d504b19374d7e130e0cdfb25/jiter-0.13.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:2ffc63785fd6c7977defe49b9824ae6ce2b2e2b77ce539bdaf006c26da06342e", size = 311164, upload-time = "2026-02-02T12:35:17.688Z" }, @@ -1691,7 +1691,7 @@ wheels = [ [[package]] name = "jmespath" version = "1.1.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/d3/59/322338183ecda247fb5d1763a6cbe46eff7222eaeebafd9fa65d4bf5cb11/jmespath-1.1.0.tar.gz", hash = "sha256:472c87d80f36026ae83c6ddd0f1d05d4e510134ed462851fd5f754c8c3cbb88d", size = 27377, upload-time = "2026-01-22T16:35:26.279Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/14/2f/967ba146e6d58cf6a652da73885f52fc68001525b4197effc174321d70b4/jmespath-1.1.0-py3-none-any.whl", hash = "sha256:a5663118de4908c91729bea0acadca56526eb2698e83de10cd116ae0f4e97c64", size = 20419, upload-time = "2026-01-22T16:35:24.919Z" }, @@ -1700,7 +1700,7 @@ wheels = [ [[package]] name = "joblib" version = "1.5.3" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/41/f2/d34e8b3a08a9cc79a50b2208a93dce981fe615b64d5a4d4abee421d898df/joblib-1.5.3.tar.gz", hash = "sha256:8561a3269e6801106863fd0d6d84bb737be9e7631e33aaed3fb9ce5953688da3", size = 331603, upload-time = "2025-12-15T08:41:46.427Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/7b/91/984aca2ec129e2757d1e4e3c81c3fcda9d0f85b74670a094cc443d9ee949/joblib-1.5.3-py3-none-any.whl", hash = "sha256:5fc3c5039fc5ca8c0276333a188bbd59d6b7ab37fe6632daa76bc7f9ec18e713", size = 309071, upload-time = "2025-12-15T08:41:44.973Z" }, @@ -1709,7 +1709,7 @@ wheels = [ [[package]] name = "jsonpatch" version = "1.33" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "jsonpointer" }, ] @@ -1721,7 +1721,7 @@ wheels = [ [[package]] name = "jsonpointer" version = "3.0.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/6a/0a/eebeb1fa92507ea94016a2a790b93c2ae41a7e18778f85471dc54475ed25/jsonpointer-3.0.0.tar.gz", hash = "sha256:2b2d729f2091522d61c3b31f82e11870f60b68f43fbc705cb76bf4b832af59ef", size = 9114, upload-time = "2024-06-10T19:24:42.462Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/71/92/5e77f98553e9e75130c78900d000368476aed74276eb8ae8796f65f00918/jsonpointer-3.0.0-py2.py3-none-any.whl", hash = "sha256:13e088adc14fca8b6aa8177c044e12701e6ad4b28ff10e65f2267a90109c9942", size = 7595, upload-time = "2024-06-10T19:24:40.698Z" }, @@ -1730,7 +1730,7 @@ wheels = [ [[package]] name = "jsonschema" version = "4.26.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "attrs" }, { name = "jsonschema-specifications" }, @@ -1745,7 +1745,7 @@ wheels = [ [[package]] name = "jsonschema-specifications" version = "2025.9.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "referencing" }, ] @@ -1757,7 +1757,7 @@ wheels = [ [[package]] name = "kiwisolver" version = "1.4.9" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/5c/3c/85844f1b0feb11ee581ac23fe5fce65cd049a200c1446708cc1b7f922875/kiwisolver-1.4.9.tar.gz", hash = "sha256:c3b22c26c6fd6811b0ae8363b95ca8ce4ea3c202d3d0975b2914310ceb1bcc4d", size = 97564, upload-time = "2025-08-10T21:27:49.279Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/c6/5d/8ce64e36d4e3aac5ca96996457dcf33e34e6051492399a3f1fec5657f30b/kiwisolver-1.4.9-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:b4b4d74bda2b8ebf4da5bd42af11d02d04428b2c32846e4c2c93219df8a7987b", size = 124159, upload-time = "2025-08-10T21:25:35.472Z" }, @@ -1865,7 +1865,7 @@ wheels = [ [[package]] name = "langchain-core" version = "1.2.8" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "jsonpatch" }, { name = "langsmith" }, @@ -1884,7 +1884,7 @@ wheels = [ [[package]] name = "langchain-openai" version = "1.1.7" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "langchain-core" }, { name = "openai" }, @@ -1898,7 +1898,7 @@ wheels = [ [[package]] name = "langsmith" version = "0.6.8" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "httpx" }, { name = "orjson", marker = "platform_python_implementation != 'PyPy'" }, @@ -1918,7 +1918,7 @@ wheels = [ [[package]] name = "litellm" version = "1.81.7" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "aiohttp" }, { name = "click" }, @@ -1941,7 +1941,7 @@ wheels = [ [[package]] name = "mako" version = "1.3.10" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "markupsafe" }, ] @@ -1953,7 +1953,7 @@ wheels = [ [[package]] name = "markupsafe" version = "3.0.3" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/7e/99/7690b6d4034fffd95959cbe0c02de8deb3098cc577c67bb6a24fe5d7caa7/markupsafe-3.0.3.tar.gz", hash = "sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698", size = 80313, upload-time = "2025-09-27T18:37:40.426Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/e8/4b/3541d44f3937ba468b75da9eebcae497dcf67adb65caa16760b0a6807ebb/markupsafe-3.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2f981d352f04553a7171b8e44369f2af4055f888dfb147d55e42d29e29e74559", size = 11631, upload-time = "2025-09-27T18:36:05.558Z" }, @@ -2038,7 +2038,7 @@ wheels = [ [[package]] name = "marshmallow" version = "3.26.2" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "packaging" }, ] @@ -2050,15 +2050,15 @@ wheels = [ [[package]] name = "matplotlib" version = "3.10.8" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ - { name = "contourpy", version = "1.3.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, - { name = "contourpy", version = "1.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "contourpy", version = "1.3.2", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version < '3.11'" }, + { name = "contourpy", version = "1.3.3", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version >= '3.11'" }, { name = "cycler" }, { name = "fonttools" }, { name = "kiwisolver" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, - { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version < '3.11'" }, + { name = "numpy", version = "2.4.2", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version >= '3.11'" }, { name = "packaging" }, { name = "pillow" }, { name = "pyparsing" }, @@ -2125,7 +2125,7 @@ wheels = [ [[package]] name = "mcp" version = "1.26.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "anyio" }, { name = "httpx" }, @@ -2150,7 +2150,7 @@ wheels = [ [[package]] name = "mlflow" version = "3.10.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "alembic" }, { name = "cryptography" }, @@ -2163,14 +2163,14 @@ dependencies = [ { name = "matplotlib" }, { name = "mlflow-skinny" }, { name = "mlflow-tracing" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, - { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version < '3.11'" }, + { name = "numpy", version = "2.4.2", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version >= '3.11'" }, { name = "pandas" }, { name = "pyarrow" }, - { name = "scikit-learn", version = "1.7.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, - { name = "scikit-learn", version = "1.8.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, - { name = "scipy", version = "1.15.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, - { name = "scipy", version = "1.17.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "scikit-learn", version = "1.7.2", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version < '3.11'" }, + { name = "scikit-learn", version = "1.8.0", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version >= '3.11'" }, + { name = "scipy", version = "1.15.3", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version < '3.11'" }, + { name = "scipy", version = "1.17.0", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version >= '3.11'" }, { name = "skops" }, { name = "sqlalchemy" }, { name = "waitress", marker = "sys_platform == 'win32'" }, @@ -2192,7 +2192,7 @@ databricks = [ [[package]] name = "mlflow-skinny" version = "3.10.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "cachetools" }, { name = "click" }, @@ -2222,7 +2222,7 @@ wheels = [ [[package]] name = "mlflow-tracing" version = "3.10.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "cachetools" }, { name = "databricks-sdk" }, @@ -2241,7 +2241,7 @@ wheels = [ [[package]] name = "multidict" version = "6.7.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "typing-extensions", marker = "python_full_version < '3.11'" }, ] @@ -2379,7 +2379,7 @@ wheels = [ [[package]] name = "mypy-extensions" version = "1.1.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/a2/6e/371856a3fb9d31ca8dac321cda606860fa4548858c0cc45d9d1d4ca2628b/mypy_extensions-1.1.0.tar.gz", hash = "sha256:52e68efc3284861e772bbcd66823fde5ae21fd2fdb51c62a211403730b916558", size = 6343, upload-time = "2025-04-22T14:54:24.164Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/79/7b/2c79738432f5c924bef5071f933bcc9efd0473bac3b4aa584a6f7c1c8df8/mypy_extensions-1.1.0-py3-none-any.whl", hash = "sha256:1be4cccdb0f2482337c4743e60421de3a356cd97508abadd57d47403e94f5505", size = 4963, upload-time = "2025-04-22T14:54:22.983Z" }, @@ -2388,7 +2388,7 @@ wheels = [ [[package]] name = "numpy" version = "2.2.6" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } resolution-markers = [ "python_full_version < '3.11'", ] @@ -2453,7 +2453,7 @@ wheels = [ [[package]] name = "numpy" version = "2.4.2" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } resolution-markers = [ "python_full_version >= '3.13'", "python_full_version == '3.12.*'", @@ -2537,7 +2537,7 @@ wheels = [ [[package]] name = "openai" version = "2.16.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "anyio" }, { name = "distro" }, @@ -2556,7 +2556,7 @@ wheels = [ [[package]] name = "opentelemetry-api" version = "1.39.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "importlib-metadata" }, { name = "typing-extensions" }, @@ -2569,7 +2569,7 @@ wheels = [ [[package]] name = "opentelemetry-proto" version = "1.39.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "protobuf" }, ] @@ -2581,7 +2581,7 @@ wheels = [ [[package]] name = "opentelemetry-sdk" version = "1.39.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "opentelemetry-api" }, { name = "opentelemetry-semantic-conventions" }, @@ -2595,7 +2595,7 @@ wheels = [ [[package]] name = "opentelemetry-semantic-conventions" version = "0.60b1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "opentelemetry-api" }, { name = "typing-extensions" }, @@ -2608,7 +2608,7 @@ wheels = [ [[package]] name = "orjson" version = "3.11.7" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/53/45/b268004f745ede84e5798b48ee12b05129d19235d0e15267aa57dcdb400b/orjson-3.11.7.tar.gz", hash = "sha256:9b1a67243945819ce55d24a30b59d6a168e86220452d2c96f4d1f093e71c0c49", size = 6144992, upload-time = "2026-02-02T15:38:49.29Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/de/1a/a373746fa6d0e116dd9e54371a7b54622c44d12296d5d0f3ad5e3ff33490/orjson-3.11.7-cp310-cp310-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:a02c833f38f36546ba65a452127633afce4cf0dd7296b753d3bb54e55e5c0174", size = 229140, upload-time = "2026-02-02T15:37:06.082Z" }, @@ -2689,7 +2689,7 @@ wheels = [ [[package]] name = "packaging" version = "25.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/a1/d4/1fc4078c65507b51b96ca8f8c3ba19e6a61c8253c72794544580a7b6c24d/packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f", size = 165727, upload-time = "2025-04-19T11:48:59.673Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469, upload-time = "2025-04-19T11:48:57.875Z" }, @@ -2698,10 +2698,10 @@ wheels = [ [[package]] name = "pandas" version = "2.3.3" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, - { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version < '3.11'" }, + { name = "numpy", version = "2.4.2", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version >= '3.11'" }, { name = "python-dateutil" }, { name = "pytz" }, { name = "tzdata" }, @@ -2760,7 +2760,7 @@ wheels = [ [[package]] name = "pillow" version = "12.1.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/d0/02/d52c733a2452ef1ffcc123b68e6606d07276b0e358db70eabad7e40042b7/pillow-12.1.0.tar.gz", hash = "sha256:5c5ae0a06e9ea030ab786b0251b32c7e4ce10e58d983c0d5c56029455180b5b9", size = 46977283, upload-time = "2026-01-02T09:13:29.892Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/fe/41/f73d92b6b883a579e79600d391f2e21cb0df767b2714ecbd2952315dfeef/pillow-12.1.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:fb125d860738a09d363a88daa0f59c4533529a90e564785e20fe875b200b6dbd", size = 5304089, upload-time = "2026-01-02T09:10:24.953Z" }, @@ -2858,7 +2858,7 @@ wheels = [ [[package]] name = "pluggy" version = "1.6.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" }, @@ -2867,7 +2867,7 @@ wheels = [ [[package]] name = "prettytable" version = "3.17.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "wcwidth" }, ] @@ -2879,7 +2879,7 @@ wheels = [ [[package]] name = "propcache" version = "0.4.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/9e/da/e9fc233cf63743258bff22b3dfa7ea5baef7b5bc324af47a0ad89b8ffc6f/propcache-0.4.1.tar.gz", hash = "sha256:f48107a8c637e80362555f37ecf49abe20370e557cc4ab374f04ec4423c97c3d", size = 46442, upload-time = "2025-10-08T19:49:02.291Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/3c/0e/934b541323035566a9af292dba85a195f7b78179114f2c6ebb24551118a9/propcache-0.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7c2d1fa3201efaf55d730400d945b5b3ab6e672e100ba0f9a409d950ab25d7db", size = 79534, upload-time = "2025-10-08T19:46:02.083Z" }, @@ -2993,7 +2993,7 @@ wheels = [ [[package]] name = "proto-plus" version = "1.27.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "protobuf" }, ] @@ -3005,7 +3005,7 @@ wheels = [ [[package]] name = "protobuf" version = "6.33.5" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/ba/25/7c72c307aafc96fa87062aa6291d9f7c94836e43214d43722e86037aac02/protobuf-6.33.5.tar.gz", hash = "sha256:6ddcac2a081f8b7b9642c09406bc6a4290128fce5f471cddd165960bb9119e5c", size = 444465, upload-time = "2026-01-29T21:51:33.494Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/b1/79/af92d0a8369732b027e6d6084251dd8e782c685c72da161bd4a2e00fbabb/protobuf-6.33.5-cp310-abi3-win32.whl", hash = "sha256:d71b040839446bac0f4d162e758bea99c8251161dae9d0983a3b88dee345153b", size = 425769, upload-time = "2026-01-29T21:51:21.751Z" }, @@ -3020,7 +3020,7 @@ wheels = [ [[package]] name = "pyarrow" version = "22.0.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/30/53/04a7fdc63e6056116c9ddc8b43bc28c12cdd181b85cbeadb79278475f3ae/pyarrow-22.0.0.tar.gz", hash = "sha256:3d600dc583260d845c7d8a6db540339dd883081925da2bd1c5cb808f720b3cd9", size = 1151151, upload-time = "2025-10-24T12:30:00.762Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/d9/9b/cb3f7e0a345353def531ca879053e9ef6b9f38ed91aebcf68b09ba54dec0/pyarrow-22.0.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:77718810bd3066158db1e95a63c160ad7ce08c6b0710bc656055033e39cdad88", size = 34223968, upload-time = "2025-10-24T10:03:31.21Z" }, @@ -3077,7 +3077,7 @@ wheels = [ [[package]] name = "pyasn1" version = "0.6.2" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/fe/b6/6e630dff89739fcd427e3f72b3d905ce0acb85a45d4ec3e2678718a3487f/pyasn1-0.6.2.tar.gz", hash = "sha256:9b59a2b25ba7e4f8197db7686c09fb33e658b98339fadb826e9512629017833b", size = 146586, upload-time = "2026-01-16T18:04:18.534Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/44/b5/a96872e5184f354da9c84ae119971a0a4c221fe9b27a4d94bd43f2596727/pyasn1-0.6.2-py3-none-any.whl", hash = "sha256:1eb26d860996a18e9b6ed05e7aae0e9fc21619fcee6af91cca9bad4fbea224bf", size = 83371, upload-time = "2026-01-16T18:04:17.174Z" }, @@ -3086,7 +3086,7 @@ wheels = [ [[package]] name = "pyasn1-modules" version = "0.4.2" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "pyasn1" }, ] @@ -3098,7 +3098,7 @@ wheels = [ [[package]] name = "pycparser" version = "3.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/1b/7d/92392ff7815c21062bea51aa7b87d45576f649f16458d78b7cf94b9ab2e6/pycparser-3.0.tar.gz", hash = "sha256:600f49d217304a5902ac3c37e1281c9fe94e4d0489de643a9504c5cdfdfc6b29", size = 103492, upload-time = "2026-01-21T14:26:51.89Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/0c/c3/44f3fbbfa403ea2a7c779186dc20772604442dde72947e7d01069cbe98e3/pycparser-3.0-py3-none-any.whl", hash = "sha256:b727414169a36b7d524c1c3e31839a521725078d7b2ff038656844266160a992", size = 48172, upload-time = "2026-01-21T14:26:50.693Z" }, @@ -3107,7 +3107,7 @@ wheels = [ [[package]] name = "pydantic" version = "2.12.5" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "annotated-types" }, { name = "pydantic-core" }, @@ -3122,7 +3122,7 @@ wheels = [ [[package]] name = "pydantic-core" version = "2.41.5" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "typing-extensions" }, ] @@ -3240,7 +3240,7 @@ wheels = [ [[package]] name = "pydantic-settings" version = "2.13.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "pydantic" }, { name = "python-dotenv" }, @@ -3254,7 +3254,7 @@ wheels = [ [[package]] name = "pygments" version = "2.19.2" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/b0/77/a5b8c569bf593b0140bde72ea885a803b82086995367bf2037de0159d924/pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887", size = 4968631, upload-time = "2025-06-21T13:39:12.283Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" }, @@ -3263,7 +3263,7 @@ wheels = [ [[package]] name = "pyjwt" version = "2.11.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/5c/5a/b46fa56bf322901eee5b0454a34343cdbdae202cd421775a8ee4e42fd519/pyjwt-2.11.0.tar.gz", hash = "sha256:35f95c1f0fbe5d5ba6e43f00271c275f7a1a4db1dab27bf708073b75318ea623", size = 98019, upload-time = "2026-01-30T19:59:55.694Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/6f/01/c26ce75ba460d5cd503da9e13b21a33804d38c2165dec7b716d06b13010c/pyjwt-2.11.0-py3-none-any.whl", hash = "sha256:94a6bde30eb5c8e04fee991062b534071fd1439ef58d2adc9ccb823e7bcd0469", size = 28224, upload-time = "2026-01-30T19:59:54.539Z" }, @@ -3277,7 +3277,7 @@ crypto = [ [[package]] name = "pyparsing" version = "3.3.2" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/f3/91/9c6ee907786a473bf81c5f53cf703ba0957b23ab84c264080fb5a450416f/pyparsing-3.3.2.tar.gz", hash = "sha256:c777f4d763f140633dcb6d8a3eda953bf7a214dc4eff598413c070bcdc117cbc", size = 6851574, upload-time = "2026-01-21T03:57:59.36Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/10/bd/c038d7cc38edc1aa5bf91ab8068b63d4308c66c4c8bb3cbba7dfbc049f9c/pyparsing-3.3.2-py3-none-any.whl", hash = "sha256:850ba148bd908d7e2411587e247a1e4f0327839c40e2e5e6d05a007ecc69911d", size = 122781, upload-time = "2026-01-21T03:57:55.912Z" }, @@ -3286,7 +3286,7 @@ wheels = [ [[package]] name = "pytest" version = "9.0.2" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "colorama", marker = "sys_platform == 'win32'" }, { name = "exceptiongroup", marker = "python_full_version < '3.11'" }, @@ -3304,7 +3304,7 @@ wheels = [ [[package]] name = "pytest-asyncio" version = "1.3.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "backports-asyncio-runner", marker = "python_full_version < '3.11'" }, { name = "pytest" }, @@ -3318,7 +3318,7 @@ wheels = [ [[package]] name = "python-dateutil" version = "2.9.0.post0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "six" }, ] @@ -3330,7 +3330,7 @@ wheels = [ [[package]] name = "python-dotenv" version = "1.2.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/f0/26/19cadc79a718c5edbec86fd4919a6b6d3f681039a2f6d66d14be94e75fb9/python_dotenv-1.2.1.tar.gz", hash = "sha256:42667e897e16ab0d66954af0e60a9caa94f0fd4ecf3aaf6d2d260eec1aa36ad6", size = 44221, upload-time = "2025-10-26T15:12:10.434Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/14/1b/a298b06749107c305e1fe0f814c6c74aea7b2f1e10989cb30f544a1b3253/python_dotenv-1.2.1-py3-none-any.whl", hash = "sha256:b81ee9561e9ca4004139c6cbba3a238c32b03e4894671e181b671e8cb8425d61", size = 21230, upload-time = "2025-10-26T15:12:09.109Z" }, @@ -3339,7 +3339,7 @@ wheels = [ [[package]] name = "python-multipart" version = "0.0.22" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/94/01/979e98d542a70714b0cb2b6728ed0b7c46792b695e3eaec3e20711271ca3/python_multipart-0.0.22.tar.gz", hash = "sha256:7340bef99a7e0032613f56dc36027b959fd3b30a787ed62d310e951f7c3a3a58", size = 37612, upload-time = "2026-01-25T10:15:56.219Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/1b/d0/397f9626e711ff749a95d96b7af99b9c566a9bb5129b8e4c10fc4d100304/python_multipart-0.0.22-py3-none-any.whl", hash = "sha256:2b2cd894c83d21bf49d702499531c7bafd057d730c201782048f7945d82de155", size = 24579, upload-time = "2026-01-25T10:15:54.811Z" }, @@ -3348,7 +3348,7 @@ wheels = [ [[package]] name = "pytz" version = "2025.2" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/f8/bf/abbd3cdfb8fbc7fb3d4d38d320f2441b1e7cbe29be4f23797b4a2b5d8aac/pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3", size = 320884, upload-time = "2025-03-25T02:25:00.538Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload-time = "2025-03-25T02:24:58.468Z" }, @@ -3357,7 +3357,7 @@ wheels = [ [[package]] name = "pywin32" version = "311" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } wheels = [ { url = "https://files.pythonhosted.org/packages/7b/40/44efbb0dfbd33aca6a6483191dae0716070ed99e2ecb0c53683f400a0b4f/pywin32-311-cp310-cp310-win32.whl", hash = "sha256:d03ff496d2a0cd4a5893504789d4a15399133fe82517455e78bad62efbb7f0a3", size = 8760432, upload-time = "2025-07-14T20:13:05.9Z" }, { url = "https://files.pythonhosted.org/packages/5e/bf/360243b1e953bd254a82f12653974be395ba880e7ec23e3731d9f73921cc/pywin32-311-cp310-cp310-win_amd64.whl", hash = "sha256:797c2772017851984b97180b0bebe4b620bb86328e8a884bb626156295a63b3b", size = 9590103, upload-time = "2025-07-14T20:13:07.698Z" }, @@ -3379,7 +3379,7 @@ wheels = [ [[package]] name = "pyyaml" version = "6.0.3" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/05/8e/961c0007c59b8dd7729d542c61a4d537767a59645b82a0b521206e1e25c2/pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f", size = 130960, upload-time = "2025-09-25T21:33:16.546Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/f4/a0/39350dd17dd6d6c6507025c0e53aef67a9293a6d37d3511f23ea510d5800/pyyaml-6.0.3-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:214ed4befebe12df36bcc8bc2b64b396ca31be9304b8f59e25c11cf94a4c033b", size = 184227, upload-time = "2025-09-25T21:31:46.04Z" }, @@ -3443,7 +3443,7 @@ wheels = [ [[package]] name = "referencing" version = "0.37.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "attrs" }, { name = "rpds-py" }, @@ -3457,7 +3457,7 @@ wheels = [ [[package]] name = "regex" version = "2026.1.15" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/0b/86/07d5056945f9ec4590b518171c4254a5925832eb727b56d3c38a7476f316/regex-2026.1.15.tar.gz", hash = "sha256:164759aa25575cbc0651bef59a0b18353e54300d79ace8084c818ad8ac72b7d5", size = 414811, upload-time = "2026-01-14T23:18:02.775Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/ea/d2/e6ee96b7dff201a83f650241c52db8e5bd080967cb93211f57aa448dc9d6/regex-2026.1.15-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:4e3dd93c8f9abe8aa4b6c652016da9a3afa190df5ad822907efe6b206c09896e", size = 488166, upload-time = "2026-01-14T23:13:46.408Z" }, @@ -3578,7 +3578,7 @@ wheels = [ [[package]] name = "requests" version = "2.32.5" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "certifi" }, { name = "charset-normalizer" }, @@ -3593,7 +3593,7 @@ wheels = [ [[package]] name = "requests-toolbelt" version = "1.0.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "requests" }, ] @@ -3605,7 +3605,7 @@ wheels = [ [[package]] name = "rpds-py" version = "0.30.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/20/af/3f2f423103f1113b36230496629986e0ef7e199d2aa8392452b484b38ced/rpds_py-0.30.0.tar.gz", hash = "sha256:dd8ff7cf90014af0c0f787eea34794ebf6415242ee1d6fa91eaba725cc441e84", size = 69469, upload-time = "2025-11-30T20:24:38.837Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/06/0c/0c411a0ec64ccb6d104dcabe0e713e05e153a9a2c3c2bd2b32ce412166fe/rpds_py-0.30.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:679ae98e00c0e8d68a7fda324e16b90fd5260945b45d3b824c892cec9eea3288", size = 370490, upload-time = "2025-11-30T20:21:33.256Z" }, @@ -3727,7 +3727,7 @@ wheels = [ [[package]] name = "rsa" version = "4.9.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "pyasn1" }, ] @@ -3739,7 +3739,7 @@ wheels = [ [[package]] name = "s3transfer" version = "0.16.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "botocore" }, ] @@ -3751,14 +3751,14 @@ wheels = [ [[package]] name = "scikit-learn" version = "1.7.2" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } resolution-markers = [ "python_full_version < '3.11'", ] dependencies = [ { name = "joblib", marker = "python_full_version < '3.11'" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, - { name = "scipy", version = "1.15.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version < '3.11'" }, + { name = "scipy", version = "1.15.3", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version < '3.11'" }, { name = "threadpoolctl", marker = "python_full_version < '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/98/c2/a7855e41c9d285dfe86dc50b250978105dce513d6e459ea66a6aeb0e1e0c/scikit_learn-1.7.2.tar.gz", hash = "sha256:20e9e49ecd130598f1ca38a1d85090e1a600147b9c02fa6f15d69cb53d968fda", size = 7193136, upload-time = "2025-09-09T08:21:29.075Z" } @@ -3798,7 +3798,7 @@ wheels = [ [[package]] name = "scikit-learn" version = "1.8.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } resolution-markers = [ "python_full_version >= '3.13'", "python_full_version == '3.12.*'", @@ -3806,8 +3806,8 @@ resolution-markers = [ ] dependencies = [ { name = "joblib", marker = "python_full_version >= '3.11'" }, - { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, - { name = "scipy", version = "1.17.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "numpy", version = "2.4.2", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version >= '3.11'" }, + { name = "scipy", version = "1.17.0", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version >= '3.11'" }, { name = "threadpoolctl", marker = "python_full_version >= '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/0e/d4/40988bf3b8e34feec1d0e6a051446b1f66225f8529b9309becaeef62b6c4/scikit_learn-1.8.0.tar.gz", hash = "sha256:9bccbb3b40e3de10351f8f5068e105d0f4083b1a65fa07b6634fbc401a6287fd", size = 7335585, upload-time = "2025-12-10T07:08:53.618Z" } @@ -3853,12 +3853,12 @@ wheels = [ [[package]] name = "scipy" version = "1.15.3" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } resolution-markers = [ "python_full_version < '3.11'", ] dependencies = [ - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version < '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/0f/37/6964b830433e654ec7485e45a00fc9a27cf868d622838f6b6d9c5ec0d532/scipy-1.15.3.tar.gz", hash = "sha256:eae3cf522bc7df64b42cad3925c876e1b0b6c35c1337c93e12c0f366f55b0eaf", size = 59419214, upload-time = "2025-05-08T16:13:05.955Z" } wheels = [ @@ -3912,14 +3912,14 @@ wheels = [ [[package]] name = "scipy" version = "1.17.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } resolution-markers = [ "python_full_version >= '3.13'", "python_full_version == '3.12.*'", "python_full_version == '3.11.*'", ] dependencies = [ - { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "numpy", version = "2.4.2", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version >= '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/56/3e/9cca699f3486ce6bc12ff46dc2031f1ec8eb9ccc9a320fdaf925f1417426/scipy-1.17.0.tar.gz", hash = "sha256:2591060c8e648d8b96439e111ac41fd8342fdeff1876be2e19dea3fe8930454e", size = 30396830, upload-time = "2026-01-10T21:34:23.009Z" } wheels = [ @@ -3988,7 +3988,7 @@ wheels = [ [[package]] name = "shellingham" version = "1.5.4" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/58/15/8b3609fd3830ef7b27b655beb4b4e9c62313a4e8da8c676e142cc210d58e/shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de", size = 10310, upload-time = "2023-10-24T04:13:40.426Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755, upload-time = "2023-10-24T04:13:38.866Z" }, @@ -3997,7 +3997,7 @@ wheels = [ [[package]] name = "six" version = "1.17.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031, upload-time = "2024-12-04T17:35:28.174Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" }, @@ -4008,6 +4008,8 @@ name = "skill-test" version = "0.1.0" source = { editable = "." } dependencies = [ + { name = "claude-agent-sdk" }, + { name = "gepa" }, { name = "mlflow", extra = ["databricks"] }, { name = "protobuf" }, { name = "python-dotenv" }, @@ -4022,6 +4024,7 @@ all = [ { name = "claude-agent-sdk" }, { name = "databricks-sdk" }, { name = "gepa" }, + { name = "litellm" }, { name = "pytest" }, { name = "pytest-asyncio" }, { name = "tiktoken" }, @@ -4035,14 +4038,18 @@ dev = [ ] optimize = [ { name = "gepa" }, + { name = "litellm" }, { name = "tiktoken" }, ] [package.metadata] requires-dist = [ + { name = "claude-agent-sdk", specifier = ">=0.1.39" }, { name = "claude-agent-sdk", marker = "extra == 'agent'", specifier = ">=0.1.39" }, { name = "databricks-sdk", marker = "extra == 'databricks'", specifier = ">=0.20.0" }, + { name = "gepa", specifier = ">=0.1.0" }, { name = "gepa", marker = "extra == 'optimize'", specifier = ">=0.1.0" }, + { name = "litellm", marker = "extra == 'optimize'", specifier = "<=1.82.6" }, { name = "mlflow", extras = ["databricks"], specifier = ">=3.10.1" }, { name = "protobuf", specifier = ">=5.26" }, { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0" }, @@ -4057,16 +4064,16 @@ provides-extras = ["databricks", "dev", "optimize", "agent", "all"] [[package]] name = "skops" version = "0.13.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, - { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version < '3.11'" }, + { name = "numpy", version = "2.4.2", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version >= '3.11'" }, { name = "packaging" }, { name = "prettytable" }, - { name = "scikit-learn", version = "1.7.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, - { name = "scikit-learn", version = "1.8.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, - { name = "scipy", version = "1.15.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, - { name = "scipy", version = "1.17.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "scikit-learn", version = "1.7.2", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version < '3.11'" }, + { name = "scikit-learn", version = "1.8.0", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version >= '3.11'" }, + { name = "scipy", version = "1.15.3", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version < '3.11'" }, + { name = "scipy", version = "1.17.0", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version >= '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/b5/0c/5ec987633e077dd0076178ea6ade2d6e57780b34afea0b497fb507d7a1ed/skops-0.13.0.tar.gz", hash = "sha256:66949fd3c95cbb5c80270fbe40293c0fe1e46cb4a921860e42584dd9c20ebeb1", size = 581312, upload-time = "2025-08-06T09:48:14.916Z" } wheels = [ @@ -4076,7 +4083,7 @@ wheels = [ [[package]] name = "smmap" version = "5.0.2" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/44/cd/a040c4b3119bbe532e5b0732286f805445375489fceaec1f48306068ee3b/smmap-5.0.2.tar.gz", hash = "sha256:26ea65a03958fa0c8a1c7e8c7a58fdc77221b8910f6be2131affade476898ad5", size = 22329, upload-time = "2025-01-02T07:14:40.909Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/04/be/d09147ad1ec7934636ad912901c5fd7667e1c858e19d355237db0d0cd5e4/smmap-5.0.2-py3-none-any.whl", hash = "sha256:b30115f0def7d7531d22a0fb6502488d879e75b260a9db4d0819cfb25403af5e", size = 24303, upload-time = "2025-01-02T07:14:38.724Z" }, @@ -4085,7 +4092,7 @@ wheels = [ [[package]] name = "sniffio" version = "1.3.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372, upload-time = "2024-02-25T23:20:04.057Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" }, @@ -4094,7 +4101,7 @@ wheels = [ [[package]] name = "sqlalchemy" version = "2.0.46" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "greenlet", marker = "platform_machine == 'AMD64' or platform_machine == 'WIN32' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'ppc64le' or platform_machine == 'win32' or platform_machine == 'x86_64'" }, { name = "typing-extensions" }, @@ -4150,7 +4157,7 @@ wheels = [ [[package]] name = "sqlparse" version = "0.5.5" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/90/76/437d71068094df0726366574cf3432a4ed754217b436eb7429415cf2d480/sqlparse-0.5.5.tar.gz", hash = "sha256:e20d4a9b0b8585fdf63b10d30066c7c94c5d7a7ec47c889a2d83a3caa93ff28e", size = 120815, upload-time = "2025-12-19T07:17:45.073Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/49/4b/359f28a903c13438ef59ebeee215fb25da53066db67b305c125f1c6d2a25/sqlparse-0.5.5-py3-none-any.whl", hash = "sha256:12a08b3bf3eec877c519589833aed092e2444e68240a3577e8e26148acc7b1ba", size = 46138, upload-time = "2025-12-19T07:17:46.573Z" }, @@ -4159,7 +4166,7 @@ wheels = [ [[package]] name = "sse-starlette" version = "3.3.2" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "anyio" }, { name = "starlette" }, @@ -4172,7 +4179,7 @@ wheels = [ [[package]] name = "starlette" version = "0.50.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "anyio" }, { name = "typing-extensions", marker = "python_full_version < '3.13'" }, @@ -4185,7 +4192,7 @@ wheels = [ [[package]] name = "tenacity" version = "9.1.2" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/0a/d4/2b0cd0fe285e14b36db076e78c93766ff1d529d70408bd1d2a5a84f1d929/tenacity-9.1.2.tar.gz", hash = "sha256:1169d376c297e7de388d18b4481760d478b0e99a777cad3a9c86e556f4b697cb", size = 48036, upload-time = "2025-04-02T08:25:09.966Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/e5/30/643397144bfbfec6f6ef821f36f33e57d35946c44a2352d3c9f0ae847619/tenacity-9.1.2-py3-none-any.whl", hash = "sha256:f77bf36710d8b73a50b2dd155c97b870017ad21afe6ab300326b0371b3b05138", size = 28248, upload-time = "2025-04-02T08:25:07.678Z" }, @@ -4194,7 +4201,7 @@ wheels = [ [[package]] name = "threadpoolctl" version = "3.6.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/b7/4d/08c89e34946fce2aec4fbb45c9016efd5f4d7f24af8e5d93296e935631d8/threadpoolctl-3.6.0.tar.gz", hash = "sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e", size = 21274, upload-time = "2025-03-13T13:49:23.031Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl", hash = "sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb", size = 18638, upload-time = "2025-03-13T13:49:21.846Z" }, @@ -4203,7 +4210,7 @@ wheels = [ [[package]] name = "tiktoken" version = "0.12.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "regex" }, { name = "requests" }, @@ -4264,7 +4271,7 @@ wheels = [ [[package]] name = "tokenizers" version = "0.22.2" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "huggingface-hub" }, ] @@ -4294,7 +4301,7 @@ wheels = [ [[package]] name = "tomli" version = "2.4.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/82/30/31573e9457673ab10aa432461bee537ce6cef177667deca369efb79df071/tomli-2.4.0.tar.gz", hash = "sha256:aa89c3f6c277dd275d8e243ad24f3b5e701491a860d5121f2cdd399fbb31fc9c", size = 17477, upload-time = "2026-01-11T11:22:38.165Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/3c/d9/3dc2289e1f3b32eb19b9785b6a006b28ee99acb37d1d47f78d4c10e28bf8/tomli-2.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b5ef256a3fd497d4973c11bf142e9ed78b150d36f5773f1ca6088c230ffc5867", size = 153663, upload-time = "2026-01-11T11:21:45.27Z" }, @@ -4348,7 +4355,7 @@ wheels = [ [[package]] name = "tqdm" version = "4.67.3" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "colorama", marker = "sys_platform == 'win32'" }, ] @@ -4360,7 +4367,7 @@ wheels = [ [[package]] name = "typer-slim" version = "0.21.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "click" }, { name = "typing-extensions" }, @@ -4373,7 +4380,7 @@ wheels = [ [[package]] name = "typing-extensions" version = "4.15.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466", size = 109391, upload-time = "2025-08-25T13:49:26.313Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" }, @@ -4382,7 +4389,7 @@ wheels = [ [[package]] name = "typing-inspect" version = "0.9.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "mypy-extensions" }, { name = "typing-extensions" }, @@ -4395,7 +4402,7 @@ wheels = [ [[package]] name = "typing-inspection" version = "0.4.2" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "typing-extensions" }, ] @@ -4407,7 +4414,7 @@ wheels = [ [[package]] name = "tzdata" version = "2025.3" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/5e/a7/c202b344c5ca7daf398f3b8a477eeb205cf3b6f32e7ec3a6bac0629ca975/tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7", size = 196772, upload-time = "2025-12-13T17:45:35.667Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/c7/b0/003792df09decd6849a5e39c28b513c06e84436a54440380862b5aeff25d/tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1", size = 348521, upload-time = "2025-12-13T17:45:33.889Z" }, @@ -4416,7 +4423,7 @@ wheels = [ [[package]] name = "urllib3" version = "2.6.3" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/c7/24/5f1b3bdffd70275f6661c76461e25f024d5a38a46f04aaca912426a2b1d3/urllib3-2.6.3.tar.gz", hash = "sha256:1b62b6884944a57dbe321509ab94fd4d3b307075e0c2eae991ac71ee15ad38ed", size = 435556, upload-time = "2026-01-07T16:24:43.925Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4", size = 131584, upload-time = "2026-01-07T16:24:42.685Z" }, @@ -4425,7 +4432,7 @@ wheels = [ [[package]] name = "uuid-utils" version = "0.14.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/57/7c/3a926e847516e67bc6838634f2e54e24381105b4e80f9338dc35cca0086b/uuid_utils-0.14.0.tar.gz", hash = "sha256:fc5bac21e9933ea6c590433c11aa54aaca599f690c08069e364eb13a12f670b4", size = 22072, upload-time = "2026-01-20T20:37:15.729Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/a7/42/42d003f4a99ddc901eef2fd41acb3694163835e037fb6dde79ad68a72342/uuid_utils-0.14.0-cp39-abi3-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:f6695c0bed8b18a904321e115afe73b34444bc8451d0ce3244a1ec3b84deb0e5", size = 601786, upload-time = "2026-01-20T20:37:09.843Z" }, @@ -4454,7 +4461,7 @@ wheels = [ [[package]] name = "uvicorn" version = "0.40.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "click" }, { name = "h11" }, @@ -4468,7 +4475,7 @@ wheels = [ [[package]] name = "waitress" version = "3.0.2" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/bf/cb/04ddb054f45faa306a230769e868c28b8065ea196891f09004ebace5b184/waitress-3.0.2.tar.gz", hash = "sha256:682aaaf2af0c44ada4abfb70ded36393f0e307f4ab9456a215ce0020baefc31f", size = 179901, upload-time = "2024-11-16T20:02:35.195Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/8d/57/a27182528c90ef38d82b636a11f606b0cbb0e17588ed205435f8affe3368/waitress-3.0.2-py3-none-any.whl", hash = "sha256:c56d67fd6e87c2ee598b76abdd4e96cfad1f24cacdea5078d382b1f9d7b5ed2e", size = 56232, upload-time = "2024-11-16T20:02:33.858Z" }, @@ -4477,7 +4484,7 @@ wheels = [ [[package]] name = "wcwidth" version = "0.5.3" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/c2/62/a7c072fbfefb2980a00f99ca994279cb9ecf310cb2e6b2a4d2a28fe192b3/wcwidth-0.5.3.tar.gz", hash = "sha256:53123b7af053c74e9fe2e92ac810301f6139e64379031f7124574212fb3b4091", size = 157587, upload-time = "2026-01-31T03:52:10.92Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/3c/c1/d73f12f8cdb1891334a2ccf7389eed244d3941e74d80dd220badb937f3fb/wcwidth-0.5.3-py3-none-any.whl", hash = "sha256:d584eff31cd4753e1e5ff6c12e1edfdb324c995713f75d26c29807bb84bf649e", size = 92981, upload-time = "2026-01-31T03:52:09.14Z" }, @@ -4486,7 +4493,7 @@ wheels = [ [[package]] name = "werkzeug" version = "3.1.5" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "markupsafe" }, ] @@ -4498,7 +4505,7 @@ wheels = [ [[package]] name = "whenever" version = "0.7.3" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "tzdata", marker = "sys_platform == 'win32'" }, ] @@ -4565,7 +4572,7 @@ wheels = [ [[package]] name = "xxhash" version = "3.6.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/02/84/30869e01909fb37a6cc7e18688ee8bf1e42d57e7e0777636bd47524c43c7/xxhash-3.6.0.tar.gz", hash = "sha256:f0162a78b13a0d7617b2845b90c763339d1f1d82bb04a4b07f4ab535cc5e05d6", size = 85160, upload-time = "2025-10-02T14:37:08.097Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/34/ee/f9f1d656ad168681bb0f6b092372c1e533c4416b8069b1896a175c46e484/xxhash-3.6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:87ff03d7e35c61435976554477a7f4cd1704c3596a89a8300d5ce7fc83874a71", size = 32845, upload-time = "2025-10-02T14:33:51.573Z" }, @@ -4683,7 +4690,7 @@ wheels = [ [[package]] name = "yarl" version = "1.22.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "idna" }, { name = "multidict" }, @@ -4809,7 +4816,7 @@ wheels = [ [[package]] name = "zipp" version = "3.23.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/e3/02/0f2892c661036d50ede074e376733dca2ae7c6eb617489437771209d4180/zipp-3.23.0.tar.gz", hash = "sha256:a07157588a12518c9d4034df3fbbee09c814741a33ff63c05fa29d26a2404166", size = 25547, upload-time = "2025-06-08T17:06:39.4Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/2e/54/647ade08bf0db230bfea292f893923872fd20be6ac6f53b2b936ba839d75/zipp-3.23.0-py3-none-any.whl", hash = "sha256:071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e", size = 10276, upload-time = "2025-06-08T17:06:38.034Z" }, @@ -4818,7 +4825,7 @@ wheels = [ [[package]] name = "zstandard" version = "0.25.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/fd/aa/3e0508d5a5dd96529cdc5a97011299056e14c6505b678fd58938792794b1/zstandard-0.25.0.tar.gz", hash = "sha256:7713e1179d162cf5c7906da876ec2ccb9c3a9dcbdffef0cc7f70c3667a205f0b", size = 711513, upload-time = "2025-09-14T22:15:54.002Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/56/7a/28efd1d371f1acd037ac64ed1c5e2b41514a6cc937dd6ab6a13ab9f0702f/zstandard-0.25.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e59fdc271772f6686e01e1b3b74537259800f57e24280be3f29c8a0deb1904dd", size = 795256, upload-time = "2025-09-14T22:15:56.415Z" }, diff --git a/databricks-skills/databricks-lakebase-autoscale/SKILL.md b/databricks-skills/databricks-lakebase-autoscale/SKILL.md index 797b8bd4..82fcbcdd 100644 --- a/databricks-skills/databricks-lakebase-autoscale/SKILL.md +++ b/databricks-skills/databricks-lakebase-autoscale/SKILL.md @@ -7,7 +7,7 @@ description: "Patterns and best practices for Lakebase Autoscaling (next-gen man Next-generation managed PostgreSQL on Databricks β€” autoscaling compute (0.5-112 CU, ~2 GB/CU), Git-like branching, scale-to-zero, and point-in-time restore (up to 35 days). -**Interface: Databricks CLI (`databricks postgres ...`).** Every admin command below uses the CLI. The only place SDK is preferred is inside an application that needs to refresh 1-hour OAuth tokens β€” see [connection-patterns.md](connection-patterns.md). +**Interface: Databricks CLI (`databricks postgres ...`).** Every admin command below uses the CLI. The only place SDK is preferred is inside an application that needs to refresh 1-hour OAuth tokens β€” see [connection-patterns.md](references/connection-patterns.md). ## Hierarchy @@ -46,7 +46,7 @@ databricks postgres update-project projects/my-app spec.display_name \ databricks postgres delete-project projects/my-app ``` -β†’ Defaults, limits table, LRO mechanics, SDK equivalents: [projects.md](projects.md). +β†’ Defaults, limits table, LRO mechanics, SDK equivalents: [projects.md](references/projects.md). --- @@ -74,13 +74,13 @@ databricks postgres reset-branch projects/my-app/branches/development databricks postgres delete-branch projects/my-app/branches/development ``` -β†’ Copy-on-write internals, TTL rules (max 30 days), reset constraints, SDK equivalents: [branches.md](branches.md). +β†’ Copy-on-write internals, TTL rules (max 30 days), reset constraints, SDK equivalents: [branches.md](references/branches.md). --- ## Endpoints (Compute) -A compute runs Postgres for one branch. One R/W endpoint per branch (plus optional read replicas). Autoscale range: 0.5-32 CU with max-min ≀ 8 CU. Large fixed sizes: 36-112 CU. +A compute runs Postgres for one branch. One R/W endpoint per branch (plus optional read replicas). Autoscale range: 0.5-32 CU with max-min ≀ 16 CU. Large fixed sizes: 36-112 CU. ```bash # Create an R/W endpoint β€” replace RW with ENDPOINT_TYPE_READ_ONLY for read replicas @@ -93,6 +93,9 @@ databricks postgres create-endpoint \ # Get host, state, CU range databricks postgres get-endpoint projects/my-app/branches/production/endpoints/primary +# List all endpoints on a branch +databricks postgres list-endpoints projects/my-app/branches/production + # Resize β€” mask is a comma-separated positional; JSON holds new values databricks postgres update-endpoint \ projects/my-app/branches/production/endpoints/primary \ @@ -103,9 +106,9 @@ databricks postgres update-endpoint \ databricks postgres delete-endpoint projects/my-app/branches/production/endpoints/my-compute ``` -**Scale-to-zero:** off on `production` by default, configurable elsewhere (min 60s, default 5min). Reactivation takes a few hundred ms; session context (temp tables, prepared statements, in-memory cache) is **reset** on wake. +**Scale-to-zero:** off on `production` by default, configurable elsewhere (min 60s, default 5min). Reactivation takes ~100ms; session context (temp tables, prepared statements, in-memory cache) is **reset** on wake. -β†’ CU sizing table, autoscaling math, scale-to-zero internals, SDK equivalents: [computes.md](computes.md). +β†’ CU sizing table, autoscaling math, scale-to-zero internals, SDK equivalents: [computes.md](references/computes.md). --- @@ -137,11 +140,26 @@ echo "postgresql://${USER/@/%40}:$TOKEN@$HOST:5432/databricks_postgres?sslmode=r PGPASSWORD="$TOKEN" psql "host=$HOST dbname=databricks_postgres user=$USER sslmode=require" ``` -Token TTL is ~1 hour. For app deployment, store **only the endpoint path** as config and generate the token at startup (and every ~50 min thereafter) β€” never bake the token into env files. +Token TTL is ~1 hour. For app deployment, store **only the endpoint path** as config and generate the token at startup (and every 45 min thereafter) β€” never bake the token into env files. Application code is the one place to use the SDK β€” tokens expire hourly and must be refreshed in-process. -β†’ Runtime connection patterns (minimal SDK snippet, SQLAlchemy pooling, async refresh loop, macOS DNS workaround, static-URL local dev): [connection-patterns.md](connection-patterns.md). +```python +# Application code β€” refresh token every 45 min: +import psycopg +from databricks.sdk import WorkspaceClient + +w = WorkspaceClient() +ep = "projects/my-app/branches/production/endpoints/primary" +host = w.postgres.get_endpoint(name=ep).status.hosts.host +token = w.postgres.generate_database_credential(endpoint=ep).token +conn = psycopg.connect( + f"host={host} dbname=databricks_postgres " + f"user={w.current_user.me().user_name} password={token} sslmode=require" +) +``` + +β†’ Runtime connection patterns (minimal SDK snippet, SQLAlchemy pooling, async refresh loop, macOS DNS workaround, static-URL local dev): [connection-patterns.md](references/connection-patterns.md). --- @@ -151,21 +169,23 @@ Syncs Unity Catalog Delta tables into Lakebase as Postgres tables via managed La ```bash # Create a synced table β€” swap scheduling_policy to SNAPSHOT | TRIGGERED | CONTINUOUS -databricks database create-synced-database-table \ - --json '{"name": "lakebase_catalog.schema.synced_table", - "spec": {"source_table_full_name": "analytics.gold.user_profiles", +databricks postgres create-synced-table lakebase_catalog.schema.synced_table \ + --json '{"spec": {"source_table_full_name": "analytics.gold.user_profiles", "primary_key_columns": ["user_id"], "scheduling_policy": "TRIGGERED", "new_pipeline_spec": {"storage_catalog": "lakebase_catalog", "storage_schema": "staging"}}}' # Status (detailed_state shows sync progress) -databricks database get-synced-database-table lakebase_catalog.schema.synced_table +databricks postgres get-synced-table synced_tables/lakebase_catalog.schema.synced_table + +# Delete (also drop the Postgres-side table separately) +databricks postgres delete-synced-table synced_tables/lakebase_catalog.schema.synced_table ``` Enable CDF on the source for TRIGGERED/CONTINUOUS: `ALTER TABLE ... SET TBLPROPERTIES (delta.enableChangeDataFeed = true)`. Each synced table uses up to 16 connections and counts against per-branch limits. -β†’ Mode comparison, type mapping (UC β†’ Postgres), capacity planning, schema evolution rules, SDK equivalents: [reverse-etl.md](reverse-etl.md). +β†’ Mode comparison, type mapping (UC β†’ Postgres), capacity planning, schema evolution rules, SDK equivalents: [reverse-etl.md](references/reverse-etl.md). --- @@ -173,18 +193,37 @@ Enable CDF on the source for TRIGGERED/CONTINUOUS: `ALTER TABLE ... SET TBLPROPE | Issue | Solution | |-------|----------| -| Token expired during long query | Refresh tokens every ~50 min (1h TTL) | -| Connection refused after scale-to-zero | Compute wakes on connect (~few hundred ms); add retry logic | +| Token expired during long query | Refresh tokens every 45 min (1h TTL) | +| Connection refused after scale-to-zero | Compute wakes on connect (~100ms); add retry logic | | DNS resolution fails on macOS | Pass `hostaddr` (resolved via `dig`) alongside `host` to psycopg | | Branch delete blocked | Delete child branches first; remove protection first | -| Autoscaling range rejected | max-min must be ≀ 8 CU (e.g., 8-16 valid; 0.5-32 invalid) | +| Autoscaling range rejected | max-min must be ≀ 16 CU (e.g., 4-20 valid; 0.5-32 invalid) | | SSL required error | Always `sslmode=require` | | Update mask required | CLI `update-*` commands take the mask as a positional arg | | Connection closed after 24h idle | 24h idle timeout, 3-day max lifetime β€” add retry | +## Databricks Apps Integration + +Scaffold an app connected to Lakebase at creation time: + +```bash +databricks apps init --name my-app \ + --features lakebase \ + --set "lakebase.postgres.branch=production" \ + --set "lakebase.postgres.database=databricks_postgres" +``` + +## High Availability + +HA adds 1–3 read secondaries across availability zones with automatic failover. Secondaries are accessible via a `-ro` suffix on the host and independently autoscale (but won't drop below the primary's current CU). HA is incompatible with scale-to-zero. See [computes.md](references/computes.md) for sizing constraints. + +## Lakehouse Sync (Beta β€” AWS only) + +Reverse direction: continuously streams Postgres changes from Lakebase into Unity Catalog Delta tables via CDC. Azure support TBD. Enable via the project UI. + ## Not Yet Supported -HA readable secondaries (use read replicas), Databricks Apps UI integration (manual connect works), Feature Store, stateful AI agents (LangChain memory), Postgresβ†’Delta sync, custom billing tags / serverless budget policies. +Custom billing tags / serverless budget policies. ## Related Skills diff --git a/databricks-skills/databricks-lakebase-autoscale/branches.md b/databricks-skills/databricks-lakebase-autoscale/references/branches.md similarity index 98% rename from databricks-skills/databricks-lakebase-autoscale/branches.md rename to databricks-skills/databricks-lakebase-autoscale/references/branches.md index ade6428a..d5092b4b 100644 --- a/databricks-skills/databricks-lakebase-autoscale/branches.md +++ b/databricks-skills/databricks-lakebase-autoscale/references/branches.md @@ -1,6 +1,6 @@ # Lakebase Autoscaling β€” Branches (deep dive) -Deep dive for the Branches concept. Basic CLI is in [SKILL.md](SKILL.md). +Deep dive for the Branches concept. Basic CLI is in [SKILL.md](../SKILL.md). ## How Branching Works diff --git a/databricks-skills/databricks-lakebase-autoscale/computes.md b/databricks-skills/databricks-lakebase-autoscale/references/computes.md similarity index 77% rename from databricks-skills/databricks-lakebase-autoscale/computes.md rename to databricks-skills/databricks-lakebase-autoscale/references/computes.md index eb0bc62e..0ddea61c 100644 --- a/databricks-skills/databricks-lakebase-autoscale/computes.md +++ b/databricks-skills/databricks-lakebase-autoscale/references/computes.md @@ -1,10 +1,10 @@ # Lakebase Autoscaling β€” Computes (deep dive) -Deep dive for Endpoints (computes). Basic CLI is in [SKILL.md](SKILL.md). +Deep dive for Endpoints (computes). Basic CLI is in [SKILL.md](../SKILL.md). ## What an Endpoint Is -An endpoint is a Postgres server instance attached to one branch. Each branch has exactly one R/W endpoint (conventionally `ep-primary`) and may have additional read-only replicas. The endpoint owns the hostname clients connect to and the CU budget that determines concurrency and RAM. +An endpoint is a Postgres server instance attached to one branch. Each branch has exactly one R/W endpoint (named `primary` by default) and may have additional read-only replicas. The endpoint owns the hostname clients connect to and the CU budget that determines concurrency and RAM. ## Compute Units @@ -27,12 +27,12 @@ Max connections flattens at 4,000 above 32 CU β€” scale up past 32 CU for memory | Category | Range | Behavior | |----------|-------|----------| -| Autoscale | 0.5-32 CU | Dynamic scaling; `max βˆ’ min ≀ 8 CU` | +| Autoscale | 0.5-32 CU | Dynamic scaling; `max βˆ’ min ≀ 16 CU` | | Large fixed | 36-112 CU | Fixed size, no autoscaling | -**Autoscaling window constraint.** The spread between `autoscaling_limit_min_cu` and `autoscaling_limit_max_cu` cannot exceed 8 CU: -- Valid: 4-8, 8-16, 16-24 -- Invalid: 0.5-32 (31.5 CU spread), 0.5-16 (15.5 CU spread) +**Autoscaling window constraint.** The spread between `autoscaling_limit_min_cu` and `autoscaling_limit_max_cu` cannot exceed 16 CU: +- Valid: .5-4, 4-20, 8-32 +- Invalid: 0.5-32 (31.5 CU spread), 1-24 (23 CU spread) Set the minimum high enough to keep your working set in memory β€” traffic that lands after a scale-up pays a cache-warm penalty until hot pages are faulted back in. @@ -47,7 +47,7 @@ When enabled, an endpoint suspends after an inactivity window (min 60 s, default ### Wake-up -Incoming connections to a suspended endpoint trigger reactivation. Expected latency is a few hundred ms, but: +Incoming connections to a suspended endpoint trigger reactivation. Expected latency is ~100ms, but: - First connection may see a timeout β€” applications must retry. - Endpoint resumes at the **minimum** of its autoscaling range; expect cache-cold performance until load ramps up. - All session-scoped state is lost: in-memory stats, temp tables, prepared statements, session GUCs, active transactions. @@ -93,6 +93,20 @@ databricks postgres update-endpoint \ --json '{"spec": {"autoscaling_limit_min_cu": 64.0, "autoscaling_limit_max_cu": 64.0}}' ``` +## High Availability + +HA adds 1–3 read secondaries across availability zones with automatic failover (hot standby, automatic primary promotion on failure). + +**Secondaries vs. read replicas:** +- **HA secondaries** share a `-ro` hostname suffix and floor at the primary's current CU. Use for failover with optional read offload. +- **Read replicas** are separate endpoints (`ENDPOINT_TYPE_READ_ONLY`) with independent sizing. Use for dedicated read scaling. + +**HA constraints:** +- Scale-to-zero is **not supported** with HA enabled. +- Autoscaling spread limit (≀16 CU) still applies. +- Secondaries autoscale independently but won't drop below the primary's current minimum. +- Total compute instances per project: 2–4 (primary + 1–3 secondaries). + ## Sizing Guidance | Factor | Recommendation | diff --git a/databricks-skills/databricks-lakebase-autoscale/connection-patterns.md b/databricks-skills/databricks-lakebase-autoscale/references/connection-patterns.md similarity index 86% rename from databricks-skills/databricks-lakebase-autoscale/connection-patterns.md rename to databricks-skills/databricks-lakebase-autoscale/references/connection-patterns.md index 111238e7..ca295292 100644 --- a/databricks-skills/databricks-lakebase-autoscale/connection-patterns.md +++ b/databricks-skills/databricks-lakebase-autoscale/references/connection-patterns.md @@ -1,6 +1,6 @@ # Lakebase Autoscaling β€” Connection Patterns (deep dive) -Deep dive for the application-runtime connection layer. Basic credential generation and a minimal Python snippet are in [SKILL.md](SKILL.md#credentials--connecting). +Deep dive for the application-runtime connection layer. Basic credential generation and a minimal Python snippet are in [SKILL.md](../SKILL.md#credentials--connecting). **Why this file uses the SDK and the others don't.** OAuth tokens are 1-hour TTL and must be refreshed from inside the running process β€” shelling out to the CLI per refresh is slow, fragile, and awkward to embed in a pool. All admin operations (project, branch, endpoint, synced-table lifecycle) stay on the CLI; only runtime token rotation and connection pooling live here. @@ -105,7 +105,7 @@ class LakebaseAutoscaleConnectionManager: database_name: str = "databricks_postgres", pool_size: int = 5, max_overflow: int = 10, - token_refresh_seconds: int = 3000 # 50 minutes + token_refresh_seconds: int = 2700 # 45 minutes ): self.project_id = project_id self.branch_id = branch_id @@ -308,9 +308,47 @@ conn = psycopg.connect(**conn_params) ## Best Practices 1. **Always use SSL**: Set `sslmode=require` in all connections -2. **Implement token refresh**: Tokens expire after 1 hour; refresh at 50 minutes +2. **Implement token refresh**: Tokens expire after 1 hour; refresh every 45 minutes 3. **Use connection pooling**: Avoid creating new connections per request 4. **Handle DNS issues on macOS**: Use the `hostaddr` workaround if needed 5. **Close connections properly**: Use context managers or explicit cleanup -6. **Handle scale-to-zero wake-up**: First connection after idle may take 2-5 seconds +6. **Handle scale-to-zero wake-up**: First connection after idle may take ~100ms; implement retry logic 7. **Log token refresh events**: Helps debug authentication issues + +## Data API (Autoscaling only) + +A PostgREST-compatible HTTP CRUD interface β€” no Postgres driver required. Enable in the project UI (auto-creates an `authenticator` role and `pgrst` schema). + +All requests require a Databricks OAuth bearer token: + +```bash +TOKEN=$(databricks postgres generate-database-credential \ + projects/my-app/branches/production/endpoints/primary | jq -r '.token') +DATA_API_URL="https:///api/2.0/lakebase/projects/my-app/data" +``` + +```bash +# GET β€” filter, paginate, order +curl -H "Authorization: Bearer $TOKEN" \ + "$DATA_API_URL/public/users?age=gt.21&limit=10&order=created_at.desc" + +# POST β€” insert +curl -X POST -H "Authorization: Bearer $TOKEN" \ + -H "Content-Type: application/json" \ + -d '{"name": "Alice", "email": "alice@example.com"}' \ + "$DATA_API_URL/public/users" + +# PATCH β€” update (filter is required) +curl -X PATCH -H "Authorization: Bearer $TOKEN" \ + -H "Content-Type: application/json" \ + -d '{"status": "active"}' \ + "$DATA_API_URL/public/users?id=eq.42" + +# DELETE β€” filter is required +curl -X DELETE -H "Authorization: Bearer $TOKEN" \ + "$DATA_API_URL/public/users?id=eq.42" +``` + +**Row-level security** with `current_user` policies is strongly recommended for multi-tenant apps. + +**Unsupported:** computed relationships, inner-join embedding, custom media types, transaction control headers, EXPLAIN/trace, pre-request functions, PostGIS auto-GeoJSON. diff --git a/databricks-skills/databricks-lakebase-autoscale/projects.md b/databricks-skills/databricks-lakebase-autoscale/references/projects.md similarity index 98% rename from databricks-skills/databricks-lakebase-autoscale/projects.md rename to databricks-skills/databricks-lakebase-autoscale/references/projects.md index 1555f697..ca629312 100644 --- a/databricks-skills/databricks-lakebase-autoscale/projects.md +++ b/databricks-skills/databricks-lakebase-autoscale/references/projects.md @@ -1,6 +1,6 @@ # Lakebase Autoscaling β€” Projects (deep dive) -Deep dive for the Projects concept. Basic CLI is in [SKILL.md](SKILL.md). +Deep dive for the Projects concept. Basic CLI is in [SKILL.md](../SKILL.md). ## What a Project Is diff --git a/databricks-skills/databricks-lakebase-autoscale/reverse-etl.md b/databricks-skills/databricks-lakebase-autoscale/references/reverse-etl.md similarity index 81% rename from databricks-skills/databricks-lakebase-autoscale/reverse-etl.md rename to databricks-skills/databricks-lakebase-autoscale/references/reverse-etl.md index ce7fa3e9..49720ff2 100644 --- a/databricks-skills/databricks-lakebase-autoscale/reverse-etl.md +++ b/databricks-skills/databricks-lakebase-autoscale/references/reverse-etl.md @@ -1,6 +1,6 @@ # Lakebase Autoscaling β€” Reverse ETL (deep dive) -Deep dive for synced tables. Basic CLI is in [SKILL.md](SKILL.md). +Deep dive for synced tables. Basic CLI is in [SKILL.md](../SKILL.md). ## How Synced Tables Work @@ -15,8 +15,8 @@ The pipeline is a managed Lakeflow Spark Declarative Pipeline that runs on Datab | Mode | Throughput (per CU) | Latency | |------|---------------------|---------| -| Continuous writes | ~1,200 rows/s | seconds | -| Bulk writes (initial load / snapshot) | ~15,000 rows/s | minutes-hours depending on size | +| Snapshot (initial load) | ~2,000 rows/s | minutes-hours depending on size | +| Triggered / Continuous (incremental) | ~150 rows/s | seconds (Continuous); scheduled (Triggered) | Each synced table uses **up to 16 connections** on the target endpoint. Account for this when sizing concurrency-heavy apps alongside reverse ETL on the same endpoint. @@ -96,9 +96,8 @@ DROP TABLE your_database.your_schema.your_table; Product catalog for a web app (hourly refresh is plenty): ```bash -databricks database create-synced-database-table \ - --json '{"name": "ecommerce_catalog.public.products", - "spec": {"source_table_full_name": "gold.products.catalog", +databricks postgres create-synced-table ecommerce_catalog.public.products \ + --json '{"spec": {"source_table_full_name": "gold.products.catalog", "primary_key_columns": ["product_id"], "scheduling_policy": "TRIGGERED"}}' ``` @@ -106,28 +105,31 @@ databricks database create-synced-database-table \ Real-time feature serving for ML (needs CDF on `ml.features.user_features`): ```bash -databricks database create-synced-database-table \ - --json '{"name": "ml_catalog.public.user_features", - "spec": {"source_table_full_name": "ml.features.user_features", +databricks postgres create-synced-table ml_catalog.public.user_features \ + --json '{"spec": {"source_table_full_name": "ml.features.user_features", "primary_key_columns": ["user_id"], "scheduling_policy": "CONTINUOUS"}}' ``` +## Lakehouse Sync (Beta β€” AWS only) + +Reverse direction: continuously streams Postgres row changes from Lakebase into Unity Catalog Delta tables via CDC. Enable via the project UI. Azure support TBD. + ## SDK Equivalents -Synced tables use the `w.database` SDK module (not `w.postgres`): +Synced tables use the `w.postgres` SDK module (CLI v0.294.0+): ```python from databricks.sdk import WorkspaceClient -from databricks.sdk.service.database import ( - SyncedDatabaseTable, SyncedTableSpec, +from databricks.sdk.service.postgres import ( + SyncedTable, SyncedTableSpec, NewPipelineSpec, SyncedTableSchedulingPolicy, ) w = WorkspaceClient() -w.database.create_synced_database_table( - SyncedDatabaseTable( +w.postgres.create_synced_table( + SyncedTable( name="lakebase_catalog.schema.synced_table", spec=SyncedTableSpec( source_table_full_name="analytics.gold.user_profiles", @@ -141,6 +143,6 @@ w.database.create_synced_database_table( ) ) -status = w.database.get_synced_database_table(name="lakebase_catalog.schema.synced_table") +status = w.postgres.get_synced_table(name="synced_tables/lakebase_catalog.schema.synced_table") print(status.data_synchronization_status.detailed_state) ``` From 9ed16205a2d38a4dcbfd0a430d1dc648f9a93771 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Wed, 6 May 2026 15:15:14 +0200 Subject: [PATCH 63/66] Dashboard skill: bar/line x+y both required, neutral pie wording MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two clarity tweaks driven by real failure modes from a demo session: 1-widget-specifications.md (Line / Bar Charts): Changed "Use x, y, optional color" β€” which read as "any of these are knobs, take what you need" β€” to: "x and y are both REQUIRED (one categorical/temporal dimension + one quantitative measure). color is optional for splitting into series." Bar/line widgets with only one axis populated are structurally invalid in Lakeview; rendering silently fails. The old wording let agents skim and skip y when x was a function call (SUM(...)) and y was the categorical dimension. SKILL.md (Design Best Practices): Removed implicit "pie and bar are equivalent" pairing in chart selection guidance. Bar is the default for low-cardinality comparison; pie is fine for snapshot composition but should not be presented as the first choice. Co-authored-by: Isaac --- .../databricks-aibi-dashboards/1-widget-specifications.md | 2 +- databricks-skills/databricks-aibi-dashboards/SKILL.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/databricks-skills/databricks-aibi-dashboards/1-widget-specifications.md b/databricks-skills/databricks-aibi-dashboards/1-widget-specifications.md index 99842a77..d7851f00 100644 --- a/databricks-skills/databricks-aibi-dashboards/1-widget-specifications.md +++ b/databricks-skills/databricks-aibi-dashboards/1-widget-specifications.md @@ -198,7 +198,7 @@ Format types: `number`, `number-currency`, `number-percent` - `version`: **3** - `widgetType`: "line" or "bar" -- Use `x`, `y`, optional `color` encodings +- **`x` and `y` are both REQUIRED** (one categorical/temporal dimension + one quantitative measure). `color` is optional for splitting into series. - `scale.type`: `"temporal"` (dates), `"quantitative"` (numbers), `"categorical"` (strings) - Use `"disaggregated": true` with pre-aggregated dataset data diff --git a/databricks-skills/databricks-aibi-dashboards/SKILL.md b/databricks-skills/databricks-aibi-dashboards/SKILL.md index 98d02f6b..12908df3 100644 --- a/databricks-skills/databricks-aibi-dashboards/SKILL.md +++ b/databricks-skills/databricks-aibi-dashboards/SKILL.md @@ -210,7 +210,7 @@ Apply unless user specifies otherwise: - **Global date filter**: When data has temporal columns, add a date range filter. Most dashboards need time-based filtering. - **KPI time bounds**: Use time-bounded metrics that enable period comparison (MoM, YoY). Unbounded "all-time" totals are less actionable. - **Value formatting**: Format values based on their meaning β€” currency with symbol, percentages with %, large numbers compacted (K/M/B). -- **Chart selection**: Match cardinality to chart type. Few distinct values β†’ pie/bar with color grouping; many values β†’ table. +- **Chart selection**: Match cardinality to chart type. Few distinct values β†’ bar with color grouping (or pie if you really want a snapshot); many values β†’ table. ## Reference Files From dee9ace3ac1182cd134ecbf71c85fcce9d1a6db0 Mon Sep 17 00:00:00 2001 From: Dustin Vannoy <126025751+dustinvannoy-db@users.noreply.github.com> Date: Wed, 6 May 2026 09:38:13 -0700 Subject: [PATCH 64/66] Auto-force refresh and isolate experimental repo dir (#509) Two related changes for experimental installs: 1. Use a separate clone dir for experimental (~/.ai-dev-kit/experimental-repo) so it doesn't clobber the stable ~/.ai-dev-kit/repo. Both install.sh and install.ps1 derive REPO_DIR/$RepoDir from the channel after arg parsing, and the prompt_mcp_path / Invoke-PromptMcpPath re-derivations also honor the channel. 2. Default FORCE=true on experimental installs (unless the user explicitly passed --force or set DEVKIT_FORCE). Previously, the skills-only path only cloned when the repo dir was missing, so reruns kept stale skills. Now when FORCE=true, that path also refreshes via fetch + reset --hard (falling back to delete + clone). Stable installs are unchanged. Co-authored-by: Isaac --- install.ps1 | 34 +++++++++++++++++++++++++++++++--- install.sh | 36 ++++++++++++++++++++++++++++++++---- 2 files changed, 63 insertions(+), 7 deletions(-) diff --git a/install.ps1 b/install.ps1 index 780b7255..0a1094d7 100644 --- a/install.ps1 +++ b/install.ps1 @@ -69,6 +69,7 @@ $script:ScopeExplicit = $false # Track if --global was explicitly passed $script:InstallMcp = $false $script:InstallSkills = $true $script:Force = $false +$script:ForceExplicit = $false $script:Silent = $false $script:UserTools = "" $script:Tools = "" @@ -225,7 +226,7 @@ while ($i -lt $args.Count) { { $_ -in "--list-skills", "-ListSkills" } { $script:ListSkills = $true; $i++ } { $_ -in "--experimental", "-Experimental" } { $script:Channel = "experimental"; $i++ } { $_ -in "-b", "--branch", "-Branch" } { $Branch = $args[$i + 1]; $script:BranchExplicit = $true; $i += 2 } - { $_ -in "-f", "--force", "-Force" } { $script:Force = $true; $i++ } + { $_ -in "-f", "--force", "-Force" } { $script:Force = $true; $script:ForceExplicit = $true; $i++ } { $_ -in "-h", "--help", "-Help" } { Write-Host "Databricks AI Dev Kit Installer (Windows)" Write-Host "" @@ -276,9 +277,21 @@ if ($script:Channel -eq "experimental" -and -not $script:BranchExplicit) { $Branch = "experimental" } +# Experimental installs default to Force=true (always refresh the cached repo) +# unless the user explicitly passed --force. +if ($script:Channel -eq "experimental" -and -not $script:ForceExplicit) { + $script:Force = $true +} + # Set raw URL after branch resolution $RawUrl = "https://raw.githubusercontent.com/$Owner/$Repo/$Branch" +# Keep stable and experimental clones in separate directories so they don't clobber each other +if ($script:Channel -eq "experimental") { + $RepoDir = Join-Path $InstallDir "experimental-repo" + $McpEntry = Join-Path $RepoDir "databricks-mcp-server\run_server.py" +} + # ─── Interactive helpers ────────────────────────────────────── function Test-Interactive { @@ -702,7 +715,8 @@ function Invoke-PromptMcpPath { } # Update derived paths - $script:RepoDir = Join-Path $script:InstallDir "repo" + $repoSubdir = if ($script:Channel -eq "experimental") { "experimental-repo" } else { "repo" } + $script:RepoDir = Join-Path $script:InstallDir $repoSubdir $script:VenvDir = Join-Path $script:InstallDir ".venv" $script:VenvPython = Join-Path $script:VenvDir "Scripts\python.exe" $script:McpEntry = Join-Path $script:RepoDir "databricks-mcp-server\run_server.py" @@ -2121,7 +2135,21 @@ function Invoke-Main { # Setup MCP server if ($script:InstallMcp) { Install-McpServer - } elseif (-not (Test-Path $script:RepoDir)) { + } elseif (Test-Path (Join-Path $script:RepoDir ".git")) { + # Repo already exists β€” refresh it when Force is true, otherwise leave as-is + if ($script:Force) { + Write-Step "Refreshing sources" + $prevEAP = $ErrorActionPreference; $ErrorActionPreference = "Continue" + & git -C $script:RepoDir fetch -q --depth 1 origin $Branch 2>&1 | Out-Null + & git -C $script:RepoDir reset --hard FETCH_HEAD 2>&1 | Out-Null + if ($LASTEXITCODE -ne 0) { + Remove-Item -Recurse -Force $script:RepoDir -ErrorAction SilentlyContinue + & git -c advice.detachedHead=false clone -q --depth 1 --branch $Branch $RepoUrl $script:RepoDir 2>&1 | Out-Null + } + $ErrorActionPreference = $prevEAP + Write-Ok "Repository refreshed ($Branch)" + } + } else { Write-Step "Downloading sources" if (-not (Test-Path $script:InstallDir)) { New-Item -ItemType Directory -Path $script:InstallDir -Force | Out-Null diff --git a/install.sh b/install.sh index 9fbb6806..1a32231a 100644 --- a/install.sh +++ b/install.sh @@ -46,6 +46,8 @@ PROFILE="${DEVKIT_PROFILE:-DEFAULT}" SCOPE="${DEVKIT_SCOPE:-project}" SCOPE_EXPLICIT=false # Track if --global was explicitly passed FORCE="${DEVKIT_FORCE:-false}" +FORCE_EXPLICIT=false +[ -n "${DEVKIT_FORCE:-}" ] && FORCE_EXPLICIT=true IS_UPDATE=false SILENT="${DEVKIT_SILENT:-false}" TOOLS="${DEVKIT_TOOLS:-}" @@ -149,7 +151,7 @@ while [ $# -gt 0 ]; do --mcp) INSTALL_MCP=true; shift ;; --tools) USER_TOOLS="$2"; shift 2 ;; --experimental) CHANNEL="experimental"; shift ;; - -f|--force) FORCE=true; shift ;; + -f|--force) FORCE=true; FORCE_EXPLICIT=true; shift ;; -h|--help) echo "Databricks AI Dev Kit Installer" echo "" @@ -267,11 +269,22 @@ if [ "$CHANNEL" = "experimental" ] && [ "$BRANCH_EXPLICIT" != true ]; then BRANCH="experimental" fi +# Experimental installs default to FORCE=true (always refresh the cached repo) +# unless the user explicitly set DEVKIT_FORCE or passed --force. +if [ "$CHANNEL" = "experimental" ] && [ "$FORCE_EXPLICIT" != true ]; then + FORCE=true +fi + # Set configuration URLs after parsing branch argument REPO_URL="https://github.com/databricks-solutions/ai-dev-kit.git" RAW_URL="https://raw.githubusercontent.com/databricks-solutions/ai-dev-kit/${BRANCH}" INSTALL_DIR="${AIDEVKIT_HOME:-$HOME/.ai-dev-kit}" -REPO_DIR="$INSTALL_DIR/repo" +# Keep stable and experimental clones in separate directories so they don't clobber each other +if [ "$CHANNEL" = "experimental" ]; then + REPO_DIR="$INSTALL_DIR/experimental-repo" +else + REPO_DIR="$INSTALL_DIR/repo" +fi VENV_DIR="$INSTALL_DIR/.venv" VENV_PYTHON="$VENV_DIR/bin/python" MCP_ENTRY="$REPO_DIR/databricks-mcp-server/run_server.py" @@ -861,7 +874,11 @@ prompt_mcp_path() { fi # Update derived paths - REPO_DIR="$INSTALL_DIR/repo" + if [ "$CHANNEL" = "experimental" ]; then + REPO_DIR="$INSTALL_DIR/experimental-repo" + else + REPO_DIR="$INSTALL_DIR/repo" + fi VENV_DIR="$INSTALL_DIR/.venv" VENV_PYTHON="$VENV_DIR/bin/python" MCP_ENTRY="$REPO_DIR/databricks-mcp-server/run_server.py" @@ -2164,7 +2181,18 @@ main() { # Setup MCP server if [ "$INSTALL_MCP" = true ]; then setup_mcp - elif [ ! -d "$REPO_DIR" ]; then + elif [ -d "$REPO_DIR/.git" ]; then + # Repo already exists β€” refresh it when FORCE is true, otherwise leave as-is + if [ "$FORCE" = true ]; then + step "Refreshing sources" + git -C "$REPO_DIR" fetch -q --depth 1 origin "$BRANCH" 2>/dev/null || true + git -C "$REPO_DIR" reset --hard FETCH_HEAD 2>/dev/null || { + rm -rf "$REPO_DIR" + git -c advice.detachedHead=false clone -q --depth 1 --branch "$BRANCH" "$REPO_URL" "$REPO_DIR" + } + ok "Repository refreshed ($BRANCH)" + fi + else step "Downloading sources" mkdir -p "$INSTALL_DIR" git -c advice.detachedHead=false clone -q --depth 1 --branch "$BRANCH" "$REPO_URL" "$REPO_DIR" From 201bc5cd0a9cb149c684aa5748b32eda64044416 Mon Sep 17 00:00:00 2001 From: Dustin Vannoy <126025751+dustinvannoy-db@users.noreply.github.com> Date: Thu, 7 May 2026 07:54:44 -0700 Subject: [PATCH 65/66] Add Kiro IDE support to install.sh and install.ps1 (#511) (#518) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds 'kiro' as a selectable tool target in the unified installer, mirroring the existing per-tool integration pattern (claude, cursor, copilot, codex, gemini, antigravity). Per Kiro IDE docs (kiro.dev/docs/skills, kiro.dev/docs/mcp/configuration): - Skills install to .kiro/skills//SKILL.md (project) or ~/.kiro/skills/ (global). Same SKILL.md schema as Claude/Cursor β€” no skill content changes required. - MCP config writes to .kiro/settings/mcp.json (project) or ~/.kiro/settings/mcp.json (global). Same {"mcpServers": ...} schema, so the existing write_mcp_json helper is reused. Detection looks for /Applications/Kiro.app or a 'kiro' command (Bash), and %LOCALAPPDATA%\Programs\Kiro\Kiro.exe or 'kiro' command (PowerShell). Co-authored-by: antonyprasad-db Co-authored-by: Antony Prasad Thevaraj <280810845+antonyprasad-db@users.noreply.github.com> --- README.md | 3 ++- install.ps1 | 31 ++++++++++++++++++++++++++++--- install.sh | 37 ++++++++++++++++++++++++++++++++----- 3 files changed, 62 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index dec82a2f..1ae57a75 100644 --- a/README.md +++ b/README.md @@ -63,11 +63,12 @@ AI-Driven Development (vibe coding) on Databricks just got a whole lot better. T - [Copilot](https://github.com/features/copilot/cli) - [Windsurf](https://windsurf.com) - [OpenCode](https://opencode.ai) + - [Kiro](https://kiro.dev) ### Install in existing project By default this will install at a project level rather than a user level. This is often a good fit, but requires you to run your client from the exact directory that was used for the install. -_Note: Project configuration files can be re-used in other projects. You find these configs under .claude, .cursor, .gemini, .codex, .github, .agents, .windsurf, .codeium, .opencode, or opencode.json_ +_Note: Project configuration files can be re-used in other projects. You find these configs under .claude, .cursor, .gemini, .codex, .github, .agents, .windsurf, .codeium, .opencode, .kiro, or opencode.json_ #### Mac / Linux diff --git a/install.ps1 b/install.ps1 index 0a1094d7..0f34f819 100644 --- a/install.ps1 +++ b/install.ps1 @@ -1,7 +1,7 @@ # # Databricks AI Dev Kit - Unified Installer (Windows) # -# Installs skills, MCP server, and configuration for Claude Code, Cursor, OpenAI Codex, GitHub Copilot, Gemini CLI, Antigravity, and Windsurf. +# Installs skills, MCP server, and configuration for Claude Code, Cursor, OpenAI Codex, GitHub Copilot, Gemini CLI, Antigravity, Windsurf, OpenCode, and Kiro. # # Usage: irm https://raw.githubusercontent.com/databricks-solutions/ai-dev-kit/main/install.ps1 -OutFile install.ps1 # .\install.ps1 [OPTIONS] @@ -241,7 +241,7 @@ while ($i -lt $args.Count) { Write-Host " --mcp-path PATH Path to MCP server installation" Write-Host " --mcp Install deprecated MCP server (default: no)" Write-Host " --silent Silent mode (no output except errors)" - Write-Host " --tools LIST Comma-separated: claude,cursor,copilot,codex,gemini,antigravity,windsurf,opencode" + Write-Host " --tools LIST Comma-separated: claude,cursor,copilot,codex,gemini,antigravity,windsurf,opencode,kiro" Write-Host " --skills-profile LIST Comma-separated profiles: all,data-engineer,analyst,ai-ml-engineer,app-developer" Write-Host " --skills LIST Comma-separated skill names to install (overrides profile)" Write-Host " --list-skills List available skills and profiles, then exit" @@ -604,6 +604,8 @@ function Invoke-DetectTools { $hasWindsurf = ($null -ne (Get-Command windsurf -ErrorAction SilentlyContinue)) -or (Test-Path "$env:LOCALAPPDATA\Programs\Windsurf\Windsurf.exe") $hasOpencode = $null -ne (Get-Command opencode -ErrorAction SilentlyContinue) + $hasKiro = ($null -ne (Get-Command kiro -ErrorAction SilentlyContinue)) -or + (Test-Path "$env:LOCALAPPDATA\Programs\Kiro\Kiro.exe") $claudeState = $hasClaude; $claudeHint = if ($hasClaude) { "detected" } else { "not found" } $cursorState = $hasCursor; $cursorHint = if ($hasCursor) { "detected" } else { "not found" } @@ -613,9 +615,10 @@ function Invoke-DetectTools { $antigravityState = $hasAntigravity; $antigravityHint = if ($hasAntigravity) { "detected" } else { "not found" } $windsurfState = $hasWindsurf; $windsurfHint = if ($hasWindsurf) { "detected" } else { "not found" } $opencodeState = $hasOpencode; $opencodeHint = if ($hasOpencode) { "detected" } else { "not found" } + $kiroState = $hasKiro; $kiroHint = if ($hasKiro) { "detected" } else { "not found" } # If nothing detected, default to claude - if (-not $hasClaude -and -not $hasCursor -and -not $hasCodex -and -not $hasCopilot -and -not $hasGemini -and -not $hasAntigravity -and -not $hasWindsurf -and -not $hasOpencode) { + if (-not $hasClaude -and -not $hasCursor -and -not $hasCodex -and -not $hasCopilot -and -not $hasGemini -and -not $hasAntigravity -and -not $hasWindsurf -and -not $hasOpencode -and -not $hasKiro) { $claudeState = $true $claudeHint = "default" } @@ -634,6 +637,7 @@ function Invoke-DetectTools { @{ Label = "Antigravity"; Value = "antigravity"; State = $antigravityState; Hint = $antigravityHint } @{ Label = "Windsurf"; Value = "windsurf"; State = $windsurfState; Hint = $windsurfHint } @{ Label = "OpenCode"; Value = "opencode"; State = $opencodeState; Hint = $opencodeHint } + @{ Label = "Kiro"; Value = "kiro"; State = $kiroState; Hint = $kiroHint } ) $result = Select-Checkbox -Items $items @@ -1240,6 +1244,13 @@ function Install-Skills { $dirs += Join-Path $BaseDir ".opencode\skills" } } + "kiro" { + if ($script:Scope -eq "global") { + $dirs += Join-Path $env:USERPROFILE ".kiro\skills" + } else { + $dirs += Join-Path $BaseDir ".kiro\skills" + } + } } } $dirs = $dirs | Select-Object -Unique @@ -1740,6 +1751,16 @@ function Write-McpConfigs { } Write-Ok "OpenCode MCP config" } + "kiro" { + if ($script:Scope -eq "global") { + $kiroSettings = Join-Path $env:USERPROFILE ".kiro\settings" + } else { + $kiroSettings = Join-Path $BaseDir ".kiro\settings" + } + if (-not (Test-Path $kiroSettings)) { New-Item -ItemType Directory -Path $kiroSettings -Force | Out-Null } + Write-McpJson (Join-Path $kiroSettings "mcp.json") + Write-Ok "Kiro MCP config" + } } } } @@ -1815,6 +1836,10 @@ function Show-Summary { Write-Msg "$step. Launch OpenCode in your project: opencode" $step++ } + if ($script:Tools -match 'kiro') { + Write-Msg "$step. Open your project in Kiro to use Databricks skills and MCP tools" + $step++ + } Write-Msg "$step. Open your project in your tool of choice" $step++ Write-Msg "$step. Try: `"List my SQL warehouses`"" diff --git a/install.sh b/install.sh index 1a32231a..e027eea8 100644 --- a/install.sh +++ b/install.sh @@ -2,7 +2,7 @@ # # Databricks AI Dev Kit - Unified Installer # -# Installs skills, MCP server, and configuration for Claude Code, Cursor, OpenAI Codex, GitHub Copilot, Gemini CLI, Antigravity, and Windsurf. +# Installs skills, MCP server, and configuration for Claude Code, Cursor, OpenAI Codex, GitHub Copilot, Gemini CLI, Antigravity, Windsurf, OpenCode, and Kiro. # # Usage: bash <(curl -sL https://raw.githubusercontent.com/databricks-solutions/ai-dev-kit/main/install.sh) [OPTIONS] # @@ -165,7 +165,7 @@ while [ $# -gt 0 ]; do echo " --mcp-only Skip skills installation" echo " --mcp-path PATH Path to MCP server installation (default: ~/.ai-dev-kit)" echo " --silent Silent mode (no output except errors)" - echo " --tools LIST Comma-separated: claude,cursor,copilot,codex,gemini,antigravity,windsurf,opencode" + echo " --tools LIST Comma-separated: claude,cursor,copilot,codex,gemini,antigravity,windsurf,opencode,kiro" echo " --skills-profile LIST Comma-separated profiles: all,data-engineer,analyst,ai-ml-engineer,app-developer" echo " --skills LIST Comma-separated skill names to install (overrides profile)" echo " --list-skills List available skills and profiles, then exit" @@ -698,6 +698,7 @@ detect_tools() { local has_antigravity=false local has_windsurf=false local has_opencode=false + local has_kiro=false command -v claude >/dev/null 2>&1 && has_claude=true { [ -d "/Applications/Cursor.app" ] || command -v cursor >/dev/null 2>&1; } && has_cursor=true @@ -707,10 +708,11 @@ detect_tools() { { [ -d "/Applications/Antigravity.app" ] || command -v antigravity >/dev/null 2>&1; } && has_antigravity=true { [ -d "/Applications/Windsurf.app" ] || command -v windsurf >/dev/null 2>&1; } && has_windsurf=true command -v opencode >/dev/null 2>&1 && has_opencode=true + { [ -d "/Applications/Kiro.app" ] || command -v kiro >/dev/null 2>&1; } && has_kiro=true # Build checkbox items: "Label|value|on_or_off|hint" - local claude_state="off" cursor_state="off" codex_state="off" copilot_state="off" gemini_state="off" antigravity_state="off" windsurf_state="off" opencode_state="off" - local claude_hint="not found" cursor_hint="not found" codex_hint="not found" copilot_hint="not found" gemini_hint="not found" antigravity_hint="not found" windsurf_hint="not found" opencode_hint="not found" + local claude_state="off" cursor_state="off" codex_state="off" copilot_state="off" gemini_state="off" antigravity_state="off" windsurf_state="off" opencode_state="off" kiro_state="off" + local claude_hint="not found" cursor_hint="not found" codex_hint="not found" copilot_hint="not found" gemini_hint="not found" antigravity_hint="not found" windsurf_hint="not found" opencode_hint="not found" kiro_hint="not found" # If previous config exists, use those selections; otherwise use auto-detection if [ "$HAS_PREVIOUS_CONFIG" = true ] && [ -n "$SAVED_TOOLS" ]; then @@ -722,6 +724,7 @@ detect_tools() { [[ " $SAVED_TOOLS " == *" antigravity "* ]] && antigravity_state="on" && antigravity_hint="previous" [[ " $SAVED_TOOLS " == *" windsurf "* ]] && windsurf_state="on" && windsurf_hint="previous" [[ " $SAVED_TOOLS " == *" opencode "* ]] && opencode_state="on" && opencode_hint="previous" + [[ " $SAVED_TOOLS " == *" kiro "* ]] && kiro_state="on" && kiro_hint="previous" else [ "$has_claude" = true ] && claude_state="on" && claude_hint="detected" [ "$has_cursor" = true ] && cursor_state="on" && cursor_hint="detected" @@ -731,9 +734,10 @@ detect_tools() { [ "$has_antigravity" = true ] && antigravity_state="on" && antigravity_hint="detected" [ "$has_windsurf" = true ] && windsurf_state="on" && windsurf_hint="detected" [ "$has_opencode" = true ] && opencode_state="on" && opencode_hint="detected" + [ "$has_kiro" = true ] && kiro_state="on" && kiro_hint="detected" # If nothing detected, pre-select claude as default - if [ "$has_claude" = false ] && [ "$has_cursor" = false ] && [ "$has_codex" = false ] && [ "$has_copilot" = false ] && [ "$has_gemini" = false ] && [ "$has_antigravity" = false ] && [ "$has_windsurf" = false ] && [ "$has_opencode" = false ]; then + if [ "$has_claude" = false ] && [ "$has_cursor" = false ] && [ "$has_codex" = false ] && [ "$has_copilot" = false ] && [ "$has_gemini" = false ] && [ "$has_antigravity" = false ] && [ "$has_windsurf" = false ] && [ "$has_opencode" = false ] && [ "$has_kiro" = false ]; then claude_state="on" claude_hint="default" fi @@ -753,6 +757,7 @@ detect_tools() { "Antigravity|antigravity|${antigravity_state}|${antigravity_hint}" \ "Windsurf|windsurf|${windsurf_state}|${windsurf_hint}" \ "OpenCode|opencode|${opencode_state}|${opencode_hint}" \ + "Kiro|kiro|${kiro_state}|${kiro_hint}" \ ) else # Silent: use detected defaults @@ -765,6 +770,7 @@ detect_tools() { [ "$has_antigravity" = true ] && tools="${tools:+$tools }antigravity" [ "$has_windsurf" = true ] && tools="${tools:+$tools }windsurf" [ "$has_opencode" = true ] && tools="${tools:+$tools }opencode" + [ "$has_kiro" = true ] && tools="${tools:+$tools }kiro" [ -z "$tools" ] && tools="claude" TOOLS="$tools" fi @@ -1340,6 +1346,13 @@ install_skills() { dirs+=("$base_dir/.opencode/skills") fi ;; + kiro) + if [ "$SCOPE" = "global" ]; then + dirs+=("$HOME/.kiro/skills") + else + dirs+=("$base_dir/.kiro/skills") + fi + ;; esac done @@ -1838,6 +1851,16 @@ write_mcp_configs() { fi ok "OpenCode MCP config" ;; + kiro) + if [ "$SCOPE" = "global" ]; then + mkdir -p "$HOME/.kiro/settings" + write_mcp_json "$HOME/.kiro/settings/mcp.json" + else + mkdir -p "$base_dir/.kiro/settings" + write_mcp_json "$base_dir/.kiro/settings/mcp.json" + fi + ok "Kiro MCP config" + ;; esac done } @@ -1894,6 +1917,10 @@ summary() { msg "${step}. Launch OpenCode in your project: ${B}opencode${N}" step=$((step + 1)) fi + if echo "$TOOLS" | grep -q kiro; then + msg "${step}. Open your project in Kiro to use Databricks skills and MCP tools" + step=$((step + 1)) + fi msg "${step}. Open your project in your tool of choice" step=$((step + 1)) msg "${step}. Try: \"List my SQL warehouses\"" From dfb9b749151bef63e4f1ee77132a44c67cdb65f1 Mon Sep 17 00:00:00 2001 From: Quentin Ambard Date: Fri, 8 May 2026 14:35:28 +0200 Subject: [PATCH 66/66] Dashboard skill: lakeview create canonical form (parent_path via --json) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CLI splits dashboard creation options unevenly: most fields have a flag, but parent_path has no flag and must come from --json. The --dataset-catalog/--dataset-schema flags conversely have no JSON equivalent (the CLI rejects them as "unknown field" inside --json). Flags and --json are NOT mutually exclusive β€” they merge β€” so the clean canonical form is everything-as-flags plus a tiny --json for parent_path: databricks lakeview create \ --display-name "X" \ --warehouse-id "WH" \ --dataset-catalog CATALOG \ --dataset-schema SCHEMA \ --serialized-dashboard "$(cat file.json)" \ --json '{"parent_path": "/Workspace/Users//path"}' Without --json, the dashboard lands at /Users//. Updated both the Quick Reference row and the Step 5 lifecycle block to this single canonical form. Comment block compressed from 9 lines to 4. Live-verified end-to-end on CLI v0.296. Co-authored-by: Isaac --- .../databricks-aibi-dashboards/SKILL.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/databricks-skills/databricks-aibi-dashboards/SKILL.md b/databricks-skills/databricks-aibi-dashboards/SKILL.md index 12908df3..f5b6da29 100644 --- a/databricks-skills/databricks-aibi-dashboards/SKILL.md +++ b/databricks-skills/databricks-aibi-dashboards/SKILL.md @@ -17,7 +17,7 @@ A dashboard should be showing something relevant for a human, typically some KPI | List tables | `databricks experimental aitools tools query --warehouse WH "SHOW TABLES IN catalog.schema"` | | Get schema | `databricks experimental aitools tools discover-schema catalog.schema.table1 catalog.schema.table2` | | Test query | `databricks experimental aitools tools query --warehouse WH "SELECT..."` | -| Create dashboard | `databricks lakeview create --display-name "X" --warehouse-id "Y" --dataset-catalog "catalog" --dataset-schema "schema" --serialized-dashboard "$(cat file.json)"` (always set `--dataset-catalog` and `--dataset-schema` β€” queries must use bare table names only to support install on different catalog.schema) | +| Create dashboard | `databricks lakeview create --display-name "X" --warehouse-id "WH" --dataset-catalog CATALOG --dataset-schema SCHEMA --serialized-dashboard "$(cat file.json)" --json '{"parent_path": "/Workspace/Users//path"}'` β€” `parent_path` is JSON-only (no flag); everything else stays as flags. Queries must use bare table names. | | Update dashboard | `databricks lakeview update DASHBOARD_ID --serialized-dashboard "$(cat file.json)"` | | Publish | `databricks lakeview publish DASHBOARD_ID --warehouse-id WH` | | Delete | `databricks lakeview trash DASHBOARD_ID` | @@ -113,17 +113,17 @@ Before writing JSON, plan your dashboard: ### Step 5: Dashboard Lifecycle Once created, you can edit the file as following: ```bash -# Create a dashboard -# IMPORTANT: Use --dataset-catalog and --dataset-schema to set the catalog/schema for all queries -# Queries in the JSON MUST use bare table names only (e.g., "FROM trips"), -# NOT "FROM schema.trips" and NOT "FROM catalog.schema.trips". -# The CLI flags only provide a default β€” they do NOT override hardcoded catalog/schema in queries. +# Create a dashboard β€” canonical form. Everything is a flag EXCEPT parent_path +# (JSON-only, no flag β€” without it, dashboard lands at /Users//). +# --dataset-catalog/--dataset-schema inject `catalog`/`schema` into each saved +# dataset; queries inside dashboard.json must use bare table names. databricks lakeview create \ --display-name "My Dashboard" \ --warehouse-id "abc123def456" \ --dataset-catalog "my_catalog" \ --dataset-schema "my_schema" \ - --serialized-dashboard "$(cat dashboard.json)" + --serialized-dashboard "$(cat dashboard.json)" \ + --json '{"parent_path": "/Workspace/Users/me@co.com/dashboards"}' # List all dashboards databricks lakeview list