diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 00000000..a47797ae --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,305 @@ +name: Release + +# Triggers when VERSION file is updated on main (typically via a version bump PR) +on: + push: + branches: [main] + paths: + - 'VERSION' + +permissions: + contents: write + pull-requests: write + +env: + SYNC_PR_BRANCH: sync-main-to-experimental + EXPERIMENTAL_BRANCH: experimental + +jobs: + sync-experimental: + name: Sync Experimental Branch + runs-on: ubuntu-latest + outputs: + synced: ${{ steps.check-sync.outputs.synced }} + pr_number: ${{ steps.find-or-create-pr.outputs.pr_number }} + pr_url: ${{ steps.find-or-create-pr.outputs.pr_url }} + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + token: ${{ secrets.GITHUB_TOKEN }} + + - name: Configure Git + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + + - name: Check if experimental branch exists + id: check-experimental + run: | + if git ls-remote --heads origin ${{ env.EXPERIMENTAL_BRANCH }} | grep -q ${{ env.EXPERIMENTAL_BRANCH }}; then + echo "exists=true" >> $GITHUB_OUTPUT + else + echo "exists=false" >> $GITHUB_OUTPUT + echo "::notice::Experimental branch does not exist yet. It will be created from main." + fi + + - name: Create experimental branch if missing + if: steps.check-experimental.outputs.exists == 'false' + run: | + git checkout -b ${{ env.EXPERIMENTAL_BRANCH }} + git push origin ${{ env.EXPERIMENTAL_BRANCH }} + echo "::notice::Created '${{ env.EXPERIMENTAL_BRANCH }}' branch from main" + + - name: Check if experimental is in sync with main + id: check-sync + if: steps.check-experimental.outputs.exists == 'true' + run: | + git fetch origin ${{ env.EXPERIMENTAL_BRANCH }} + + # Check if main is ahead of experimental + BEHIND_COUNT=$(git rev-list --count origin/${{ env.EXPERIMENTAL_BRANCH }}..origin/main) + + if [ "$BEHIND_COUNT" -eq 0 ]; then + echo "synced=true" >> $GITHUB_OUTPUT + echo "::notice::โœ… Experimental branch is in sync with main" + else + echo "synced=false" >> $GITHUB_OUTPUT + echo "behind_count=$BEHIND_COUNT" >> $GITHUB_OUTPUT + echo "::warning::Experimental branch is $BEHIND_COUNT commit(s) behind main" + fi + + - name: Find or create sync PR + id: find-or-create-pr + if: steps.check-sync.outputs.synced == 'false' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + # Check for existing open PR + EXISTING_PR=$(gh pr list --head main --base ${{ env.EXPERIMENTAL_BRANCH }} --state open --json number,url --jq '.[0]') + + if [ -n "$EXISTING_PR" ]; then + PR_NUMBER=$(echo "$EXISTING_PR" | jq -r '.number') + PR_URL=$(echo "$EXISTING_PR" | jq -r '.url') + echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT + echo "pr_url=$PR_URL" >> $GITHUB_OUTPUT + echo "pr_existed=true" >> $GITHUB_OUTPUT + echo "::notice::Found existing sync PR #$PR_NUMBER" + else + # Create new PR + PR_URL=$(gh pr create \ + --title "๐Ÿ”„ Sync: merge main into experimental" \ + --body "## Auto-generated sync PR + +This PR keeps the \`experimental\` branch up to date with \`main\`. + +### Why is this needed? +The experimental branch allows users to opt-in to early access features. It must stay in sync with main to ensure experimental users get all stable fixes and features. + +### What to do? +- **If this PR has no conflicts**: It will be auto-merged by the release workflow +- **If this PR has conflicts**: Please resolve them manually, then the next release attempt will succeed + +--- +*This PR was automatically created by the release workflow.*" \ + --head main \ + --base ${{ env.EXPERIMENTAL_BRANCH }}) + + PR_NUMBER=$(echo "$PR_URL" | grep -oE '[0-9]+$') + echo "pr_number=$PR_NUMBER" >> $GITHUB_OUTPUT + echo "pr_url=$PR_URL" >> $GITHUB_OUTPUT + echo "pr_existed=false" >> $GITHUB_OUTPUT + echo "::notice::Created sync PR #$PR_NUMBER: $PR_URL" + fi + + - name: Check PR mergeability + id: check-mergeable + if: steps.check-sync.outputs.synced == 'false' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + PR_NUMBER="${{ steps.find-or-create-pr.outputs.pr_number }}" + + # Wait a moment for GitHub to compute mergeability + sleep 5 + + # Get PR mergeable state + MERGEABLE=$(gh pr view "$PR_NUMBER" --json mergeable --jq '.mergeable') + + echo "mergeable=$MERGEABLE" >> $GITHUB_OUTPUT + + if [ "$MERGEABLE" = "MERGEABLE" ]; then + echo "::notice::โœ… Sync PR #$PR_NUMBER is mergeable (no conflicts)" + elif [ "$MERGEABLE" = "CONFLICTING" ]; then + echo "::error::โŒ Sync PR #$PR_NUMBER has merge conflicts that must be resolved manually" + else + echo "::warning::โณ Sync PR #$PR_NUMBER mergeability is unknown (state: $MERGEABLE)" + fi + + - name: Auto-merge sync PR + id: auto-merge + if: steps.check-sync.outputs.synced == 'false' && steps.check-mergeable.outputs.mergeable == 'MERGEABLE' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + PR_NUMBER="${{ steps.find-or-create-pr.outputs.pr_number }}" + + echo "Auto-merging sync PR #$PR_NUMBER..." + gh pr merge "$PR_NUMBER" --merge --admin -t "Sync main into experimental (auto-merge)" + + echo "merged=true" >> $GITHUB_OUTPUT + echo "::notice::โœ… Successfully merged sync PR #$PR_NUMBER" + + - name: Fail if conflicts exist + if: steps.check-sync.outputs.synced == 'false' && steps.check-mergeable.outputs.mergeable == 'CONFLICTING' + run: | + PR_URL="${{ steps.find-or-create-pr.outputs.pr_url }}" + PR_NUMBER="${{ steps.find-or-create-pr.outputs.pr_number }}" + + echo "" + echo "โŒ RELEASE BLOCKED - Merge conflicts detected" + echo "" + echo "๐Ÿ“‹ Sync PR: $PR_URL" + echo "" + echo "โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”" + echo " HOW TO FIX" + echo "โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”" + echo "" + echo "Open Claude Code in the ai-dev-kit repo and paste this prompt:" + echo "" + cat << 'PROMPT' +โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ” COPY BELOW THIS LINE โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ” + +Merge main into experimental and resolve any conflicts. + +## Step 1: Start the merge + +Run: + git checkout experimental + git pull + git merge origin/main + +## Step 2: Understand what's in experimental (IMPORTANT - do this BEFORE resolving) + +I need you to fully understand the experimental branch before touching any conflicts. + +1. List commits only in experimental: + git log main..experimental --oneline + +2. For each commit, read the actual changes (not just the message): + git show --stat + Then read the key files if needed. + +3. Give me a detailed summary: + - What experimental features exist (describe each one) + - What files are experimental-only vs modified from main + - The intent/purpose of these experimental changes + +Do NOT proceed to conflict resolution until you've given me this summary. + +## Step 3: Analyze each conflict + +Run: git diff --name-only --diff-filter=U + +For each conflicted file: +1. Show me the conflict markers (the <<<<<<< ======= >>>>>>> sections) +2. Explain what MAIN is changing (likely: bugfix, stable feature, refactor) +3. Explain what EXPERIMENTAL is changing (likely: early-access feature) +4. Explain if these changes are independent or overlapping + +## Step 4: Resolve conflicts + +Apply these rules: +- **Independent changes** (different parts of file): Keep BOTH - main's updates AND experimental's features +- **Compatible changes** (e.g., main fixed a bug in code experimental also modified): Apply main's fix within experimental's version +- **Conflicting intent**: STOP and ask me. Based on your analysis from Step 2, explain the tradeoff and give me clear options to choose from. + +## Step 5: Complete the merge + +After ALL conflicts are resolved, commit with a detailed message: + + git add . + git commit -m "Merge main into experimental + + Kept from main: + - [list bugfixes and features from main] + + Preserved from experimental: + - [list experimental features preserved] + + Resolutions: + - [list any non-trivial merge decisions] + " + git push origin experimental + +## Step 6: Confirm + +Tell me when the merge is pushed so I can re-run the release workflow. + +โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ” COPY ABOVE THIS LINE โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ” +PROMPT + echo "" + echo "After the merge is pushed, re-run this workflow." + echo "" + echo "โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”" + + exit 1 + + create-release: + name: Create Release + runs-on: ubuntu-latest + needs: sync-experimental + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Read version + id: version + run: | + VERSION=$(cat VERSION | tr -d '[:space:]') + echo "version=$VERSION" >> $GITHUB_OUTPUT + echo "::notice::Releasing version $VERSION" + + - name: Check if tag already exists + id: check-tag + run: | + VERSION="${{ steps.version.outputs.version }}" + if git rev-parse "v$VERSION" >/dev/null 2>&1; then + echo "exists=true" >> $GITHUB_OUTPUT + echo "::warning::Tag v$VERSION already exists, skipping release creation" + else + echo "exists=false" >> $GITHUB_OUTPUT + fi + + - name: Create and push tag + if: steps.check-tag.outputs.exists == 'false' + run: | + VERSION="${{ steps.version.outputs.version }}" + git tag "v$VERSION" + git push origin "v$VERSION" + echo "::notice::Created tag v$VERSION" + + - name: Create GitHub Release + if: steps.check-tag.outputs.exists == 'false' + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + VERSION="${{ steps.version.outputs.version }}" + + gh release create "v$VERSION" \ + --title "v$VERSION" \ + --generate-notes \ + --latest + + echo "" + echo "โ•”โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•—" + echo "โ•‘ โœ… RELEASE SUCCESSFUL โ•‘" + echo "โ• โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•ฃ" + echo "โ•‘ โ•‘" + echo "โ•‘ Version: v$VERSION" + echo "โ•‘ โ•‘" + echo "โ•‘ โ€ข GitHub Release created โ•‘" + echo "โ•‘ โ€ข Experimental branch is in sync โ•‘" + echo "โ•‘ โ•‘" + echo "โ•šโ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•" diff --git a/.gitignore b/.gitignore index d7ae2881..f33db3f3 100644 --- a/.gitignore +++ b/.gitignore @@ -63,6 +63,7 @@ claude_agent_settings.json .coverage htmlcov/ .pytest_cache/ +.test-results/ # Skill test run results (detailed per-task logs with full responses) .test/skills/*/runs/ diff --git a/.test/pyproject.toml b/.test/pyproject.toml index 73461d8e..af4e838a 100644 --- a/.test/pyproject.toml +++ b/.test/pyproject.toml @@ -13,6 +13,8 @@ dependencies = [ "mlflow[databricks]>=3.10.1", "protobuf>=5.26", "python-dotenv>=1.0.0", + "claude-agent-sdk>=0.1.39", + "gepa>=0.1.0", ] [project.optional-dependencies] diff --git a/.test/skills/databricks-lakebase-autoscale/ground_truth.yaml b/.test/skills/databricks-lakebase-autoscale/ground_truth.yaml index 9832a09c..f1342dac 100644 --- a/.test/skills/databricks-lakebase-autoscale/ground_truth.yaml +++ b/.test/skills/databricks-lakebase-autoscale/ground_truth.yaml @@ -6,50 +6,37 @@ metadata: test_cases: - id: lakebase_auto_create_project_001 inputs: - prompt: "How do I create a new Lakebase Autoscale project using the Python SDK?" + prompt: "How do I create a new Lakebase Autoscale project?" outputs: response: | - ```python - from databricks.sdk import WorkspaceClient - from databricks.sdk.service.postgres import Project, ProjectSpec - - w = WorkspaceClient() - - # Create a project (long-running operation) - operation = w.postgres.create_project( - project=Project( - spec=ProjectSpec( - display_name="My Application", - pg_version="17" - ) - ), - project_id="my-app" - ) - result = operation.wait() - print(f"Created project: {result.name}") + ```bash + databricks postgres create-project my-app \ + --json '{"spec": {"display_name": "My Application", "pg_version": "17"}}' ``` + + The CLI blocks until the long-running operation completes and returns the final resource. execution_success: true expectations: expected_facts: - - "Projects are created with w.postgres.create_project()" - - "create_project returns a long-running operation that requires .wait()" - - "ProjectSpec requires display_name and pg_version" + - "Projects are created with 'databricks postgres create-project'" + - "Project spec is passed as --json with display_name and pg_version" + - "The CLI blocks until the long-running operation completes" expected_patterns: - - pattern: "create_project" + - pattern: "databricks postgres create-project" min_count: 1 - description: "Uses create_project() method" - - pattern: "ProjectSpec" + description: "Uses CLI create-project command" + - pattern: "display_name" min_count: 1 - description: "Uses ProjectSpec for configuration" - - pattern: "\\.wait\\(\\)" + description: "Specifies project display name in spec" + - pattern: "pg_version" min_count: 1 - description: "Waits for the long-running operation to complete" - - pattern: "WorkspaceClient" + description: "Specifies Postgres version" + - pattern: "--json" min_count: 1 - description: "Uses WorkspaceClient for SDK access" + description: "Passes spec as JSON" guidelines: - - "Must show the long-running operation pattern with .wait()" - - "Must include ProjectSpec with display_name and pg_version" + - "Must use databricks postgres create-project with --json for the spec" + - "Must include display_name and pg_version in the spec" metadata: category: happy_path difficulty: easy @@ -60,44 +47,32 @@ test_cases: prompt: "How do I create a development branch in Lakebase Autoscale with an expiration?" outputs: response: | - ```python - from databricks.sdk.service.postgres import Branch, BranchSpec, Duration - - # Create a dev branch with 7-day expiration - branch = w.postgres.create_branch( - parent="projects/my-app", - branch=Branch( - spec=BranchSpec( - source_branch="projects/my-app/branches/production", - ttl=Duration(seconds=604800) # 7 days - ) - ), - branch_id="development" - ).wait() - print(f"Branch created: {branch.name}") + ```bash + databricks postgres create-branch projects/my-app development \ + --json '{"spec": {"source_branch": "projects/my-app/branches/production", "ttl": "604800s"}}' ``` Branches are copy-on-write snapshots of the production branch, ideal for development and testing. execution_success: true expectations: expected_facts: - - "Branches are created with w.postgres.create_branch()" - - "BranchSpec takes a source_branch and optional ttl for expiration" - - "Duration(seconds=604800) sets a 7-day TTL" + - "Branches are created with 'databricks postgres create-branch'" + - "Branch spec is passed as --json with source_branch and ttl in seconds" + - "TTL of 604800s sets a 7-day expiration" - "Branches use copy-on-write from the source branch" expected_patterns: - - pattern: "create_branch" - min_count: 1 - description: "Uses create_branch() method" - - pattern: "BranchSpec" - min_count: 1 - description: "Uses BranchSpec for branch configuration" - - pattern: "Duration" + - pattern: "databricks postgres create-branch" min_count: 1 - description: "Uses Duration for TTL specification" + description: "Uses CLI create-branch command" - pattern: "source_branch" min_count: 1 description: "Specifies the parent branch to fork from" + - pattern: "604800s" + min_count: 1 + description: "Sets 7-day TTL in seconds" + - pattern: "copy-on-write" + min_count: 1 + description: "Explains branch isolation model" guidelines: - "Must include TTL/expiration on the development branch" - "Must explain that branches are copy-on-write snapshots" @@ -119,13 +94,13 @@ test_cases: # Get endpoint details endpoint = w.postgres.get_endpoint( - name="projects/my-app/branches/production/endpoints/ep-primary" + name="projects/my-app/branches/production/endpoints/primary" ) host = endpoint.status.hosts.host # Generate OAuth token (scoped to endpoint) cred = w.postgres.generate_database_credential( - endpoint="projects/my-app/branches/production/endpoints/ep-primary" + endpoint="projects/my-app/branches/production/endpoints/primary" ) # Connect using psycopg3 @@ -152,15 +127,15 @@ test_cases: - pattern: "psycopg\\.connect" min_count: 1 description: "Connects using psycopg3" - - pattern: "generate_database_credential" + - pattern: "generate.database.credential" min_count: 1 - description: "Generates OAuth token for authentication" + description: "Generates OAuth token for authentication (CLI or SDK)" - pattern: "sslmode=require" min_count: 1 description: "Requires SSL for the connection" - - pattern: "get_endpoint" + - pattern: "get.endpoint" min_count: 1 - description: "Retrieves endpoint details" + description: "Retrieves endpoint details (CLI or SDK)" guidelines: - "Must use psycopg (not psycopg2) for the connection" - "Must generate OAuth token and use it as password" @@ -182,7 +157,7 @@ test_cases: # Generate database credential (optionally scoped to an endpoint) cred = w.postgres.generate_database_credential( - endpoint="projects/my-app/branches/production/endpoints/ep-primary" + endpoint="projects/my-app/branches/production/endpoints/primary" ) token = cred.token # Use as password in connection string # Token expires after 1 hour @@ -192,17 +167,17 @@ test_cases: execution_success: true expectations: expected_facts: - - "OAuth tokens are generated via w.postgres.generate_database_credential()" + - "OAuth tokens are generated with 'databricks postgres generate-database-credential'" - "Tokens expire after 1 hour" - "Tokens can be scoped to a specific endpoint" - "The token is used as the password in PostgreSQL connection strings" expected_patterns: - - pattern: "generate_database_credential" + - pattern: "generate.database.credential" min_count: 1 - description: "Uses generate_database_credential() method" - - pattern: "cred\\.token" + description: "Uses generate-database-credential (CLI or SDK)" + - pattern: "token" min_count: 1 - description: "Accesses the token from the credential response" + description: "References the token field from the credential response" guidelines: - "Must mention the 1-hour token expiration" - "Must show that the token is used as the connection password" @@ -216,50 +191,37 @@ test_cases: prompt: "How do I resize the compute capacity for a Lakebase Autoscale endpoint?" outputs: response: | - ```python - from databricks.sdk.service.postgres import Endpoint, EndpointSpec, FieldMask - - # Update compute to autoscale between 2-8 CU - w.postgres.update_endpoint( - name="projects/my-app/branches/production/endpoints/ep-primary", - endpoint=Endpoint( - name="projects/my-app/branches/production/endpoints/ep-primary", - spec=EndpointSpec( - autoscaling_limit_min_cu=2.0, - autoscaling_limit_max_cu=8.0 - ) - ), - update_mask=FieldMask(field_mask=[ - "spec.autoscaling_limit_min_cu", - "spec.autoscaling_limit_max_cu" - ]) - ).wait() + ```bash + databricks postgres update-endpoint \ + projects/my-app/branches/production/endpoints/primary \ + "spec.autoscaling_limit_min_cu,spec.autoscaling_limit_max_cu" \ + --json '{"spec": {"autoscaling_limit_min_cu": 2.0, "autoscaling_limit_max_cu": 8.0}}' ``` Compute Units (CU) scale automatically within the min/max range based on workload. execution_success: true expectations: expected_facts: - - "Compute is resized with w.postgres.update_endpoint()" + - "Compute is resized with 'databricks postgres update-endpoint'" - "Autoscaling is configured with min and max CU limits" - - "FieldMask specifies which fields to update" - - "The operation is long-running and requires .wait()" + - "The update mask is passed as a positional argument" + - "The CLI blocks until the update completes" expected_patterns: - - pattern: "update_endpoint" + - pattern: "databricks postgres update-endpoint" min_count: 1 - description: "Uses update_endpoint() to resize" + description: "Uses CLI update-endpoint command" - pattern: "autoscaling_limit_min_cu" min_count: 1 description: "Sets minimum autoscaling CU" - pattern: "autoscaling_limit_max_cu" min_count: 1 description: "Sets maximum autoscaling CU" - - pattern: "FieldMask" + - pattern: "spec\\.autoscaling" min_count: 1 - description: "Uses FieldMask for partial update" + description: "Positional mask specifies autoscaling fields" guidelines: - "Must include both min and max CU configuration" - - "Must use FieldMask to specify updated fields" + - "Must pass update mask as positional argument to update-endpoint" metadata: category: happy_path difficulty: medium @@ -271,33 +233,42 @@ test_cases: outputs: response: | ```bash - # Create a project - databricks postgres create-project \ - --project-id my-app \ + # Create a project (PROJECT_ID is a positional argument) + databricks postgres create-project my-app \ --json '{"spec": {"display_name": "My App", "pg_version": "17"}}' - # List projects + # List / get projects databricks postgres list-projects - - # Get project details databricks postgres get-project projects/my-app - # Create a branch + # Create a branch (TTL in seconds; use no_expiry: true for permanent) databricks postgres create-branch projects/my-app development \ - --json '{"spec": {"source_branch": "projects/my-app/branches/production", "no_expiry": true}}' + --json '{"spec": {"source_branch": "projects/my-app/branches/production", "ttl": "604800s"}}' # List branches databricks postgres list-branches projects/my-app - # Delete a project + # Resize endpoint compute + databricks postgres update-endpoint \ + projects/my-app/branches/production/endpoints/primary \ + "spec.autoscaling_limit_min_cu,spec.autoscaling_limit_max_cu" \ + --json '{"spec": {"autoscaling_limit_min_cu": 2.0, "autoscaling_limit_max_cu": 8.0}}' + + # Generate OAuth credential for connecting + databricks postgres generate-database-credential \ + projects/my-app/branches/production/endpoints/primary + + # Delete a project (irreversible) databricks postgres delete-project projects/my-app ``` execution_success: true expectations: expected_facts: - - "Lakebase Autoscale CLI commands are under databricks postgres" - - "Project creation uses --json for the spec configuration" - - "Resource names follow the pattern projects/{id}/branches/{id}/endpoints/{id}" + - "All Lakebase Autoscale CLI commands are under 'databricks postgres'" + - "The project ID is a positional argument to create-project, not a flag" + - "Spec is passed as --json for create and update commands" + - "Resource names follow the hierarchical path: projects/{id}/branches/{id}/endpoints/{id}" + - "update-endpoint takes the field mask as a positional argument before --json" expected_patterns: - pattern: "databricks postgres create-project" min_count: 1 @@ -308,10 +279,662 @@ test_cases: - pattern: "databricks postgres create-branch" min_count: 1 description: "Shows branch creation command" + - pattern: "databricks postgres.*endpoint" + min_count: 1 + description: "Shows endpoint management command" + - pattern: "generate-database-credential" + min_count: 1 + description: "Shows credential generation for connecting" guidelines: - - "Must show at least 4 different CLI subcommands" - - "Must include both project and branch management commands" + - "Must show at least 5 different CLI subcommands covering projects, branches, endpoints, and credentials" + - "Must not use --project-id flag; project ID is always a positional argument" + - "Must show the correct positional field-mask syntax for update-endpoint" metadata: category: happy_path difficulty: easy source: auto_generated + +- id: lakebase_auto_full_project_setup_007 + inputs: + prompt: | + Create a new Lakebase Autoscaling project for a retail customer support system. + - Project name: `support-autoscale` + - Display name: `Support Autoscale` + - PG version 17 + - Set autoscaling compute: min 0 CU, max 2 CU with scale-to-zero enabled (5 minute suspend timeout) + - Create a `production` branch (default, protected) with a read-write endpoint + - Create a `dev` branch from production with a 7-day TTL expiration + - Generate an OAuth credential and verify connectivity to the production endpoint by running `SELECT version()` + - Create a database named `support_db` + outputs: + response: | + Here's how to set up a complete Lakebase Autoscaling project for a retail customer support system: + + ```python + from databricks.sdk import WorkspaceClient + from databricks.sdk.service.postgres import ( + Project, ProjectSpec, Branch, BranchSpec, + Endpoint, EndpointSpec, FieldMask, Duration + ) + import psycopg + + w = WorkspaceClient() + + # Step 1: Create the project (production branch + default endpoint created automatically) + project = w.postgres.create_project( + project=Project( + spec=ProjectSpec( + display_name="Support Autoscale", + pg_version="17" + ) + ), + project_id="support-autoscale" + ).wait() + print(f"Created project: {project.name}") + + # Step 2: Configure autoscaling on the production endpoint (0-2 CU, scale-to-zero 5 min) + w.postgres.update_endpoint( + name="projects/support-autoscale/branches/production/endpoints/primary", + endpoint=Endpoint( + name="projects/support-autoscale/branches/production/endpoints/primary", + spec=EndpointSpec( + autoscaling_limit_min_cu=0, + autoscaling_limit_max_cu=2.0, + suspend_timeout_seconds=300 + ) + ), + update_mask=FieldMask(field_mask=[ + "spec.autoscaling_limit_min_cu", + "spec.autoscaling_limit_max_cu", + "spec.suspend_timeout_seconds" + ]) + ).wait() + print("Configured autoscaling: 0-2 CU with 5-minute scale-to-zero") + + # Step 3: Protect the production branch + w.postgres.update_branch( + name="projects/support-autoscale/branches/production", + branch=Branch( + name="projects/support-autoscale/branches/production", + spec=BranchSpec(is_protected=True) + ), + update_mask=FieldMask(field_mask=["spec.is_protected"]) + ).wait() + print("Production branch is now protected") + + # Step 4: Create dev branch from production with 7-day TTL + dev_branch = w.postgres.create_branch( + parent="projects/support-autoscale", + branch=Branch( + spec=BranchSpec( + source_branch="projects/support-autoscale/branches/production", + ttl=Duration(seconds=604800) # 7 days + ) + ), + branch_id="dev" + ).wait() + print(f"Created dev branch: {dev_branch.name} (expires in 7 days)") + + # Step 5: Generate OAuth credential and verify connectivity + endpoint = w.postgres.get_endpoint( + name="projects/support-autoscale/branches/production/endpoints/primary" + ) + host = endpoint.status.hosts.host + + cred = w.postgres.generate_database_credential( + endpoint="projects/support-autoscale/branches/production/endpoints/primary" + ) + + conn_string = ( + f"host={host} " + f"dbname=databricks_postgres " + f"user={w.current_user.me().user_name} " + f"password={cred.token} " + f"sslmode=require" + ) + with psycopg.connect(conn_string) as conn: + with conn.cursor() as cur: + cur.execute("SELECT version()") + print(f"Connected! PostgreSQL version: {cur.fetchone()[0]}") + + # Step 6: Create the support_db database + with psycopg.connect(conn_string) as conn: + conn.autocommit = True + with conn.cursor() as cur: + cur.execute("CREATE DATABASE support_db") + print("Created database: support_db") + ``` + + **Key points:** + + - The `production` branch and a primary read-write endpoint named `primary` are created automatically when you create a project. + - `update_endpoint` with `FieldMask` configures autoscaling limits (0-2 CU) and the scale-to-zero suspend timeout (300 seconds = 5 minutes). + - `update_branch` with `is_protected=True` prevents accidental deletion or reset of the production branch. + - The `dev` branch uses copy-on-write from production, so it's storage-efficient. The 7-day TTL (`Duration(seconds=604800)`) auto-deletes it after expiration. + - OAuth tokens expire after 1 hour; for production apps, implement token refresh at ~45 minutes. + - `CREATE DATABASE` is standard PostgreSQL DDL executed over the psycopg connection. The connection must have `autocommit = True` since `CREATE DATABASE` cannot run inside a transaction. + - All SDK create/update/delete operations are long-running and require `.wait()` to block until completion. + - Always use `sslmode=require` in connection strings. + execution_success: true + expectations: + expected_facts: + - "Project is created with 'databricks postgres create-project'" + - "The CLI blocks until project creation completes" + - "Production branch and a read-write endpoint named 'primary' are created automatically with the project" + - "Autoscaling is configured via 'databricks postgres update-endpoint' with positional mask" + - "Scale-to-zero is set via suspend_timeout_seconds (300 = 5 minutes)" + - "Production branch is protected using 'databricks postgres update-branch'" + - "Dev branch is created with 'databricks postgres create-branch' using 7-day TTL (604800s)" + - "OAuth tokens are generated via generate_database_credential()" + - "Connectivity is verified by running SELECT version() over psycopg" + - "CREATE DATABASE requires autocommit=True on the psycopg connection" + - "All connection strings must include sslmode=require" + expected_patterns: + - pattern: "databricks postgres create-project" + min_count: 1 + description: "Uses CLI create-project command" + - pattern: "databricks postgres update-endpoint" + min_count: 1 + description: "Uses CLI update-endpoint to configure autoscaling" + - pattern: "autoscaling_limit_min_cu" + min_count: 1 + description: "Sets minimum autoscaling CU" + - pattern: "autoscaling_limit_max_cu" + min_count: 1 + description: "Sets maximum autoscaling CU" + - pattern: "is_protected" + min_count: 1 + description: "Protects the production branch" + - pattern: "databricks postgres create-branch" + min_count: 1 + description: "Uses CLI create-branch for the dev branch" + - pattern: "604800s" + min_count: 1 + description: "Sets 7-day TTL on dev branch" + - pattern: "CREATE DATABASE support_db" + min_count: 1 + description: "Creates the support_db database" + guidelines: + - "Must create the project with display_name='Support Autoscale' and pg_version='17'" + - "Must configure autoscaling with min 0 CU and max 2.0 CU" + - "Must enable scale-to-zero with a 5-minute (300 second) timeout" + - "Must protect the production branch using update-branch" + - "Must create a dev branch from production with 7-day TTL expiration" + - "Must create a database named support_db using standard Postgres DDL" + - "Must include sslmode=require in connection strings" + - "Must use positional mask argument in update-endpoint and update-branch" + metadata: + category: happy_path + difficulty: hard + source: interactive_add + +- id: lakebase_auto_schema_ddl_008 + inputs: + prompt: | + Connect to the `support_db` database on the `production` branch of the `support-autoscale` Lakebase project. + Create a `support` schema and build the following tables with appropriate constraints, indexes, and foreign keys: + + **products** - Product catalog for referencing in support cases + - product_id (PK, serial), sku (unique, varchar 50), name (varchar 200), category (varchar 100), price (numeric 10,2), is_active (boolean default true), created_at (timestamptz default now) + + **customers** - Customer records + - customer_id (PK, serial), email (unique, varchar 255), first_name (varchar 100), last_name (varchar 100), membership_tier (varchar 20, check in 'Free','Pro','Enterprise'), region (varchar 50), created_at (timestamptz default now) + + **orders** - Order history tied to customers + - order_id (PK, serial), customer_id (FK to customers), order_date (timestamptz default now), total_amount (numeric 12,2), status (varchar 20, check in 'pending','processing','shipped','delivered','cancelled','returned'), shipping_address (text), created_at (timestamptz default now) + + **order_items** - Line items tying orders to products + - order_item_id (PK, serial), order_id (FK to orders ON DELETE CASCADE), product_id (FK to products), quantity (integer, check > 0), unit_price (numeric 10,2), created_at (timestamptz default now) + + Add indexes on: customers(email), orders(customer_id, order_date), order_items(order_id), order_items(product_id), orders(status). + + Verify the schema was created correctly by querying information_schema. + outputs: + response: | + Here's how to connect to the Lakebase Autoscale project and build the support schema: + + ```python + import psycopg + from databricks.sdk import WorkspaceClient + + w = WorkspaceClient() + + # Get production endpoint for the support-autoscale project + endpoint = w.postgres.get_endpoint( + name="projects/support-autoscale/branches/production/endpoints/primary" + ) + host = endpoint.status.hosts.host + + # Generate OAuth credential scoped to this endpoint + cred = w.postgres.generate_database_credential( + endpoint="projects/support-autoscale/branches/production/endpoints/primary" + ) + + # Connect to support_db on the production branch + conn_string = ( + f"host={host} " + f"dbname=support_db " + f"user={w.current_user.me().user_name} " + f"password={cred.token} " + f"sslmode=require" + ) + + with psycopg.connect(conn_string) as conn: + conn.autocommit = True + with conn.cursor() as cur: + cur.execute("CREATE SCHEMA IF NOT EXISTS support") + + cur.execute(""" + CREATE TABLE support.products ( + product_id SERIAL PRIMARY KEY, + sku VARCHAR(50) UNIQUE NOT NULL, + name VARCHAR(200) NOT NULL, + category VARCHAR(100), + price NUMERIC(10, 2), + is_active BOOLEAN DEFAULT TRUE, + created_at TIMESTAMPTZ DEFAULT NOW() + ) + """) + + cur.execute(""" + CREATE TABLE support.customers ( + customer_id SERIAL PRIMARY KEY, + email VARCHAR(255) UNIQUE NOT NULL, + first_name VARCHAR(100) NOT NULL, + last_name VARCHAR(100) NOT NULL, + membership_tier VARCHAR(20) CHECK ( + membership_tier IN ('Free', 'Pro', 'Enterprise') + ), + region VARCHAR(50), + created_at TIMESTAMPTZ DEFAULT NOW() + ) + """) + + cur.execute(""" + CREATE TABLE support.orders ( + order_id SERIAL PRIMARY KEY, + customer_id INTEGER NOT NULL + REFERENCES support.customers(customer_id), + order_date TIMESTAMPTZ DEFAULT NOW(), + total_amount NUMERIC(12, 2), + status VARCHAR(20) CHECK ( + status IN ('pending', 'processing', 'shipped', + 'delivered', 'cancelled', 'returned') + ), + shipping_address TEXT, + created_at TIMESTAMPTZ DEFAULT NOW() + ) + """) + + cur.execute(""" + CREATE TABLE support.order_items ( + order_item_id SERIAL PRIMARY KEY, + order_id INTEGER NOT NULL + REFERENCES support.orders(order_id) ON DELETE CASCADE, + product_id INTEGER NOT NULL + REFERENCES support.products(product_id), + quantity INTEGER CHECK (quantity > 0), + unit_price NUMERIC(10, 2), + created_at TIMESTAMPTZ DEFAULT NOW() + ) + """) + + cur.execute("CREATE INDEX idx_customers_email ON support.customers(email)") + cur.execute("CREATE INDEX idx_orders_customer_date ON support.orders(customer_id, order_date)") + cur.execute("CREATE INDEX idx_order_items_order ON support.order_items(order_id)") + cur.execute("CREATE INDEX idx_order_items_product ON support.order_items(product_id)") + cur.execute("CREATE INDEX idx_orders_status ON support.orders(status)") + + cur.execute(""" + SELECT table_name, column_name, data_type, is_nullable + FROM information_schema.columns + WHERE table_schema = 'support' + ORDER BY table_name, ordinal_position + """) + print("Schema columns:") + for row in cur.fetchall(): + print(f" {row[0]}.{row[1]}: {row[2]} (nullable: {row[3]})") + + cur.execute(""" + SELECT indexname, tablename + FROM pg_indexes + WHERE schemaname = 'support' + ORDER BY tablename, indexname + """) + print("\nIndexes:") + for row in cur.fetchall(): + print(f" {row[0]} on {row[1]}") + ``` + + **Key points:** + + - Connect to `support_db` (not the default `databricks_postgres`) by specifying `dbname=support_db`. + - `autocommit = True` is required for DDL statements. + - Foreign keys: `orders.customer_id` references `customers`; `order_items` references `orders` (ON DELETE CASCADE) and `products`. + - CHECK constraints enforce valid values for `membership_tier`, `status`, and `quantity > 0`. + - The composite index on `orders(customer_id, order_date)` optimizes queries that filter by customer and sort by date. + - Verification uses `information_schema.columns` and `pg_indexes`. + - Always use `sslmode=require` in connection strings. + execution_success: true + expectations: + expected_facts: + - "Connection uses get-endpoint and generate-database-credential to get host and OAuth token" + - "Connects to support_db via dbname=support_db in the connection string" + - "autocommit must be True for DDL statements" + - "Creates a support schema with CREATE SCHEMA IF NOT EXISTS" + - "products table has SERIAL PK, UNIQUE sku, and BOOLEAN default" + - "customers table has CHECK constraint on membership_tier" + - "orders table has FK to customers(customer_id)" + - "order_items table has FK to orders with ON DELETE CASCADE" + - "order_items table has FK to products(product_id)" + - "CHECK constraint on quantity > 0" + - "Composite index on orders(customer_id, order_date) for query optimization" + - "Verification queries information_schema.columns and pg_indexes" + - "sslmode=require is used in the connection string" + expected_patterns: + - pattern: "get.endpoint" + min_count: 1 + description: "Gets endpoint details for connection host (CLI or SDK)" + - pattern: "generate.database.credential" + min_count: 1 + description: "Generates OAuth token for authentication (CLI or SDK)" + - pattern: "support_db" + min_count: 1 + description: "Connects to the support_db database" + - pattern: "sslmode=require" + min_count: 1 + description: "Requires SSL for the connection" + - pattern: "CREATE SCHEMA" + min_count: 1 + description: "Creates the support schema" + - pattern: "CREATE TABLE support\\.products" + min_count: 1 + description: "Creates the products table in support schema" + - pattern: "CREATE TABLE support\\.customers" + min_count: 1 + description: "Creates the customers table in support schema" + - pattern: "CREATE TABLE support\\.orders" + min_count: 1 + description: "Creates the orders table in support schema" + - pattern: "CREATE TABLE support\\.order_items" + min_count: 1 + description: "Creates the order_items table in support schema" + - pattern: "ON DELETE CASCADE" + min_count: 1 + description: "Cascade delete on order_items when order is deleted" + - pattern: "REFERENCES" + min_count: 3 + description: "Foreign key constraints (orders->customers, order_items->orders, order_items->products)" + - pattern: "CHECK" + min_count: 3 + description: "CHECK constraints on membership_tier, status, and quantity" + - pattern: "CREATE INDEX" + min_count: 5 + description: "Creates all 5 requested indexes" + - pattern: "information_schema" + min_count: 1 + description: "Verifies schema using information_schema" + - pattern: "sslmode=require" + min_count: 1 + description: "Requires SSL for the connection" + guidelines: + - "Must connect to support_db, not the default databricks_postgres" + - "Must use autocommit=True for DDL execution" + - "Must create all 4 tables in the support schema: products, customers, orders, order_items" + - "Must include all specified columns with correct data types and constraints" + - "Must define foreign keys between orders->customers and order_items->orders/products" + - "order_items.order_id FK must include ON DELETE CASCADE" + - "Must include CHECK constraints on membership_tier, status, and quantity" + - "Must create all 5 requested indexes including the composite index" + - "Must verify the schema using information_schema queries" + - "Must use Lakebase connection pattern: get-endpoint + generate-database-credential + sslmode=require" + metadata: + category: happy_path + difficulty: hard + source: interactive_add + +- id: lakebase_auto_support_cases_ddl_009 + inputs: + prompt: | + Use uv to install psycopg if not installed (use pip as a fallback). + Connect to `support_db` on `production` of `support-autoscale` and add support case tables to the `support` schema: + + **support_cases** - Customer support tickets tied to orders + - case_id (PK, serial), customer_id (FK to customers NOT NULL), order_id (FK to orders, nullable - not all cases relate to an order), subject (varchar 300 NOT NULL), description (text), category (varchar 50, check in 'order_issue','product_defect','shipping','billing','returns','general'), priority (varchar 20, check in 'low','medium','high','critical'), status (varchar 20 default 'open', check in 'open','in_progress','waiting_customer','resolved','closed'), assigned_agent (varchar 100), created_at (timestamptz default now), updated_at (timestamptz default now), resolved_at (timestamptz nullable) + + **case_products** - Junction table linking cases to one or more products involved + - case_product_id (PK, serial), case_id (FK to support_cases ON DELETE CASCADE), product_id (FK to products), is_primary (boolean default false), created_at (timestamptz default now) + - Add unique constraint on (case_id, product_id) + + **case_notes** - Activity log / notes on each support case + - note_id (PK, serial), case_id (FK to support_cases ON DELETE CASCADE), author (varchar 100 NOT NULL), note_type (varchar 20, check in 'internal','customer_reply','agent_reply','system'), content (text NOT NULL), created_at (timestamptz default now) + + Add indexes on: support_cases(customer_id), support_cases(order_id), support_cases(status, priority), support_cases(created_at), case_products(case_id), case_products(product_id), case_notes(case_id, created_at). + outputs: + response: | + Here's how to install psycopg, connect to the Lakebase Autoscale project, and create the support case tables: + + ```python + import subprocess + import sys + + try: + import psycopg + except ImportError: + try: + subprocess.check_call(["uv", "pip", "install", "psycopg[binary]>=3.0"]) + except (subprocess.CalledProcessError, FileNotFoundError): + subprocess.check_call( + [sys.executable, "-m", "pip", "install", "psycopg[binary]>=3.0"] + ) + import psycopg + + from databricks.sdk import WorkspaceClient + + w = WorkspaceClient() + + endpoint = w.postgres.get_endpoint( + name="projects/support-autoscale/branches/production/endpoints/primary" + ) + host = endpoint.status.hosts.host + + cred = w.postgres.generate_database_credential( + endpoint="projects/support-autoscale/branches/production/endpoints/primary" + ) + + conn_string = ( + f"host={host} " + f"dbname=support_db " + f"user={w.current_user.me().user_name} " + f"password={cred.token} " + f"sslmode=require" + ) + + with psycopg.connect(conn_string) as conn: + conn.autocommit = True + with conn.cursor() as cur: + cur.execute(""" + CREATE TABLE support.support_cases ( + case_id SERIAL PRIMARY KEY, + customer_id INTEGER NOT NULL + REFERENCES support.customers(customer_id), + order_id INTEGER + REFERENCES support.orders(order_id), + subject VARCHAR(300) NOT NULL, + description TEXT, + category VARCHAR(50) CHECK ( + category IN ('order_issue', 'product_defect', 'shipping', + 'billing', 'returns', 'general') + ), + priority VARCHAR(20) CHECK ( + priority IN ('low', 'medium', 'high', 'critical') + ), + status VARCHAR(20) DEFAULT 'open' CHECK ( + status IN ('open', 'in_progress', 'waiting_customer', + 'resolved', 'closed') + ), + assigned_agent VARCHAR(100), + created_at TIMESTAMPTZ DEFAULT NOW(), + updated_at TIMESTAMPTZ DEFAULT NOW(), + resolved_at TIMESTAMPTZ + ) + """) + + cur.execute(""" + CREATE TABLE support.case_products ( + case_product_id SERIAL PRIMARY KEY, + case_id INTEGER NOT NULL + REFERENCES support.support_cases(case_id) ON DELETE CASCADE, + product_id INTEGER NOT NULL + REFERENCES support.products(product_id), + is_primary BOOLEAN DEFAULT FALSE, + created_at TIMESTAMPTZ DEFAULT NOW(), + UNIQUE (case_id, product_id) + ) + """) + + cur.execute(""" + CREATE TABLE support.case_notes ( + note_id SERIAL PRIMARY KEY, + case_id INTEGER NOT NULL + REFERENCES support.support_cases(case_id) ON DELETE CASCADE, + author VARCHAR(100) NOT NULL, + note_type VARCHAR(20) CHECK ( + note_type IN ('internal', 'customer_reply', + 'agent_reply', 'system') + ), + content TEXT NOT NULL, + created_at TIMESTAMPTZ DEFAULT NOW() + ) + """) + + cur.execute("CREATE INDEX idx_cases_customer ON support.support_cases(customer_id)") + cur.execute("CREATE INDEX idx_cases_order ON support.support_cases(order_id)") + cur.execute("CREATE INDEX idx_cases_status_priority ON support.support_cases(status, priority)") + cur.execute("CREATE INDEX idx_cases_created ON support.support_cases(created_at)") + cur.execute("CREATE INDEX idx_case_products_case ON support.case_products(case_id)") + cur.execute("CREATE INDEX idx_case_products_product ON support.case_products(product_id)") + cur.execute("CREATE INDEX idx_case_notes_case_created ON support.case_notes(case_id, created_at)") + + cur.execute(""" + SELECT table_name, column_name, data_type, column_default, is_nullable + FROM information_schema.columns + WHERE table_schema = 'support' + AND table_name IN ('support_cases', 'case_products', 'case_notes') + ORDER BY table_name, ordinal_position + """) + print("New table columns:") + for row in cur.fetchall(): + print(f" {row[0]}.{row[1]}: {row[2]} (default: {row[3]}, nullable: {row[4]})") + + cur.execute(""" + SELECT indexname, tablename + FROM pg_indexes + WHERE schemaname = 'support' + AND tablename IN ('support_cases', 'case_products', 'case_notes') + ORDER BY tablename, indexname + """) + print("\nIndexes:") + for row in cur.fetchall(): + print(f" {row[0]} on {row[1]}") + + cur.execute(""" + SELECT constraint_name, constraint_type + FROM information_schema.table_constraints + WHERE table_schema = 'support' + AND table_name = 'case_products' + AND constraint_type = 'UNIQUE' + """) + for row in cur.fetchall(): + print(f"\nUnique constraint: {row[0]} ({row[1]})") + ``` + + **Key points:** + + - **psycopg install**: Tries `uv pip install` first for speed, falls back to `pip` if uv is unavailable. + - **Nullable FK**: `order_id` on `support_cases` is nullable because not all cases relate to an order. + - **Default status**: `status DEFAULT 'open'` with CHECK means new cases start open and can only transition to valid states. + - **Junction table**: `case_products` uses `UNIQUE (case_id, product_id)` to prevent duplicate product associations. + - **ON DELETE CASCADE**: Deleting a support case automatically removes its `case_products` and `case_notes`. + - **Composite indexes**: `idx_cases_status_priority` optimizes dashboard queries; `idx_case_notes_case_created` optimizes chronological note retrieval. + - OAuth tokens expire after 1 hour. Always use `sslmode=require`. + execution_success: true + expectations: + expected_facts: + - "psycopg is installed via uv pip install with pip as fallback" + - "Connection uses get-endpoint and generate-database-credential to get host and OAuth token" + - "Connects to support_db, not the default databricks_postgres" + - "autocommit must be True for DDL statements" + - "support_cases has nullable FK order_id (not all cases relate to an order)" + - "support_cases has customer_id FK that is NOT NULL" + - "support_cases status defaults to 'open' with CHECK constraint" + - "case_products is a junction table with UNIQUE(case_id, product_id) constraint" + - "case_products and case_notes have ON DELETE CASCADE from support_cases" + - "case_notes has CHECK constraint on note_type" + - "Composite index on support_cases(status, priority) for dashboard queries" + - "Composite index on case_notes(case_id, created_at) for chronological note retrieval" + - "Verification uses information_schema and table_constraints" + expected_patterns: + - pattern: "uv.*pip.*install.*psycopg" + min_count: 1 + description: "Installs psycopg via uv" + - pattern: "pip.*install.*psycopg" + min_count: 1 + description: "Falls back to pip for psycopg install" + - pattern: "get.endpoint" + min_count: 1 + description: "Gets endpoint details for connection host (CLI or SDK)" + - pattern: "generate.database.credential" + min_count: 1 + description: "Generates OAuth token for authentication (CLI or SDK)" + - pattern: "sslmode=require" + min_count: 1 + description: "Requires SSL for the connection" + - pattern: "CREATE TABLE support\\.support_cases" + min_count: 1 + description: "Creates the support_cases table" + - pattern: "CREATE TABLE support\\.case_products" + min_count: 1 + description: "Creates the case_products junction table" + - pattern: "CREATE TABLE support\\.case_notes" + min_count: 1 + description: "Creates the case_notes table" + - pattern: "ON DELETE CASCADE" + min_count: 2 + description: "Cascade delete on case_products and case_notes when case is deleted" + - pattern: "UNIQUE.*case_id.*product_id" + min_count: 1 + description: "Unique constraint preventing duplicate case-product associations" + - pattern: "REFERENCES" + min_count: 5 + description: "FK constraints (support_cases->customers, ->orders, case_products->cases, ->products, case_notes->cases)" + - pattern: "CHECK" + min_count: 4 + description: "CHECK constraints on category, priority, status, and note_type" + - pattern: "CREATE INDEX" + min_count: 7 + description: "Creates all 7 requested indexes" + - pattern: "information_schema" + min_count: 1 + description: "Verifies schema using information_schema" + - pattern: "table_constraints" + min_count: 1 + description: "Verifies unique constraint via table_constraints" + - pattern: "DEFAULT 'open'" + min_count: 1 + description: "Status defaults to open for new cases" + guidelines: + - "Must install psycopg using uv with pip fallback" + - "Must connect to support_db using Autoscale pattern (w.postgres)" + - "Must create all 3 tables: support_cases, case_products, case_notes" + - "support_cases.order_id must be nullable (not all cases relate to an order)" + - "support_cases.customer_id must be NOT NULL with FK to customers" + - "support_cases.status must default to 'open' with CHECK constraint" + - "case_products must have UNIQUE(case_id, product_id) constraint" + - "case_products and case_notes must have ON DELETE CASCADE from support_cases" + - "Must create all 7 requested indexes including composite indexes" + - "Must verify schema using information_schema and table_constraints queries" + metadata: + category: happy_path + difficulty: hard + source: interactive_add diff --git a/.test/src/skill_test/agent/executor.py b/.test/src/skill_test/agent/executor.py index 4ef4af31..96b74dfe 100644 --- a/.test/src/skill_test/agent/executor.py +++ b/.test/src/skill_test/agent/executor.py @@ -292,7 +292,9 @@ def _get_agent_env() -> dict[str, str]: file_env = settings.get("env", {}) for k, v in file_env.items(): if isinstance(v, str): - env[k] = _resolve_env_refs(v) + resolved = _resolve_env_refs(v) + if resolved: # Skip empty values so Claude Code falls back to keychain/token-cache auth + env[k] = resolved logger.info("Loaded agent env from %s (%d vars)", p, len(file_env)) except (json.JSONDecodeError, OSError) as e: logger.warning("Failed to load %s: %s", p, e) diff --git a/.test/tests/integration/test_compute.py b/.test/tests/integration/test_compute.py new file mode 100644 index 00000000..688d1536 --- /dev/null +++ b/.test/tests/integration/test_compute.py @@ -0,0 +1,245 @@ +"""Integration tests for compute.py CLI script. + +Tests actual subprocess execution of the compute CLI script. +""" +import json +import subprocess +import sys +from pathlib import Path + +import pytest + +# Get repo root for running scripts +_repo_root = Path(__file__).resolve().parents[3] +_compute_script = _repo_root / "databricks-skills" / "databricks-execution-compute" / "scripts" / "compute.py" + + +class TestComputeScriptHelp: + """Test compute.py help and basic CLI structure.""" + + def test_script_shows_help(self): + """Verify script has help output.""" + result = subprocess.run( + [sys.executable, str(_compute_script), "--help"], + capture_output=True, + text=True, + cwd=str(_repo_root), + timeout=10 + ) + + assert result.returncode == 0 + assert "execute-code" in result.stdout + assert "list-compute" in result.stdout + assert "manage-cluster" in result.stdout + + def test_execute_code_help(self): + """Verify execute-code subcommand help.""" + result = subprocess.run( + [sys.executable, str(_compute_script), "execute-code", "--help"], + capture_output=True, + text=True, + cwd=str(_repo_root), + timeout=10 + ) + + assert result.returncode == 0 + assert "--code" in result.stdout + assert "--compute-type" in result.stdout + + def test_list_compute_help(self): + """Verify list-compute subcommand help.""" + result = subprocess.run( + [sys.executable, str(_compute_script), "list-compute", "--help"], + capture_output=True, + text=True, + cwd=str(_repo_root), + timeout=10 + ) + + assert result.returncode == 0 + assert "--resource" in result.stdout + + def test_manage_cluster_help(self): + """Verify manage-cluster subcommand help.""" + result = subprocess.run( + [sys.executable, str(_compute_script), "manage-cluster", "--help"], + capture_output=True, + text=True, + cwd=str(_repo_root), + timeout=10 + ) + + assert result.returncode == 0 + assert "--action" in result.stdout + + +@pytest.mark.integration +class TestListCompute: + """Tests for list-compute command.""" + + def test_list_clusters(self): + """Should list all clusters.""" + result = subprocess.run( + [sys.executable, str(_compute_script), "list-compute", "--resource", "clusters"], + capture_output=True, + text=True, + cwd=str(_repo_root), + timeout=60 + ) + + try: + output = json.loads(result.stdout) + assert "clusters" in output + assert isinstance(output["clusters"], list) + except json.JSONDecodeError: + pytest.fail(f"Invalid JSON: {result.stdout}\nStderr: {result.stderr}") + + def test_list_node_types(self): + """Should list available node types.""" + result = subprocess.run( + [sys.executable, str(_compute_script), "list-compute", "--resource", "node_types"], + capture_output=True, + text=True, + cwd=str(_repo_root), + timeout=60 + ) + + try: + output = json.loads(result.stdout) + assert "node_types" in output + assert isinstance(output["node_types"], list) + assert len(output["node_types"]) > 0 + except json.JSONDecodeError: + pytest.fail(f"Invalid JSON: {result.stdout}\nStderr: {result.stderr}") + + def test_list_spark_versions(self): + """Should list available Spark versions.""" + result = subprocess.run( + [sys.executable, str(_compute_script), "list-compute", "--resource", "spark_versions"], + capture_output=True, + text=True, + cwd=str(_repo_root), + timeout=60 + ) + + try: + output = json.loads(result.stdout) + assert "spark_versions" in output + assert isinstance(output["spark_versions"], list) + assert len(output["spark_versions"]) > 0 + except json.JSONDecodeError: + pytest.fail(f"Invalid JSON: {result.stdout}\nStderr: {result.stderr}") + + +@pytest.mark.integration +class TestExecuteCode: + """Tests for execute-code command.""" + + def test_execute_serverless_simple(self): + """Test simple Python execution on serverless.""" + code = 'print("Hello from compute test"); dbutils.notebook.exit("success")' + + result = subprocess.run( + [ + sys.executable, str(_compute_script), + "execute-code", + "--code", code, + "--compute-type", "serverless", + "--timeout", "180" + ], + capture_output=True, + text=True, + cwd=str(_repo_root), + timeout=300 # 5 min for cold start + ) + + try: + output = json.loads(result.stdout) + assert output.get("success", False), f"Execution failed: {output}" + except json.JSONDecodeError: + pytest.fail(f"Invalid JSON: {result.stdout}\nStderr: {result.stderr}") + + def test_execute_requires_code_or_file(self): + """Should return error when neither code nor file provided.""" + result = subprocess.run( + [ + sys.executable, str(_compute_script), + "execute-code", + "--compute-type", "serverless" + ], + capture_output=True, + text=True, + cwd=str(_repo_root), + timeout=30 + ) + + try: + output = json.loads(result.stdout) + assert output.get("success") is False + assert "error" in output + except json.JSONDecodeError: + pytest.fail(f"Invalid JSON: {result.stdout}\nStderr: {result.stderr}") + + +@pytest.mark.integration +class TestManageCluster: + """Tests for manage-cluster command (read-only operations).""" + + def test_invalid_action(self): + """Should return error for invalid action.""" + result = subprocess.run( + [ + sys.executable, str(_compute_script), + "manage-cluster", + "--action", "invalid_action" + ], + capture_output=True, + text=True, + cwd=str(_repo_root), + timeout=30 + ) + + # argparse will fail with invalid choice + assert result.returncode != 0 or "error" in result.stdout.lower() + + def test_get_requires_cluster_id(self): + """Should return error when cluster_id not provided for get.""" + result = subprocess.run( + [ + sys.executable, str(_compute_script), + "manage-cluster", + "--action", "get" + ], + capture_output=True, + text=True, + cwd=str(_repo_root), + timeout=30 + ) + + try: + output = json.loads(result.stdout) + assert output.get("success") is False + assert "error" in output + except json.JSONDecodeError: + pytest.fail(f"Invalid JSON: {result.stdout}\nStderr: {result.stderr}") + + def test_create_requires_name(self): + """Should return error when name not provided for create.""" + result = subprocess.run( + [ + sys.executable, str(_compute_script), + "manage-cluster", + "--action", "create" + ], + capture_output=True, + text=True, + cwd=str(_repo_root), + timeout=30 + ) + + try: + output = json.loads(result.stdout) + assert output.get("success") is False + assert "error" in output + except json.JSONDecodeError: + pytest.fail(f"Invalid JSON: {result.stdout}\nStderr: {result.stderr}") diff --git a/.test/uv.lock b/.test/uv.lock index dbed6c0b..7a6113f8 100644 --- a/.test/uv.lock +++ b/.test/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 2 +revision = 3 requires-python = ">=3.10" resolution-markers = [ "python_full_version >= '3.13'", @@ -11,7 +11,7 @@ resolution-markers = [ [[package]] name = "aiohappyeyeballs" version = "2.6.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/26/30/f84a107a9c4331c14b2b586036f40965c128aa4fee4dda5d3d51cb14ad54/aiohappyeyeballs-2.6.1.tar.gz", hash = "sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558", size = 22760, upload-time = "2025-03-12T01:42:48.764Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/0f/15/5bf3b99495fb160b63f95972b81750f18f7f4e02ad051373b669d17d44f2/aiohappyeyeballs-2.6.1-py3-none-any.whl", hash = "sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8", size = 15265, upload-time = "2025-03-12T01:42:47.083Z" }, @@ -20,7 +20,7 @@ wheels = [ [[package]] name = "aiohttp" version = "3.13.3" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "aiohappyeyeballs" }, { name = "aiosignal" }, @@ -140,7 +140,7 @@ wheels = [ [[package]] name = "aiosignal" version = "1.4.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "frozenlist" }, { name = "typing-extensions", marker = "python_full_version < '3.13'" }, @@ -153,7 +153,7 @@ wheels = [ [[package]] name = "alembic" version = "1.18.3" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "mako" }, { name = "sqlalchemy" }, @@ -168,7 +168,7 @@ wheels = [ [[package]] name = "annotated-doc" version = "0.0.4" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/57/ba/046ceea27344560984e26a590f90bc7f4a75b06701f653222458922b558c/annotated_doc-0.0.4.tar.gz", hash = "sha256:fbcda96e87e9c92ad167c2e53839e57503ecfda18804ea28102353485033faa4", size = 7288, upload-time = "2025-11-10T22:07:42.062Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/1e/d3/26bf1008eb3d2daa8ef4cacc7f3bfdc11818d111f7e2d0201bc6e3b49d45/annotated_doc-0.0.4-py3-none-any.whl", hash = "sha256:571ac1dc6991c450b25a9c2d84a3705e2ae7a53467b5d111c24fa8baabbed320", size = 5303, upload-time = "2025-11-10T22:07:40.673Z" }, @@ -177,7 +177,7 @@ wheels = [ [[package]] name = "annotated-types" version = "0.7.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/ee/67/531ea369ba64dcff5ec9c3402f9f51bf748cec26dde048a2f973a4eea7f5/annotated_types-0.7.0.tar.gz", hash = "sha256:aff07c09a53a08bc8cfccb9c85b05f1aa9a2a6f23728d790723543408344ce89", size = 16081, upload-time = "2024-05-20T21:33:25.928Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/78/b6/6307fbef88d9b5ee7421e68d78a9f162e0da4900bc5f5793f6d3d0e34fb8/annotated_types-0.7.0-py3-none-any.whl", hash = "sha256:1f02e8b43a8fbbc3f3e0d4f0f4bfc8131bcb4eebe8849b8e5c773f3a1c582a53", size = 13643, upload-time = "2024-05-20T21:33:24.1Z" }, @@ -186,7 +186,7 @@ wheels = [ [[package]] name = "anyio" version = "4.12.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "exceptiongroup", marker = "python_full_version < '3.11'" }, { name = "idna" }, @@ -200,7 +200,7 @@ wheels = [ [[package]] name = "async-timeout" version = "5.0.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/a5/ae/136395dfbfe00dfc94da3f3e136d0b13f394cba8f4841120e34226265780/async_timeout-5.0.1.tar.gz", hash = "sha256:d9321a7a3d5a6a5e187e824d2fa0793ce379a202935782d555d6e9d2735677d3", size = 9274, upload-time = "2024-11-06T16:41:39.6Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/fe/ba/e2081de779ca30d473f21f5b30e0e737c438205440784c7dfc81efc2b029/async_timeout-5.0.1-py3-none-any.whl", hash = "sha256:39e3809566ff85354557ec2398b55e096c8364bacac9405a7a1fa429e77fe76c", size = 6233, upload-time = "2024-11-06T16:41:37.9Z" }, @@ -209,7 +209,7 @@ wheels = [ [[package]] name = "attrs" version = "25.4.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/6b/5c/685e6633917e101e5dcb62b9dd76946cbb57c26e133bae9e0cd36033c0a9/attrs-25.4.0.tar.gz", hash = "sha256:16d5969b87f0859ef33a48b35d55ac1be6e42ae49d5e853b597db70c35c57e11", size = 934251, upload-time = "2025-10-06T13:54:44.725Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/3a/2a/7cc015f5b9f5db42b7d48157e23356022889fc354a2813c15934b7cb5c0e/attrs-25.4.0-py3-none-any.whl", hash = "sha256:adcf7e2a1fb3b36ac48d97835bb6d8ade15b8dcce26aba8bf1d14847b57a3373", size = 67615, upload-time = "2025-10-06T13:54:43.17Z" }, @@ -218,7 +218,7 @@ wheels = [ [[package]] name = "azure-core" version = "1.38.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "requests" }, { name = "typing-extensions" }, @@ -231,7 +231,7 @@ wheels = [ [[package]] name = "azure-storage-blob" version = "12.28.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "azure-core" }, { name = "cryptography" }, @@ -246,7 +246,7 @@ wheels = [ [[package]] name = "azure-storage-file-datalake" version = "12.23.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "azure-core" }, { name = "azure-storage-blob" }, @@ -261,7 +261,7 @@ wheels = [ [[package]] name = "backports-asyncio-runner" version = "1.2.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/8e/ff/70dca7d7cb1cbc0edb2c6cc0c38b65cba36cccc491eca64cabd5fe7f8670/backports_asyncio_runner-1.2.0.tar.gz", hash = "sha256:a5aa7b2b7d8f8bfcaa2b57313f70792df84e32a2a746f585213373f900b42162", size = 69893, upload-time = "2025-07-02T02:27:15.685Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/a0/59/76ab57e3fe74484f48a53f8e337171b4a2349e506eabe136d7e01d059086/backports_asyncio_runner-1.2.0-py3-none-any.whl", hash = "sha256:0da0a936a8aeb554eccb426dc55af3ba63bcdc69fa1a600b5bb305413a4477b5", size = 12313, upload-time = "2025-07-02T02:27:14.263Z" }, @@ -270,7 +270,7 @@ wheels = [ [[package]] name = "blinker" version = "1.9.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/21/28/9b3f50ce0e048515135495f198351908d99540d69bfdc8c1d15b73dc55ce/blinker-1.9.0.tar.gz", hash = "sha256:b4ce2265a7abece45e7cc896e98dbebe6cead56bcf805a3d23136d145f5445bf", size = 22460, upload-time = "2024-11-08T17:25:47.436Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/10/cb/f2ad4230dc2eb1a74edf38f1a38b9b52277f75bef262d8908e60d957e13c/blinker-1.9.0-py3-none-any.whl", hash = "sha256:ba0efaa9080b619ff2f3459d1d500c57bddea4a6b424b60a91141db6fd2f08bc", size = 8458, upload-time = "2024-11-08T17:25:46.184Z" }, @@ -279,7 +279,7 @@ wheels = [ [[package]] name = "boto3" version = "1.42.41" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "botocore" }, { name = "jmespath" }, @@ -293,7 +293,7 @@ wheels = [ [[package]] name = "botocore" version = "1.42.41" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "jmespath" }, { name = "python-dateutil" }, @@ -307,7 +307,7 @@ wheels = [ [[package]] name = "cachetools" version = "6.2.6" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/39/91/d9ae9a66b01102a18cd16db0cf4cd54187ffe10f0865cc80071a4104fbb3/cachetools-6.2.6.tar.gz", hash = "sha256:16c33e1f276b9a9c0b49ab5782d901e3ad3de0dd6da9bf9bcd29ac5672f2f9e6", size = 32363, upload-time = "2026-01-27T20:32:59.956Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/90/45/f458fa2c388e79dd9d8b9b0c99f1d31b568f27388f2fdba7bb66bbc0c6ed/cachetools-6.2.6-py3-none-any.whl", hash = "sha256:8c9717235b3c651603fff0076db52d6acbfd1b338b8ed50256092f7ce9c85bda", size = 11668, upload-time = "2026-01-27T20:32:58.527Z" }, @@ -316,7 +316,7 @@ wheels = [ [[package]] name = "certifi" version = "2026.1.4" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/e0/2d/a891ca51311197f6ad14a7ef42e2399f36cf2f9bd44752b3dc4eab60fdc5/certifi-2026.1.4.tar.gz", hash = "sha256:ac726dd470482006e014ad384921ed6438c457018f4b3d204aea4281258b2120", size = 154268, upload-time = "2026-01-04T02:42:41.825Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/e6/ad/3cc14f097111b4de0040c83a525973216457bbeeb63739ef1ed275c1c021/certifi-2026.1.4-py3-none-any.whl", hash = "sha256:9943707519e4add1115f44c2bc244f782c0249876bf51b6599fee1ffbedd685c", size = 152900, upload-time = "2026-01-04T02:42:40.15Z" }, @@ -325,7 +325,7 @@ wheels = [ [[package]] name = "cffi" version = "2.0.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "pycparser", marker = "implementation_name != 'PyPy'" }, ] @@ -407,7 +407,7 @@ wheels = [ [[package]] name = "charset-normalizer" version = "3.4.4" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/13/69/33ddede1939fdd074bce5434295f38fae7136463422fe4fd3e0e89b98062/charset_normalizer-3.4.4.tar.gz", hash = "sha256:94537985111c35f28720e43603b8e7b43a6ecfb2ce1d3058bbe955b73404e21a", size = 129418, upload-time = "2025-10-14T04:42:32.879Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/1f/b8/6d51fc1d52cbd52cd4ccedd5b5b2f0f6a11bbf6765c782298b0f3e808541/charset_normalizer-3.4.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e824f1492727fa856dd6eda4f7cee25f8518a12f3c4a56a74e8095695089cf6d", size = 209709, upload-time = "2025-10-14T04:40:11.385Z" }, @@ -496,7 +496,7 @@ wheels = [ [[package]] name = "claude-agent-sdk" version = "0.1.46" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "anyio" }, { name = "mcp" }, @@ -513,7 +513,7 @@ wheels = [ [[package]] name = "click" version = "8.3.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "colorama", marker = "sys_platform == 'win32'" }, ] @@ -525,7 +525,7 @@ wheels = [ [[package]] name = "cloudpickle" version = "3.1.2" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/27/fb/576f067976d320f5f0114a8d9fa1215425441bb35627b1993e5afd8111e5/cloudpickle-3.1.2.tar.gz", hash = "sha256:7fda9eb655c9c230dab534f1983763de5835249750e85fbcef43aaa30a9a2414", size = 22330, upload-time = "2025-11-03T09:25:26.604Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/88/39/799be3f2f0f38cc727ee3b4f1445fe6d5e4133064ec2e4115069418a5bb6/cloudpickle-3.1.2-py3-none-any.whl", hash = "sha256:9acb47f6afd73f60dc1df93bb801b472f05ff42fa6c84167d25cb206be1fbf4a", size = 22228, upload-time = "2025-11-03T09:25:25.534Z" }, @@ -534,7 +534,7 @@ wheels = [ [[package]] name = "colorama" version = "0.4.6" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697, upload-time = "2022-10-25T02:36:22.414Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, @@ -543,12 +543,12 @@ wheels = [ [[package]] name = "contourpy" version = "1.3.2" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } resolution-markers = [ "python_full_version < '3.11'", ] dependencies = [ - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version < '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/66/54/eb9bfc647b19f2009dd5c7f5ec51c4e6ca831725f1aea7a993034f483147/contourpy-1.3.2.tar.gz", hash = "sha256:b6945942715a034c671b7fc54f9588126b0b8bf23db2696e3ca8328f3ff0ab54", size = 13466130, upload-time = "2025-04-15T17:47:53.79Z" } wheels = [ @@ -613,14 +613,14 @@ wheels = [ [[package]] name = "contourpy" version = "1.3.3" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } resolution-markers = [ "python_full_version >= '3.13'", "python_full_version == '3.12.*'", "python_full_version == '3.11.*'", ] dependencies = [ - { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "numpy", version = "2.4.2", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version >= '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/58/01/1253e6698a07380cd31a736d248a3f2a50a7c88779a1813da27503cadc2a/contourpy-1.3.3.tar.gz", hash = "sha256:083e12155b210502d0bca491432bb04d56dc3432f95a979b429f2848c3dbe880", size = 13466174, upload-time = "2025-07-26T12:03:12.549Z" } wheels = [ @@ -700,7 +700,7 @@ wheels = [ [[package]] name = "cryptography" version = "46.0.4" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "cffi", marker = "platform_python_implementation != 'PyPy'" }, { name = "typing-extensions", marker = "python_full_version < '3.11'" }, @@ -760,7 +760,7 @@ wheels = [ [[package]] name = "cycler" version = "0.12.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/a9/95/a3dbbb5028f35eafb79008e7522a75244477d2838f38cbb722248dabc2a8/cycler-0.12.1.tar.gz", hash = "sha256:88bb128f02ba341da8ef447245a9e138fae777f6a23943da4540077d3601eb1c", size = 7615, upload-time = "2023-10-07T05:32:18.335Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30", size = 8321, upload-time = "2023-10-07T05:32:16.783Z" }, @@ -769,7 +769,7 @@ wheels = [ [[package]] name = "databricks-agents" version = "1.9.3" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "boto3" }, { name = "botocore" }, @@ -779,8 +779,8 @@ dependencies = [ { name = "jinja2" }, { name = "litellm" }, { name = "mlflow-skinny" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, - { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version < '3.11'" }, + { name = "numpy", version = "2.4.2", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version >= '3.11'" }, { name = "pandas" }, { name = "pydantic" }, { name = "tenacity" }, @@ -796,7 +796,7 @@ wheels = [ [[package]] name = "databricks-sdk" version = "0.84.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "google-auth" }, { name = "protobuf" }, @@ -817,7 +817,7 @@ openai = [ [[package]] name = "dataclasses-json" version = "0.6.7" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "marshmallow" }, { name = "typing-inspect" }, @@ -830,7 +830,7 @@ wheels = [ [[package]] name = "distro" version = "1.9.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/fc/f8/98eea607f65de6527f8a2e8885fc8015d3e6f5775df186e443e0964a11c3/distro-1.9.0.tar.gz", hash = "sha256:2fa77c6fd8940f116ee1d6b94a2f90b13b5ea8d019b98bc8bafdcabcdd9bdbed", size = 60722, upload-time = "2023-12-24T09:54:32.31Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/12/b3/231ffd4ab1fc9d679809f356cebee130ac7daa00d6d6f3206dd4fd137e9e/distro-1.9.0-py3-none-any.whl", hash = "sha256:7bffd925d65168f85027d8da9af6bddab658135b840670a223589bc0c8ef02b2", size = 20277, upload-time = "2023-12-24T09:54:30.421Z" }, @@ -839,7 +839,7 @@ wheels = [ [[package]] name = "docker" version = "7.1.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "pywin32", marker = "sys_platform == 'win32'" }, { name = "requests" }, @@ -853,7 +853,7 @@ wheels = [ [[package]] name = "exceptiongroup" version = "1.3.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "typing-extensions", marker = "python_full_version < '3.11'" }, ] @@ -865,7 +865,7 @@ wheels = [ [[package]] name = "fastapi" version = "0.128.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "annotated-doc" }, { name = "pydantic" }, @@ -880,7 +880,7 @@ wheels = [ [[package]] name = "fastuuid" version = "0.14.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/c3/7d/d9daedf0f2ebcacd20d599928f8913e9d2aea1d56d2d355a93bfa2b611d7/fastuuid-0.14.0.tar.gz", hash = "sha256:178947fc2f995b38497a74172adee64fdeb8b7ec18f2a5934d037641ba265d26", size = 18232, upload-time = "2025-10-19T22:19:22.402Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/ad/b2/731a6696e37cd20eed353f69a09f37a984a43c9713764ee3f7ad5f57f7f9/fastuuid-0.14.0-cp310-cp310-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:6e6243d40f6c793c3e2ee14c13769e341b90be5ef0c23c82fa6515a96145181a", size = 516760, upload-time = "2025-10-19T22:25:21.509Z" }, @@ -943,7 +943,7 @@ wheels = [ [[package]] name = "filelock" version = "3.20.3" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/1d/65/ce7f1b70157833bf3cb851b556a37d4547ceafc158aa9b34b36782f23696/filelock-3.20.3.tar.gz", hash = "sha256:18c57ee915c7ec61cff0ecf7f0f869936c7c30191bb0cf406f1341778d0834e1", size = 19485, upload-time = "2026-01-09T17:55:05.421Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/b5/36/7fb70f04bf00bc646cd5bb45aa9eddb15e19437a28b8fb2b4a5249fac770/filelock-3.20.3-py3-none-any.whl", hash = "sha256:4b0dda527ee31078689fc205ec4f1c1bf7d56cf88b6dc9426c4f230e46c2dce1", size = 16701, upload-time = "2026-01-09T17:55:04.334Z" }, @@ -952,7 +952,7 @@ wheels = [ [[package]] name = "flask" version = "3.1.2" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "blinker" }, { name = "click" }, @@ -969,7 +969,7 @@ wheels = [ [[package]] name = "flask-cors" version = "6.0.2" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "flask" }, { name = "werkzeug" }, @@ -982,7 +982,7 @@ wheels = [ [[package]] name = "fonttools" version = "4.61.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/ec/ca/cf17b88a8df95691275a3d77dc0a5ad9907f328ae53acbe6795da1b2f5ed/fonttools-4.61.1.tar.gz", hash = "sha256:6675329885c44657f826ef01d9e4fb33b9158e9d93c537d84ad8399539bc6f69", size = 3565756, upload-time = "2025-12-12T17:31:24.246Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/5b/94/8a28707adb00bed1bf22dac16ccafe60faf2ade353dcb32c3617ee917307/fonttools-4.61.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7c7db70d57e5e1089a274cbb2b1fd635c9a24de809a231b154965d415d6c6d24", size = 2854799, upload-time = "2025-12-12T17:29:27.5Z" }, @@ -1039,7 +1039,7 @@ wheels = [ [[package]] name = "frozenlist" version = "1.8.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/2d/f5/c831fac6cc817d26fd54c7eaccd04ef7e0288806943f7cc5bbf69f3ac1f0/frozenlist-1.8.0.tar.gz", hash = "sha256:3ede829ed8d842f6cd48fc7081d7a41001a56f1f38603f9d49bf3020d59a31ad", size = 45875, upload-time = "2025-10-06T05:38:17.865Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/83/4a/557715d5047da48d54e659203b9335be7bfaafda2c3f627b7c47e0b3aaf3/frozenlist-1.8.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:b37f6d31b3dcea7deb5e9696e529a6aa4a898adc33db82da12e4c60a7c4d2011", size = 86230, upload-time = "2025-10-06T05:35:23.699Z" }, @@ -1160,7 +1160,7 @@ wheels = [ [[package]] name = "fsspec" version = "2026.1.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/d5/7d/5df2650c57d47c57232af5ef4b4fdbff182070421e405e0d62c6cdbfaa87/fsspec-2026.1.0.tar.gz", hash = "sha256:e987cb0496a0d81bba3a9d1cee62922fb395e7d4c3b575e57f547953334fe07b", size = 310496, upload-time = "2026-01-09T15:21:35.562Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/01/c9/97cc5aae1648dcb851958a3ddf73ccd7dbe5650d95203ecb4d7720b4cdbf/fsspec-2026.1.0-py3-none-any.whl", hash = "sha256:cb76aa913c2285a3b49bdd5fc55b1d7c708d7208126b60f2eb8194fe1b4cbdcc", size = 201838, upload-time = "2026-01-09T15:21:34.041Z" }, @@ -1169,7 +1169,7 @@ wheels = [ [[package]] name = "gepa" version = "0.1.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/f5/30/511e52916956508f56eca721260fcd524cfffd580e57782dd471be925f7e/gepa-0.1.0.tar.gz", hash = "sha256:f8b3d7918d4cdcf8593f39ef1cc757c4ba1a4e6793e3ffb622e6c0bc60a1efd9", size = 226064, upload-time = "2026-02-19T19:43:08.272Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/1f/32/fe8afb3d2a6605a6bcbc8f119f0a2adae96e9e5d57ebed055490219956a8/gepa-0.1.0-py3-none-any.whl", hash = "sha256:4e3f8fe8ca20169e60518b2e9d416e8c4a579459848adffdcad12223fbf9643e", size = 191392, upload-time = "2026-02-19T19:43:07.065Z" }, @@ -1178,7 +1178,7 @@ wheels = [ [[package]] name = "gitdb" version = "4.0.12" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "smmap" }, ] @@ -1190,7 +1190,7 @@ wheels = [ [[package]] name = "gitpython" version = "3.1.46" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "gitdb" }, ] @@ -1202,7 +1202,7 @@ wheels = [ [[package]] name = "google-api-core" version = "2.29.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "google-auth" }, { name = "googleapis-common-protos" }, @@ -1218,7 +1218,7 @@ wheels = [ [[package]] name = "google-auth" version = "2.48.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "cryptography" }, { name = "pyasn1-modules" }, @@ -1232,7 +1232,7 @@ wheels = [ [[package]] name = "google-cloud-core" version = "2.5.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "google-api-core" }, { name = "google-auth" }, @@ -1245,7 +1245,7 @@ wheels = [ [[package]] name = "google-cloud-storage" version = "3.9.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "google-api-core" }, { name = "google-auth" }, @@ -1262,7 +1262,7 @@ wheels = [ [[package]] name = "google-crc32c" version = "1.8.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/03/41/4b9c02f99e4c5fb477122cd5437403b552873f014616ac1d19ac8221a58d/google_crc32c-1.8.0.tar.gz", hash = "sha256:a428e25fb7691024de47fecfbff7ff957214da51eddded0da0ae0e0f03a2cf79", size = 14192, upload-time = "2025-12-16T00:35:25.142Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/95/ac/6f7bc93886a823ab545948c2dd48143027b2355ad1944c7cf852b338dc91/google_crc32c-1.8.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:0470b8c3d73b5f4e3300165498e4cf25221c7eb37f1159e221d1825b6df8a7ff", size = 31296, upload-time = "2025-12-16T00:19:07.261Z" }, @@ -1297,7 +1297,7 @@ wheels = [ [[package]] name = "google-resumable-media" version = "2.8.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "google-crc32c" }, ] @@ -1309,7 +1309,7 @@ wheels = [ [[package]] name = "googleapis-common-protos" version = "1.72.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "protobuf" }, ] @@ -1321,7 +1321,7 @@ wheels = [ [[package]] name = "graphene" version = "3.4.3" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "graphql-core" }, { name = "graphql-relay" }, @@ -1336,7 +1336,7 @@ wheels = [ [[package]] name = "graphql-core" version = "3.2.7" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/ac/9b/037a640a2983b09aed4a823f9cf1729e6d780b0671f854efa4727a7affbe/graphql_core-3.2.7.tar.gz", hash = "sha256:27b6904bdd3b43f2a0556dad5d579bdfdeab1f38e8e8788e555bdcb586a6f62c", size = 513484, upload-time = "2025-11-01T22:30:40.436Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/0a/14/933037032608787fb92e365883ad6a741c235e0ff992865ec5d904a38f1e/graphql_core-3.2.7-py3-none-any.whl", hash = "sha256:17fc8f3ca4a42913d8e24d9ac9f08deddf0a0b2483076575757f6c412ead2ec0", size = 207262, upload-time = "2025-11-01T22:30:38.912Z" }, @@ -1345,7 +1345,7 @@ wheels = [ [[package]] name = "graphql-relay" version = "3.2.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "graphql-core" }, ] @@ -1357,7 +1357,7 @@ wheels = [ [[package]] name = "greenlet" version = "3.3.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/8a/99/1cd3411c56a410994669062bd73dd58270c00cc074cac15f385a1fd91f8a/greenlet-3.3.1.tar.gz", hash = "sha256:41848f3230b58c08bb43dee542e74a2a2e34d3c59dc3076cec9151aeeedcae98", size = 184690, upload-time = "2026-01-23T15:31:02.076Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/fe/65/5b235b40581ad75ab97dcd8b4218022ae8e3ab77c13c919f1a1dfe9171fd/greenlet-3.3.1-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:04bee4775f40ecefcdaa9d115ab44736cd4b9c5fba733575bfe9379419582e13", size = 273723, upload-time = "2026-01-23T15:30:37.521Z" }, @@ -1417,7 +1417,7 @@ wheels = [ [[package]] name = "gunicorn" version = "23.0.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "packaging" }, ] @@ -1429,7 +1429,7 @@ wheels = [ [[package]] name = "h11" version = "0.16.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/01/ee/02a2c011bdab74c6fb3c75474d40b3052059d95df7e73351460c8588d963/h11-0.16.0.tar.gz", hash = "sha256:4e35b956cf45792e4caa5885e69fba00bdbc6ffafbfa020300e549b208ee5ff1", size = 101250, upload-time = "2025-04-24T03:35:25.427Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/04/4b/29cac41a4d98d144bf5f6d33995617b185d14b22401f75ca86f384e87ff1/h11-0.16.0-py3-none-any.whl", hash = "sha256:63cf8bbe7522de3bf65932fda1d9c2772064ffb3dae62d55932da54b31cb6c86", size = 37515, upload-time = "2025-04-24T03:35:24.344Z" }, @@ -1438,7 +1438,7 @@ wheels = [ [[package]] name = "hf-xet" version = "1.2.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/5e/6e/0f11bacf08a67f7fb5ee09740f2ca54163863b07b70d579356e9222ce5d8/hf_xet-1.2.0.tar.gz", hash = "sha256:a8c27070ca547293b6890c4bf389f713f80e8c478631432962bb7f4bc0bd7d7f", size = 506020, upload-time = "2025-10-24T19:04:32.129Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/9e/a5/85ef910a0aa034a2abcfadc360ab5ac6f6bc4e9112349bd40ca97551cff0/hf_xet-1.2.0-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:ceeefcd1b7aed4956ae8499e2199607765fbd1c60510752003b6cc0b8413b649", size = 2861870, upload-time = "2025-10-24T19:04:11.422Z" }, @@ -1467,7 +1467,7 @@ wheels = [ [[package]] name = "httpcore" version = "1.0.9" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "certifi" }, { name = "h11" }, @@ -1480,7 +1480,7 @@ wheels = [ [[package]] name = "httpx" version = "0.28.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "anyio" }, { name = "certifi" }, @@ -1495,7 +1495,7 @@ wheels = [ [[package]] name = "httpx-sse" version = "0.4.3" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/0f/4c/751061ffa58615a32c31b2d82e8482be8dd4a89154f003147acee90f2be9/httpx_sse-0.4.3.tar.gz", hash = "sha256:9b1ed0127459a66014aec3c56bebd93da3c1bc8bb6618c8082039a44889a755d", size = 15943, upload-time = "2025-10-10T21:48:22.271Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/d2/fd/6668e5aec43ab844de6fc74927e155a3b37bf40d7c3790e49fc0406b6578/httpx_sse-0.4.3-py3-none-any.whl", hash = "sha256:0ac1c9fe3c0afad2e0ebb25a934a59f4c7823b60792691f779fad2c5568830fc", size = 8960, upload-time = "2025-10-10T21:48:21.158Z" }, @@ -1504,7 +1504,7 @@ wheels = [ [[package]] name = "huey" version = "2.6.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/fe/29/3428d52eb8e85025e264a291641a9f9d6407cc1e51d1b630f6ac5815999a/huey-2.6.0.tar.gz", hash = "sha256:8d11f8688999d65266af1425b831f6e3773e99415027177b8734b0ffd5e251f6", size = 221068, upload-time = "2026-01-06T03:01:02.055Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/1a/34/fae9ac8f1c3a552fd3f7ff652b94c78d219dedc5fce0c0a4232457760a00/huey-2.6.0-py3-none-any.whl", hash = "sha256:1b9df9d370b49c6d5721ba8a01ac9a787cf86b3bdc584e4679de27b920395c3f", size = 76951, upload-time = "2026-01-06T03:01:00.808Z" }, @@ -1513,7 +1513,7 @@ wheels = [ [[package]] name = "huggingface-hub" version = "1.4.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "filelock" }, { name = "fsspec" }, @@ -1534,7 +1534,7 @@ wheels = [ [[package]] name = "idna" version = "3.11" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/6f/6d/0703ccc57f3a7233505399edb88de3cbd678da106337b9fcde432b65ed60/idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902", size = 194582, upload-time = "2025-10-12T14:55:20.501Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/0e/61/66938bbb5fc52dbdf84594873d5b51fb1f7c7794e9c0f5bd885f30bc507b/idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea", size = 71008, upload-time = "2025-10-12T14:55:18.883Z" }, @@ -1543,7 +1543,7 @@ wheels = [ [[package]] name = "importlib-metadata" version = "8.7.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "zipp" }, ] @@ -1555,7 +1555,7 @@ wheels = [ [[package]] name = "iniconfig" version = "2.3.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" }, @@ -1564,7 +1564,7 @@ wheels = [ [[package]] name = "isodate" version = "0.7.2" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/54/4d/e940025e2ce31a8ce1202635910747e5a87cc3a6a6bb2d00973375014749/isodate-0.7.2.tar.gz", hash = "sha256:4cd1aa0f43ca76f4a6c6c0292a85f40b35ec2e43e315b59f06e6d32171a953e6", size = 29705, upload-time = "2024-10-08T23:04:11.5Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/15/aa/0aca39a37d3c7eb941ba736ede56d689e7be91cab5d9ca846bde3999eba6/isodate-0.7.2-py3-none-any.whl", hash = "sha256:28009937d8031054830160fce6d409ed342816b543597cece116d966c6d99e15", size = 22320, upload-time = "2024-10-08T23:04:09.501Z" }, @@ -1573,7 +1573,7 @@ wheels = [ [[package]] name = "itsdangerous" version = "2.2.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/9c/cb/8ac0172223afbccb63986cc25049b154ecfb5e85932587206f42317be31d/itsdangerous-2.2.0.tar.gz", hash = "sha256:e0050c0b7da1eea53ffaf149c0cfbb5c6e2e2b69c4bef22c81fa6eb73e5f6173", size = 54410, upload-time = "2024-04-16T21:28:15.614Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/04/96/92447566d16df59b2a776c0fb82dbc4d9e07cd95062562af01e408583fc4/itsdangerous-2.2.0-py3-none-any.whl", hash = "sha256:c6242fc49e35958c8b15141343aa660db5fc54d4f13a1db01a3f5891b98700ef", size = 16234, upload-time = "2024-04-16T21:28:14.499Z" }, @@ -1582,7 +1582,7 @@ wheels = [ [[package]] name = "jinja2" version = "3.1.6" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "markupsafe" }, ] @@ -1594,7 +1594,7 @@ wheels = [ [[package]] name = "jiter" version = "0.13.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/0d/5e/4ec91646aee381d01cdb9974e30882c9cd3b8c5d1079d6b5ff4af522439a/jiter-0.13.0.tar.gz", hash = "sha256:f2839f9c2c7e2dffc1bc5929a510e14ce0a946be9365fd1219e7ef342dae14f4", size = 164847, upload-time = "2026-02-02T12:37:56.441Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/d0/5a/41da76c5ea07bec1b0472b6b2fdb1b651074d504b19374d7e130e0cdfb25/jiter-0.13.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:2ffc63785fd6c7977defe49b9824ae6ce2b2e2b77ce539bdaf006c26da06342e", size = 311164, upload-time = "2026-02-02T12:35:17.688Z" }, @@ -1691,7 +1691,7 @@ wheels = [ [[package]] name = "jmespath" version = "1.1.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/d3/59/322338183ecda247fb5d1763a6cbe46eff7222eaeebafd9fa65d4bf5cb11/jmespath-1.1.0.tar.gz", hash = "sha256:472c87d80f36026ae83c6ddd0f1d05d4e510134ed462851fd5f754c8c3cbb88d", size = 27377, upload-time = "2026-01-22T16:35:26.279Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/14/2f/967ba146e6d58cf6a652da73885f52fc68001525b4197effc174321d70b4/jmespath-1.1.0-py3-none-any.whl", hash = "sha256:a5663118de4908c91729bea0acadca56526eb2698e83de10cd116ae0f4e97c64", size = 20419, upload-time = "2026-01-22T16:35:24.919Z" }, @@ -1700,7 +1700,7 @@ wheels = [ [[package]] name = "joblib" version = "1.5.3" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/41/f2/d34e8b3a08a9cc79a50b2208a93dce981fe615b64d5a4d4abee421d898df/joblib-1.5.3.tar.gz", hash = "sha256:8561a3269e6801106863fd0d6d84bb737be9e7631e33aaed3fb9ce5953688da3", size = 331603, upload-time = "2025-12-15T08:41:46.427Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/7b/91/984aca2ec129e2757d1e4e3c81c3fcda9d0f85b74670a094cc443d9ee949/joblib-1.5.3-py3-none-any.whl", hash = "sha256:5fc3c5039fc5ca8c0276333a188bbd59d6b7ab37fe6632daa76bc7f9ec18e713", size = 309071, upload-time = "2025-12-15T08:41:44.973Z" }, @@ -1709,7 +1709,7 @@ wheels = [ [[package]] name = "jsonpatch" version = "1.33" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "jsonpointer" }, ] @@ -1721,7 +1721,7 @@ wheels = [ [[package]] name = "jsonpointer" version = "3.0.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/6a/0a/eebeb1fa92507ea94016a2a790b93c2ae41a7e18778f85471dc54475ed25/jsonpointer-3.0.0.tar.gz", hash = "sha256:2b2d729f2091522d61c3b31f82e11870f60b68f43fbc705cb76bf4b832af59ef", size = 9114, upload-time = "2024-06-10T19:24:42.462Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/71/92/5e77f98553e9e75130c78900d000368476aed74276eb8ae8796f65f00918/jsonpointer-3.0.0-py2.py3-none-any.whl", hash = "sha256:13e088adc14fca8b6aa8177c044e12701e6ad4b28ff10e65f2267a90109c9942", size = 7595, upload-time = "2024-06-10T19:24:40.698Z" }, @@ -1730,7 +1730,7 @@ wheels = [ [[package]] name = "jsonschema" version = "4.26.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "attrs" }, { name = "jsonschema-specifications" }, @@ -1745,7 +1745,7 @@ wheels = [ [[package]] name = "jsonschema-specifications" version = "2025.9.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "referencing" }, ] @@ -1757,7 +1757,7 @@ wheels = [ [[package]] name = "kiwisolver" version = "1.4.9" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/5c/3c/85844f1b0feb11ee581ac23fe5fce65cd049a200c1446708cc1b7f922875/kiwisolver-1.4.9.tar.gz", hash = "sha256:c3b22c26c6fd6811b0ae8363b95ca8ce4ea3c202d3d0975b2914310ceb1bcc4d", size = 97564, upload-time = "2025-08-10T21:27:49.279Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/c6/5d/8ce64e36d4e3aac5ca96996457dcf33e34e6051492399a3f1fec5657f30b/kiwisolver-1.4.9-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:b4b4d74bda2b8ebf4da5bd42af11d02d04428b2c32846e4c2c93219df8a7987b", size = 124159, upload-time = "2025-08-10T21:25:35.472Z" }, @@ -1865,7 +1865,7 @@ wheels = [ [[package]] name = "langchain-core" version = "1.2.8" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "jsonpatch" }, { name = "langsmith" }, @@ -1884,7 +1884,7 @@ wheels = [ [[package]] name = "langchain-openai" version = "1.1.7" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "langchain-core" }, { name = "openai" }, @@ -1898,7 +1898,7 @@ wheels = [ [[package]] name = "langsmith" version = "0.6.8" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "httpx" }, { name = "orjson", marker = "platform_python_implementation != 'PyPy'" }, @@ -1918,7 +1918,7 @@ wheels = [ [[package]] name = "litellm" version = "1.81.7" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "aiohttp" }, { name = "click" }, @@ -1941,7 +1941,7 @@ wheels = [ [[package]] name = "mako" version = "1.3.10" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "markupsafe" }, ] @@ -1953,7 +1953,7 @@ wheels = [ [[package]] name = "markupsafe" version = "3.0.3" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/7e/99/7690b6d4034fffd95959cbe0c02de8deb3098cc577c67bb6a24fe5d7caa7/markupsafe-3.0.3.tar.gz", hash = "sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698", size = 80313, upload-time = "2025-09-27T18:37:40.426Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/e8/4b/3541d44f3937ba468b75da9eebcae497dcf67adb65caa16760b0a6807ebb/markupsafe-3.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2f981d352f04553a7171b8e44369f2af4055f888dfb147d55e42d29e29e74559", size = 11631, upload-time = "2025-09-27T18:36:05.558Z" }, @@ -2038,7 +2038,7 @@ wheels = [ [[package]] name = "marshmallow" version = "3.26.2" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "packaging" }, ] @@ -2050,15 +2050,15 @@ wheels = [ [[package]] name = "matplotlib" version = "3.10.8" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ - { name = "contourpy", version = "1.3.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, - { name = "contourpy", version = "1.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "contourpy", version = "1.3.2", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version < '3.11'" }, + { name = "contourpy", version = "1.3.3", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version >= '3.11'" }, { name = "cycler" }, { name = "fonttools" }, { name = "kiwisolver" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, - { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version < '3.11'" }, + { name = "numpy", version = "2.4.2", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version >= '3.11'" }, { name = "packaging" }, { name = "pillow" }, { name = "pyparsing" }, @@ -2125,7 +2125,7 @@ wheels = [ [[package]] name = "mcp" version = "1.26.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "anyio" }, { name = "httpx" }, @@ -2150,7 +2150,7 @@ wheels = [ [[package]] name = "mlflow" version = "3.10.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "alembic" }, { name = "cryptography" }, @@ -2163,14 +2163,14 @@ dependencies = [ { name = "matplotlib" }, { name = "mlflow-skinny" }, { name = "mlflow-tracing" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, - { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version < '3.11'" }, + { name = "numpy", version = "2.4.2", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version >= '3.11'" }, { name = "pandas" }, { name = "pyarrow" }, - { name = "scikit-learn", version = "1.7.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, - { name = "scikit-learn", version = "1.8.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, - { name = "scipy", version = "1.15.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, - { name = "scipy", version = "1.17.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "scikit-learn", version = "1.7.2", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version < '3.11'" }, + { name = "scikit-learn", version = "1.8.0", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version >= '3.11'" }, + { name = "scipy", version = "1.15.3", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version < '3.11'" }, + { name = "scipy", version = "1.17.0", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version >= '3.11'" }, { name = "skops" }, { name = "sqlalchemy" }, { name = "waitress", marker = "sys_platform == 'win32'" }, @@ -2192,7 +2192,7 @@ databricks = [ [[package]] name = "mlflow-skinny" version = "3.10.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "cachetools" }, { name = "click" }, @@ -2222,7 +2222,7 @@ wheels = [ [[package]] name = "mlflow-tracing" version = "3.10.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "cachetools" }, { name = "databricks-sdk" }, @@ -2241,7 +2241,7 @@ wheels = [ [[package]] name = "multidict" version = "6.7.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "typing-extensions", marker = "python_full_version < '3.11'" }, ] @@ -2379,7 +2379,7 @@ wheels = [ [[package]] name = "mypy-extensions" version = "1.1.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/a2/6e/371856a3fb9d31ca8dac321cda606860fa4548858c0cc45d9d1d4ca2628b/mypy_extensions-1.1.0.tar.gz", hash = "sha256:52e68efc3284861e772bbcd66823fde5ae21fd2fdb51c62a211403730b916558", size = 6343, upload-time = "2025-04-22T14:54:24.164Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/79/7b/2c79738432f5c924bef5071f933bcc9efd0473bac3b4aa584a6f7c1c8df8/mypy_extensions-1.1.0-py3-none-any.whl", hash = "sha256:1be4cccdb0f2482337c4743e60421de3a356cd97508abadd57d47403e94f5505", size = 4963, upload-time = "2025-04-22T14:54:22.983Z" }, @@ -2388,7 +2388,7 @@ wheels = [ [[package]] name = "numpy" version = "2.2.6" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } resolution-markers = [ "python_full_version < '3.11'", ] @@ -2453,7 +2453,7 @@ wheels = [ [[package]] name = "numpy" version = "2.4.2" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } resolution-markers = [ "python_full_version >= '3.13'", "python_full_version == '3.12.*'", @@ -2537,7 +2537,7 @@ wheels = [ [[package]] name = "openai" version = "2.16.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "anyio" }, { name = "distro" }, @@ -2556,7 +2556,7 @@ wheels = [ [[package]] name = "opentelemetry-api" version = "1.39.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "importlib-metadata" }, { name = "typing-extensions" }, @@ -2569,7 +2569,7 @@ wheels = [ [[package]] name = "opentelemetry-proto" version = "1.39.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "protobuf" }, ] @@ -2581,7 +2581,7 @@ wheels = [ [[package]] name = "opentelemetry-sdk" version = "1.39.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "opentelemetry-api" }, { name = "opentelemetry-semantic-conventions" }, @@ -2595,7 +2595,7 @@ wheels = [ [[package]] name = "opentelemetry-semantic-conventions" version = "0.60b1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "opentelemetry-api" }, { name = "typing-extensions" }, @@ -2608,7 +2608,7 @@ wheels = [ [[package]] name = "orjson" version = "3.11.7" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/53/45/b268004f745ede84e5798b48ee12b05129d19235d0e15267aa57dcdb400b/orjson-3.11.7.tar.gz", hash = "sha256:9b1a67243945819ce55d24a30b59d6a168e86220452d2c96f4d1f093e71c0c49", size = 6144992, upload-time = "2026-02-02T15:38:49.29Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/de/1a/a373746fa6d0e116dd9e54371a7b54622c44d12296d5d0f3ad5e3ff33490/orjson-3.11.7-cp310-cp310-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:a02c833f38f36546ba65a452127633afce4cf0dd7296b753d3bb54e55e5c0174", size = 229140, upload-time = "2026-02-02T15:37:06.082Z" }, @@ -2689,7 +2689,7 @@ wheels = [ [[package]] name = "packaging" version = "25.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/a1/d4/1fc4078c65507b51b96ca8f8c3ba19e6a61c8253c72794544580a7b6c24d/packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f", size = 165727, upload-time = "2025-04-19T11:48:59.673Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469, upload-time = "2025-04-19T11:48:57.875Z" }, @@ -2698,10 +2698,10 @@ wheels = [ [[package]] name = "pandas" version = "2.3.3" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, - { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version < '3.11'" }, + { name = "numpy", version = "2.4.2", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version >= '3.11'" }, { name = "python-dateutil" }, { name = "pytz" }, { name = "tzdata" }, @@ -2760,7 +2760,7 @@ wheels = [ [[package]] name = "pillow" version = "12.1.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/d0/02/d52c733a2452ef1ffcc123b68e6606d07276b0e358db70eabad7e40042b7/pillow-12.1.0.tar.gz", hash = "sha256:5c5ae0a06e9ea030ab786b0251b32c7e4ce10e58d983c0d5c56029455180b5b9", size = 46977283, upload-time = "2026-01-02T09:13:29.892Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/fe/41/f73d92b6b883a579e79600d391f2e21cb0df767b2714ecbd2952315dfeef/pillow-12.1.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:fb125d860738a09d363a88daa0f59c4533529a90e564785e20fe875b200b6dbd", size = 5304089, upload-time = "2026-01-02T09:10:24.953Z" }, @@ -2858,7 +2858,7 @@ wheels = [ [[package]] name = "pluggy" version = "1.6.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/f9/e2/3e91f31a7d2b083fe6ef3fa267035b518369d9511ffab804f839851d2779/pluggy-1.6.0.tar.gz", hash = "sha256:7dcc130b76258d33b90f61b658791dede3486c3e6bfb003ee5c9bfb396dd22f3", size = 69412, upload-time = "2025-05-15T12:30:07.975Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" }, @@ -2867,7 +2867,7 @@ wheels = [ [[package]] name = "prettytable" version = "3.17.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "wcwidth" }, ] @@ -2879,7 +2879,7 @@ wheels = [ [[package]] name = "propcache" version = "0.4.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/9e/da/e9fc233cf63743258bff22b3dfa7ea5baef7b5bc324af47a0ad89b8ffc6f/propcache-0.4.1.tar.gz", hash = "sha256:f48107a8c637e80362555f37ecf49abe20370e557cc4ab374f04ec4423c97c3d", size = 46442, upload-time = "2025-10-08T19:49:02.291Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/3c/0e/934b541323035566a9af292dba85a195f7b78179114f2c6ebb24551118a9/propcache-0.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7c2d1fa3201efaf55d730400d945b5b3ab6e672e100ba0f9a409d950ab25d7db", size = 79534, upload-time = "2025-10-08T19:46:02.083Z" }, @@ -2993,7 +2993,7 @@ wheels = [ [[package]] name = "proto-plus" version = "1.27.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "protobuf" }, ] @@ -3005,7 +3005,7 @@ wheels = [ [[package]] name = "protobuf" version = "6.33.5" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/ba/25/7c72c307aafc96fa87062aa6291d9f7c94836e43214d43722e86037aac02/protobuf-6.33.5.tar.gz", hash = "sha256:6ddcac2a081f8b7b9642c09406bc6a4290128fce5f471cddd165960bb9119e5c", size = 444465, upload-time = "2026-01-29T21:51:33.494Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/b1/79/af92d0a8369732b027e6d6084251dd8e782c685c72da161bd4a2e00fbabb/protobuf-6.33.5-cp310-abi3-win32.whl", hash = "sha256:d71b040839446bac0f4d162e758bea99c8251161dae9d0983a3b88dee345153b", size = 425769, upload-time = "2026-01-29T21:51:21.751Z" }, @@ -3020,7 +3020,7 @@ wheels = [ [[package]] name = "pyarrow" version = "22.0.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/30/53/04a7fdc63e6056116c9ddc8b43bc28c12cdd181b85cbeadb79278475f3ae/pyarrow-22.0.0.tar.gz", hash = "sha256:3d600dc583260d845c7d8a6db540339dd883081925da2bd1c5cb808f720b3cd9", size = 1151151, upload-time = "2025-10-24T12:30:00.762Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/d9/9b/cb3f7e0a345353def531ca879053e9ef6b9f38ed91aebcf68b09ba54dec0/pyarrow-22.0.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:77718810bd3066158db1e95a63c160ad7ce08c6b0710bc656055033e39cdad88", size = 34223968, upload-time = "2025-10-24T10:03:31.21Z" }, @@ -3077,7 +3077,7 @@ wheels = [ [[package]] name = "pyasn1" version = "0.6.2" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/fe/b6/6e630dff89739fcd427e3f72b3d905ce0acb85a45d4ec3e2678718a3487f/pyasn1-0.6.2.tar.gz", hash = "sha256:9b59a2b25ba7e4f8197db7686c09fb33e658b98339fadb826e9512629017833b", size = 146586, upload-time = "2026-01-16T18:04:18.534Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/44/b5/a96872e5184f354da9c84ae119971a0a4c221fe9b27a4d94bd43f2596727/pyasn1-0.6.2-py3-none-any.whl", hash = "sha256:1eb26d860996a18e9b6ed05e7aae0e9fc21619fcee6af91cca9bad4fbea224bf", size = 83371, upload-time = "2026-01-16T18:04:17.174Z" }, @@ -3086,7 +3086,7 @@ wheels = [ [[package]] name = "pyasn1-modules" version = "0.4.2" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "pyasn1" }, ] @@ -3098,7 +3098,7 @@ wheels = [ [[package]] name = "pycparser" version = "3.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/1b/7d/92392ff7815c21062bea51aa7b87d45576f649f16458d78b7cf94b9ab2e6/pycparser-3.0.tar.gz", hash = "sha256:600f49d217304a5902ac3c37e1281c9fe94e4d0489de643a9504c5cdfdfc6b29", size = 103492, upload-time = "2026-01-21T14:26:51.89Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/0c/c3/44f3fbbfa403ea2a7c779186dc20772604442dde72947e7d01069cbe98e3/pycparser-3.0-py3-none-any.whl", hash = "sha256:b727414169a36b7d524c1c3e31839a521725078d7b2ff038656844266160a992", size = 48172, upload-time = "2026-01-21T14:26:50.693Z" }, @@ -3107,7 +3107,7 @@ wheels = [ [[package]] name = "pydantic" version = "2.12.5" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "annotated-types" }, { name = "pydantic-core" }, @@ -3122,7 +3122,7 @@ wheels = [ [[package]] name = "pydantic-core" version = "2.41.5" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "typing-extensions" }, ] @@ -3240,7 +3240,7 @@ wheels = [ [[package]] name = "pydantic-settings" version = "2.13.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "pydantic" }, { name = "python-dotenv" }, @@ -3254,7 +3254,7 @@ wheels = [ [[package]] name = "pygments" version = "2.19.2" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/b0/77/a5b8c569bf593b0140bde72ea885a803b82086995367bf2037de0159d924/pygments-2.19.2.tar.gz", hash = "sha256:636cb2477cec7f8952536970bc533bc43743542f70392ae026374600add5b887", size = 4968631, upload-time = "2025-06-21T13:39:12.283Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" }, @@ -3263,7 +3263,7 @@ wheels = [ [[package]] name = "pyjwt" version = "2.11.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/5c/5a/b46fa56bf322901eee5b0454a34343cdbdae202cd421775a8ee4e42fd519/pyjwt-2.11.0.tar.gz", hash = "sha256:35f95c1f0fbe5d5ba6e43f00271c275f7a1a4db1dab27bf708073b75318ea623", size = 98019, upload-time = "2026-01-30T19:59:55.694Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/6f/01/c26ce75ba460d5cd503da9e13b21a33804d38c2165dec7b716d06b13010c/pyjwt-2.11.0-py3-none-any.whl", hash = "sha256:94a6bde30eb5c8e04fee991062b534071fd1439ef58d2adc9ccb823e7bcd0469", size = 28224, upload-time = "2026-01-30T19:59:54.539Z" }, @@ -3277,7 +3277,7 @@ crypto = [ [[package]] name = "pyparsing" version = "3.3.2" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/f3/91/9c6ee907786a473bf81c5f53cf703ba0957b23ab84c264080fb5a450416f/pyparsing-3.3.2.tar.gz", hash = "sha256:c777f4d763f140633dcb6d8a3eda953bf7a214dc4eff598413c070bcdc117cbc", size = 6851574, upload-time = "2026-01-21T03:57:59.36Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/10/bd/c038d7cc38edc1aa5bf91ab8068b63d4308c66c4c8bb3cbba7dfbc049f9c/pyparsing-3.3.2-py3-none-any.whl", hash = "sha256:850ba148bd908d7e2411587e247a1e4f0327839c40e2e5e6d05a007ecc69911d", size = 122781, upload-time = "2026-01-21T03:57:55.912Z" }, @@ -3286,7 +3286,7 @@ wheels = [ [[package]] name = "pytest" version = "9.0.2" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "colorama", marker = "sys_platform == 'win32'" }, { name = "exceptiongroup", marker = "python_full_version < '3.11'" }, @@ -3304,7 +3304,7 @@ wheels = [ [[package]] name = "pytest-asyncio" version = "1.3.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "backports-asyncio-runner", marker = "python_full_version < '3.11'" }, { name = "pytest" }, @@ -3318,7 +3318,7 @@ wheels = [ [[package]] name = "python-dateutil" version = "2.9.0.post0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "six" }, ] @@ -3330,7 +3330,7 @@ wheels = [ [[package]] name = "python-dotenv" version = "1.2.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/f0/26/19cadc79a718c5edbec86fd4919a6b6d3f681039a2f6d66d14be94e75fb9/python_dotenv-1.2.1.tar.gz", hash = "sha256:42667e897e16ab0d66954af0e60a9caa94f0fd4ecf3aaf6d2d260eec1aa36ad6", size = 44221, upload-time = "2025-10-26T15:12:10.434Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/14/1b/a298b06749107c305e1fe0f814c6c74aea7b2f1e10989cb30f544a1b3253/python_dotenv-1.2.1-py3-none-any.whl", hash = "sha256:b81ee9561e9ca4004139c6cbba3a238c32b03e4894671e181b671e8cb8425d61", size = 21230, upload-time = "2025-10-26T15:12:09.109Z" }, @@ -3339,7 +3339,7 @@ wheels = [ [[package]] name = "python-multipart" version = "0.0.22" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/94/01/979e98d542a70714b0cb2b6728ed0b7c46792b695e3eaec3e20711271ca3/python_multipart-0.0.22.tar.gz", hash = "sha256:7340bef99a7e0032613f56dc36027b959fd3b30a787ed62d310e951f7c3a3a58", size = 37612, upload-time = "2026-01-25T10:15:56.219Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/1b/d0/397f9626e711ff749a95d96b7af99b9c566a9bb5129b8e4c10fc4d100304/python_multipart-0.0.22-py3-none-any.whl", hash = "sha256:2b2cd894c83d21bf49d702499531c7bafd057d730c201782048f7945d82de155", size = 24579, upload-time = "2026-01-25T10:15:54.811Z" }, @@ -3348,7 +3348,7 @@ wheels = [ [[package]] name = "pytz" version = "2025.2" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/f8/bf/abbd3cdfb8fbc7fb3d4d38d320f2441b1e7cbe29be4f23797b4a2b5d8aac/pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3", size = 320884, upload-time = "2025-03-25T02:25:00.538Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload-time = "2025-03-25T02:24:58.468Z" }, @@ -3357,7 +3357,7 @@ wheels = [ [[package]] name = "pywin32" version = "311" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } wheels = [ { url = "https://files.pythonhosted.org/packages/7b/40/44efbb0dfbd33aca6a6483191dae0716070ed99e2ecb0c53683f400a0b4f/pywin32-311-cp310-cp310-win32.whl", hash = "sha256:d03ff496d2a0cd4a5893504789d4a15399133fe82517455e78bad62efbb7f0a3", size = 8760432, upload-time = "2025-07-14T20:13:05.9Z" }, { url = "https://files.pythonhosted.org/packages/5e/bf/360243b1e953bd254a82f12653974be395ba880e7ec23e3731d9f73921cc/pywin32-311-cp310-cp310-win_amd64.whl", hash = "sha256:797c2772017851984b97180b0bebe4b620bb86328e8a884bb626156295a63b3b", size = 9590103, upload-time = "2025-07-14T20:13:07.698Z" }, @@ -3379,7 +3379,7 @@ wheels = [ [[package]] name = "pyyaml" version = "6.0.3" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/05/8e/961c0007c59b8dd7729d542c61a4d537767a59645b82a0b521206e1e25c2/pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f", size = 130960, upload-time = "2025-09-25T21:33:16.546Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/f4/a0/39350dd17dd6d6c6507025c0e53aef67a9293a6d37d3511f23ea510d5800/pyyaml-6.0.3-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:214ed4befebe12df36bcc8bc2b64b396ca31be9304b8f59e25c11cf94a4c033b", size = 184227, upload-time = "2025-09-25T21:31:46.04Z" }, @@ -3443,7 +3443,7 @@ wheels = [ [[package]] name = "referencing" version = "0.37.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "attrs" }, { name = "rpds-py" }, @@ -3457,7 +3457,7 @@ wheels = [ [[package]] name = "regex" version = "2026.1.15" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/0b/86/07d5056945f9ec4590b518171c4254a5925832eb727b56d3c38a7476f316/regex-2026.1.15.tar.gz", hash = "sha256:164759aa25575cbc0651bef59a0b18353e54300d79ace8084c818ad8ac72b7d5", size = 414811, upload-time = "2026-01-14T23:18:02.775Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/ea/d2/e6ee96b7dff201a83f650241c52db8e5bd080967cb93211f57aa448dc9d6/regex-2026.1.15-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:4e3dd93c8f9abe8aa4b6c652016da9a3afa190df5ad822907efe6b206c09896e", size = 488166, upload-time = "2026-01-14T23:13:46.408Z" }, @@ -3578,7 +3578,7 @@ wheels = [ [[package]] name = "requests" version = "2.32.5" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "certifi" }, { name = "charset-normalizer" }, @@ -3593,7 +3593,7 @@ wheels = [ [[package]] name = "requests-toolbelt" version = "1.0.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "requests" }, ] @@ -3605,7 +3605,7 @@ wheels = [ [[package]] name = "rpds-py" version = "0.30.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/20/af/3f2f423103f1113b36230496629986e0ef7e199d2aa8392452b484b38ced/rpds_py-0.30.0.tar.gz", hash = "sha256:dd8ff7cf90014af0c0f787eea34794ebf6415242ee1d6fa91eaba725cc441e84", size = 69469, upload-time = "2025-11-30T20:24:38.837Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/06/0c/0c411a0ec64ccb6d104dcabe0e713e05e153a9a2c3c2bd2b32ce412166fe/rpds_py-0.30.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:679ae98e00c0e8d68a7fda324e16b90fd5260945b45d3b824c892cec9eea3288", size = 370490, upload-time = "2025-11-30T20:21:33.256Z" }, @@ -3727,7 +3727,7 @@ wheels = [ [[package]] name = "rsa" version = "4.9.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "pyasn1" }, ] @@ -3739,7 +3739,7 @@ wheels = [ [[package]] name = "s3transfer" version = "0.16.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "botocore" }, ] @@ -3751,14 +3751,14 @@ wheels = [ [[package]] name = "scikit-learn" version = "1.7.2" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } resolution-markers = [ "python_full_version < '3.11'", ] dependencies = [ { name = "joblib", marker = "python_full_version < '3.11'" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, - { name = "scipy", version = "1.15.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version < '3.11'" }, + { name = "scipy", version = "1.15.3", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version < '3.11'" }, { name = "threadpoolctl", marker = "python_full_version < '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/98/c2/a7855e41c9d285dfe86dc50b250978105dce513d6e459ea66a6aeb0e1e0c/scikit_learn-1.7.2.tar.gz", hash = "sha256:20e9e49ecd130598f1ca38a1d85090e1a600147b9c02fa6f15d69cb53d968fda", size = 7193136, upload-time = "2025-09-09T08:21:29.075Z" } @@ -3798,7 +3798,7 @@ wheels = [ [[package]] name = "scikit-learn" version = "1.8.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } resolution-markers = [ "python_full_version >= '3.13'", "python_full_version == '3.12.*'", @@ -3806,8 +3806,8 @@ resolution-markers = [ ] dependencies = [ { name = "joblib", marker = "python_full_version >= '3.11'" }, - { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, - { name = "scipy", version = "1.17.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "numpy", version = "2.4.2", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version >= '3.11'" }, + { name = "scipy", version = "1.17.0", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version >= '3.11'" }, { name = "threadpoolctl", marker = "python_full_version >= '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/0e/d4/40988bf3b8e34feec1d0e6a051446b1f66225f8529b9309becaeef62b6c4/scikit_learn-1.8.0.tar.gz", hash = "sha256:9bccbb3b40e3de10351f8f5068e105d0f4083b1a65fa07b6634fbc401a6287fd", size = 7335585, upload-time = "2025-12-10T07:08:53.618Z" } @@ -3853,12 +3853,12 @@ wheels = [ [[package]] name = "scipy" version = "1.15.3" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } resolution-markers = [ "python_full_version < '3.11'", ] dependencies = [ - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version < '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/0f/37/6964b830433e654ec7485e45a00fc9a27cf868d622838f6b6d9c5ec0d532/scipy-1.15.3.tar.gz", hash = "sha256:eae3cf522bc7df64b42cad3925c876e1b0b6c35c1337c93e12c0f366f55b0eaf", size = 59419214, upload-time = "2025-05-08T16:13:05.955Z" } wheels = [ @@ -3912,14 +3912,14 @@ wheels = [ [[package]] name = "scipy" version = "1.17.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } resolution-markers = [ "python_full_version >= '3.13'", "python_full_version == '3.12.*'", "python_full_version == '3.11.*'", ] dependencies = [ - { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "numpy", version = "2.4.2", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version >= '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/56/3e/9cca699f3486ce6bc12ff46dc2031f1ec8eb9ccc9a320fdaf925f1417426/scipy-1.17.0.tar.gz", hash = "sha256:2591060c8e648d8b96439e111ac41fd8342fdeff1876be2e19dea3fe8930454e", size = 30396830, upload-time = "2026-01-10T21:34:23.009Z" } wheels = [ @@ -3988,7 +3988,7 @@ wheels = [ [[package]] name = "shellingham" version = "1.5.4" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/58/15/8b3609fd3830ef7b27b655beb4b4e9c62313a4e8da8c676e142cc210d58e/shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de", size = 10310, upload-time = "2023-10-24T04:13:40.426Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/e0/f9/0595336914c5619e5f28a1fb793285925a8cd4b432c9da0a987836c7f822/shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686", size = 9755, upload-time = "2023-10-24T04:13:38.866Z" }, @@ -3997,7 +3997,7 @@ wheels = [ [[package]] name = "six" version = "1.17.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031, upload-time = "2024-12-04T17:35:28.174Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" }, @@ -4008,6 +4008,8 @@ name = "skill-test" version = "0.1.0" source = { editable = "." } dependencies = [ + { name = "claude-agent-sdk" }, + { name = "gepa" }, { name = "mlflow", extra = ["databricks"] }, { name = "protobuf" }, { name = "python-dotenv" }, @@ -4022,6 +4024,7 @@ all = [ { name = "claude-agent-sdk" }, { name = "databricks-sdk" }, { name = "gepa" }, + { name = "litellm" }, { name = "pytest" }, { name = "pytest-asyncio" }, { name = "tiktoken" }, @@ -4035,14 +4038,18 @@ dev = [ ] optimize = [ { name = "gepa" }, + { name = "litellm" }, { name = "tiktoken" }, ] [package.metadata] requires-dist = [ + { name = "claude-agent-sdk", specifier = ">=0.1.39" }, { name = "claude-agent-sdk", marker = "extra == 'agent'", specifier = ">=0.1.39" }, { name = "databricks-sdk", marker = "extra == 'databricks'", specifier = ">=0.20.0" }, + { name = "gepa", specifier = ">=0.1.0" }, { name = "gepa", marker = "extra == 'optimize'", specifier = ">=0.1.0" }, + { name = "litellm", marker = "extra == 'optimize'", specifier = "<=1.82.6" }, { name = "mlflow", extras = ["databricks"], specifier = ">=3.10.1" }, { name = "protobuf", specifier = ">=5.26" }, { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0" }, @@ -4057,16 +4064,16 @@ provides-extras = ["databricks", "dev", "optimize", "agent", "all"] [[package]] name = "skops" version = "0.13.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, - { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version < '3.11'" }, + { name = "numpy", version = "2.4.2", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version >= '3.11'" }, { name = "packaging" }, { name = "prettytable" }, - { name = "scikit-learn", version = "1.7.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, - { name = "scikit-learn", version = "1.8.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, - { name = "scipy", version = "1.15.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, - { name = "scipy", version = "1.17.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, + { name = "scikit-learn", version = "1.7.2", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version < '3.11'" }, + { name = "scikit-learn", version = "1.8.0", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version >= '3.11'" }, + { name = "scipy", version = "1.15.3", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version < '3.11'" }, + { name = "scipy", version = "1.17.0", source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" }, marker = "python_full_version >= '3.11'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/b5/0c/5ec987633e077dd0076178ea6ade2d6e57780b34afea0b497fb507d7a1ed/skops-0.13.0.tar.gz", hash = "sha256:66949fd3c95cbb5c80270fbe40293c0fe1e46cb4a921860e42584dd9c20ebeb1", size = 581312, upload-time = "2025-08-06T09:48:14.916Z" } wheels = [ @@ -4076,7 +4083,7 @@ wheels = [ [[package]] name = "smmap" version = "5.0.2" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/44/cd/a040c4b3119bbe532e5b0732286f805445375489fceaec1f48306068ee3b/smmap-5.0.2.tar.gz", hash = "sha256:26ea65a03958fa0c8a1c7e8c7a58fdc77221b8910f6be2131affade476898ad5", size = 22329, upload-time = "2025-01-02T07:14:40.909Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/04/be/d09147ad1ec7934636ad912901c5fd7667e1c858e19d355237db0d0cd5e4/smmap-5.0.2-py3-none-any.whl", hash = "sha256:b30115f0def7d7531d22a0fb6502488d879e75b260a9db4d0819cfb25403af5e", size = 24303, upload-time = "2025-01-02T07:14:38.724Z" }, @@ -4085,7 +4092,7 @@ wheels = [ [[package]] name = "sniffio" version = "1.3.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/a2/87/a6771e1546d97e7e041b6ae58d80074f81b7d5121207425c964ddf5cfdbd/sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc", size = 20372, upload-time = "2024-02-25T23:20:04.057Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" }, @@ -4094,7 +4101,7 @@ wheels = [ [[package]] name = "sqlalchemy" version = "2.0.46" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "greenlet", marker = "platform_machine == 'AMD64' or platform_machine == 'WIN32' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'ppc64le' or platform_machine == 'win32' or platform_machine == 'x86_64'" }, { name = "typing-extensions" }, @@ -4150,7 +4157,7 @@ wheels = [ [[package]] name = "sqlparse" version = "0.5.5" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/90/76/437d71068094df0726366574cf3432a4ed754217b436eb7429415cf2d480/sqlparse-0.5.5.tar.gz", hash = "sha256:e20d4a9b0b8585fdf63b10d30066c7c94c5d7a7ec47c889a2d83a3caa93ff28e", size = 120815, upload-time = "2025-12-19T07:17:45.073Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/49/4b/359f28a903c13438ef59ebeee215fb25da53066db67b305c125f1c6d2a25/sqlparse-0.5.5-py3-none-any.whl", hash = "sha256:12a08b3bf3eec877c519589833aed092e2444e68240a3577e8e26148acc7b1ba", size = 46138, upload-time = "2025-12-19T07:17:46.573Z" }, @@ -4159,7 +4166,7 @@ wheels = [ [[package]] name = "sse-starlette" version = "3.3.2" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "anyio" }, { name = "starlette" }, @@ -4172,7 +4179,7 @@ wheels = [ [[package]] name = "starlette" version = "0.50.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "anyio" }, { name = "typing-extensions", marker = "python_full_version < '3.13'" }, @@ -4185,7 +4192,7 @@ wheels = [ [[package]] name = "tenacity" version = "9.1.2" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/0a/d4/2b0cd0fe285e14b36db076e78c93766ff1d529d70408bd1d2a5a84f1d929/tenacity-9.1.2.tar.gz", hash = "sha256:1169d376c297e7de388d18b4481760d478b0e99a777cad3a9c86e556f4b697cb", size = 48036, upload-time = "2025-04-02T08:25:09.966Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/e5/30/643397144bfbfec6f6ef821f36f33e57d35946c44a2352d3c9f0ae847619/tenacity-9.1.2-py3-none-any.whl", hash = "sha256:f77bf36710d8b73a50b2dd155c97b870017ad21afe6ab300326b0371b3b05138", size = 28248, upload-time = "2025-04-02T08:25:07.678Z" }, @@ -4194,7 +4201,7 @@ wheels = [ [[package]] name = "threadpoolctl" version = "3.6.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/b7/4d/08c89e34946fce2aec4fbb45c9016efd5f4d7f24af8e5d93296e935631d8/threadpoolctl-3.6.0.tar.gz", hash = "sha256:8ab8b4aa3491d812b623328249fab5302a68d2d71745c8a4c719a2fcaba9f44e", size = 21274, upload-time = "2025-03-13T13:49:23.031Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl", hash = "sha256:43a0b8fd5a2928500110039e43a5eed8480b918967083ea48dc3ab9f13c4a7fb", size = 18638, upload-time = "2025-03-13T13:49:21.846Z" }, @@ -4203,7 +4210,7 @@ wheels = [ [[package]] name = "tiktoken" version = "0.12.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "regex" }, { name = "requests" }, @@ -4264,7 +4271,7 @@ wheels = [ [[package]] name = "tokenizers" version = "0.22.2" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "huggingface-hub" }, ] @@ -4294,7 +4301,7 @@ wheels = [ [[package]] name = "tomli" version = "2.4.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/82/30/31573e9457673ab10aa432461bee537ce6cef177667deca369efb79df071/tomli-2.4.0.tar.gz", hash = "sha256:aa89c3f6c277dd275d8e243ad24f3b5e701491a860d5121f2cdd399fbb31fc9c", size = 17477, upload-time = "2026-01-11T11:22:38.165Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/3c/d9/3dc2289e1f3b32eb19b9785b6a006b28ee99acb37d1d47f78d4c10e28bf8/tomli-2.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b5ef256a3fd497d4973c11bf142e9ed78b150d36f5773f1ca6088c230ffc5867", size = 153663, upload-time = "2026-01-11T11:21:45.27Z" }, @@ -4348,7 +4355,7 @@ wheels = [ [[package]] name = "tqdm" version = "4.67.3" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "colorama", marker = "sys_platform == 'win32'" }, ] @@ -4360,7 +4367,7 @@ wheels = [ [[package]] name = "typer-slim" version = "0.21.1" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "click" }, { name = "typing-extensions" }, @@ -4373,7 +4380,7 @@ wheels = [ [[package]] name = "typing-extensions" version = "4.15.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/72/94/1a15dd82efb362ac84269196e94cf00f187f7ed21c242792a923cdb1c61f/typing_extensions-4.15.0.tar.gz", hash = "sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466", size = 109391, upload-time = "2025-08-25T13:49:26.313Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/18/67/36e9267722cc04a6b9f15c7f3441c2363321a3ea07da7ae0c0707beb2a9c/typing_extensions-4.15.0-py3-none-any.whl", hash = "sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548", size = 44614, upload-time = "2025-08-25T13:49:24.86Z" }, @@ -4382,7 +4389,7 @@ wheels = [ [[package]] name = "typing-inspect" version = "0.9.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "mypy-extensions" }, { name = "typing-extensions" }, @@ -4395,7 +4402,7 @@ wheels = [ [[package]] name = "typing-inspection" version = "0.4.2" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "typing-extensions" }, ] @@ -4407,7 +4414,7 @@ wheels = [ [[package]] name = "tzdata" version = "2025.3" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/5e/a7/c202b344c5ca7daf398f3b8a477eeb205cf3b6f32e7ec3a6bac0629ca975/tzdata-2025.3.tar.gz", hash = "sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7", size = 196772, upload-time = "2025-12-13T17:45:35.667Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/c7/b0/003792df09decd6849a5e39c28b513c06e84436a54440380862b5aeff25d/tzdata-2025.3-py2.py3-none-any.whl", hash = "sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1", size = 348521, upload-time = "2025-12-13T17:45:33.889Z" }, @@ -4416,7 +4423,7 @@ wheels = [ [[package]] name = "urllib3" version = "2.6.3" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/c7/24/5f1b3bdffd70275f6661c76461e25f024d5a38a46f04aaca912426a2b1d3/urllib3-2.6.3.tar.gz", hash = "sha256:1b62b6884944a57dbe321509ab94fd4d3b307075e0c2eae991ac71ee15ad38ed", size = 435556, upload-time = "2026-01-07T16:24:43.925Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4", size = 131584, upload-time = "2026-01-07T16:24:42.685Z" }, @@ -4425,7 +4432,7 @@ wheels = [ [[package]] name = "uuid-utils" version = "0.14.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/57/7c/3a926e847516e67bc6838634f2e54e24381105b4e80f9338dc35cca0086b/uuid_utils-0.14.0.tar.gz", hash = "sha256:fc5bac21e9933ea6c590433c11aa54aaca599f690c08069e364eb13a12f670b4", size = 22072, upload-time = "2026-01-20T20:37:15.729Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/a7/42/42d003f4a99ddc901eef2fd41acb3694163835e037fb6dde79ad68a72342/uuid_utils-0.14.0-cp39-abi3-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:f6695c0bed8b18a904321e115afe73b34444bc8451d0ce3244a1ec3b84deb0e5", size = 601786, upload-time = "2026-01-20T20:37:09.843Z" }, @@ -4454,7 +4461,7 @@ wheels = [ [[package]] name = "uvicorn" version = "0.40.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "click" }, { name = "h11" }, @@ -4468,7 +4475,7 @@ wheels = [ [[package]] name = "waitress" version = "3.0.2" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/bf/cb/04ddb054f45faa306a230769e868c28b8065ea196891f09004ebace5b184/waitress-3.0.2.tar.gz", hash = "sha256:682aaaf2af0c44ada4abfb70ded36393f0e307f4ab9456a215ce0020baefc31f", size = 179901, upload-time = "2024-11-16T20:02:35.195Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/8d/57/a27182528c90ef38d82b636a11f606b0cbb0e17588ed205435f8affe3368/waitress-3.0.2-py3-none-any.whl", hash = "sha256:c56d67fd6e87c2ee598b76abdd4e96cfad1f24cacdea5078d382b1f9d7b5ed2e", size = 56232, upload-time = "2024-11-16T20:02:33.858Z" }, @@ -4477,7 +4484,7 @@ wheels = [ [[package]] name = "wcwidth" version = "0.5.3" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/c2/62/a7c072fbfefb2980a00f99ca994279cb9ecf310cb2e6b2a4d2a28fe192b3/wcwidth-0.5.3.tar.gz", hash = "sha256:53123b7af053c74e9fe2e92ac810301f6139e64379031f7124574212fb3b4091", size = 157587, upload-time = "2026-01-31T03:52:10.92Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/3c/c1/d73f12f8cdb1891334a2ccf7389eed244d3941e74d80dd220badb937f3fb/wcwidth-0.5.3-py3-none-any.whl", hash = "sha256:d584eff31cd4753e1e5ff6c12e1edfdb324c995713f75d26c29807bb84bf649e", size = 92981, upload-time = "2026-01-31T03:52:09.14Z" }, @@ -4486,7 +4493,7 @@ wheels = [ [[package]] name = "werkzeug" version = "3.1.5" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "markupsafe" }, ] @@ -4498,7 +4505,7 @@ wheels = [ [[package]] name = "whenever" version = "0.7.3" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "tzdata", marker = "sys_platform == 'win32'" }, ] @@ -4565,7 +4572,7 @@ wheels = [ [[package]] name = "xxhash" version = "3.6.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/02/84/30869e01909fb37a6cc7e18688ee8bf1e42d57e7e0777636bd47524c43c7/xxhash-3.6.0.tar.gz", hash = "sha256:f0162a78b13a0d7617b2845b90c763339d1f1d82bb04a4b07f4ab535cc5e05d6", size = 85160, upload-time = "2025-10-02T14:37:08.097Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/34/ee/f9f1d656ad168681bb0f6b092372c1e533c4416b8069b1896a175c46e484/xxhash-3.6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:87ff03d7e35c61435976554477a7f4cd1704c3596a89a8300d5ce7fc83874a71", size = 32845, upload-time = "2025-10-02T14:33:51.573Z" }, @@ -4683,7 +4690,7 @@ wheels = [ [[package]] name = "yarl" version = "1.22.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } dependencies = [ { name = "idna" }, { name = "multidict" }, @@ -4809,7 +4816,7 @@ wheels = [ [[package]] name = "zipp" version = "3.23.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/e3/02/0f2892c661036d50ede074e376733dca2ae7c6eb617489437771209d4180/zipp-3.23.0.tar.gz", hash = "sha256:a07157588a12518c9d4034df3fbbee09c814741a33ff63c05fa29d26a2404166", size = 25547, upload-time = "2025-06-08T17:06:39.4Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/2e/54/647ade08bf0db230bfea292f893923872fd20be6ac6f53b2b936ba839d75/zipp-3.23.0-py3-none-any.whl", hash = "sha256:071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e", size = 10276, upload-time = "2025-06-08T17:06:38.034Z" }, @@ -4818,7 +4825,7 @@ wheels = [ [[package]] name = "zstandard" version = "0.25.0" -source = { registry = "https://pypi.org/simple" } +source = { registry = "https://pypi-proxy.dev.databricks.com/simple/" } sdist = { url = "https://files.pythonhosted.org/packages/fd/aa/3e0508d5a5dd96529cdc5a97011299056e14c6505b678fd58938792794b1/zstandard-0.25.0.tar.gz", hash = "sha256:7713e1179d162cf5c7906da876ec2ccb9c3a9dcbdffef0cc7f70c3667a205f0b", size = 711513, upload-time = "2025-09-14T22:15:54.002Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/56/7a/28efd1d371f1acd037ac64ed1c5e2b41514a6cc937dd6ab6a13ab9f0702f/zstandard-0.25.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e59fdc271772f6686e01e1b3b74537259800f57e24280be3f29c8a0deb1904dd", size = 795256, upload-time = "2025-09-14T22:15:56.415Z" }, diff --git a/DECOMMISSION_PLAN.md b/DECOMMISSION_PLAN.md new file mode 100644 index 00000000..93bfa905 --- /dev/null +++ b/DECOMMISSION_PLAN.md @@ -0,0 +1,270 @@ +# MCP Server Decommissioning Plan + +## Executive Summary + +This plan outlines removing `databricks-tools-core` and `databricks-mcp-server` from the main AI Dev Kit project, simplifying the installation to focus on **standalone skills only**. + +## Current State Analysis + +### What Exists Today + +| Component | Purpose | Dependencies | +|-----------|---------|--------------| +| `databricks-tools-core/` | Python library with high-level Databricks functions | None (standalone) | +| `databricks-mcp-server/` | MCP server exposing 50+ tools | Depends on databricks-tools-core | +| `databricks-skills/` | Markdown skills + self-contained Python scripts | **None** (already standalone) | +| `databricks-builder-app/` | Full-stack web application | **Depends on BOTH** tools-core and mcp-server | + +### Files Referencing MCP/Core + +**Shell scripts:** +- `install.sh` (main installer) - lines 1071, 251, 657, etc. +- `databricks-mcp-server/setup.sh` +- `.claude-plugin/setup.sh` +- `databricks-builder-app/scripts/deploy.sh` (lines 193-195) +- `databricks-builder-app/scripts/start_local.sh` (lines 205-206) + +**Documentation:** +- `README.md` - references both packages in "What's Included" and "Core Library" sections +- `SECURITY.md` - mentions packages in installation flow +- `CONTRIBUTING.md` - setup instructions reference mcp-server +- `databricks-builder-app/README.md` - architecture diagram includes mcp-server + +## builder-app Refactoring (Much Simpler Than Expected!) + +### Reference Implementation + +A cleaner solution exists in `industry-demo-prompts/app/src/demo_prompt_generator/backend/services/agent.py`. + +**Key insight:** MCP tools are NOT needed. Skills + standard SDK tools provide everything: + +```python +# Note: MCP tools removed - ai-dev-kit now uses CLI tools via skills +allowed_tools = ["Read", "Write", "Edit", "Glob", "Grep", "Bash", "Skill"] +``` + +### Current builder-app Dependencies + +| File | Import | Can Be Removed? | +|------|--------|----------------| +| `server/services/agent.py` | `databricks_tools_core.auth` | Yes - use `databricks.sdk.WorkspaceClient()` directly | +| `server/services/databricks_tools.py` | `databricks_mcp_server.*` | **DELETE ENTIRE FILE** | +| `server/services/clusters.py` | `databricks_tools_core.auth` | Yes - use SDK directly | +| `server/services/warehouses.py` | `databricks_tools_core.auth` | Yes - use SDK directly | +| `server/services/user.py` | `databricks_tools_core.identity` | Yes - inline constants | +| `server/db/database.py` | `databricks_tools_core.identity` | Yes - inline constants | +| `alembic/env.py` | `databricks_tools_core.identity` | Yes - inline constants | + +### Refactoring Steps + +1. **Delete `databricks_tools.py`** (433 lines) - No longer needed +2. **Simplify `agent.py`**: + - Remove MCP server loading + - Use standard SDK tools: `["Read", "Write", "Edit", "Glob", "Grep", "Bash", "Skill"]` + - Add `setting_sources=["project"]` to enable skill discovery + - Copy client pooling pattern from reference implementation +3. **Replace auth imports** - Use `databricks.sdk.WorkspaceClient()` directly +4. **Inline identity constants**: + ```python + # Instead of: from databricks_tools_core.identity import PRODUCT_NAME, PRODUCT_VERSION + PRODUCT_NAME = "databricks-builder-app" + PRODUCT_VERSION = "0.1.0" + ``` +5. **Update deploy.sh** - Remove package copying steps +6. **Update pyproject.toml** - Remove `databricks_tools_core*` and `databricks_mcp_server*` from includes + +### Code Reduction + +| File | Before | After | +|------|--------|-------| +| `databricks_tools.py` | 433 lines | **DELETED** | +| `agent.py` | ~400 lines | ~300 lines | +| `deploy.sh` | Complex pkg copy | Simple | + +**Total: ~500+ lines removed, simpler architecture** + +### Phase 2: Simplify Main Project + +Once builder-app is self-contained: + +#### 2.1 Delete Folders +```bash +rm -rf databricks-tools-core/ +rm -rf databricks-mcp-server/ +``` + +#### 2.2 Simplify install.sh + +**Option A: Remove MCP entirely (Recommended)** + +Replace the 1790-line `install.sh` with a simplified version that: +- Only installs skills (like `install_skills.sh` does) +- Removes all MCP configuration code +- Removes the Python venv creation for MCP + +**Option B: Keep MCP as optional** + +Keep `--skills-only` as default, make MCP opt-in via `--with-mcp`: +- Default behavior = skills only +- `--with-mcp` = old behavior + +#### 2.3 Update Documentation + +**README.md changes:** +- Remove "Core Library" section +- Remove "MCP Tools Only" from table +- Remove databricks-tools-core from "What's Included" +- Update architecture diagram (remove MCP layer) + +**Files to update:** +- `README.md` +- `SECURITY.md` +- `CONTRIBUTING.md` +- `databricks-builder-app/README.md` + +#### 2.4 Update Other Files + +- `.mcp.json` - Delete or update +- `.claude-plugin/setup.sh` - Remove core/mcp references +- `pyproject.toml` (if any) - Update dependencies + +## Installation Flow Comparison + +### Current Flow (install.sh) +``` +1. Clone repo to ~/.ai-dev-kit +2. Create Python venv +3. pip install databricks-tools-core + databricks-mcp-server +4. Install skills to .claude/skills/ +5. Write MCP config to claude_desktop_config.json, etc. +``` + +### Simplified Flow (after decommissioning) +``` +1. Install skills to .claude/skills/ (directly from GitHub) +2. Done! +``` + +## Migration Guide for Users + +Users who want MCP tools after decommissioning: + +1. **Use databricks CLI directly** - Skills now guide users to use CLI commands +2. **Use databricks SDK** - Skills include Python SDK examples +3. **Fork the MCP server** - If they really need it, they can fork the repo at the commit before removal + +## Risks and Mitigations + +| Risk | Mitigation | +|------|------------| +| builder-app breaks | Phase 1 must complete before Phase 2 | +| Users depend on MCP | Document migration path; skills cover same functionality | +| Lost test coverage | Move relevant tests to databricks-skills/.tests/ | + +## File Deletion Summary + +**Folders to delete:** +- `databricks-tools-core/` (~20 Python files, ~15K lines) +- `databricks-mcp-server/` (~15 Python files, ~10K lines) + +**Files to heavily modify:** +- `install.sh` (reduce from 1790 lines to ~500) +- `README.md` (remove 4+ sections) +- `CONTRIBUTING.md` (remove MCP setup) +- `SECURITY.md` (update installation flow) + +**Files to delete:** +- `.mcp.json` (MCP config example) + +## Pre-requisite: Fix Skills Integration Tests + +Before proceeding with decommissioning, fix the broken integration tests in `databricks-skills/.tests/`: + +### Current Test Status + +| Test File | Unit Tests | Integration Tests | Status | +|-----------|------------|-------------------|--------| +| `test_agent_bricks_manager.py` | 5 pass | 3 skip (no workspace) | OK | +| `test_pdf_generator.py` | 13 pass | 3 fail | **NEEDS FIX** | + +### Failing Tests (test_pdf_generator.py) + +``` +FAILED test_pdf_generator.py::TestPDFGenerationIntegration::test_generate_and_upload_pdf +FAILED test_pdf_generator.py::TestPDFGenerationIntegration::test_generate_and_upload_pdf_with_folder +FAILED test_pdf_generator.py::TestPDFGenerationIntegration::test_generate_complex_pdf +``` + +**Root cause:** Test volume `ai_dev_kit.test_pdf_generation.raw_data` doesn't exist. + +### Fix Required + +Update `test_pdf_generator.py` to skip gracefully when test volume is unavailable: + +```python +@pytest.fixture(autouse=True) +def skip_if_volume_missing(self, test_config): + """Skip tests if the required volume doesn't exist.""" + error = _validate_volume_exists( + test_config["catalog"], + test_config["schema"], + test_config["volume"] + ) + if error: + pytest.skip(f"Test volume not available: {error}") +``` + +### Additional Integration Tests Needed + +For complete coverage, add integration tests for remaining skills with Python files: + +| Skill | Python File | Test Status | +|-------|-------------|-------------| +| `databricks-agent-bricks` | `mas_manager.py` | Has tests | +| `databricks-unstructured-pdf-generation` | `pdf_generator.py` | Has tests (needs fix) | +| Other skills with .py files | Various | Need tests | + +## Recommended Execution Order + +### Phase 0: Fix Skills Tests +1. [ ] **Fix broken integration tests** (test_pdf_generator.py skip when volume missing) +2. [ ] Add integration tests for remaining skills with Python files + +### Phase 1: Refactor builder-app (Much Simpler Now!) + +**Reference implementation:** `../industry-demo-prompts/app/src/demo_prompt_generator/backend/services/agent.py` + +3. [ ] Update `pyproject.toml`: + - Bump `claude-agent-sdk>=0.1.50` (from 0.1.19) + - Remove `databricks_tools_core*` and `databricks_mcp_server*` from includes +4. [ ] Delete `server/services/databricks_tools.py` entirely +5. [ ] Simplify `server/services/agent.py`: + - Remove MCP imports and loading + - Use standard tools: `["Read", "Write", "Edit", "Glob", "Grep", "Bash", "Skill"]` + - Add `setting_sources=["project"]` for skill discovery + - Adopt client pooling pattern from reference implementation +6. [ ] Replace `databricks_tools_core.auth` โ†’ `databricks.sdk.WorkspaceClient()` +7. [ ] Inline `PRODUCT_NAME`, `PRODUCT_VERSION` constants +8. [ ] Update `deploy.sh` - remove package copying +9. [ ] Test builder-app locally and deployed + +### Phase 2: Simplify Main Project +10. [ ] Simplify `install.sh` to skills-only (remove MCP setup) +11. [ ] Update `install.ps1` (Windows) similarly +12. [ ] Update `README.md` +13. [ ] Update `CONTRIBUTING.md` +14. [ ] Update `SECURITY.md` + +### Phase 3: Delete and Verify +15. [ ] Delete `databricks-tools-core/` +16. [ ] Delete `databricks-mcp-server/` +17. [ ] Delete `.mcp.json` +18. [ ] Delete `.claude-plugin/` (or update if needed) +19. [ ] Test full installation flow (skills-only) +20. [ ] Test builder-app deployment + +## Questions to Resolve + +1. **Should we archive MCP in a separate branch?** - For users who want to fork it +2. **What about install.ps1 (Windows)?** - Same changes needed +3. **Keep .claude-plugin/ ?** - This also references MCP diff --git a/README.md b/README.md index dec82a2f..1ae57a75 100644 --- a/README.md +++ b/README.md @@ -63,11 +63,12 @@ AI-Driven Development (vibe coding) on Databricks just got a whole lot better. T - [Copilot](https://github.com/features/copilot/cli) - [Windsurf](https://windsurf.com) - [OpenCode](https://opencode.ai) + - [Kiro](https://kiro.dev) ### Install in existing project By default this will install at a project level rather than a user level. This is often a good fit, but requires you to run your client from the exact directory that was used for the install. -_Note: Project configuration files can be re-used in other projects. You find these configs under .claude, .cursor, .gemini, .codex, .github, .agents, .windsurf, .codeium, .opencode, or opencode.json_ +_Note: Project configuration files can be re-used in other projects. You find these configs under .claude, .cursor, .gemini, .codex, .github, .agents, .windsurf, .codeium, .opencode, .kiro, or opencode.json_ #### Mac / Linux diff --git a/databricks-skills/.tests/__init__.py b/databricks-skills/.tests/__init__.py new file mode 100644 index 00000000..22366876 --- /dev/null +++ b/databricks-skills/.tests/__init__.py @@ -0,0 +1 @@ +"""databricks-skills integration tests.""" diff --git a/databricks-skills/.tests/conftest.py b/databricks-skills/.tests/conftest.py new file mode 100644 index 00000000..f5612394 --- /dev/null +++ b/databricks-skills/.tests/conftest.py @@ -0,0 +1,78 @@ +""" +Pytest fixtures for databricks-skills integration tests. + +These fixtures set up test resources in Databricks for testing the Python scripts +in databricks-skills that use databricks-tools-core functionality. + +Requires a valid Databricks connection (via env vars or ~/.databrickscfg). +""" + +import logging +import os +from pathlib import Path + +import pytest +from databricks.sdk import WorkspaceClient + +# Load .env.test file if it exists +_env_file = Path(__file__).parent.parent.parent / "databricks-tools-core" / ".env.test" +if _env_file.exists(): + from dotenv import load_dotenv + + load_dotenv(_env_file) + logging.getLogger(__name__).info(f"Loaded environment from {_env_file}") + +logger = logging.getLogger(__name__) + + +def pytest_configure(config): + """Configure pytest with custom markers.""" + config.addinivalue_line( + "markers", "integration: mark test as integration test requiring Databricks" + ) + + +@pytest.fixture(scope="session") +def workspace_client() -> WorkspaceClient: + """ + Create a WorkspaceClient for the test session. + + Uses standard Databricks authentication: + 1. DATABRICKS_HOST + DATABRICKS_TOKEN env vars + 2. ~/.databrickscfg profile + """ + try: + client = WorkspaceClient() + # Verify connection works + client.current_user.me() + logger.info(f"Connected to Databricks: {client.config.host}") + return client + except Exception as e: + pytest.skip(f"Could not connect to Databricks: {e}") + + +@pytest.fixture(scope="session") +def warehouse_id(workspace_client: WorkspaceClient) -> str: + """ + Get a running SQL warehouse for tests. + + Prefers shared endpoints, falls back to any running warehouse. + """ + from databricks.sdk.service.sql import State + + warehouses = list(workspace_client.warehouses.list()) + + # Priority: running shared endpoint + for w in warehouses: + if w.state == State.RUNNING and "shared" in (w.name or "").lower(): + logger.info(f"Using warehouse: {w.name} ({w.id})") + return w.id + + # Fallback: any running warehouse + for w in warehouses: + if w.state == State.RUNNING: + logger.info(f"Using warehouse: {w.name} ({w.id})") + return w.id + + # No running warehouse found + pytest.skip("No running SQL warehouse available for tests") diff --git a/databricks-skills/.tests/run_tests.py b/databricks-skills/.tests/run_tests.py new file mode 100755 index 00000000..cae0da56 --- /dev/null +++ b/databricks-skills/.tests/run_tests.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python3 +""" +Test runner for databricks-skills. + +Runs unit tests (mocked, no Databricks connection required) and integration tests +(require Databricks connection). Generates HTML and terminal reports. + +Usage: + python run_tests.py # Run all tests + python run_tests.py --unit # Run only unit tests + python run_tests.py --integration # Run only integration tests + python run_tests.py -v # Verbose output + python run_tests.py --html # Generate HTML report +""" + +import argparse +import os +import subprocess +import sys +from datetime import datetime +from pathlib import Path + + +def main(): + parser = argparse.ArgumentParser( + description="Run databricks-skills tests with reports" + ) + parser.add_argument( + "--unit", + action="store_true", + help="Run only unit tests (mocked, no Databricks connection)", + ) + parser.add_argument( + "--integration", + action="store_true", + help="Run only integration tests (requires Databricks connection)", + ) + parser.add_argument( + "-v", "--verbose", + action="store_true", + help="Verbose output", + ) + parser.add_argument( + "--html", + action="store_true", + help="Generate HTML report", + ) + parser.add_argument( + "--xml", + action="store_true", + help="Generate JUnit XML report for CI", + ) + parser.add_argument( + "-k", + metavar="EXPRESSION", + help="Only run tests matching the given expression", + ) + + args = parser.parse_args() + + # Determine test directory + tests_dir = Path(__file__).parent + skills_dir = tests_dir.parent + repo_root = skills_dir.parent + + # Results directory for reports + results_dir = tests_dir / ".test-results" + results_dir.mkdir(exist_ok=True) + + # Build pytest command + pytest_args = [ + sys.executable, + "-m", "pytest", + str(tests_dir), + ] + + # Filter by test type + if args.unit and not args.integration: + # Unit tests: exclude integration marker + pytest_args.extend(["-m", "not integration"]) + elif args.integration and not args.unit: + # Integration tests only + pytest_args.extend(["-m", "integration"]) + # If both or neither specified, run all tests + + # Add verbosity + if args.verbose: + pytest_args.append("-v") + else: + pytest_args.append("-q") + + # Add expression filter + if args.k: + pytest_args.extend(["-k", args.k]) + + # Add HTML report + if args.html: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + html_path = results_dir / f"report_{timestamp}.html" + pytest_args.extend(["--html", str(html_path), "--self-contained-html"]) + print(f"HTML report will be saved to: {html_path}") + + # Add XML report + if args.xml: + xml_path = results_dir / "junit.xml" + pytest_args.extend(["--junitxml", str(xml_path)]) + print(f"JUnit XML report will be saved to: {xml_path}") + + # Add color output + pytest_args.append("--color=yes") + + # Show captured output on failure + pytest_args.append("-rA") + + # Set PYTHONPATH to include skills directory + env = os.environ.copy() + pythonpath = env.get("PYTHONPATH", "") + env["PYTHONPATH"] = f"{skills_dir}:{repo_root / 'databricks-tools-core'}:{pythonpath}" + + # Print test configuration + print("=" * 60) + print("databricks-skills Test Runner") + print("=" * 60) + print(f"Tests directory: {tests_dir}") + print(f"Results directory: {results_dir}") + test_type = "all" + if args.unit and not args.integration: + test_type = "unit only" + elif args.integration and not args.unit: + test_type = "integration only" + print(f"Test type: {test_type}") + print("=" * 60) + print() + + # Run pytest + result = subprocess.run(pytest_args, env=env) + + # Print summary + print() + print("=" * 60) + if result.returncode == 0: + print("All tests PASSED") + else: + print(f"Tests FAILED (exit code: {result.returncode})") + + if args.html: + print(f"HTML report: {html_path}") + if args.xml: + print(f"JUnit XML: {xml_path}") + print("=" * 60) + + return result.returncode + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/databricks-skills/.tests/test_agent_bricks_manager.py b/databricks-skills/.tests/test_agent_bricks_manager.py new file mode 100644 index 00000000..d6c7d62f --- /dev/null +++ b/databricks-skills/.tests/test_agent_bricks_manager.py @@ -0,0 +1,161 @@ +""" +Integration tests for databricks-agent-bricks/scripts/mas_manager.py + +Tests the Supervisor Agent (MAS) CLI interface functions. +The mas_manager.py is self-contained - requires only databricks-sdk and requests. +""" + +import json +import sys +from pathlib import Path + +import pytest + +# Add the skills directory to the path +SKILLS_DIR = Path(__file__).parent.parent +sys.path.insert(0, str(SKILLS_DIR / "databricks-agent-bricks")) + +from mas_manager import ( + create_mas, + get_mas, + find_mas, + update_mas, + delete_mas, + list_mas, + add_examples, + add_examples_queued, + list_examples, + _build_agent_list, +) + + +@pytest.fixture +def sample_agent_config(): + """Sample agent configuration for testing.""" + return { + "name": "Test Agent", + "description": "A test agent for unit testing", + "endpoint_name": "test-endpoint", + } + + +@pytest.fixture +def sample_genie_agent(): + """Sample Genie agent configuration.""" + return { + "name": "Genie Agent", + "description": "A Genie-based agent", + "genie_space_id": "test-space-123", + } + + +@pytest.fixture +def sample_uc_function_agent(): + """Sample UC Function agent configuration.""" + return { + "name": "UC Function Agent", + "description": "A UC function agent", + "uc_function_name": "catalog.schema.function_name", + } + + +class TestBuildAgentList: + """Tests for _build_agent_list helper function.""" + + def test_build_serving_endpoint_agent(self, sample_agent_config): + """Should build serving endpoint agent config.""" + result = _build_agent_list([sample_agent_config]) + + assert len(result) == 1 + agent = result[0] + assert agent["name"] == "Test Agent" + assert agent["description"] == "A test agent for unit testing" + assert agent["agent_type"] == "serving_endpoint" + assert agent["serving_endpoint"]["name"] == "test-endpoint" + + def test_build_genie_agent(self, sample_genie_agent): + """Should build Genie agent config.""" + result = _build_agent_list([sample_genie_agent]) + + assert len(result) == 1 + agent = result[0] + assert agent["agent_type"] == "genie" + assert agent["genie_space"]["id"] == "test-space-123" + + def test_build_uc_function_agent(self, sample_uc_function_agent): + """Should build UC function agent config.""" + result = _build_agent_list([sample_uc_function_agent]) + + assert len(result) == 1 + agent = result[0] + assert agent["agent_type"] == "unity_catalog_function" + assert agent["unity_catalog_function"]["uc_path"]["catalog"] == "catalog" + assert agent["unity_catalog_function"]["uc_path"]["schema"] == "schema" + assert agent["unity_catalog_function"]["uc_path"]["name"] == "function_name" + + def test_build_mcp_connection_agent(self): + """Should build external MCP server agent config.""" + agent_config = { + "name": "MCP Agent", + "description": "External MCP server", + "connection_name": "my-mcp-connection", + } + result = _build_agent_list([agent_config]) + + assert len(result) == 1 + agent = result[0] + assert agent["agent_type"] == "external_mcp_server" + assert agent["external_mcp_server"]["connection_name"] == "my-mcp-connection" + + def test_build_multiple_agents(self, sample_agent_config, sample_genie_agent): + """Should build multiple agent configs.""" + result = _build_agent_list([sample_agent_config, sample_genie_agent]) + + assert len(result) == 2 + assert result[0]["agent_type"] == "serving_endpoint" + assert result[1]["agent_type"] == "genie" + + +@pytest.mark.integration +class TestMASLifecycle: + """Integration tests for MAS CRUD operations. + + Note: These tests require a Databricks workspace with Agent Bricks enabled. + They are marked as integration tests and may be skipped if connection fails. + """ + + @pytest.fixture + def test_mas_name(self): + """Unique name for test MAS.""" + import uuid + return f"test-mas-{uuid.uuid4().hex[:8]}" + + def test_list_mas(self, workspace_client): + """Should list existing MAS tiles.""" + try: + result = list_mas() + assert isinstance(result, list) + except Exception as e: + if "Agent Bricks" in str(e) or "not enabled" in str(e).lower(): + pytest.skip("Agent Bricks not enabled in workspace") + raise + + def test_find_mas_not_found(self, workspace_client): + """Should return not found for non-existent MAS.""" + try: + result = find_mas("nonexistent-mas-name-xyz-123") + assert result["found"] is False + except Exception as e: + if "Agent Bricks" in str(e) or "not enabled" in str(e).lower(): + pytest.skip("Agent Bricks not enabled in workspace") + raise + + def test_get_mas_not_found(self, workspace_client): + """Should return error for non-existent tile ID.""" + try: + result = get_mas("00000000-0000-0000-0000-000000000000") + assert "error" in result or result.get("tile_id") == "" + except Exception as e: + if "Agent Bricks" in str(e) or "not enabled" in str(e).lower(): + pytest.skip("Agent Bricks not enabled in workspace") + raise diff --git a/databricks-skills/.tests/test_genie_conversation.py b/databricks-skills/.tests/test_genie_conversation.py new file mode 100644 index 00000000..0ada389f --- /dev/null +++ b/databricks-skills/.tests/test_genie_conversation.py @@ -0,0 +1,204 @@ +""" +Integration tests for databricks-genie/scripts/conversation.py + +Tests the Genie Conversation API CLI interface. +Requires databricks.sdk for Genie Space operations. +""" + +import json +import os +import sys +from pathlib import Path +from unittest.mock import MagicMock, patch + +import pytest + +# Add the skills directory to the path +SKILLS_DIR = Path(__file__).parent.parent +sys.path.insert(0, str(SKILLS_DIR / "databricks-genie")) + +from conversation import ask_genie, _print_json + + +class TestAskGenieFunction: + """Tests for the ask_genie function structure and error handling.""" + + def test_ask_genie_returns_dict(self): + """Should return a dictionary result.""" + # Test with a mock to verify return structure + with patch("conversation.WorkspaceClient") as mock_client: + # Setup mock + mock_response = MagicMock() + mock_response.conversation_id = "conv-123" + mock_response.message_id = "msg-456" + + mock_message = MagicMock() + mock_message.status = MagicMock() + mock_message.status.value = "COMPLETED" + mock_message.attachments = [] + mock_message.query_result = None + + mock_instance = mock_client.return_value + mock_instance.genie.start_conversation_and_wait.return_value = mock_response + mock_instance.genie.get_message.return_value = mock_message + + result = ask_genie( + space_id="test-space", + question="Test question", + timeout_seconds=5, + ) + + assert isinstance(result, dict) + assert "question" in result + assert "conversation_id" in result + assert "message_id" in result + assert "status" in result + + def test_ask_genie_with_conversation_id(self): + """Should pass conversation_id for follow-up questions.""" + with patch("conversation.WorkspaceClient") as mock_client: + mock_response = MagicMock() + mock_response.conversation_id = "conv-123" + mock_response.message_id = "msg-456" + + mock_message = MagicMock() + mock_message.status = MagicMock() + mock_message.status.value = "COMPLETED" + mock_message.attachments = [] + mock_message.query_result = None + + mock_instance = mock_client.return_value + mock_instance.genie.start_conversation_and_wait.return_value = mock_response + mock_instance.genie.get_message.return_value = mock_message + + result = ask_genie( + space_id="test-space", + question="Follow-up question", + conversation_id="existing-conv-id", + timeout_seconds=5, + ) + + # Verify the conversation_id was passed + call_args = mock_instance.genie.start_conversation_and_wait.call_args + assert call_args.kwargs.get("conversation_id") == "existing-conv-id" + + def test_ask_genie_handles_timeout(self): + """Should return timeout status when query exceeds timeout.""" + with patch("conversation.WorkspaceClient") as mock_client: + mock_response = MagicMock() + mock_response.conversation_id = "conv-123" + mock_response.message_id = "msg-456" + + mock_message = MagicMock() + mock_message.status = MagicMock() + mock_message.status.value = "EXECUTING_QUERY" # Never completes + mock_message.attachments = [] + + mock_instance = mock_client.return_value + mock_instance.genie.start_conversation_and_wait.return_value = mock_response + mock_instance.genie.get_message.return_value = mock_message + + # Very short timeout to trigger timeout path + result = ask_genie( + space_id="test-space", + question="Test question", + timeout_seconds=0.1, # Will timeout immediately + ) + + assert result["status"] == "TIMEOUT" + assert "error" in result + + def test_ask_genie_handles_failure(self): + """Should return failure status when query fails.""" + with patch("conversation.WorkspaceClient") as mock_client: + mock_response = MagicMock() + mock_response.conversation_id = "conv-123" + mock_response.message_id = "msg-456" + + mock_message = MagicMock() + mock_message.status = MagicMock() + mock_message.status.value = "FAILED" + mock_message.attachments = [] + + mock_instance = mock_client.return_value + mock_instance.genie.start_conversation_and_wait.return_value = mock_response + mock_instance.genie.get_message.return_value = mock_message + + result = ask_genie( + space_id="test-space", + question="Test question", + timeout_seconds=5, + ) + + assert result["status"] == "FAILED" + + +class TestPrintJson: + """Tests for the _print_json helper function.""" + + def test_print_json_dict(self, capsys): + """Should print dict as formatted JSON.""" + _print_json({"key": "value", "number": 42}) + captured = capsys.readouterr() + assert '"key": "value"' in captured.out + assert '"number": 42' in captured.out + + def test_print_json_list(self, capsys): + """Should print list as formatted JSON.""" + _print_json([1, 2, 3]) + captured = capsys.readouterr() + assert "1" in captured.out + assert "2" in captured.out + assert "3" in captured.out + + +@pytest.mark.integration +class TestGenieConversationIntegration: + """Integration tests for Genie Conversation API. + + Note: These tests require a Databricks workspace with Genie enabled + and a valid Genie Space ID configured via environment variable. + """ + + @pytest.fixture + def genie_space_id(self): + """Get Genie Space ID from environment.""" + space_id = os.environ.get("TEST_GENIE_SPACE_ID") + if not space_id: + pytest.skip("TEST_GENIE_SPACE_ID not set - skipping Genie integration tests") + return space_id + + def test_ask_genie_simple_question(self, workspace_client, genie_space_id): + """Should be able to ask a simple question to Genie.""" + result = ask_genie( + space_id=genie_space_id, + question="How many rows are in the table?", + timeout_seconds=120, + ) + + # Should return a valid result + assert result["conversation_id"] is not None + assert result["status"] in ["COMPLETED", "FAILED", "TIMEOUT"] + + def test_ask_genie_follow_up(self, workspace_client, genie_space_id): + """Should be able to ask follow-up questions.""" + # First question + result1 = ask_genie( + space_id=genie_space_id, + question="Show me the first 5 rows", + timeout_seconds=120, + ) + + if result1["status"] != "COMPLETED": + pytest.skip("First query did not complete - skipping follow-up test") + + # Follow-up question + result2 = ask_genie( + space_id=genie_space_id, + question="Now show me the count", + conversation_id=result1["conversation_id"], + timeout_seconds=120, + ) + + # Should use same conversation + assert result2["conversation_id"] == result1["conversation_id"] diff --git a/databricks-skills/README.md b/databricks-skills/README.md index a81730a2..a5e50868 100644 --- a/databricks-skills/README.md +++ b/databricks-skills/README.md @@ -1,6 +1,6 @@ # Databricks Skills for Claude Code -Skills that teach Claude Code how to work effectively with Databricks - providing patterns, best practices, and code examples that work with Databricks MCP tools. +Skills that teach Claude Code how to work effectively with Databricks - providing patterns, best practices, and code examples using the Databricks CLI, Python SDK, and REST APIs. ## Installation @@ -105,7 +105,7 @@ cp -r ai-dev-kit/databricks-skills/databricks-agent-bricks .claude/skills/ - **databricks-app-python** - Python web apps (Dash, Streamlit, Flask) with foundation model integration - **databricks-python-sdk** - Python SDK, Connect, CLI, REST API - **databricks-config** - Profile authentication setup -- **databricks-lakebase-provisioned** - Managed PostgreSQL for OLTP workloads +- **databricks-lakebase-autoscale** - Lakebase Autoscaling managed PostgreSQL with branching, scale-to-zero, reverse ETL ### ๐Ÿ“š Reference - **databricks-docs** - Documentation index via llms.txt @@ -113,22 +113,21 @@ cp -r ai-dev-kit/databricks-skills/databricks-agent-bricks .claude/skills/ ## How It Works ``` -โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” -โ”‚ .claude/skills/ + .claude/mcp.json โ”‚ -โ”‚ (Knowledge) (Actions) โ”‚ -โ”‚ โ”‚ -โ”‚ Skills teach HOW + MCP does it โ”‚ -โ”‚ โ†“ โ†“ โ”‚ -โ”‚ Claude Code learns patterns and executes โ”‚ -โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ +โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” +โ”‚ .claude/skills/ + Databricks CLI/SDK โ”‚ +โ”‚ (Knowledge) (Actions) โ”‚ +โ”‚ โ”‚ +โ”‚ Skills teach HOW + CLI/SDK executes โ”‚ +โ”‚ โ†“ โ†“ โ”‚ +โ”‚ Claude Code learns patterns and executes โ”‚ +โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ ``` **Example:** User says "Create a sales dashboard" 1. Claude loads `databricks-aibi-dashboards` skill โ†’ learns validation workflow -2. Calls `get_table_stats_and_schema()` โ†’ gets schemas -3. Calls `execute_sql()` โ†’ tests queries -4. Calls `manage_dashboard(action="create_or_update")` โ†’ deploys -5. Returns working dashboard URL +2. Runs `databricks experimental aitools tools query` โ†’ tests queries +3. Uses Python SDK to create dashboard via REST API +4. Returns working dashboard URL ## Custom Skills @@ -149,6 +148,26 @@ description: "What this teaches" ... ``` +## Testing + +Run tests for skill scripts (requires `pytest`): + +```bash +cd databricks-skills/.tests + +# Run all tests (unit tests are mocked, no Databricks connection needed) +python run_tests.py + +# Run only unit tests +python run_tests.py --unit + +# Run integration tests (requires Databricks connection) +python run_tests.py --integration + +# Verbose output +python run_tests.py -v +``` + ## Troubleshooting **Skills not loading?** Check `.claude/skills/` exists and each skill has `SKILL.md` @@ -158,6 +177,7 @@ description: "What this teaches" ## Related - [databricks-tools-core](../databricks-tools-core/) - Python library -- [databricks-mcp-server](../databricks-mcp-server/) - MCP server +- [Databricks CLI](https://docs.databricks.com/dev-tools/cli/index.html) - Official CLI +- [Databricks SDK](https://docs.databricks.com/en/dev-tools/sdk-python.html) - Python SDK - [Databricks Docs](https://docs.databricks.com/) - Official documentation - [MLflow Skills](https://github.com/mlflow/skills) - Upstream MLflow skills repository diff --git a/databricks-skills/databricks-agent-bricks/1-knowledge-assistants.md b/databricks-skills/databricks-agent-bricks/1-knowledge-assistants.md index 3adff469..0b349c5b 100644 --- a/databricks-skills/databricks-agent-bricks/1-knowledge-assistants.md +++ b/databricks-skills/databricks-agent-bricks/1-knowledge-assistants.md @@ -1,183 +1,74 @@ -# Knowledge Assistants (KA) +# Knowledge Assistants - Details -Knowledge Assistants are document-based Q&A systems that use RAG (Retrieval-Augmented Generation) to answer questions from indexed documents. +For commands, see [SKILL.md](SKILL.md). -## What is a Knowledge Assistant? +## Source Types -A KA connects to documents stored in a Unity Catalog Volume and allows users to ask natural language questions. The system: +Both shapes go inside the `--json` body alongside `display_name` and `description` โ€” see SKILL.md for the full invocation. -1. **Indexes** all documents in the volume (PDFs, text files, etc.) -2. **Retrieves** relevant chunks when a question is asked -3. **Generates** an answer using the retrieved context +### Files (Volume) -## When to Use - -Use a Knowledge Assistant when: -- You have a collection of documents (policies, manuals, guides, reports) -- Users need to find specific information without reading entire documents -- You want to provide a conversational interface to documentation - -## Prerequisites - -Before creating a KA, you need documents in a Unity Catalog Volume: - -**Option 1: Use existing documents** -- Upload PDFs/text files to a Volume manually or via SDK - -**Option 2: Generate synthetic documents** -- Use the `databricks-unstructured-pdf-generation` skill to create realistic PDF documents -- Each PDF gets a companion JSON file with question/guideline pairs for evaluation - -## Creating a Knowledge Assistant - -Use the `manage_ka` tool with `action="create_or_update"`: - -- `name`: "HR Policy Assistant" -- `volume_path`: "/Volumes/my_catalog/my_schema/raw_data/hr_docs" -- `description`: "Answers questions about HR policies and procedures" -- `instructions`: "Be helpful and always cite the specific policy document when answering. If you're unsure, say so." - -The tool will: -1. Create the KA with the specified volume as a knowledge source -2. Scan the volume for JSON files with example questions (from PDF generation) -3. Queue examples to be added once the endpoint is ready - -## Provisioning Timeline - -After creation, the KA endpoint needs to provision: - -| Status | Meaning | Duration | -|--------|---------|----------| -| `PROVISIONING` | Creating the endpoint | 2-5 minutes | -| `ONLINE` | Ready to use | - | -| `OFFLINE` | Not currently running | - | - -Use `manage_ka` with `action="get"` to check the status: - -- `tile_id`: "" - -## Adding Example Questions - -Example questions help with: -- **Evaluation**: Test if the KA answers correctly -- **User onboarding**: Show users what to ask - -### Automatic (from PDF generation) - -If you used `generate_pdf_documents`, each PDF has a companion JSON with: ```json { - "question": "What is the company's remote work policy?", - "guideline": "Should mention the 3-day minimum in-office requirement" + "display_name": "...", + "description": "...", + "source_type": "files", + "files": {"path": "/Volumes/catalog/schema/volume/folder/"} } ``` -These are automatically added when `add_examples_from_volume=true` (default). - -### Manual +Supported formats: PDF, TXT, MD, DOCX. -Examples can also be specified in the `manage_ka` create_or_update call if needed. +### Vector Search Index -## Best Practices +Use an existing index instead of auto-indexing: -### Document Organization - -- **One volume per topic**: e.g., `/Volumes/catalog/schema/raw_data/hr_docs`, `/Volumes/catalog/schema/raw_data/tech_docs` -- **Clear naming**: Name files descriptively so chunks are identifiable - -### Instructions - -Good instructions improve answer quality: - -``` -Be helpful and professional. When answering: -1. Always cite the specific document and section -2. If multiple documents are relevant, mention all of them -3. If the information isn't in the documents, clearly say so -4. Use bullet points for multi-part answers +```json +{ + "display_name": "...", + "description": "...", + "source_type": "index", + "index": { + "index_name": "catalog.schema.my_index", + "text_col": "content", + "doc_uri_col": "source_url" + } +} ``` -### Updating Content - -To update the indexed documents: -1. Add/remove/modify files in the volume -2. Call `manage_ka` with `action="create_or_update"`, the same name and `tile_id` -3. The KA will re-index the updated content - -## Example Workflow - -1. **Generate PDF documents** using `databricks-unstructured-pdf-generation` skill: - - Creates PDFs in `/Volumes/catalog/schema/raw_data/pdf_documents` - - Creates JSON files with question/guideline pairs - -2. **Create the Knowledge Assistant**: - - `name`: "My Document Assistant" - - `volume_path`: "/Volumes/catalog/schema/raw_data/pdf_documents" +## Updating Content -3. **Wait for ONLINE status** (2-5 minutes) +1. Add/modify/remove files in the Volume +2. Re-sync: `databricks knowledge-assistants sync-knowledge-sources "knowledge-assistants/{ka_id}"` -4. **Examples are automatically added** from the JSON files - -5. **Test the KA** in the Databricks UI - -## Using KA in Supervisor Agents - -Knowledge Assistants can be used as agents in a Supervisor Agent (formerly Multi-Agent Supervisor, MAS). Each KA has an associated model serving endpoint. - -### Finding the Endpoint Name +## Troubleshooting -Use `manage_ka` with `action="get"` to retrieve the KA details. The response includes: -- `tile_id`: The unique identifier for the KA -- `name`: The KA name (sanitized) -- `endpoint_status`: Current status (ONLINE, PROVISIONING, etc.) +**KA stays in CREATING:** +- Wait up to 10 minutes +- Check workspace quotas +- Verify volume path exists -The endpoint name follows this pattern: `ka-{tile_id}-endpoint` +**Documents not indexed:** +- Check file format (PDF, TXT, MD, DOCX) +- Verify volume path (trailing slash matters) +- Check file permissions -### Finding a KA by Name +**Poor answer quality:** +- Ensure documents are well-structured +- Break large documents into smaller files +- Add clear headings and sections -If you know the KA name but not the tile_id, use `manage_ka` with `action="find_by_name"`: +## Evaluation Questions -```python -manage_ka(action="find_by_name", name="HR_Policy_Assistant") -# Returns: {"found": True, "tile_id": "01abc...", "name": "HR_Policy_Assistant", "endpoint_name": "ka-01abc...-endpoint"} -``` +When testing a KA, check if the volume or project contains a `pdf_eval_questions.json` file with test questions: -### Example: Adding KA to Supervisor Agent - -```python -# First, find the KA -manage_ka(action="find_by_name", name="HR_Policy_Assistant") - -# Then use the tile_id in a Supervisor Agent -manage_mas( - action="create_or_update", - name="Support_MAS", - agents=[ - { - "name": "hr_agent", - "ka_tile_id": "", - "description": "Answers HR policy questions from the employee handbook" - } - ] -) +```json +{ + "api_errors_guide.pdf": { + "question": "What is the solution for error ERR-4521?", + "expected_fact": "Call /api/v2/auth/refresh with refresh_token before the 3600s TTL expires" + } +} ``` -## Troubleshooting - -### Endpoint stays in PROVISIONING - -- Check workspace capacity and quotas -- Verify the volume path is accessible -- Wait up to 10 minutes before investigating further - -### Documents not indexed - -- Ensure files are in a supported format (PDF, TXT, MD) -- Check file permissions in the volume -- Verify the volume path is correct - -### Poor answer quality - -- Add more specific instructions -- Ensure documents are well-structured -- Consider breaking large documents into smaller files +Use these questions to validate retrieval accuracy. See [databricks-unstructured-pdf-generation](../databricks-unstructured-pdf-generation/SKILL.md) for generating test PDFs with eval questions. diff --git a/databricks-skills/databricks-agent-bricks/2-supervisor-agents.md b/databricks-skills/databricks-agent-bricks/2-supervisor-agents.md index 7121bfcf..accc610d 100644 --- a/databricks-skills/databricks-agent-bricks/2-supervisor-agents.md +++ b/databricks-skills/databricks-agent-bricks/2-supervisor-agents.md @@ -1,394 +1,92 @@ -# Supervisor Agents (MAS) +# Supervisor Agents - Details -Supervisor Agents orchestrate multiple specialized agents, routing user queries to the most appropriate agent based on the query content. - -## What is a Supervisor Agent? - -A Supervisor Agent (formerly Multi-Agent Supervisor, MAS) acts as a traffic controller for multiple AI agents, routing user queries to the most appropriate agent. It supports five types of agents: - -1. **Knowledge Assistants (KA)**: Document-based Q&A from PDFs/files in Volumes -2. **Genie Spaces**: Natural language to SQL for data exploration -3. **Model Serving Endpoints**: Custom LLM agents, fine-tuned models, RAG applications -4. **Unity Catalog Functions**: Callable UC functions for data operations -5. **External MCP Servers**: JSON-RPC endpoints via UC HTTP Connections for external system integration - -When a user asks a question: -1. **Analyzes** the query to understand the intent -2. **Routes** to the most appropriate specialized agent -3. **Returns** the agent's response to the user - -This allows you to combine multiple specialized agents into a single unified interface. - -## When to Use - -Use a Supervisor Agent when: -- You have multiple specialized agents (billing, technical support, HR, etc.) -- Users shouldn't need to know which agent to ask -- You want to provide a unified conversational experience - -## Prerequisites - -Before creating a Supervisor Agent, you need agents of one or both types: - -**Model Serving Endpoints** (`endpoint_name`): -- Knowledge Assistant (KA) endpoints (e.g., `ka-abc123-endpoint`) -- Custom agents built with LangChain, LlamaIndex, etc. -- Fine-tuned models -- RAG applications - -**Genie Spaces** (`genie_space_id`): -- Existing Genie spaces for SQL-based data exploration -- Great for analytics, metrics, and data-driven questions -- No separate endpoint deployment required - reference the space directly -- To find a Genie space by name, use `find_genie_by_name(display_name="My Genie")` -- **Note**: There is NO system table for Genie spaces - do not try to query `system.ai.genie_spaces` +For commands, see [SKILL.md](SKILL.md). `` in examples = the directory containing SKILL.md. ## Unity Catalog Functions -Unity Catalog Functions allow Supervisor Agents to call registered UC functions for data operations. +Call registered UC functions from the Supervisor Agent. -### Prerequisites - -- UC Function already exists (use SQL `CREATE FUNCTION` or Python UDF) -- Agent service principal has `EXECUTE` privilege: - ```sql - GRANT EXECUTE ON FUNCTION catalog.schema.function_name TO ``; - ``` - -### Configuration +**Prerequisites:** +- UC Function exists (`CREATE FUNCTION` or Python UDF) +- Grant execute: `GRANT EXECUTE ON FUNCTION catalog.schema.func TO \`\`;` +**Config:** ```json -{ - "name": "data_enrichment", - "uc_function_name": "sales_analytics.utils.enrich_customer_data", - "description": "Enriches customer records with demographic and purchase history data" -} +{"name": "enricher", "uc_function_name": "catalog.schema.enrich_data", "description": "Enriches customer records"} ``` -**Field**: `uc_function_name` - Fully-qualified function name in format `catalog.schema.function_name` - ## External MCP Servers -External MCP Servers enable Supervisor Agents to interact with external systems (ERP, CRM, etc.) via UC HTTP Connections. The MCP server implements a JSON-RPC 2.0 endpoint that exposes tools for the Supervisor Agent to call. - -### Prerequisites - -**1. MCP Server Endpoint**: Your external system must provide a JSON-RPC 2.0 endpoint (e.g., `/api/mcp`) that implements the MCP protocol: - -```python -# Example MCP server tool definition -TOOLS = [ - { - "name": "approve_invoice", - "description": "Approve a specific invoice", - "inputSchema": { - "type": "object", - "properties": { - "invoice_number": {"type": "string", "description": "Invoice number to approve"}, - "approver": {"type": "string", "description": "Name/email of approver"}, - }, - "required": ["invoice_number"], - }, - }, -] - -# JSON-RPC methods: initialize, tools/list, tools/call -``` - -**2. UC HTTP Connection**: Create a Unity Catalog HTTP Connection that points to your MCP endpoint: +Connect to external systems (ERP, CRM) via UC HTTP Connection implementing MCP protocol. +**1. Create UC HTTP Connection:** ```sql -CREATE CONNECTION my_mcp_connection TYPE HTTP +CREATE CONNECTION my_mcp TYPE HTTP OPTIONS ( - host 'https://my-app.databricksapps.com', -- Your MCP server URL + host 'https://my-app.databricksapps.com', port '443', - base_path '/api/mcp', -- Path to JSON-RPC endpoint - client_id '', -- OAuth M2M credentials - client_secret '', + base_path '/api/mcp', + client_id '', + client_secret '', oauth_scope 'all-apis', token_endpoint 'https://.azuredatabricks.net/oidc/v1/token', - is_mcp_connection 'true' -- REQUIRED: Identifies as MCP connection + is_mcp_connection 'true' ); ``` -**3. Grant Permissions**: Agent service principal needs access to the connection: - +**2. Grant access:** ```sql -GRANT USE CONNECTION ON my_mcp_connection TO ``; +GRANT USE CONNECTION ON my_mcp TO ``; ``` -### Configuration - -Reference the UC Connection using the `connection_name` field: - -```python -{ - "name": "external_operations", - "connection_name": "my_mcp_connection", - "description": "Execute external system operations: approve invoices, create records, trigger workflows" -} -``` - -**Field**: `connection_name` - the name of the Unity Catalog HTTP Connection configured as an MCP server - -**Important**: Make the description comprehensive - it guides the Supervisor Agent's routing decisions for when to call this agent. - -### Complete Example: Multi-System Supervisor - -Example showing integration of Genie, KA, and external MCP: - -```python -manage_mas( - action="create_or_update", - name="AP_Invoice_Supervisor", - agents=[ - { - "name": "billing_analyst", - "genie_space_id": "01abc123...", - "description": "SQL analytics on AP invoice data: spending trends, vendor analysis, aging reports" - }, - { - "name": "policy_expert", - "ka_tile_id": "f32c5f73...", - "description": "Answers questions about AP policies, approval workflows, and compliance requirements from policy documents" - }, - { - "name": "ap_operations", - "connection_name": "ap_invoice_mcp", - "description": ( - "Execute AP operations: approve/reject/flag invoices, search invoice details, " - "get vendor summaries, trigger batch workflows. Use for ANY action or write operation." - ) - } - ], - description="AP automation assistant with analytics, policy guidance, and operational actions", - instructions=""" - Route queries as follows: - - Data questions (invoice counts, spend analysis, vendor metrics) โ†’ billing_analyst - - Policy questions (thresholds, SLAs, compliance rules) โ†’ policy_expert - - Actions (approve, reject, flag, search, workflows) โ†’ ap_operations - - When a user asks to approve, reject, or flag an invoice, ALWAYS use ap_operations. - """ -) +**3. Config:** +```json +{"name": "operations", "connection_name": "my_mcp", "description": "Execute operations: approve invoices, trigger workflows"} ``` -### MCP Connection Testing - -Verify your connection before adding to MAS: - +**Test connection:** ```sql --- Test tools/list method -SELECT http_request( - conn => 'my_mcp_connection', - method => 'POST', - path => '', - json => '{"jsonrpc":"2.0","method":"tools/list","id":1}' -); -``` - -### Resources - -- **MCP Protocol Spec**: [Model Context Protocol](https://modelcontextprotocol.io) - -## Creating a Supervisor Agent - -Use the `manage_mas` tool with `action="create_or_update"`: - -- `name`: "Customer Support MAS" -- `agents`: - ```json - [ - { - "name": "policy_agent", - "ka_tile_id": "f32c5f73-466b-4798-b3a0-5396b5ece2a5", - "description": "Answers questions about company policies and procedures from indexed documents" - }, - { - "name": "usage_analytics", - "genie_space_id": "01abc123-def4-5678-90ab-cdef12345678", - "description": "Answers data questions about usage metrics, trends, and statistics" - }, - { - "name": "custom_agent", - "endpoint_name": "my-custom-endpoint", - "description": "Handles specialized queries via custom model endpoint" - } - ] - ``` -- `description`: "Routes customer queries to specialized support agents" -- `instructions`: "Analyze the user's question and route to the most appropriate agent. If unclear, ask for clarification." - -This example shows mixing Knowledge Assistants (policy_agent), Genie spaces (usage_analytics), and custom endpoints (custom_agent). - -## Agent Configuration - -Each agent in the `agents` list needs: - -| Field | Required | Description | -|-------|----------|-------------| -| `name` | Yes | Internal identifier for the agent | -| `description` | Yes | What this agent handles (critical for routing) | -| `ka_tile_id` | One of these | Knowledge Assistant tile ID (for document Q&A agents) | -| `genie_space_id` | One of these | Genie space ID (for SQL-based data agents) | -| `endpoint_name` | One of these | Model serving endpoint name (for custom agents) | -| `uc_function_name` | One of these | Unity Catalog function name in format `catalog.schema.function_name` | -| `connection_name` | One of these | Unity Catalog connection name (for external MCP servers) | - -**Note**: Provide exactly one of: `ka_tile_id`, `genie_space_id`, `endpoint_name`, `uc_function_name`, or `connection_name`. - -To find a KA tile_id, use `manage_ka(action="find_by_name", name="Your KA Name")`. -To find a Genie space_id, use `find_genie_by_name(display_name="Your Genie Name")`. - -### Writing Good Descriptions - -The `description` field is critical for routing. Make it specific: - -**Good descriptions:** -- "Handles billing questions including invoices, payments, refunds, and subscription changes" -- "Answers technical questions about API errors, integration issues, and product bugs" -- "Provides information about HR policies, PTO, benefits, and employee handbook" - -**Bad descriptions:** -- "Billing agent" (too vague) -- "Handles stuff" (not helpful) -- "Technical" (not specific) - -## Provisioning Timeline - -After creation, the Supervisor Agent endpoint needs to provision: - -| Status | Meaning | Duration | -|--------|---------|----------| -| `PROVISIONING` | Creating the supervisor | 2-5 minutes | -| `ONLINE` | Ready to route queries | - | -| `OFFLINE` | Not currently running | - | - -Use `manage_mas` with `action="get"` to check the status. - -## Adding Example Questions - -Example questions help with evaluation and can guide routing optimization: - -```json -{ - "examples": [ - { - "question": "I haven't received my invoice for this month", - "guideline": "Should be routed to billing_agent" - }, - { - "question": "The API is returning a 500 error", - "guideline": "Should be routed to technical_agent" - }, - { - "question": "How many vacation days do I have?", - "guideline": "Should be routed to hr_agent" - } - ] -} +SELECT http_request(conn => 'my_mcp', method => 'POST', path => '', json => '{"jsonrpc":"2.0","method":"tools/list","id":1}'); ``` -If the Supervisor Agent is not yet `ONLINE`, examples are queued and added automatically when ready. +## Writing Good Descriptions -## Best Practices +The `description` field drives routing. Be specific: -### Agent Design - -1. **Specialized agents**: Each agent should have a clear, distinct purpose -2. **Non-overlapping domains**: Avoid agents with similar descriptions -3. **Clear boundaries**: Define what each agent does and doesn't handle - -### Instructions - -Provide routing instructions: - -``` -You are a customer support supervisor. Your job is to route user queries to the right specialist: +| Good | Bad | +|------|-----| +| "Handles billing: invoices, payments, refunds, subscriptions" | "Billing agent" | +| "Answers API errors, integration issues, product bugs" | "Technical" | +| "HR policies, PTO, benefits, employee handbook" | "Handles stuff" | -1. For billing, payments, or subscription questions โ†’ billing_agent -2. For technical issues, bugs, or API problems โ†’ technical_agent -3. For HR, benefits, or policy questions โ†’ hr_agent +## Adding Examples -If the query is unclear or spans multiple domains, ask the user to clarify. -``` +Examples help evaluation and routing optimization. **The MAS endpoint must be ONLINE.** Right after `create_mas` (or a big `update_mas`), the endpoint is `NOT_READY` and takes **up to ~10 minutes** to come ONLINE. Pass `--wait` to block until then: -### Fallback Handling +```bash +# Fails fast if endpoint isn't ONLINE yet +python /scripts/mas_manager.py add_examples TILE_ID '[ + {"question": "I need my invoice for March", "guideline": "Route to billing_agent"}, + {"question": "API returns 500 error", "guideline": "Route to tech_agent"} +]' -Consider adding a general-purpose agent for queries that don't fit elsewhere: +# --wait blocks until endpoint is ONLINE (default timeout 15 min) then adds. +# The process must stay alive for the whole wait โ€” there is no background queue. +python /scripts/mas_manager.py add_examples TILE_ID '[...]' --wait -```json -{ - "name": "general_agent", - "endpoint_name": "general-support-endpoint", - "description": "Handles general inquiries that don't fit other categories, provides navigation help" -} +python /scripts/mas_manager.py list_examples TILE_ID ``` -## Example Workflow - -1. **Deploy specialized agents** as model serving endpoints: - - `billing-assistant-endpoint` - - `tech-support-endpoint` - - `hr-assistant-endpoint` - -2. **Create the MAS**: - - Configure agents with clear descriptions - - Add routing instructions - -3. **Wait for ONLINE status** (2-5 minutes) - -4. **Add example questions** for evaluation - -5. **Test routing** with various query types - -## Updating a Supervisor Agent - -To update an existing Supervisor Agent: - -1. **Add/remove agents**: Call `manage_mas` with `action="create_or_update"` and updated `agents` list -2. **Update descriptions**: Change agent descriptions to improve routing -3. **Modify instructions**: Update routing rules - -The tool finds the existing Supervisor Agent by name and updates it. - ## Troubleshooting -### Queries routed to wrong agent - -- Review and improve agent descriptions -- Make descriptions more specific and distinct -- Add examples that demonstrate correct routing - -### Endpoint not responding - -- Verify each underlying model serving endpoint is running -- Check endpoint logs for errors -- Ensure endpoints accept the expected input format - -### Slow responses +**Wrong routing:** +- Improve agent descriptions (more specific, less overlap) +- Add examples demonstrating correct routing -- Check latency of underlying endpoints -- Consider endpoint scaling settings -- Monitor for cold start issues - -## Advanced: Hierarchical Routing - -For complex scenarios, you can create multiple levels of Supervisor Agents: - -``` -Top-level Supervisor -โ”œโ”€โ”€ Customer Support Supervisor -โ”‚ โ”œโ”€โ”€ billing_agent -โ”‚ โ”œโ”€โ”€ technical_agent -โ”‚ โ””โ”€โ”€ general_agent -โ”œโ”€โ”€ Sales Supervisor -โ”‚ โ”œโ”€โ”€ pricing_agent -โ”‚ โ”œโ”€โ”€ demo_agent -โ”‚ โ””โ”€โ”€ contract_agent -โ””โ”€โ”€ Internal Supervisor - โ”œโ”€โ”€ hr_agent - โ””โ”€โ”€ it_helpdesk_agent -``` +**Endpoint not responding:** +- Verify underlying endpoints are running +- Check endpoint logs -Each sub-supervisor is deployed as an endpoint and configured as an agent in the top-level supervisor. +**Slow responses:** +- Check underlying endpoint latency +- Review endpoint scaling settings diff --git a/databricks-skills/databricks-agent-bricks/SKILL.md b/databricks-skills/databricks-agent-bricks/SKILL.md index 026f204a..9ef56920 100644 --- a/databricks-skills/databricks-agent-bricks/SKILL.md +++ b/databricks-skills/databricks-agent-bricks/SKILL.md @@ -1,212 +1,107 @@ --- name: databricks-agent-bricks -description: "Create and manage Databricks Agent Bricks: Knowledge Assistants (KA) for document Q&A, Genie Spaces for SQL exploration, and Supervisor Agents (MAS) for multi-agent orchestration. Use when building conversational AI applications on Databricks." +description: "Create Agent Bricks: Knowledge Assistants (KA) for document Q&A and Supervisor Agents for multi-agent orchestration (MAS). For Genie Spaces, see databricks-genie skill." --- # Agent Bricks -Create and manage Databricks Agent Bricks - pre-built AI components for building conversational applications. - -## Overview - -Agent Bricks are three types of pre-built AI tiles in Databricks: +Agent Bricks are pre-built AI tiles in Databricks that provide conversational interfaces. This skill covers **Knowledge Assistants** and **Supervisor Agents**. For Genie Spaces, use the `databricks-genie` skill. -| Brick | Purpose | Data Source | -|-------|---------|-------------| -| **Knowledge Assistant (KA)** | Document-based Q&A using RAG | PDF/text files in Volumes | -| **Genie Space** | Natural language to SQL | Unity Catalog tables | -| **Supervisor Agent (MAS)** | Multi-agent orchestration | Model serving endpoints | +| Brick | Purpose | This Skill | +|-------|---------|------------| +| **Knowledge Assistant (KA)** | Document Q&A using RAG on PDFs/text in Volumes | โœ“ | +| **Supervisor Agent** | Orchestrates multiple agents (KA, Genie, endpoints, UC functions, MCP) | โœ“ | +| **Genie Space** | Natural language to SQL on Unity Catalog tables | `databricks-genie` | -## Prerequisites - -Before creating Agent Bricks, ensure you have the required data: - -### For Knowledge Assistants -- **Documents in a Volume**: PDF, text, or other files stored in a Unity Catalog volume -- Generate synthetic documents using the `databricks-unstructured-pdf-generation` skill if needed - -### For Genie Spaces -- **See the `databricks-genie` skill** for comprehensive Genie Space guidance -- Tables in Unity Catalog with the data to explore -- Generate raw data using the `databricks-synthetic-data-gen` skill -- Create tables using the `databricks-spark-declarative-pipelines` skill - -### For Supervisor Agents -- **Model Serving Endpoints**: Deployed agent endpoints (KA endpoints, custom agents, fine-tuned models) -- **Genie Spaces**: Existing Genie spaces can be used directly as agents for SQL-based queries -- Mix and match endpoint-based and Genie-based agents in the same Supervisor Agent - -### For Unity Catalog Functions -- **Existing UC Function**: Function already registered in Unity Catalog -- Agent service principal has `EXECUTE` privilege on the function - -### For External MCP Servers -- **Existing UC HTTP Connection**: Connection configured with `is_mcp_connection: 'true'` -- Agent service principal has `USE CONNECTION` privilege on the connection - -## MCP Tools - -### Knowledge Assistant Tool - -**manage_ka** - Manage Knowledge Assistants (KA) -- `action`: "create_or_update", "get", "find_by_name", or "delete" -- `name`: Name for the KA (for create_or_update, find_by_name) -- `volume_path`: Path to documents (e.g., `/Volumes/catalog/schema/volume/folder`) (for create_or_update) -- `description`: (optional) What the KA does (for create_or_update) -- `instructions`: (optional) How the KA should answer (for create_or_update) -- `tile_id`: The KA tile ID (for get, delete, or update via create_or_update) -- `add_examples_from_volume`: (optional, default: true) Auto-add examples from JSON files (for create_or_update) - -Actions: -- **create_or_update**: Requires `name`, `volume_path`. Optionally pass `tile_id` to update. -- **get**: Requires `tile_id`. Returns tile_id, name, description, endpoint_status, knowledge_sources, examples_count. -- **find_by_name**: Requires `name` (exact match). Returns found, tile_id, name, endpoint_name, endpoint_status. Use this to look up an existing KA when you know the name but not the tile_id. -- **delete**: Requires `tile_id`. - -### Genie Space Tools - -**For comprehensive Genie guidance, use the `databricks-genie` skill.** - -Use `manage_genie` with actions: -- `create_or_update` - Create or update a Genie Space -- `get` - Get Genie Space details -- `list` - List all Genie Spaces -- `delete` - Delete a Genie Space -- `export` / `import` - For migration +--- -See `databricks-genie` skill for: -- Table inspection workflow -- Sample question best practices -- Curation (instructions, certified queries) +## Knowledge Assistant + +```bash +# Find volumes +databricks volumes list CATALOG SCHEMA +databricks experimental aitools tools query --warehouse WH "LIST '/Volumes/catalog/schema/volume/'" + +# Create KA +databricks knowledge-assistants create-knowledge-assistant "Name" "Description" + +# Add knowledge source. With --json, pass ONLY the PARENT as a positional arg +# and put display_name / description / source_type / the source body (files|index|file_table) +# inside the JSON. Mixing positional DISPLAY_NAME/DESCRIPTION/SOURCE_TYPE with --json errors. +databricks knowledge-assistants create-knowledge-source \ + "knowledge-assistants/{ka_id}" \ + --json '{ + "display_name": "Docs", + "description": "Documentation files", + "source_type": "files", + "files": {"path": "/Volumes/catalog/schema/volume/"} + }' + +# Sync and check status +databricks knowledge-assistants sync-knowledge-sources "knowledge-assistants/{ka_id}" +databricks knowledge-assistants get-knowledge-assistant "knowledge-assistants/{ka_id}" + +# List/manage +databricks knowledge-assistants list-knowledge-assistants +databricks knowledge-assistants delete-knowledge-assistant "knowledge-assistants/{ka_id}" +``` -**IMPORTANT**: There is NO system table for Genie spaces (e.g., `system.ai.genie_spaces` does not exist). Use `manage_genie(action="list")` to find spaces. +**Source types:** `files` (Volume path) or `index` (Vector Search: `index.index_name`, `index.text_col`, `index.doc_uri_col`) -### Supervisor Agent Tool +**Status:** `CREATING` (2-5 min) โ†’ `ONLINE` โ†’ `OFFLINE` -**manage_mas** - Manage Supervisor Agents (MAS) -- `action`: "create_or_update", "get", "find_by_name", or "delete" -- `name`: Name for the Supervisor Agent (for create_or_update, find_by_name) -- `agents`: List of agent configurations (for create_or_update), each with: - - `name`: Agent identifier (required) - - `description`: What this agent handles - critical for routing (required) - - `ka_tile_id`: Knowledge Assistant tile ID (use for document Q&A agents - recommended for KAs) - - `genie_space_id`: Genie space ID (use for SQL-based data agents) - - `endpoint_name`: Model serving endpoint name (for custom agents) - - `uc_function_name`: Unity Catalog function name in format `catalog.schema.function_name` - - `connection_name`: Unity Catalog connection name (for external MCP servers) - - Note: Provide exactly one of: `ka_tile_id`, `genie_space_id`, `endpoint_name`, `uc_function_name`, or `connection_name` -- `description`: (optional) What the Supervisor Agent does (for create_or_update) -- `instructions`: (optional) Routing instructions for the supervisor (for create_or_update) -- `tile_id`: The Supervisor Agent tile ID (for get, delete, or update via create_or_update) -- `examples`: (optional) List of example questions with `question` and `guideline` fields (for create_or_update) +--- -Actions: -- **create_or_update**: Requires `name`, `agents`. Optionally pass `tile_id` to update. -- **get**: Requires `tile_id`. Returns tile_id, name, description, endpoint_status, agents, examples_count. -- **find_by_name**: Requires `name` (exact match). Returns found, tile_id, name, endpoint_status, agents_count. Use this to look up an existing Supervisor Agent when you know the name but not the tile_id. -- **delete**: Requires `tile_id`. - -## Typical Workflow - -### 1. Generate Source Data - -Before creating Agent Bricks, generate the required source data: - -**For KA (document Q&A)**: -``` -1. Use `databricks-unstructured-pdf-generation` skill to generate PDFs -2. PDFs are saved to a Volume with companion JSON files (question/guideline pairs) +## Supervisor Agent + +**No CLI** โ€” use `mas_manager.py` from this skill's `scripts/` folder. All `/...` paths below are relative to the directory containing this SKILL.md (resolve to the absolute path in your install location). + +```bash +# Create MAS +python /scripts/mas_manager.py create_mas "My Supervisor" '{ + "description": "Routes queries to specialized agents", + "instructions": "Route data questions to analyst, document questions to docs_agent.", + "agents": [ + {"name": "analyst", "genie_space_id": "01abc...", "description": "SQL analytics"}, + {"name": "docs_agent", "ka_tile_id": "dab408a2-...", "description": "Answers from documents"} + ] +}' + +# Check status and manage. list_mas enumerates every MAS you can access +# and returns {tile_id, name, endpoint_status, agents_count} โ€” use it to +# find a tile_id / see which MAS are ONLINE before operations. +python /scripts/mas_manager.py list_mas +python /scripts/mas_manager.py get_mas TILE_ID +python /scripts/mas_manager.py update_mas TILE_ID '{"agents": [...]}' +python /scripts/mas_manager.py delete_mas TILE_ID + +# Add examples โ€” requires endpoint_status=ONLINE. After create_mas the MAS is +# NOT_READY and takes up to ~10 min to reach ONLINE. Without --wait, this +# fails fast if not ONLINE yet. With --wait, it blocks until ONLINE then adds. +python /scripts/mas_manager.py add_examples TILE_ID '[{"question": "...", "guideline": "..."}]' [--wait] + +# Find IDs +databricks knowledge-assistants list-knowledge-assistants --output json | jq '.[].id' +databricks genie list-spaces --output json | jq '.[].space_id' ``` -**For Genie (SQL exploration)**: -``` -1. Use `databricks-synthetic-data-gen` skill to create raw parquet data -2. Use `databricks-spark-declarative-pipelines` skill to create bronze/silver/gold tables -``` +**Agent types** (use exactly ONE per agent): -### 2. Create the Agent Brick - -Use `manage_ka(action="create_or_update", ...)` or `manage_mas(action="create_or_update", ...)` with your data sources. - -### 3. Wait for Provisioning - -Newly created KA and MAS tiles need time to provision. The endpoint status will progress: -- `PROVISIONING` - Being created (can take 2-5 minutes) -- `ONLINE` - Ready to use -- `OFFLINE` - Not running - -### 4. Add Examples (Automatic) - -For KA, if `add_examples_from_volume=true`, examples are automatically extracted from JSON files in the volume and added once the endpoint is `ONLINE`. - -## Best Practices - -1. **Use meaningful names**: Names are sanitized automatically (spaces become underscores) -2. **Provide descriptions**: Helps users understand what the brick does -3. **Add instructions**: Guide the AI's behavior and tone -4. **Include sample questions**: Shows users how to interact with the brick -5. **Use the workflow**: Generate data first, then create the brick - -## Example: Multi-Modal Supervisor Agent - -```python -manage_mas( - action="create_or_update", - name="Enterprise Support Supervisor", - agents=[ - { - "name": "knowledge_base", - "ka_tile_id": "f32c5f73-466b-...", - "description": "Answers questions about company policies, procedures, and documentation from indexed files" - }, - { - "name": "analytics_engine", - "genie_space_id": "01abc123...", - "description": "Runs SQL analytics on usage metrics, performance stats, and operational data" - }, - { - "name": "ml_classifier", - "endpoint_name": "custom-classification-endpoint", - "description": "Classifies support tickets and predicts resolution time using custom ML model" - }, - { - "name": "data_enrichment", - "uc_function_name": "support.utils.enrich_ticket_data", - "description": "Enriches support ticket data with customer history and context" - }, - { - "name": "ticket_operations", - "connection_name": "ticket_system_mcp", - "description": "Creates, updates, assigns, and closes support tickets in external ticketing system" - } - ], - description="Comprehensive enterprise support agent with knowledge retrieval, analytics, ML, data enrichment, and ticketing operations", - instructions=""" - Route queries as follows: - 1. Policy/procedure questions โ†’ knowledge_base - 2. Data analysis requests โ†’ analytics_engine - 3. Ticket classification โ†’ ml_classifier - 4. Customer context lookups โ†’ data_enrichment - 5. Ticket creation/updates โ†’ ticket_operations - - If a query spans multiple domains, chain agents: - - First gather information (analytics_engine or knowledge_base) - - Then take action (ticket_operations) - """ -) -``` +| Field | Type | +|-------|------| +| `ka_tile_id` | Knowledge Assistant | +| `genie_space_id` | Genie Space | +| `endpoint_name` | Model serving endpoint | +| `uc_function_name` | UC function (`catalog.schema.func`) | +| `connection_name` | MCP server (UC HTTP Connection) | -## Related Skills +**Status:** `NOT_READY` (up to ~10 min after create/big update) โ†’ `ONLINE` โ†’ `OFFLINE` -- **[databricks-genie](../databricks-genie/SKILL.md)** - Comprehensive Genie Space creation, curation, and Conversation API guidance -- **[databricks-unstructured-pdf-generation](../databricks-unstructured-pdf-generation/SKILL.md)** - Generate synthetic PDFs to feed into Knowledge Assistants -- **[databricks-synthetic-data-gen](../databricks-synthetic-data-gen/SKILL.md)** - Create raw data for Genie Space tables -- **[databricks-spark-declarative-pipelines](../databricks-spark-declarative-pipelines/SKILL.md)** - Build bronze/silver/gold tables consumed by Genie Spaces -- **[databricks-model-serving](../databricks-model-serving/SKILL.md)** - Deploy custom agent endpoints used as MAS agents -- **[databricks-vector-search](../databricks-vector-search/SKILL.md)** - Build vector indexes for RAG applications paired with KAs +--- -## See Also +## Reference -- `1-knowledge-assistants.md` - Detailed KA patterns and examples -- `databricks-genie` skill - Detailed Genie patterns, curation, and examples -- `2-supervisor-agents.md` - Detailed MAS patterns and examples +| Topic | File | +|-------|------| +| KA source types, index, troubleshooting | [1-knowledge-assistants.md](1-knowledge-assistants.md) | +| UC functions, MCP servers, examples | [2-supervisor-agents.md](2-supervisor-agents.md) | diff --git a/databricks-skills/databricks-agent-bricks/scripts/mas_manager.py b/databricks-skills/databricks-agent-bricks/scripts/mas_manager.py new file mode 100644 index 00000000..bf1d92ee --- /dev/null +++ b/databricks-skills/databricks-agent-bricks/scripts/mas_manager.py @@ -0,0 +1,667 @@ +#!/usr/bin/env python3 +""" +Supervisor Agent (MAS) Manager - Self-contained CLI for MAS operations. + +Usage: + python mas_manager.py create_mas "Name" '{"agents": [...], "description": "...", "instructions": "..."}' + python mas_manager.py get_mas TILE_ID + python mas_manager.py find_mas "Name" + python mas_manager.py update_mas TILE_ID '{"name": ..., "agents": [...], ...}' + python mas_manager.py delete_mas TILE_ID + python mas_manager.py list_mas + python mas_manager.py add_examples TILE_ID '[{"question": "...", "guideline": "..."}]' [--wait] + python mas_manager.py list_examples TILE_ID + + --wait on add_examples blocks until the MAS endpoint reaches ONLINE state + (up to ~10 min after a create/update) before adding the examples. + +Requires: databricks-sdk, requests + pip install databricks-sdk requests +""" + +import json +import logging +import re +import sys +import time +from concurrent.futures import ThreadPoolExecutor, as_completed +from dataclasses import dataclass +from enum import Enum +from typing import Any, Dict, List, Optional + +import requests +from databricks.sdk import WorkspaceClient + +logger = logging.getLogger(__name__) + + +# ============================================================================ +# Models +# ============================================================================ + + +class TileType(Enum): + """Tile types.""" + UNSPECIFIED = 0 + KIE = 1 + T2T = 2 + KA = 3 + MAO = 4 + MAS = 5 + + +class EndpointStatus(Enum): + """Endpoint status values.""" + ONLINE = "ONLINE" + OFFLINE = "OFFLINE" + PROVISIONING = "PROVISIONING" + NOT_READY = "NOT_READY" + + +@dataclass(frozen=True) +class MASIds: + """Supervisor Agent identifiers.""" + tile_id: str + name: str + + +# ============================================================================ +# MAS Manager Class +# ============================================================================ + + +class MASManager: + """Manager for Supervisor Agent (MAS) operations. + + Uses raw HTTP API calls since there's no CLI for MAS operations. + Authentication is handled via databricks-sdk WorkspaceClient. + """ + + def __init__(self, client: Optional[WorkspaceClient] = None): + """Initialize the MAS Manager. + + Args: + client: Optional WorkspaceClient (creates new one if not provided) + """ + self.w: WorkspaceClient = client or WorkspaceClient() + + @staticmethod + def sanitize_name(name: str) -> str: + """Sanitize a name to ensure it's alphanumeric with only hyphens and underscores.""" + sanitized = name.replace(" ", "_") + sanitized = re.sub(r"[^a-zA-Z0-9_-]", "_", sanitized) + sanitized = re.sub(r"[_-]{2,}", "_", sanitized) + sanitized = sanitized.strip("_-") + if not sanitized: + sanitized = "supervisor_agent" + return sanitized + + # ======================================================================== + # MAS CRUD Operations + # ======================================================================== + + def create( + self, + name: str, + agents: List[Dict[str, Any]], + description: Optional[str] = None, + instructions: Optional[str] = None, + ) -> Dict[str, Any]: + """Create a Supervisor Agent with specified agents.""" + payload = {"name": self.sanitize_name(name), "agents": agents} + if description: + payload["description"] = description + if instructions: + payload["instructions"] = instructions + return self._post("/api/2.0/multi-agent-supervisors", payload) + + def get(self, tile_id: str) -> Optional[Dict[str, Any]]: + """Get MAS by tile_id.""" + try: + return self._get(f"/api/2.0/multi-agent-supervisors/{tile_id}") + except Exception as e: + if "does not exist" in str(e).lower() or "not found" in str(e).lower(): + return None + raise + + def update( + self, + tile_id: str, + name: Optional[str] = None, + description: Optional[str] = None, + instructions: Optional[str] = None, + agents: Optional[List[Dict[str, Any]]] = None, + ) -> Dict[str, Any]: + """Update a Supervisor Agent.""" + payload = {"tile_id": tile_id} + if name: + payload["name"] = self.sanitize_name(name) + if description: + payload["description"] = description + if instructions: + payload["instructions"] = instructions + if agents: + payload["agents"] = agents + return self._patch(f"/api/2.0/multi-agent-supervisors/{tile_id}", payload) + + def delete(self, tile_id: str) -> None: + """Delete a Supervisor Agent.""" + self._delete(f"/api/2.0/tiles/{tile_id}") + + def find_by_name(self, name: str) -> Optional[MASIds]: + """Find a MAS by exact display name.""" + sanitized_name = self.sanitize_name(name) + filter_q = f"name_contains={sanitized_name}&&tile_type=MAS" + page_token = None + while True: + params = {"filter": filter_q} + if page_token: + params["page_token"] = page_token + resp = self._get("/api/2.0/tiles", params=params) + for t in resp.get("tiles", []): + if t.get("name") == sanitized_name: + return MASIds(tile_id=t["tile_id"], name=sanitized_name) + page_token = resp.get("next_page_token") + if not page_token: + break + return None + + def list_all(self, page_size: int = 100) -> List[Dict[str, Any]]: + """List all Supervisor Agents.""" + all_tiles = [] + filter_q = "tile_type=MAS" + page_token = None + + while True: + params = {"page_size": page_size, "filter": filter_q} + if page_token: + params["page_token"] = page_token + + resp = self._get("/api/2.0/tiles", params=params) + for tile in resp.get("tiles", []): + tile_type = tile.get("tile_type") + if tile_type in ("MAS", "5"): + all_tiles.append(tile) + + page_token = resp.get("next_page_token") + if not page_token: + break + + return all_tiles + + def get_endpoint_status(self, tile_id: str) -> Optional[str]: + """Get the endpoint status of a MAS.""" + mas = self.get(tile_id) + if not mas: + return None + return mas.get("multi_agent_supervisor", {}).get("status", {}).get("endpoint_status") + + # ======================================================================== + # Examples Management + # ======================================================================== + + def create_example(self, tile_id: str, question: str, guidelines: Optional[List[str]] = None) -> Dict[str, Any]: + """Create an example question for the MAS.""" + payload = {"tile_id": tile_id, "question": question} + if guidelines: + payload["guidelines"] = guidelines + return self._post(f"/api/2.0/multi-agent-supervisors/{tile_id}/examples", payload) + + def list_examples(self, tile_id: str, page_size: int = 100) -> Dict[str, Any]: + """List all examples for a MAS.""" + return self._get(f"/api/2.0/multi-agent-supervisors/{tile_id}/examples", params={"page_size": page_size}) + + def delete_example(self, tile_id: str, example_id: str) -> None: + """Delete an example from the MAS.""" + self._delete(f"/api/2.0/multi-agent-supervisors/{tile_id}/examples/{example_id}") + + def add_examples_batch(self, tile_id: str, questions: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + """Add multiple example questions in parallel.""" + created_examples = [] + + def create_example(q: Dict[str, Any]) -> Optional[Dict[str, Any]]: + question_text = q.get("question", "") + guidelines = q.get("guideline") + if guidelines and isinstance(guidelines, str): + guidelines = [guidelines] + + if not question_text: + return None + try: + return self.create_example(tile_id, question_text, guidelines) + except Exception as e: + logger.error(f"Failed to add MAS example '{question_text[:50]}...': {e}") + return None + + max_workers = min(2, len(questions)) + with ThreadPoolExecutor(max_workers=max_workers) as executor: + future_to_q = {executor.submit(create_example, q): q for q in questions} + for future in as_completed(future_to_q): + result = future.result() + if result: + created_examples.append(result) + + return created_examples + + # ======================================================================== + # HTTP Helpers + # ======================================================================== + + def _handle_response_error(self, response: requests.Response, method: str, path: str) -> None: + """Extract detailed error from response and raise.""" + if response.status_code >= 400: + try: + error_data = response.json() + error_msg = error_data.get("message", error_data.get("error", str(error_data))) + raise Exception(f"{method} {path} failed: {error_msg}") + except ValueError: + raise Exception(f"{method} {path} failed with status {response.status_code}: {response.text}") + + def _get(self, path: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + headers = self.w.config.authenticate() + url = f"{self.w.config.host}{path}" + response = requests.get(url, headers=headers, params=params or {}, timeout=20) + self._handle_response_error(response, "GET", path) + return response.json() + + def _post(self, path: str, body: Dict[str, Any], timeout: int = 300) -> Dict[str, Any]: + headers = self.w.config.authenticate() + headers["Content-Type"] = "application/json" + url = f"{self.w.config.host}{path}" + response = requests.post(url, headers=headers, json=body, timeout=timeout) + self._handle_response_error(response, "POST", path) + return response.json() + + def _patch(self, path: str, body: Dict[str, Any]) -> Dict[str, Any]: + headers = self.w.config.authenticate() + headers["Content-Type"] = "application/json" + url = f"{self.w.config.host}{path}" + response = requests.patch(url, headers=headers, json=body, timeout=20) + self._handle_response_error(response, "PATCH", path) + return response.json() + + def _delete(self, path: str) -> Dict[str, Any]: + headers = self.w.config.authenticate() + url = f"{self.w.config.host}{path}" + response = requests.delete(url, headers=headers, timeout=20) + self._handle_response_error(response, "DELETE", path) + return response.json() + + +# ============================================================================ +# CLI Functions +# ============================================================================ + + +def _get_manager() -> MASManager: + """Get MASManager instance.""" + return MASManager() + + +def _build_agent_list(agents: List[Dict[str, str]]) -> List[Dict[str, Any]]: + """Build agent list for API from simplified config.""" + agent_list = [] + for agent in agents: + agent_name = agent.get("name", "") + agent_description = agent.get("description", "") + + agent_config = { + "name": agent_name, + "description": agent_description, + } + + if agent.get("genie_space_id"): + agent_config["agent_type"] = "genie" + agent_config["genie_space"] = {"id": agent.get("genie_space_id")} + elif agent.get("ka_tile_id"): + ka_tile_id = agent.get("ka_tile_id") + tile_id_prefix = ka_tile_id.split("-")[0] + agent_config["agent_type"] = "serving_endpoint" + agent_config["serving_endpoint"] = {"name": f"ka-{tile_id_prefix}-endpoint"} + elif agent.get("uc_function_name"): + uc_function_name = agent.get("uc_function_name") + uc_parts = uc_function_name.split(".") + agent_config["agent_type"] = "unity_catalog_function" + agent_config["unity_catalog_function"] = { + "uc_path": { + "catalog": uc_parts[0], + "schema": uc_parts[1], + "name": uc_parts[2], + } + } + elif agent.get("connection_name"): + agent_config["agent_type"] = "external_mcp_server" + agent_config["external_mcp_server"] = {"connection_name": agent.get("connection_name")} + else: + agent_config["agent_type"] = "serving_endpoint" + agent_config["serving_endpoint"] = {"name": agent.get("endpoint_name")} + + agent_list.append(agent_config) + return agent_list + + +def create_mas( + name: str, + agents: List[Dict[str, str]], + description: str = None, + instructions: str = None, +) -> Dict[str, Any]: + """Create a new Supervisor Agent.""" + manager = _get_manager() + agent_list = _build_agent_list(agents) + + result = manager.create( + name=name, + agents=agent_list, + description=description, + instructions=instructions, + ) + + mas_data = result.get("multi_agent_supervisor", {}) + tile_data = mas_data.get("tile", {}) + status_data = mas_data.get("status", {}) + + return { + "tile_id": tile_data.get("tile_id", ""), + "name": tile_data.get("name", name), + "endpoint_status": status_data.get("endpoint_status", "UNKNOWN"), + "agents_count": len(agents), + } + + +def get_mas(tile_id: str) -> Dict[str, Any]: + """Get a Supervisor Agent by tile ID.""" + manager = _get_manager() + result = manager.get(tile_id) + + if not result: + return {"error": f"Supervisor Agent {tile_id} not found"} + + mas_data = result.get("multi_agent_supervisor", {}) + tile_data = mas_data.get("tile", {}) + status_data = mas_data.get("status", {}) + + return { + "tile_id": tile_data.get("tile_id", tile_id), + "name": tile_data.get("name", ""), + "description": tile_data.get("description", ""), + "endpoint_status": status_data.get("endpoint_status", "UNKNOWN"), + "agents": mas_data.get("agents", []), + # instructions live on the tile, not on the mas_data root + "instructions": tile_data.get("instructions", ""), + } + + +def find_mas(name: str) -> Dict[str, Any]: + """Find a Supervisor Agent by name.""" + manager = _get_manager() + result = manager.find_by_name(name) + + if result is None: + return {"found": False, "name": name} + + full_details = manager.get(result.tile_id) + if full_details: + mas_data = full_details.get("multi_agent_supervisor", {}) + status_data = mas_data.get("status", {}) + return { + "found": True, + "tile_id": result.tile_id, + "name": result.name, + "endpoint_status": status_data.get("endpoint_status", "UNKNOWN"), + "agents_count": len(mas_data.get("agents", [])), + } + + return { + "found": True, + "tile_id": result.tile_id, + "name": result.name, + } + + +def update_mas( + tile_id: str, + name: str = None, + agents: List[Dict[str, str]] = None, + description: str = None, + instructions: str = None, +) -> Dict[str, Any]: + """Update an existing Supervisor Agent.""" + manager = _get_manager() + + existing = manager.get(tile_id) + if not existing: + return {"error": f"Supervisor Agent {tile_id} not found"} + + mas_data = existing.get("multi_agent_supervisor", {}) + tile_data = mas_data.get("tile", {}) + + final_name = name or tile_data.get("name", "") + final_description = description or tile_data.get("description", "") + # instructions live on the tile in GET responses, not on the mas_data root + final_instructions = instructions or tile_data.get("instructions", "") + + if agents: + agent_list = _build_agent_list(agents) + else: + agent_list = mas_data.get("agents", []) + + result = manager.update( + tile_id=tile_id, + name=final_name, + description=final_description, + instructions=final_instructions, + agents=agent_list, + ) + + updated_data = result.get("multi_agent_supervisor", {}) + updated_tile = updated_data.get("tile", {}) + updated_status = updated_data.get("status", {}) + + return { + "tile_id": updated_tile.get("tile_id", tile_id), + "name": updated_tile.get("name", final_name), + "endpoint_status": updated_status.get("endpoint_status", "UNKNOWN"), + } + + +def delete_mas(tile_id: str) -> Dict[str, Any]: + """Delete a Supervisor Agent.""" + manager = _get_manager() + try: + manager.delete(tile_id) + return {"success": True, "tile_id": tile_id} + except Exception as e: + return {"success": False, "tile_id": tile_id, "error": str(e)} + + +def list_mas() -> List[Dict[str, Any]]: + """List all Supervisor Agents.""" + manager = _get_manager() + results = [] + + tiles = manager.list_all() + for tile in tiles: + tile_id = tile.get("tile_id") + details = manager.get(tile_id) + if details: + mas_data = details.get("multi_agent_supervisor", {}) + tile_data = mas_data.get("tile", {}) + status_data = mas_data.get("status", {}) + results.append({ + "tile_id": tile_id, + "name": tile_data.get("name", ""), + "endpoint_status": status_data.get("endpoint_status", "UNKNOWN"), + "agents_count": len(mas_data.get("agents", [])), + }) + + return results + + +def add_examples( + tile_id: str, + examples: List[Dict[str, Any]], + wait_for_online: bool = False, + wait_timeout_seconds: int = 900, + poll_interval_seconds: float = 30.0, +) -> Dict[str, Any]: + """Add example questions to a Supervisor Agent. + + Examples can only be added once the MAS endpoint is ONLINE. Right after + create_mas or a big update_mas the endpoint is NOT_READY and takes up to + ~10 min to come ONLINE. + + If wait_for_online=True, this call BLOCKS synchronously until the endpoint + reaches ONLINE (polling every `poll_interval_seconds`, up to + `wait_timeout_seconds`). Default timeout 15 min covers the ~10 min + provisioning with some headroom. The caller's process MUST stay alive for + the whole duration โ€” there is no background queue. + + If wait_for_online=False (default) and the endpoint is not ONLINE, returns + an error immediately without adding anything. Retry later once ONLINE. + """ + manager = _get_manager() + + status = get_mas(tile_id) + if "error" in status: + return status + + current = status.get("endpoint_status") + + if current != "ONLINE": + if not wait_for_online: + return { + "error": f"MAS is not ONLINE (status: {current}). " + "Retry once it's ONLINE, or pass wait_for_online=True " + "(--wait on the CLI) to block until it comes up.", + "tile_id": tile_id, + "endpoint_status": current, + } + + # Block-and-poll until ONLINE or timeout. + deadline = time.monotonic() + wait_timeout_seconds + while time.monotonic() < deadline: + current = manager.get_endpoint_status(tile_id) + if current == "ONLINE": + break + time.sleep(poll_interval_seconds) + else: + return { + "error": f"Timed out after {wait_timeout_seconds}s waiting for MAS " + f"to reach ONLINE (last status: {current}).", + "tile_id": tile_id, + "endpoint_status": current, + } + + created = manager.add_examples_batch(tile_id, examples) + return { + "tile_id": tile_id, + "added_count": len(created), + "total_requested": len(examples), + "endpoint_status": current, + } + + +def list_examples(tile_id: str) -> Dict[str, Any]: + """List all examples for a Supervisor Agent.""" + manager = _get_manager() + result = manager.list_examples(tile_id) + return { + "tile_id": tile_id, + "examples": result.get("examples", []), + "count": len(result.get("examples", [])), + } + + +def _print_json(data: Any) -> None: + """Print data as formatted JSON.""" + print(json.dumps(data, indent=2)) + + +def main(): + """CLI entry point.""" + if len(sys.argv) < 2: + print(__doc__) + sys.exit(1) + + command = sys.argv[1] + + if command == "create_mas": + if len(sys.argv) < 4: + print("Usage: python mas_manager.py create_mas NAME '{\"agents\": [...], ...}'") + sys.exit(1) + name = sys.argv[2] + config = json.loads(sys.argv[3]) + result = create_mas( + name=name, + agents=config.get("agents", []), + description=config.get("description"), + instructions=config.get("instructions"), + ) + _print_json(result) + + elif command == "get_mas": + if len(sys.argv) < 3: + print("Usage: python mas_manager.py get_mas TILE_ID") + sys.exit(1) + result = get_mas(sys.argv[2]) + _print_json(result) + + elif command == "find_mas": + if len(sys.argv) < 3: + print("Usage: python mas_manager.py find_mas NAME") + sys.exit(1) + result = find_mas(sys.argv[2]) + _print_json(result) + + elif command == "update_mas": + if len(sys.argv) < 4: + print("Usage: python mas_manager.py update_mas TILE_ID '{\"name\": ..., \"agents\": [...], ...}'") + sys.exit(1) + tile_id = sys.argv[2] + config = json.loads(sys.argv[3]) + result = update_mas( + tile_id=tile_id, + name=config.get("name"), + agents=config.get("agents"), + description=config.get("description"), + instructions=config.get("instructions"), + ) + _print_json(result) + + elif command == "delete_mas": + if len(sys.argv) < 3: + print("Usage: python mas_manager.py delete_mas TILE_ID") + sys.exit(1) + result = delete_mas(sys.argv[2]) + _print_json(result) + + elif command == "list_mas": + result = list_mas() + _print_json(result) + + elif command == "add_examples": + if len(sys.argv) < 4: + print("Usage: python mas_manager.py add_examples TILE_ID '[{\"question\": \"...\", \"guideline\": \"...\"}]' [--wait]") + sys.exit(1) + tile_id = sys.argv[2] + examples = json.loads(sys.argv[3]) + wait = "--wait" in sys.argv[4:] + result = add_examples(tile_id, examples, wait_for_online=wait) + _print_json(result) + + elif command == "list_examples": + if len(sys.argv) < 3: + print("Usage: python mas_manager.py list_examples TILE_ID") + sys.exit(1) + result = list_examples(sys.argv[2]) + _print_json(result) + + else: + print(f"Unknown command: {command}") + print(__doc__) + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/databricks-skills/databricks-aibi-dashboards/1-widget-specifications.md b/databricks-skills/databricks-aibi-dashboards/1-widget-specifications.md index 8f23b166..d7851f00 100644 --- a/databricks-skills/databricks-aibi-dashboards/1-widget-specifications.md +++ b/databricks-skills/databricks-aibi-dashboards/1-widget-specifications.md @@ -198,7 +198,7 @@ Format types: `number`, `number-currency`, `number-percent` - `version`: **3** - `widgetType`: "line" or "bar" -- Use `x`, `y`, optional `color` encodings +- **`x` and `y` are both REQUIRED** (one categorical/temporal dimension + one quantitative measure). `color` is optional for splitting into series. - `scale.type`: `"temporal"` (dates), `"quantitative"` (numbers), `"categorical"` (strings) - Use `"disaggregated": true` with pre-aggregated dataset data @@ -297,7 +297,7 @@ Add `format` to any encoding to display values appropriately: ## Dataset Parameters -Use `:param` syntax in SQL for dynamic filtering: +Use `:param` syntax in SQL for dynamic filtering. Parameters can be bound to filter widgets (see [3-filters.md](3-filters.md)): ```json { @@ -323,19 +323,9 @@ Use `:param` syntax in SQL for dynamic filtering: Allowed in `query.fields` (no CAST or complex SQL): ```json -// Aggregations -{"name": "sum(revenue)", "expression": "SUM(`revenue`)"} -{"name": "avg(price)", "expression": "AVG(`price`)"} -{"name": "count(id)", "expression": "COUNT(`id`)"} -{"name": "countdistinct(id)", "expression": "COUNT(DISTINCT `id`)"} - -// Date truncation -{"name": "daily(date)", "expression": "DATE_TRUNC(\"DAY\", `date`)"} -{"name": "weekly(date)", "expression": "DATE_TRUNC(\"WEEK\", `date`)"} -{"name": "monthly(date)", "expression": "DATE_TRUNC(\"MONTH\", `date`)"} - -// Simple reference -{"name": "category", "expression": "`category`"} +{"name": "[sum|avg|count|countdistinct|min|max](col)", "expression": "[SUM|AVG|COUNT|COUNT(DISTINCT)|MIN|MAX](`col`)"} +{"name": "[daily|weekly|monthly](date)", "expression": "DATE_TRUNC(\"[DAY|WEEK|MONTH]\", `date`)"} +{"name": "field", "expression": "`field`"} ``` For conditional logic, compute in dataset SQL instead. diff --git a/databricks-skills/databricks-aibi-dashboards/3-examples.md b/databricks-skills/databricks-aibi-dashboards/3-examples.md deleted file mode 100644 index cb8791dd..00000000 --- a/databricks-skills/databricks-aibi-dashboards/3-examples.md +++ /dev/null @@ -1,308 +0,0 @@ -# Complete Dashboard Examples - -Production-ready templates you can adapt for your use case. - -## Basic Dashboard (NYC Taxi) - -```python -import json - -# Step 1: Check table schema -table_info = get_table_stats_and_schema(catalog="samples", schema="nyctaxi") - -# Step 2: Test queries -execute_sql("SELECT COUNT(*) as trips, AVG(fare_amount) as avg_fare, AVG(trip_distance) as avg_distance FROM samples.nyctaxi.trips") -execute_sql(""" - SELECT pickup_zip, COUNT(*) as trip_count - FROM samples.nyctaxi.trips - GROUP BY pickup_zip - ORDER BY trip_count DESC - LIMIT 10 -""") - -# Step 3: Build dashboard JSON -dashboard = { - "datasets": [ - { - "name": "summary", - "displayName": "Summary Stats", - "queryLines": [ - "SELECT COUNT(*) as trips, AVG(fare_amount) as avg_fare, ", - "AVG(trip_distance) as avg_distance ", - "FROM samples.nyctaxi.trips " - ] - }, - { - "name": "by_zip", - "displayName": "Trips by ZIP", - "queryLines": [ - "SELECT pickup_zip, COUNT(*) as trip_count ", - "FROM samples.nyctaxi.trips ", - "GROUP BY pickup_zip ", - "ORDER BY trip_count DESC ", - "LIMIT 10 " - ] - } - ], - "pages": [{ - "name": "overview", - "displayName": "NYC Taxi Overview", - "pageType": "PAGE_TYPE_CANVAS", - "layoutVersion": "GRID_V1", - "layout": [ - # Text header - NO spec block! Use SEPARATE widgets for title and subtitle! - { - "widget": { - "name": "title", - "multilineTextboxSpec": { - "lines": ["## NYC Taxi Dashboard"] - } - }, - "position": {"x": 0, "y": 0, "width": 12, "height": 1} - }, - { - "widget": { - "name": "subtitle", - "multilineTextboxSpec": { - "lines": ["Trip statistics and analysis"] - } - }, - "position": {"x": 0, "y": 1, "width": 12, "height": 1} - }, - # Counter - version 2, width 4! - { - "widget": { - "name": "total-trips", - "queries": [{ - "name": "main_query", - "query": { - "datasetName": "summary", - "fields": [{"name": "trips", "expression": "`trips`"}], - "disaggregated": True - } - }], - "spec": { - "version": 2, - "widgetType": "counter", - "encodings": { - "value": {"fieldName": "trips", "displayName": "Total Trips"} - }, - "frame": {"title": "Total Trips", "showTitle": True} - } - }, - "position": {"x": 0, "y": 2, "width": 4, "height": 3} - }, - { - "widget": { - "name": "avg-fare", - "queries": [{ - "name": "main_query", - "query": { - "datasetName": "summary", - "fields": [{"name": "avg_fare", "expression": "`avg_fare`"}], - "disaggregated": True - } - }], - "spec": { - "version": 2, - "widgetType": "counter", - "encodings": { - "value": {"fieldName": "avg_fare", "displayName": "Avg Fare"} - }, - "frame": {"title": "Average Fare", "showTitle": True} - } - }, - "position": {"x": 4, "y": 2, "width": 4, "height": 3} - }, - { - "widget": { - "name": "total-distance", - "queries": [{ - "name": "main_query", - "query": { - "datasetName": "summary", - "fields": [{"name": "avg_distance", "expression": "`avg_distance`"}], - "disaggregated": True - } - }], - "spec": { - "version": 2, - "widgetType": "counter", - "encodings": { - "value": {"fieldName": "avg_distance", "displayName": "Avg Distance"} - }, - "frame": {"title": "Average Distance", "showTitle": True} - } - }, - "position": {"x": 8, "y": 2, "width": 4, "height": 3} - }, - # Bar chart - version 3 - { - "widget": { - "name": "trips-by-zip", - "queries": [{ - "name": "main_query", - "query": { - "datasetName": "by_zip", - "fields": [ - {"name": "pickup_zip", "expression": "`pickup_zip`"}, - {"name": "trip_count", "expression": "`trip_count`"} - ], - "disaggregated": True - } - }], - "spec": { - "version": 3, - "widgetType": "bar", - "encodings": { - "x": {"fieldName": "pickup_zip", "scale": {"type": "categorical"}, "displayName": "ZIP"}, - "y": {"fieldName": "trip_count", "scale": {"type": "quantitative"}, "displayName": "Trips"} - }, - "frame": {"title": "Trips by Pickup ZIP", "showTitle": True} - } - }, - "position": {"x": 0, "y": 5, "width": 12, "height": 5} - }, - # Table - version 2, minimal column props! - { - "widget": { - "name": "zip-table", - "queries": [{ - "name": "main_query", - "query": { - "datasetName": "by_zip", - "fields": [ - {"name": "pickup_zip", "expression": "`pickup_zip`"}, - {"name": "trip_count", "expression": "`trip_count`"} - ], - "disaggregated": True - } - }], - "spec": { - "version": 2, - "widgetType": "table", - "encodings": { - "columns": [ - {"fieldName": "pickup_zip", "displayName": "ZIP Code"}, - {"fieldName": "trip_count", "displayName": "Trip Count"} - ] - }, - "frame": {"title": "Top ZIP Codes", "showTitle": True} - } - }, - "position": {"x": 0, "y": 10, "width": 12, "height": 5} - } - ] - }] -} - -# Step 4: Deploy -result = manage_dashboard( - action="create_or_update", - display_name="NYC Taxi Dashboard", - parent_path="/Workspace/Users/me/dashboards", - serialized_dashboard=json.dumps(dashboard), - warehouse_id=manage_warehouse(action="get_best"), -) -print(result["url"]) -``` - -## Dashboard with Global Filters - -```python -import json - -# Dashboard with a global filter for region -dashboard_with_filters = { - "datasets": [ - { - "name": "sales", - "displayName": "Sales Data", - "queryLines": [ - "SELECT region, SUM(revenue) as total_revenue ", - "FROM catalog.schema.sales ", - "GROUP BY region" - ] - } - ], - "pages": [ - { - "name": "overview", - "displayName": "Sales Overview", - "pageType": "PAGE_TYPE_CANVAS", - "layoutVersion": "GRID_V1", - "layout": [ - { - "widget": { - "name": "total-revenue", - "queries": [{ - "name": "main_query", - "query": { - "datasetName": "sales", - "fields": [{"name": "total_revenue", "expression": "`total_revenue`"}], - "disaggregated": True - } - }], - "spec": { - "version": 2, # Version 2 for counters! - "widgetType": "counter", - "encodings": { - "value": {"fieldName": "total_revenue", "displayName": "Total Revenue"} - }, - "frame": {"title": "Total Revenue", "showTitle": True} - } - }, - "position": {"x": 0, "y": 0, "width": 12, "height": 3} - } - ] - }, - { - "name": "filters", - "displayName": "Filters", - "pageType": "PAGE_TYPE_GLOBAL_FILTERS", # Required for global filter page! - "layoutVersion": "GRID_V1", - "layout": [ - { - "widget": { - "name": "filter_region", - "queries": [{ - "name": "ds_sales_region", - "query": { - "datasetName": "sales", - "fields": [ - {"name": "region", "expression": "`region`"} - # DO NOT use associative_filter_predicate_group - causes SQL errors! - ], - "disaggregated": False # False for filters! - } - }], - "spec": { - "version": 2, # Version 2 for filters! - "widgetType": "filter-multi-select", # NOT "filter"! - "encodings": { - "fields": [{ - "fieldName": "region", - "displayName": "Region", - "queryName": "ds_sales_region" # Must match query name! - }] - }, - "frame": {"showTitle": True, "title": "Region"} # Always show title! - } - }, - "position": {"x": 0, "y": 0, "width": 4, "height": 2} - } - ] - } - ] -} - -# Deploy with filters -result = manage_dashboard( - action="create_or_update", - display_name="Sales Dashboard with Filters", - parent_path="/Workspace/Users/me/dashboards", - serialized_dashboard=json.dumps(dashboard_with_filters), - warehouse_id=manage_warehouse(action="get_best"), -) -print(result["url"]) -``` diff --git a/databricks-skills/databricks-aibi-dashboards/3-filters.md b/databricks-skills/databricks-aibi-dashboards/3-filters.md index 5a4ab497..0c98e494 100644 --- a/databricks-skills/databricks-aibi-dashboards/3-filters.md +++ b/databricks-skills/databricks-aibi-dashboards/3-filters.md @@ -153,7 +153,7 @@ Place filter widget directly on a `PAGE_TYPE_CANVAS` page (same widget structure { "name": "revenue_by_category", "queryLines": [ - "SELECT category, SUM(revenue) as revenue FROM catalog.schema.orders ", + "SELECT category, SUM(revenue) as revenue FROM orders ", "WHERE order_date BETWEEN :date_range.min AND :date_range.max ", "GROUP BY category" ], diff --git a/databricks-skills/databricks-aibi-dashboards/4-examples.md b/databricks-skills/databricks-aibi-dashboards/4-examples.md index 8fa49c5b..81129e2b 100644 --- a/databricks-skills/databricks-aibi-dashboards/4-examples.md +++ b/databricks-skills/databricks-aibi-dashboards/4-examples.md @@ -59,6 +59,8 @@ This example shows a complete dashboard with: - Data table for detailed records - Global filters (date range, region, category) +> **Note**: Queries reference bare table names only (no catalog, no schema). Catalog and schema are set via `--dataset-catalog "my_catalog" --dataset-schema "gold"` when creating the dashboard. These flags only apply when the query omits catalog/schema โ€” they will NOT override anything you hardcode in the `FROM` clause. + ```json { "datasets": [ @@ -67,7 +69,7 @@ This example shows a complete dashboard with: "displayName": "Daily Sales", "queryLines": [ "SELECT sale_date, region, department, total_orders, total_units, total_revenue, total_cost, profit_margin ", - "FROM catalog.schema.gold_daily_sales ", + "FROM daily_sales ", "ORDER BY sale_date" ] }, @@ -76,7 +78,7 @@ This example shows a complete dashboard with: "displayName": "Product Performance", "queryLines": [ "SELECT product_id, product_name, department, region, units_sold, revenue, cost, profit ", - "FROM catalog.schema.gold_product_performance" + "FROM product_performance" ] } ], diff --git a/databricks-skills/databricks-aibi-dashboards/SKILL.md b/databricks-skills/databricks-aibi-dashboards/SKILL.md index 426e6024..f5b6da29 100644 --- a/databricks-skills/databricks-aibi-dashboards/SKILL.md +++ b/databricks-skills/databricks-aibi-dashboards/SKILL.md @@ -1,82 +1,226 @@ --- name: databricks-aibi-dashboards -description: "Create Databricks AI/BI dashboards. Use when creating, updating, or deploying Lakeview dashboards. CRITICAL: You MUST test ALL SQL queries via execute_sql BEFORE deploying. Follow guidelines strictly." +description: "Create Databricks AI/BI dashboards. Must use when creating, updating, or deploying Lakeview dashboards as Databricks Dashboard have a unique json structure. CRITICAL: You MUST test ALL SQL queries via CLI BEFORE deploying. Follow guidelines strictly." --- # AI/BI Dashboard Skill -Create Databricks AI/BI dashboards (formerly Lakeview dashboards). **Follow these guidelines strictly.** +Create Databricks AI/BI dashboards (formerly Lakeview dashboards). +A dashboard should be showing something relevant for a human, typically some KPI on the top, and based on the story, some graph (often temporal), and we see "something happens". +**Follow these guidelines strictly.** -## CRITICAL: MANDATORY VALIDATION WORKFLOW +## Quick Reference -**You MUST follow this workflow exactly. Skipping validation causes broken dashboards.** +| Task | Command | +|------|---------| +| List warehouses | `databricks warehouses list` | +| List tables | `databricks experimental aitools tools query --warehouse WH "SHOW TABLES IN catalog.schema"` | +| Get schema | `databricks experimental aitools tools discover-schema catalog.schema.table1 catalog.schema.table2` | +| Test query | `databricks experimental aitools tools query --warehouse WH "SELECT..."` | +| Create dashboard | `databricks lakeview create --display-name "X" --warehouse-id "WH" --dataset-catalog CATALOG --dataset-schema SCHEMA --serialized-dashboard "$(cat file.json)" --json '{"parent_path": "/Workspace/Users//path"}'` โ€” `parent_path` is JSON-only (no flag); everything else stays as flags. Queries must use bare table names. | +| Update dashboard | `databricks lakeview update DASHBOARD_ID --serialized-dashboard "$(cat file.json)"` | +| Publish | `databricks lakeview publish DASHBOARD_ID --warehouse-id WH` | +| Delete | `databricks lakeview trash DASHBOARD_ID` | +--- + +## CRITICAL: Widget Version Requirements + +> **Wrong version = broken widget!** This is the #1 cause of dashboard errors. + +| Widget Type | Version | Notes | +|-------------|---------|-------| +| `counter` | **2** | KPI cards | +| `table` | **2** | Data tables | +| `bar`, `line`, `area`, `pie`, `scatter` | **3** | Charts | +| `combo`, `choropleth-map` | **1** | Advanced charts | +| `filter-*` | **2** | All filter types | + +--- + +## NEW DASHBOARD CREATION WORKFLOW + +**You MUST test ALL SQL queries via CLI BEFORE deploying. Follow the overall logic in these steps for new dashboard - Skipping validation causes broken dashboards.** + +### Step 1: Get Warehouse ID if not already known + +```bash +# List warehouses to find one for SQL execution +databricks warehouses list ``` -โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” -โ”‚ STEP 1: Get table schemas via get_table_stats_and_schema(catalog, schema) โ”‚ -โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค -โ”‚ STEP 2: Write SQL queries for each dataset โ”‚ -โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค -โ”‚ STEP 3: TEST EVERY QUERY via execute_sql() โ† DO NOT SKIP! โ”‚ -โ”‚ - If query fails, FIX IT before proceeding โ”‚ -โ”‚ - Verify column names match what widgets will reference โ”‚ -โ”‚ - Verify data types are correct (dates, numbers, strings) โ”‚ -โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค -โ”‚ STEP 4: Build dashboard JSON using ONLY verified queries โ”‚ -โ”œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”ค -โ”‚ STEP 5: Deploy via manage_dashboard(action="create_or_update") โ”‚ -โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ + +### Step 2: Discover Table Schemas and existing data pattern + +A good dashboard comes from knowing the data first. Spend time here โ€” the exploration drives design decisions in Step 4 (which widgets, which filters, which groupings). + +Use `discover-schema` as the default โ€” one call returns columns, types, sample rows, null counts, and row count. If you only know the schema, list tables first with `query "SHOW TABLES IN ..."`. + +`databricks experimental aitools tools discover-schema catalog.schema.orders catalog.schema.customers` + +Sample rows alone don't tell you what to build. you can write aggregate SQL through `databricks experimental aitools tools query --warehouse "..."` to probe typically: + +- **Cardinality** of candidate grouping columns โ†’ decides chart color-group vs. table (โ‰ค8 distinct values for charts, see Cardinality & Readability below). +- **Top categorical values** โ†’ populates filter options and chart legends meaningfully. +- **Numeric distribution** (min/max/avg/percentiles) โ†’ decides KPI with delta vs. trend chart (flat metrics shouldn't be line charts, see Data Variance Considerations below). +- **Trend viability** at daily/weekly/monthly grain โ†’ picks the right trend granularity. +- **Story confirmation** โ€” run the aggregations you plan to put in the dashboard and check they're not flat, empty, or uninteresting. Fix the query or adjust the story before moving on. + +Fan out independent probes (state โˆˆ `PENDING|RUNNING|SUCCEEDED|FAILED|CANCELED|CLOSED`): + +```bash +submit() { databricks api post /api/2.0/sql/statements --json "$(jq -nc --arg w "$1" --arg s "$2" '{warehouse_id:$w,statement:$s,wait_timeout:"0s",on_wait_timeout:"CONTINUE"}')" | jq -r .statement_id; } +SIDS=(); for q in "$@"; do SIDS+=( "$(submit "$WH" "$q")" ); done +for s in "${SIDS[@]}"; do databricks api get "/api/2.0/sql/statements/$s" | jq '{state:.status.state, rows:.result.data_array}'; done +# cancel: databricks api post "/api/2.0/sql/statements/$SID/cancel" ``` -**WARNING: If you deploy without testing queries, widgets WILL show "Invalid widget definition" errors!** - -## Available MCP Tools - -| Tool | Description | -|------|-------------| -| `get_table_stats_and_schema` | **STEP 1**: Get table schemas for designing queries | -| `execute_sql` | **STEP 3**: Test SQL queries - MANDATORY before deployment! | -| `manage_warehouse` (action="get_best") | Get available warehouse ID | -| `manage_dashboard` | **STEP 5**: Dashboard lifecycle management (see actions below) | - -### manage_dashboard Actions - -| Action | Description | Required Params | -|--------|-------------|-----------------| -| `create_or_update` | Deploy dashboard JSON (only after validation!) | display_name, parent_path, serialized_dashboard, warehouse_id | -| `get` | Get dashboard details by ID | dashboard_id | -| `list` | List all dashboards | (none) | -| `delete` | Move dashboard to trash | dashboard_id | -| `publish` | Publish a dashboard | dashboard_id, warehouse_id | -| `unpublish` | Unpublish a dashboard | dashboard_id | - -**Example usage:** -```python -# Create/update dashboard -manage_dashboard( - action="create_or_update", - display_name="Sales Dashboard", - parent_path="/Workspace/Users/me/dashboards", - serialized_dashboard=dashboard_json, - warehouse_id="abc123", - publish=True # auto-publish after create -) +> **Dashboard queries are different** โ€” inside the dashboard JSON, the `FROM` clause must reference ONLY the table name, with no catalog or schema prefix: +> - โœ… Correct: `FROM trips` +> - โŒ Wrong: `FROM nyctaxi.trips` +> - โŒ Wrong: `FROM samples.nyctaxi.trips` +> +> The catalog and schema are supplied separately via the `--dataset-catalog` and `--dataset-schema` flags when you run `databricks lakeview create`. These flags do NOT rewrite the query โ€” they only fill in the catalog/schema when the query omits them. If you hardcode a catalog or schema in the `FROM` clause, the flags are ignored for that query and the dashboard won't be portable across environments. -# Get dashboard details -manage_dashboard(action="get", dashboard_id="dashboard_123") + +### Step 3: Verify Data Matches Story +The datasets.querylines in the dashboard json (see example below) must be tested to ensure + +Before finalizing, run the SQL Queries you intend to add in each dataset to confirm that they run properly and that the result are valid. +This is crucial, as the widget defined in the json will use the query field output to render the visualization. The value should also make sense at a business level. +Remember that for the filter to work, the query should have the field available (so typically group by the filter field) + +If values don't match expectations, ensure the query is correct, fix the data if you can, or adjust the story before creating the dashboard. + +### Step 4: Plan Dashboard Structure + +Before writing JSON, plan your dashboard: + +1. You must know the expected specific JSON structure. For this, **Read reference files**: [1-widget-specifications.md](1-widget-specifications.md), [3-filters.md](3-filters.md), [4-examples.md](4-examples.md) + +2. Think: **What widgets?** Map each visualization to a dataset: + | Widget | Type | Dataset | Has filter field? | + |--------|------|---------|-------------------| + | Revenue KPI | counter | ds_sales | โœ“ date, region | + | Trend Chart | line | ds_sales | โœ“ date, region | + | Top Products | table | ds_products | โœ— no date | + ... + +3. **What filters?** For each filter, verify ALL datasets you want filtered contain the filter field. + > **Filters only affect datasets that have the filter field.** A pre-aggregated table without dates WON'T be date-filtered. + +4. **Write JSON locally** as a file. + +### Step 5: Dashboard Lifecycle +Once created, you can edit the file as following: +```bash +# Create a dashboard โ€” canonical form. Everything is a flag EXCEPT parent_path +# (JSON-only, no flag โ€” without it, dashboard lands at /Users//). +# --dataset-catalog/--dataset-schema inject `catalog`/`schema` into each saved +# dataset; queries inside dashboard.json must use bare table names. +databricks lakeview create \ + --display-name "My Dashboard" \ + --warehouse-id "abc123def456" \ + --dataset-catalog "my_catalog" \ + --dataset-schema "my_schema" \ + --serialized-dashboard "$(cat dashboard.json)" \ + --json '{"parent_path": "/Workspace/Users/me@co.com/dashboards"}' # List all dashboards -manage_dashboard(action="list") +databricks lakeview list + +# Get dashboard details +databricks lakeview get DASHBOARD_ID + +# Update a dashboard +databricks lakeview update DASHBOARD_ID --serialized-dashboard "$(cat dashboard.json)" + +# Publish a dashboard +databricks lakeview publish DASHBOARD_ID --warehouse-id WAREHOUSE_ID + +# Unpublish a dashboard +databricks lakeview unpublish DASHBOARD_ID + +# Delete (trash) a dashboard +databricks lakeview trash DASHBOARD_ID + +# By default, after creation, tag dashboards to track resources created with this skill +databricks workspace-entity-tag-assignments create-tag-assignment \ + dashboards DASHBOARD_ID aidevkit_project --tag-value ai-dev-kit +``` + +--- + +## JSON Structure (Required Skeleton) + +Every dashboard's `serialized_dashboard` content must follow this exact structure: + +```json +{ + "datasets": [ + { + "name": "ds_x", + "displayName": "Dataset X", + "queryLines": ["SELECT col1, col2 ", "FROM my_table"] + } + ], + "pages": [ + { + "name": "main", + "displayName": "Main", + "pageType": "PAGE_TYPE_CANVAS", + "layout": [ + {"widget": {/* INLINE widget definition */}, "position": {"x":0,"y":0,"width":2,"height":3}} + ] + } + ] +} ``` +**Structural rules (violations cause "failed to parse serialized dashboard"):** +- `queryLines`: Array of strings, NOT `"query": "string"` +- Widgets: INLINE in `layout[].widget`, NOT a separate `"widgets"` array +- `pageType`: Required on every page (`PAGE_TYPE_CANVAS` or `PAGE_TYPE_GLOBAL_FILTERS`) +- Query binding: `query.fields[].name` must exactly match `encodings.*.fieldName` + +### Linking a Genie Space (Optional) + +To add an "Ask Genie" button to the dashboard, or to link a genie space/room with an ID, add `uiSettings.genieSpace` to the JSON: + +```json +{ + "datasets": [...], + "pages": [...], + "uiSettings": { + "genieSpace": { + "isEnabled": true, + "overrideId": "your-genie-space-id-here", + "enablementMode": "ENABLED" + } + } +} +``` + +> **Genie is NOT a widget.** Link via `uiSettings.genieSpace` only. There is no `"widgetType": "assistant"`. + +--- + +## Design Best Practices + +Apply unless user specifies otherwise: +- **Global date filter**: When data has temporal columns, add a date range filter. Most dashboards need time-based filtering. +- **KPI time bounds**: Use time-bounded metrics that enable period comparison (MoM, YoY). Unbounded "all-time" totals are less actionable. +- **Value formatting**: Format values based on their meaning โ€” currency with symbol, percentages with %, large numbers compacted (K/M/B). +- **Chart selection**: Match cardinality to chart type. Few distinct values โ†’ bar with color grouping (or pie if you really want a snapshot); many values โ†’ table. + ## Reference Files | What are you building? | Reference | |------------------------|-----------| | Any widget (text, counter, table, chart) | [1-widget-specifications.md](1-widget-specifications.md) | -| Dashboard with filters (global or page-level) | [2-filters.md](2-filters.md) | -| Need a complete working template to adapt | [3-examples.md](3-examples.md) | -| Debugging a broken dashboard | [4-troubleshooting.md](4-troubleshooting.md) | +| Advanced charts (area, scatter/Bubble, combo (Line+Bar), Choropleth map) | [2-advanced-widget-specifications.md](2-advanced-widget-specifications.md) | +| Dashboard with filters (global or page-level) | [3-filters.md](3-filters.md) | +| Need a complete working template to adapt | [4-examples.md](4-examples.md) | +| Debugging a broken dashboard | [5-troubleshooting.md](5-troubleshooting.md) | --- @@ -84,12 +228,16 @@ manage_dashboard(action="list") ### 1) DATASET ARCHITECTURE -- **One dataset per domain** (e.g., orders, customers, products) +- **One dataset per domain** (e.g., orders, customers, products). Datasets shared across widgets benefit from the same filters. - **Exactly ONE valid SQL query per dataset** (no multiple queries separated by `;`) -- Always use **fully-qualified table names**: `catalog.schema.table_name` +- **Queries must use bare table names only** โ€” no catalog, no schema prefix. Example: `FROM orders`, never `FROM gold.orders` or `FROM main.gold.orders`. The catalog and schema come from the `--dataset-catalog` and `--dataset-schema` flags at creation time. These flags only fill in missing parts โ€” they do NOT override any catalog/schema written in the query. - SELECT must include all dimensions needed by widgets and all derived columns via `AS` aliases - Put ALL business logic (CASE/WHEN, COALESCE, ratios) into the dataset SELECT with explicit aliases - **Contract rule**: Every widget `fieldName` must exactly match a dataset column or alias +- **Add ORDER BY** when visualization depends on data order: + - Time series: `ORDER BY date` for chronological display + - Rankings/Top-N: `ORDER BY metric DESC LIMIT 10` for "Top 10" charts + - Categorical charts: `ORDER BY metric DESC` to show largest values first ### 2) WIDGET FIELD EXPRESSIONS @@ -117,26 +265,10 @@ manage_dashboard(action="list") Allowed expressions in widget queries (you CANNOT use CAST or other SQL in expressions): -**For numbers:** -```json -{"name": "sum(revenue)", "expression": "SUM(`revenue`)"} -{"name": "avg(price)", "expression": "AVG(`price`)"} -{"name": "count(orders)", "expression": "COUNT(`order_id`)"} -{"name": "countdistinct(customers)", "expression": "COUNT(DISTINCT `customer_id`)"} -{"name": "min(date)", "expression": "MIN(`order_date`)"} -{"name": "max(date)", "expression": "MAX(`order_date`)"} -``` - -**For dates** (use daily for timeseries, weekly/monthly for grouped comparisons): ```json -{"name": "daily(date)", "expression": "DATE_TRUNC(\"DAY\", `date`)"} -{"name": "weekly(date)", "expression": "DATE_TRUNC(\"WEEK\", `date`)"} -{"name": "monthly(date)", "expression": "DATE_TRUNC(\"MONTH\", `date`)"} -``` - -**Simple field reference** (for pre-aggregated data): -```json -{"name": "category", "expression": "`category`"} +{"name": "[sum|avg|count|countdistinct|min|max](col)", "expression": "[SUM|AVG|COUNT|COUNT(DISTINCT)|MIN|MAX](`col`)"} +{"name": "[daily|weekly|monthly](date)", "expression": "DATE_TRUNC(\"[DAY|WEEK|MONTH]\", `date`)"} +{"name": "field", "expression": "`field`"} ``` If you need conditional logic or multi-field formulas, compute a derived column in the dataset SQL first. @@ -165,13 +297,20 @@ Each widget has a position: `{"x": 0, "y": 0, "width": 4, "height": 4}` **CRITICAL**: Each row must fill width=12 exactly. No gaps allowed. +``` +CORRECT: WRONG: +y=0: [w=12] y=0: [w=8]____ โ† gap! +y=1: [w=4][w=4][w=4] โ† fills 12 y=1: [w=2][w=2][w=2][w=2]__ โ† gap! +y=4: [w=6][w=6] โ† fills 12 +``` + **Recommended widget sizes:** | Widget Type | Width | Height | Notes | |-------------|-------|--------|-------| | Text header | 12 | 1 | Full width; use SEPARATE widgets for title and subtitle | | Counter/KPI | 4 | **3-4** | **NEVER height=2** - too cramped! | -| Line/Bar chart | 6 | **5-6** | Pair side-by-side to fill row | +| Line/Bar/Area chart | 6 | **5-6** | Pair side-by-side to fill row | | Pie chart | 6 | **5-6** | Needs space for legend | | Full-width chart | 12 | 5-7 | For detailed time series | | Table | 12 | 5-8 | Full width for readability | @@ -194,11 +333,11 @@ y=12: Table (w=12, h=6) - Detailed data | Dimension Type | Max Values | Examples | |----------------|------------|----------| | Chart color/groups | **3-8** | 4 regions, 5 product lines, 3 tiers | -| Filters | 4-10 | 8 countries, 5 channels | +| Filters | 4-15 | 8 countries, 5 channels | | High cardinality | **Table only** | customer_id, order_id, SKU | **Before creating any chart with color/grouping:** -1. Check column cardinality (use `get_table_stats_and_schema` to see distinct values) +1. Check column cardinality via discover-schema or a COUNT DISTINCT query 2. If >10 distinct values, aggregate to higher level OR use TOP-N + "Other" bucket 3. For high-cardinality dimensions, use a table widget instead of a chart @@ -209,13 +348,29 @@ Before deploying, verify: 2. **Every page has `"layoutVersion": "GRID_V1"`** 3. All rows sum to width=12 with no gaps 4. KPIs use height 3-4, charts use height 5-6 -5. Chart dimensions have โ‰ค8 distinct values +5. Chart dimensions have reasonable cardinality (โ‰ค8 for colors/groups) 6. All widget fieldNames match dataset columns exactly 7. **Field `name` in query.fields matches `fieldName` in encodings exactly** (e.g., both `"sum(spend)"`) 8. Counter datasets: use `disaggregated: true` for 1-row datasets, `disaggregated: false` with aggregation for multi-row -9. Percent values are 0-1 (not 0-100) +9. **Percent values must be 0-1 for `number-percent` format** (0.865 displays as "86.5%", don't forget to set the format). If data is 0-100, either divide by 100 in SQL or use `number` format instead. 10. SQL uses Spark syntax (date_sub, not INTERVAL) -11. **All SQL queries tested via `execute_sql` and return expected data** +11. **All SQL queries tested via CLI and return expected data** +12. **Every dataset you want filtered MUST contain the filter field** โ€” filters only affect datasets with that column in their query + +--- + +## Data Variance Considerations + +Before creating trend charts, check if the metric has enough variance to visualize meaningfully: + +```sql +SELECT MIN(metric), MAX(metric), MAX(metric) - MIN(metric) as range FROM dataset +``` + +If the range is very small relative to the scale (e.g., 83-89% on a 0-100 scale), the chart will appear nearly flat. Consider: +- Showing as KPI with delta/comparison instead of chart +- Using a table to display exact values +- Adjusting the visualization to focus on the variance --- diff --git a/databricks-skills/databricks-app-python/4-deployment.md b/databricks-skills/databricks-app-python/4-deployment.md index 0d0ab9f2..111f59b7 100644 --- a/databricks-skills/databricks-app-python/4-deployment.md +++ b/databricks-skills/databricks-app-python/4-deployment.md @@ -1,6 +1,6 @@ # Deploying Databricks Apps -Three deployment options: Databricks CLI (simplest), Asset Bundles (multi-environment), or MCP tools (programmatic). +Three deployment options: Databricks CLI (simplest), Asset Bundles (multi-environment), or CLI commands (programmatic). **Cookbook deployment guide**: https://apps-cookbook.dev/docs/deploy @@ -45,13 +45,15 @@ If you use `databricks workspace import-dir` directly, it does **not** apply the ### Step 2: Create and Deploy +`--overwrite` on `workspace import-dir` is required for redeploys โ€” without it the CLI **silently skips files that already exist**, so your updated code never makes it to the workspace and the app keeps running the old version. Harmless on the first deploy. + ```bash # Create the app databricks apps create # Upload source code (make sure to exclude node_modules, venv, etc.) databricks workspace mkdirs /Workspace/Users//apps/ -databricks workspace import-dir . /Workspace/Users//apps/ +databricks workspace import-dir . /Workspace/Users//apps/ --overwrite # Deploy databricks apps deploy \ @@ -65,9 +67,11 @@ databricks apps get ### Redeployment +Use this recipe after the initial deploy, when you want a clean upload (stale files removed). On a first-ever deploy the `workspace delete` line errors because the directory doesn't exist yet โ€” either run Step 2 first, or prefix the delete with ` 2>/dev/null || true` if you want this recipe to double as a clean deploy. + ```bash databricks workspace delete /Workspace/Users//apps/ --recursive -databricks workspace import-dir . /Workspace/Users//apps/ +databricks workspace import-dir . /Workspace/Users//apps/ --overwrite databricks apps deploy \ --source-code-path /Workspace/Users//apps/ ``` @@ -115,9 +119,9 @@ For complete DABs guidance, use the **databricks-bundles** skill. --- -## Option 3: MCP Tools +## Option 3: CLI Commands -For programmatic app lifecycle management, see [6-mcp-approach.md](6-mcp-approach.md). +For CLI-based app lifecycle management, see [6-cli-approach.md](6-cli-approach.md). --- diff --git a/databricks-skills/databricks-app-python/6-cli-approach.md b/databricks-skills/databricks-app-python/6-cli-approach.md new file mode 100644 index 00000000..0636511e --- /dev/null +++ b/databricks-skills/databricks-app-python/6-cli-approach.md @@ -0,0 +1,93 @@ +# CLI Commands for App Lifecycle + +Use the Databricks CLI to create, deploy, and manage Databricks Apps. + +--- + +## databricks apps - App Lifecycle Management + +```bash +# List all apps +databricks apps list + +# Create an app +databricks apps create --name my-dashboard --json '{"description": "Customer analytics dashboard"}' + +# Get app details +databricks apps get my-dashboard + +# Deploy an app (from workspace source code) +databricks apps deploy my-dashboard --source-code-path /Workspace/Users/user@example.com/my_app + +# Get app logs +databricks apps logs my-dashboard + +# Delete an app +databricks apps delete my-dashboard + +# By default, after creation, tag apps to track resources created with this skill +databricks workspace-entity-tag-assignments create-tag-assignment \ + apps my-dashboard aidevkit_project --tag-value ai-dev-kit +``` + +--- + +## Workflow + +### Step 1: Write App Files Locally + +Create your app files in a local folder: + +``` +my_app/ +โ”œโ”€โ”€ app.py # Main application +โ”œโ”€โ”€ models.py # Pydantic models +โ”œโ”€โ”€ backend.py # Data access layer +โ”œโ”€โ”€ requirements.txt # Additional dependencies +โ””โ”€โ”€ app.yaml # Databricks Apps configuration +``` + +### Step 2: Upload to Workspace + +`--overwrite` is required for redeploys โ€” without it the CLI **silently skips files that already exist**, so your updated code never makes it to the workspace and the app keeps running the old version. Harmless on the first deploy. + +```bash +# Upload local folder to workspace +databricks workspace import-dir /path/to/my_app /Workspace/Users/user@example.com/my_app --overwrite +``` + +### Step 3: Create and Deploy App + +```bash +# Create the app +databricks apps create --name my-dashboard --json '{"description": "Customer analytics dashboard"}' + +# Deploy from workspace source +databricks apps deploy my-dashboard --source-code-path /Workspace/Users/user@example.com/my_app +``` + +### Step 4: Verify + +```bash +# Check app status +databricks apps get my-dashboard + +# Check logs for errors +databricks apps logs my-dashboard +``` + +### Step 5: Iterate + +1. Fix issues in local files +2. Re-upload with `databricks workspace import-dir /path/to/my_app /Workspace/Users/user@example.com/my_app --overwrite` +3. Re-deploy with `databricks apps deploy my-dashboard --source-code-path ...` +4. Check `databricks apps logs my-dashboard` for errors +5. Repeat until app is healthy + +--- + +## Notes + +- Add resources (SQL warehouse, Lakebase, etc.) via the Databricks Apps UI after creating the app +- CLI uses your configured profile's credentials โ€” ensure you have access to required resources +- For DABs deployment, see [4-deployment.md](4-deployment.md) diff --git a/databricks-skills/databricks-app-python/6-mcp-approach.md b/databricks-skills/databricks-app-python/6-mcp-approach.md deleted file mode 100644 index 943c49ba..00000000 --- a/databricks-skills/databricks-app-python/6-mcp-approach.md +++ /dev/null @@ -1,79 +0,0 @@ -# MCP Tools for App Lifecycle - -Use MCP tools to create, deploy, and manage Databricks Apps programmatically. This mirrors the CLI workflow but can be invoked by AI agents. - ---- - -## manage_app - App Lifecycle Management - -| Action | Description | Required Params | -|--------|-------------|-----------------| -| `create_or_update` | Idempotent create, deploys if source_code_path provided | name | -| `get` | Get app details (with optional logs) | name | -| `list` | List all apps | (none, optional name_contains filter) | -| `delete` | Delete an app | name | - ---- - -## Workflow - -### Step 1: Write App Files Locally - -Create your app files in a local folder: - -``` -my_app/ -โ”œโ”€โ”€ app.py # Main application -โ”œโ”€โ”€ models.py # Pydantic models -โ”œโ”€โ”€ backend.py # Data access layer -โ”œโ”€โ”€ requirements.txt # Additional dependencies -โ””โ”€โ”€ app.yaml # Databricks Apps configuration -``` - -### Step 2: Upload to Workspace - -```python -# MCP Tool: manage_workspace_files -manage_workspace_files( - action="upload", - local_path="/path/to/my_app", - workspace_path="/Workspace/Users/user@example.com/my_app" -) -``` - -### Step 3: Create and Deploy App - -```python -# MCP Tool: manage_app (creates if needed + deploys) -result = manage_app( - action="create_or_update", - name="my-dashboard", - description="Customer analytics dashboard", - source_code_path="/Workspace/Users/user@example.com/my_app" -) -# Returns: {"name": "my-dashboard", "url": "...", "created": True, "deployment": {...}} -``` - -### Step 4: Verify - -```python -# MCP Tool: manage_app (get with logs) -app = manage_app(action="get", name="my-dashboard", include_logs=True) -# Returns: {"name": "...", "url": "...", "status": "RUNNING", "logs": "...", ...} -``` - -### Step 5: Iterate - -1. Fix issues in local files -2. Re-upload with `manage_workspace_files(action="upload", ...)` -3. Re-deploy with `manage_app(action="create_or_update", ...)` (will update existing + deploy) -4. Check `manage_app(action="get", name=..., include_logs=True)` for errors -5. Repeat until app is healthy - ---- - -## Notes - -- Add resources (SQL warehouse, Lakebase, etc.) via the Databricks Apps UI after creating the app -- MCP tools use the service principal's permissions โ€” ensure it has access to required resources -- For manual deployment, see [4-deployment.md](4-deployment.md) diff --git a/databricks-skills/databricks-app-python/SKILL.md b/databricks-skills/databricks-app-python/SKILL.md index 777d3377..c5a595a5 100644 --- a/databricks-skills/databricks-app-python/SKILL.md +++ b/databricks-skills/databricks-app-python/SKILL.md @@ -72,7 +72,7 @@ Copy this checklist and verify each item: **Lakebase**: Use [5-lakebase.md](5-lakebase.md) when using Lakebase (PostgreSQL) as your app's data layer โ€” covers auto-injected env vars, psycopg2/asyncpg patterns, and when to choose Lakebase vs SQL warehouse. (Keywords: Lakebase, PostgreSQL, psycopg2, asyncpg, transactional, PGHOST) -**MCP tools**: Use [6-mcp-approach.md](6-mcp-approach.md) for managing app lifecycle via MCP tools โ€” covers creating, deploying, monitoring, and deleting apps programmatically. (Keywords: MCP, create app, deploy app, app logs) +**CLI commands**: Use [6-cli-approach.md](6-cli-approach.md) for managing app lifecycle via CLI โ€” covers creating, deploying, monitoring, and deleting apps. (Keywords: CLI, create app, deploy app, app logs) **Foundation Models**: See [examples/llm_config.py](examples/llm_config.py) for calling Databricks foundation model APIs โ€” covers OAuth M2M auth, OpenAI-compatible client wiring, and token caching. (Keywords: foundation model, LLM, OpenAI client, chat completions) @@ -87,7 +87,7 @@ Copy this checklist and verify each item: **Connecting to data/resources?** โ†’ Read [2-app-resources.md](2-app-resources.md) **Using Lakebase (PostgreSQL)?** โ†’ Read [5-lakebase.md](5-lakebase.md) **Deploying to Databricks?** โ†’ Read [4-deployment.md](4-deployment.md) - **Using MCP tools?** โ†’ Read [6-mcp-approach.md](6-mcp-approach.md) + **Using CLI for app lifecycle?** โ†’ Read [6-cli-approach.md](6-cli-approach.md) **Calling foundation model/LLM APIs?** โ†’ See [examples/llm_config.py](examples/llm_config.py) 2. Follow the instructions in the relevant guide @@ -207,5 +207,5 @@ class EntityIn(BaseModel): - **[databricks-app-apx](../databricks-app-apx/SKILL.md)** - full-stack apps with FastAPI + React - **[databricks-bundles](../databricks-bundles/SKILL.md)** - deploying apps via DABs - **[databricks-python-sdk](../databricks-python-sdk/SKILL.md)** - backend SDK integration -- **[databricks-lakebase-provisioned](../databricks-lakebase-provisioned/SKILL.md)** - adding persistent PostgreSQL state +- **[databricks-lakebase-autoscale](../databricks-lakebase-autoscale/SKILL.md)** - adding persistent PostgreSQL state (autoscaling managed PG with branching) - **[databricks-model-serving](../databricks-model-serving/SKILL.md)** - serving ML models for app integration diff --git a/databricks-skills/databricks-config/SKILL.md b/databricks-skills/databricks-config/SKILL.md index 118713d1..21728f19 100644 --- a/databricks-skills/databricks-config/SKILL.md +++ b/databricks-skills/databricks-config/SKILL.md @@ -3,20 +3,144 @@ name: databricks-config description: "Manage Databricks workspace connections: check current workspace, switch profiles, list available workspaces, or authenticate to a new workspace. Use when the user mentions \"switch workspace\", \"which workspace\", \"current profile\", \"databrickscfg\", \"connect to workspace\", or \"databricks auth\"." --- -Use the `manage_workspace` MCP tool for all workspace operations. Do NOT edit `~/.databrickscfg`, use Bash, or use the Databricks CLI. +Use the Databricks CLI for all workspace operations. -## Steps +## CLI Commands -1. Call `ToolSearch` with query `select:mcp__databricks__manage_workspace` to load the tool. +### Check Current Workspace -2. Map user intent to action: - - status / which workspace / current โ†’ `action="status"` - - list / available workspaces โ†’ `action="list"` - - switch to X โ†’ call `list` first to find the profile name, then `action="switch", profile=""` (or `host=""` if a URL was given) - - login / connect / authenticate โ†’ `action="login", host=""` +```bash +# Show current configuration status +databricks auth describe -3. Call `mcp__databricks__manage_workspace` with the action and any parameters. +# Show current workspace URL +databricks config get --key host -4. Present the result. For `status`/`switch`/`login`: show host, profile, username. For `list`: formatted table with the active profile marked. +# Show current profile +databricks config get --key profile +``` -> **Note:** The switch is session-scoped โ€” it resets on MCP server restart. For permanent profile setup, use `databricks auth login -p ` and update `~/.databrickscfg` with `cluster_id` or `serverless_compute_id = auto`. +### List Available Profiles + +```bash +# List all configured profiles from ~/.databrickscfg +cat ~/.databrickscfg | grep '^\[' | tr -d '[]' +``` + +### Switch Workspace/Profile + +```bash +# Use a different profile for subsequent commands +databricks --profile auth describe + +# Or set environment variable for the session +export DATABRICKS_CONFIG_PROFILE= +``` + +### Authenticate to New Workspace + +```bash +# OAuth login (opens browser) +databricks auth login --host https://your-workspace.cloud.databricks.com + +# OAuth login with profile name +databricks auth login --host https://your-workspace.cloud.databricks.com --profile my-profile + +# Configure with PAT +databricks configure --profile my-profile +``` + +### Verify Authentication + +```bash +# Check auth status +databricks auth describe + +# Test by listing clusters +databricks clusters list +``` + +## ~/.databrickscfg Format + +```ini +[DEFAULT] +host = https://your-workspace.cloud.databricks.com +cluster_id = 0123-456789-abc123 +# or +serverless_compute_id = auto + +[production] +host = https://prod-workspace.cloud.databricks.com +token = dapi... + +[development] +host = https://dev-workspace.cloud.databricks.com +``` + +## Python SDK + +```python +from databricks.sdk import WorkspaceClient + +# Use default profile +w = WorkspaceClient() + +# Use specific profile +w = WorkspaceClient(profile="production") + +# Use specific host +w = WorkspaceClient(host="https://your-workspace.cloud.databricks.com") + +# Check current user +print(w.current_user.me().user_name) +``` + +> **Note:** Profile changes via environment variables or CLI flags are session-scoped. For permanent profile setup, use `databricks auth login -p ` and update `~/.databrickscfg` with `cluster_id` or `serverless_compute_id = auto`. + +## CLI Syntax Patterns + +**IMPORTANT**: Use `--json` for creating Unity Catalog objects. This is the most reliable syntax. + +```bash +# โœ… CORRECT - use --json for create operations +databricks catalogs create --json '{"name": "my_catalog"}' +databricks schemas create --json '{"name": "my_schema", "catalog_name": "my_catalog"}' +databricks volumes create --json '{"name": "my_volume", "catalog_name": "my_catalog", "schema_name": "my_schema", "volume_type": "MANAGED"}' +``` + +### Common CLI Patterns + +```bash +# Get help for any command +databricks --help +databricks schemas create --help + +# List operations +databricks catalogs list +databricks schemas list CATALOG_NAME +databricks volumes list CATALOG_NAME.SCHEMA_NAME +databricks clusters list +databricks warehouses list + +# Create operations (use --json) +databricks catalogs create --json '{"name": "my_catalog"}' +databricks schemas create --json '{"name": "my_schema", "catalog_name": "my_catalog"}' +databricks volumes create --json '{"name": "my_volume", "catalog_name": "my_catalog", "schema_name": "my_schema", "volume_type": "MANAGED"}' + +# Delete operations (use full name) +databricks catalogs delete CATALOG_NAME +databricks schemas delete CATALOG_NAME.SCHEMA_NAME +databricks volumes delete CATALOG_NAME.SCHEMA_NAME.VOLUME_NAME +``` + +### SQL Execution via CLI + +```bash +# Run SQL query +databricks experimental aitools tools query --warehouse WAREHOUSE_ID "SELECT * FROM catalog.schema.table LIMIT 10" + +# Create objects via SQL (alternative approach) +databricks experimental aitools tools query --warehouse WAREHOUSE_ID "CREATE CATALOG my_catalog" +databricks experimental aitools tools query --warehouse WAREHOUSE_ID "CREATE SCHEMA my_catalog.my_schema" +databricks experimental aitools tools query --warehouse WAREHOUSE_ID "CREATE VOLUME my_catalog.my_schema.my_volume" +``` diff --git a/databricks-skills/databricks-dbsql/SKILL.md b/databricks-skills/databricks-dbsql/SKILL.md index 24bf2694..043228b9 100644 --- a/databricks-skills/databricks-dbsql/SKILL.md +++ b/databricks-skills/databricks-dbsql/SKILL.md @@ -297,4 +297,4 @@ Load these for detailed syntax, full parameter lists, and advanced patterns: - **Star schema in Gold layer** for BI; OBT acceptable in Silver - **Define PK/FK constraints** on dimensional models for query optimization - **Use `COLLATE UTF8_LCASE`** for user-facing string columns that need case-insensitive search -- **Use MCP tools** (`execute_sql`, `execute_sql_multi`) to test and validate all SQL before deploying +- **Test SQL via CLI** (`databricks experimental aitools tools query`) or notebooks before deploying diff --git a/databricks-skills/databricks-docs/SKILL.md b/databricks-skills/databricks-docs/SKILL.md index ceca11e0..8e9d68d5 100644 --- a/databricks-skills/databricks-docs/SKILL.md +++ b/databricks-skills/databricks-docs/SKILL.md @@ -5,7 +5,7 @@ description: "Databricks documentation reference via llms.txt index. Use when ot # Databricks Documentation Reference -This skill provides access to the complete Databricks documentation index via llms.txt - use it as a **reference resource** to supplement other skills and inform your use of MCP tools. +This skill provides access to the complete Databricks documentation index via llms.txt - use it as a **reference resource** to supplement other skills. ## Role of This Skill @@ -13,10 +13,10 @@ This is a **reference skill**, not an action skill. Use it to: - Look up documentation when other skills don't cover a topic - Get authoritative guidance on Databricks concepts and APIs -- Find detailed information to inform how you use MCP tools +- Find detailed information to inform CLI commands and SDK usage - Discover features and capabilities you may not know about -**Always prefer using MCP tools for actions** (execute_sql, manage_pipeline, etc.) and **load specific skills for workflows** (databricks-python-sdk, databricks-spark-declarative-pipelines, etc.). Use this skill when you need reference documentation. +**Always prefer using CLI/SDK for actions** and **load specific skills for workflows** (databricks-python-sdk, databricks-spark-declarative-pipelines, etc.). Use this skill when you need reference documentation. ## How to Use @@ -28,7 +28,7 @@ Use WebFetch to retrieve this index, then: 1. Search for relevant sections/links 2. Fetch specific documentation pages for detailed guidance -3. Apply what you learn using the appropriate MCP tools +3. Apply what you learn using the appropriate CLI commands or SDK ## Documentation Structure @@ -47,7 +47,7 @@ The llms.txt file is organized by category: 1. Load `databricks-spark-declarative-pipelines` skill for workflow patterns 2. Use this skill to fetch docs if you need clarification on specific DLT features -3. Use `manage_pipeline(action="create_or_update")` MCP tool to actually create the pipeline +3. Use `databricks pipelines create` CLI command to create the pipeline **Scenario:** User asks about an unfamiliar Databricks feature diff --git a/databricks-skills/databricks-execution-compute/SKILL.md b/databricks-skills/databricks-execution-compute/SKILL.md index c3518385..74cc43d9 100644 --- a/databricks-skills/databricks-execution-compute/SKILL.md +++ b/databricks-skills/databricks-execution-compute/SKILL.md @@ -15,6 +15,8 @@ description: >- Run code on Databricks. Three execution modesโ€”choose based on workload. +> **Path convention:** `` in examples below = the directory containing this SKILL.md. Resolve it to the absolute path in your install (e.g. `~/.claude/skills/databricks-execution-compute`). Commands like `python /scripts/compute.py ...` work from any cwd. + ## Execution Mode Decision Matrix | Aspect | [Databricks Connect](references/1-databricks-connect.md) โญ | [Serverless Job](references/2-serverless-job.md) | [Interactive Cluster](references/3-interactive-cluster.md) | @@ -27,6 +29,7 @@ Run code on Databricks. Three execution modesโ€”choose based on workload. ### Decision Flow +Prefer Databricks Connect for all spark-based workload, then serverless. ``` Spark-based code? โ†’ Databricks Connect (fastest) โ””โ”€ Python 3.12 missing? โ†’ Install it + databricks-connect @@ -42,38 +45,124 @@ Scala/R? โ†’ Interactive Cluster (list and ask which one to use) **Read the reference file for your chosen mode before proceeding.** -### Databricks Connect (no MCP tool, run locally) โ†’ [reference](references/1-databricks-connect.md) +### Databricks Connect (run locally, prefer when it's pure spark code) โ†’ [reference](references/1-databricks-connect.md) ```bash +from databricks.connect import DatabricksSession +... +spark = DatabricksSession.builder.profile("my-local-profile").serverless(True).getOrCreate() + + python my_spark_script.py ``` ### Serverless Job โ†’ [reference](references/2-serverless-job.md) -```python -execute_code(file_path="/path/to/script.py") +Pure CLI flow: upload a local file as a workspace notebook, fire a one-time run with `databricks jobs submit` (create + run in one call, ephemeral โ€” no Jobs UI entry, no retry), then poll + fetch the result. The local file must be a Databricks source notebook โ€” top line `# Databricks notebook source` (Python) or `-- Databricks notebook source` (SQL). + +**1. Upload the local file as a workspace notebook.** `TARGET_PATH` is positional; `--file` is the local path. + +`databricks workspace import /Workspace/Users//.ai_dev_kit/train --file /local/path/to/train.py --format SOURCE --language PYTHON --overwrite` + +**2. Submit the run.** Use `--no-wait` to get `{"run_id": N}` back immediately; drop it to block until terminated. **`"client": "4"` is required** for `dependencies` to install (`"1"` silently ignores them). + +`databricks jobs submit --no-wait --json @submit.json` + +```json +{ + "run_name": "train-run", + "tasks": [{ + "task_key": "main", + "notebook_task": {"notebook_path": "/Workspace/Users//.ai_dev_kit/train"}, + "environment_key": "ml_env" + }], + "environments": [{ + "environment_key": "ml_env", + "spec": {"client": "4", "dependencies": ["scikit-learn==1.5.2", "mlflow==2.22.0"]} + }] +} ``` +**3. Check state / wait for completion.** Life-cycle: `PENDING` โ†’ `RUNNING` โ†’ `TERMINATED` (or `SKIPPED` / `INTERNAL_ERROR`). Only read `.state.result_state` (`SUCCESS` / `FAILED` / `CANCELED`) once life-cycle is `TERMINATED`. + +`databricks jobs get-run | jq '{state: .state.life_cycle_state, result: .state.result_state, duration_ms: .execution_duration, url: .run_page_url, task_run_id: .tasks[0].run_id}'` + +**4. Fetch the output / error.** **Gotcha:** `get-run-output` takes the **task** run_id (`.tasks[0].run_id`), NOT the parent `run_id` from submit. `notebook_output.result` is the string passed to `dbutils.notebook.exit()`. + +`databricks jobs get-run-output | jq '{result: .notebook_output.result, error, error_trace}'` + +Always use `dbutils.notebook.exit()` in the notebook โ€” `print()` is not captured by `get-run-output`. For JSON results: `dbutils.notebook.exit(json.dumps({...}))` then parse `.notebook_output.result` client-side. + +**Convenience wrapper.** `scripts/compute.py execute-code` does upload + submit + wait + cleanup in one command and returns a single tidy JSON: + +`python /scripts/compute.py execute-code --file /local/path/to/train.py --compute-type serverless --timeout 1500 --environments '[{"environment_key":"ml_env","spec":{"client":"4","dependencies":["scikit-learn==1.5.2","mlflow==2.22.0"]}}]' | jq '{success, state, output, error, run_id, run_page_url, execution_duration_ms}'` + ### Interactive Cluster โ†’ [reference](references/3-interactive-cluster.md) -```python -# Check for running clusters first (or use the one instructed) -list_compute(resource="clusters") -# Ask the customer which one to use +**Avoid by default โ€” prefer Serverless Job.** Only use an interactive cluster when: +- you have an existing classic cluster already running and available, or +- you need live, stateful execution across multiple calls (debugging via an execution context), or +- the user explicitly asks for it. -# Run code, reuse context_id for follow-up MCP call -result = execute_code(code="...", compute_type="cluster", cluster_id="...") -execute_code(code="...", context_id=result["context_id"], cluster_id=result["cluster_id"]) -``` +Interactive clusters are **slow to start (3-8 min)** and cost money while running. Don't start one implicitly. + +## CLI Commands -## MCP Tools +| Command | Purpose | +|---------|---------| +| `python /scripts/compute.py execute-code` | Run code on serverless or an existing cluster | +| `python /scripts/compute.py list-compute` | List clusters, node types, Spark versions | +| `python /scripts/compute.py manage-cluster` | Create/start/terminate/delete clusters (see [3-interactive-cluster.md](references/3-interactive-cluster.md)) | +| `databricks warehouses create/list` | Manage SQL warehouses | + +### SQL Warehouses + +All `ID`-taking commands use positional arg (no `--id` flag). Use `databricks warehouses list` to find an ID. + +```bash +# Create a serverless SQL warehouse. min_num_clusters + max_num_clusters are REQUIRED +# (the server rejects the default 0). Keep the aidevkit_project tag for resource tracking. +databricks warehouses create --json '{ + "name": "my-warehouse", + "cluster_size": "Small", + "enable_serverless_compute": true, + "auto_stop_mins": 10, + "min_num_clusters": 1, + "max_num_clusters": 1, + "tags": {"custom_tags": [{"key": "aidevkit_project", "value": "ai-dev-kit"}]} +}' + +# List / find โ€” trim to id, name, state with jq +databricks warehouses list -o json | jq '.[] | {id, name, state, size: .cluster_size}' + +# Find by name +databricks warehouses list -o json | jq '.[] | select(.name == "my-warehouse")' + +# Get one warehouse's full config +databricks warehouses get + +# Start / stop (both are LROs; add --no-wait to return immediately) +databricks warehouses start +databricks warehouses stop + +# Resize / reconfigure โ€” pass the FULL desired config (omitted fields revert to defaults, +# so always re-state min_num_clusters/max_num_clusters). Use --no-wait if the warehouse +# is STOPPED, otherwise edit blocks trying to reach RUNNING and errors out (the mutation +# itself still applies). When the warehouse is already RUNNING, --no-wait is optional. +databricks warehouses edit --no-wait --json '{ + "name": "my-warehouse", + "cluster_size": "Medium", + "enable_serverless_compute": true, + "auto_stop_mins": 15, + "min_num_clusters": 1, + "max_num_clusters": 1 +}' + +# Delete (irreversible) +databricks warehouses delete +``` -| Tool | For | Purpose | -|------|-----|---------| -| `execute_code` | Serverless, Interactive | Run code remotely | -| `list_compute` | Interactive | List clusters, check status, auto-select running cluster | -| `manage_cluster` | Interactive | Create, start, terminate, delete. **COSTLY:** `start` takes 3-8 minโ€”ask user | -| `manage_sql_warehouse` | SQL | Create, modify, delete SQL warehouses | +**Sizes:** `2X-Small`, `X-Small`, `Small`, `Medium`, `Large`, `X-Large`, `2X-Large`, `3X-Large`, `4X-Large`. **Types:** set `"warehouse_type": "PRO"` (default) or `"CLASSIC"` in the JSON body. ## Related Skills diff --git a/databricks-skills/databricks-execution-compute/references/1-databricks-connect.md b/databricks-skills/databricks-execution-compute/references/1-databricks-connect.md index 838d2a7d..39be79a4 100644 --- a/databricks-skills/databricks-execution-compute/references/1-databricks-connect.md +++ b/databricks-skills/databricks-execution-compute/references/1-databricks-connect.md @@ -30,16 +30,12 @@ auth_type = databricks-cli ## Usage Pattern ```python -from databricks.connect import DatabricksSession, DatabricksEnv - -# Declare dependencies installed on serverless compute -# CRITICAL: Include ALL packages used inside UDFs (pandas/numpy are there by default) -env = DatabricksEnv().withDependencies("faker", "holidays") +from databricks.connect import DatabricksSession +# Install dependencies locally first: uv pip install faker holidays spark = ( DatabricksSession.builder - .profile("my-workspace") # optional: run on a specific profile from ~/.databrickscfg instead of default - .withEnvironment(env) + .profile("my-workspace") # optional: use a specific profile from ~/.databrickscfg .serverless(True) .getOrCreate() ) @@ -54,9 +50,8 @@ df.write.mode('overwrite').saveAsTable("catalog.schema.table") | Issue | Solution | |-------|----------| | `Python 3.12 required` | create venv with correct python version | -| `DatabricksEnv not found` | Upgrade to databricks-connect >= 16.4 | | `serverless_compute_id` error | Add `serverless_compute_id = auto` to ~/.databrickscfg | -| `ModuleNotFoundError` inside UDF | Add the package to `withDependencies()` | +| `ModuleNotFoundError` inside UDF | Install the package locally: `uv pip install ` | | `PERSIST TABLE not supported` | Don't use `.cache()` or `.persist()` with serverless | | `broadcast` is used | Don't broadcast small DF using spark connect, have a small python list instead or join small DF | @@ -68,5 +63,5 @@ Switch to **[Serverless Job](2-serverless-job.md)** when: - Non-Spark Python code (pure sklearn, pytorch, etc.) Switch to **[Interactive Cluster](3-interactive-cluster.md)** when: -- Need state across multiple separate MCP tool calls +- Need state across multiple separate tool calls - Need Scala or R support diff --git a/databricks-skills/databricks-execution-compute/references/2-serverless-job.md b/databricks-skills/databricks-execution-compute/references/2-serverless-job.md index 4be8801c..d80c31e9 100644 --- a/databricks-skills/databricks-execution-compute/references/2-serverless-job.md +++ b/databricks-skills/databricks-execution-compute/references/2-serverless-job.md @@ -1,6 +1,8 @@ # Serverless Job Execution -**Use when:** Running intensive Python code remotely (ML training, heavy processing) that doesn't need Spark, or when code shouldn't depend on local machine staying connected. +**Use when:** Running intensive Python code remotely (ML training, heavy processing) that doesn't need Spark, or when code shouldn't depend on the local machine staying connected. + +> `` in examples = the directory containing the parent SKILL.md โ€” substitute the absolute install path (e.g. `~/.claude/skills/databricks-execution-compute`). ## When to Choose Serverless Job @@ -15,55 +17,99 @@ |-----|-----| | No cluster to manage | ~25-50s cold start each invocation | | Up to 30 min timeout | No state preserved between calls | -| Independent execution | print() unreliableโ€”use `dbutils.notebook.exit()` | +| Independent execution | `print()` unreliable โ€” use `dbutils.notebook.exit()` | -## Executing code -### Prefer running from a Local File (edit the local file then run it) +## Pure CLI flow -```python -execute_code( - file_path="/local/path/to/train_model.py", - compute_type="serverless" -) -``` +`databricks jobs submit` is the "create + run" primitive for ephemeral runs (no Jobs UI entry, no retry). The local file must be a Databricks source notebook โ€” first line `# Databricks notebook source` (Python) or `-- Databricks notebook source` (SQL). -## Jobs with Custom Dependencies +### 1. Upload the local file as a workspace notebook -Use `job_extra_params` to install pip packages: +`TARGET_PATH` is positional; `--file` is the local path; `--language` is required when `--format SOURCE`. -```python -execute_code( - file_path="/path/to/train.py", - job_extra_params={ - "environments": [{ - "environment_key": "ml_env", - "spec": {"client": "4", "dependencies": ["scikit-learn", "pandas", "mlflow"]} - }] - } -) +`databricks workspace import /Workspace/Users//.ai_dev_kit/train --file /local/path/to/train.py --format SOURCE --language PYTHON --overwrite` + +### 2. Submit the run + +`--no-wait` returns `{"run_id": N}` immediately. Drop it to block until terminated. **`"client": "4"` is required** for `dependencies` to install โ€” `"1"` silently ignores them. + +`databricks jobs submit --no-wait --json @submit.json` + +Where `submit.json`: + +```json +{ + "run_name": "train-run", + "tasks": [{ + "task_key": "main", + "notebook_task": {"notebook_path": "/Workspace/Users//.ai_dev_kit/train"}, + "environment_key": "ml_env" + }], + "environments": [{ + "environment_key": "ml_env", + "spec": {"client": "4", "dependencies": ["scikit-learn==1.5.2", "mlflow==2.22.0"]} + }] +} ``` -**CRITICAL:** Use `"client": "4"` in the spec. `"client": "1"` won't install dependencies. +### 3. Check status + +One-shot trim to the fields that matter: + +`databricks jobs get-run | jq '{state: .state.life_cycle_state, result: .state.result_state, duration_ms: .execution_duration, url: .run_page_url}'` + +Life-cycle states: `PENDING` โ†’ `RUNNING` โ†’ `TERMINATED` (or `SKIPPED` / `INTERNAL_ERROR`). Only read `.state.result_state` (`SUCCESS` / `FAILED` / `CANCELED`) once `life_cycle_state == TERMINATED`. + +### 4. Fetch the output / error + +**Gotcha:** `get-run-output` takes the **task** run_id (`.tasks[0].run_id`), not the parent `run_id` from submit. -## Output Handling +`databricks jobs get-run-output | jq '{result: .notebook_output.result, error, error_trace}'` + +`notebook_output.result` is whatever `dbutils.notebook.exit()` passed. `error` / `error_trace` populate on failure. + +### 5. (Optional) Delete the temp notebook + +`databricks workspace delete /Workspace/Users//.ai_dev_kit/train` + +## Output handling in the notebook ```python -# โŒ BAD - print() may not be captured +# BAD โ€” print() output isn't returned by get-run-output print("Training complete!") -# โœ… GOOD - Use dbutils.notebook.exit() +# GOOD โ€” dbutils.notebook.exit() populates notebook_output.result import json -results = {"accuracy": 0.95, "model_path": "/Volumes/..."} -dbutils.notebook.exit(json.dumps(results)) +dbutils.notebook.exit(json.dumps({"accuracy": 0.95, "model_path": "/Volumes/..."})) ``` +Max output size is 5 MB. Larger results should be written to a Volume/object store and referenced by path. + +## Convenience wrapper + +`scripts/compute.py execute-code` does upload + submit + wait + cleanup in one command and returns a single JSON with `success`, `state`, `output` (the `dbutils.notebook.exit` payload), `error`, `run_id`, `run_page_url`, `execution_duration_ms`. + +Minimal: + +`python /scripts/compute.py execute-code --file train.py --compute-type serverless` + +With dependencies: + +`python /scripts/compute.py execute-code --file /path/to/train.py --compute-type serverless --timeout 1500 --environments '[{"environment_key":"ml_env","spec":{"client":"4","dependencies":["scikit-learn==1.5.2","mlflow==2.22.0","xgboost==2.1.3"]}}]'` + +Long dependency list from a file: + +`python /scripts/compute.py execute-code --file /path/to/train.py --compute-type serverless --environments @env.json` + ## Common Issues | Issue | Solution | |-------|----------| -| print() output missing | Use `dbutils.notebook.exit()` | -| `ModuleNotFoundError` | Add to environments spec with `"client": "4"` | -| Job times out | Max is 1800s; split into smaller tasks | +| `print()` output missing | Use `dbutils.notebook.exit()` โ€” `print` isn't captured by `get-run-output` | +| `ModuleNotFoundError` | Add the package to the environments spec with `"client": "4"` | +| Dependencies listed but not installed | `"client": "1"` silently drops `dependencies`; use `"client": "4"` | +| `get-run-output` returns empty `notebook_output` | You passed the parent run_id, not `.tasks[0].run_id` | +| Job times out | Default 1800 s on the script wrapper; raise `--timeout` or use `jobs submit --no-wait` + your own polling | ## When NOT to Use @@ -72,5 +118,5 @@ Switch to **[Databricks Connect](1-databricks-connect.md)** when: - Need local debugging with breakpoints Switch to **[Interactive Cluster](3-interactive-cluster.md)** when: -- Need state across multiple MCP tool calls +- Need state across multiple tool calls - Need Scala or R support diff --git a/databricks-skills/databricks-execution-compute/references/3-interactive-cluster.md b/databricks-skills/databricks-execution-compute/references/3-interactive-cluster.md index aa73ea90..7197334a 100644 --- a/databricks-skills/databricks-execution-compute/references/3-interactive-cluster.md +++ b/databricks-skills/databricks-execution-compute/references/3-interactive-cluster.md @@ -1,6 +1,8 @@ # Interactive Cluster Execution -**Use when:** You have an existing running cluster and need to preserve state across multiple MCP tool calls, or need Scala/R support. +**Use when:** You have an existing running cluster and need to preserve state across multiple tool calls, or need Scala/R support. + +> `` in examples = the directory containing the parent SKILL.md โ€” substitute the absolute install path (e.g. `~/.claude/skills/databricks-execution-compute`). ## When to Choose Interactive Cluster @@ -20,8 +22,8 @@ **Starting a cluster takes 3-8 minutes and costs money.** Always check first: -```python -list_compute(resource="clusters") +```bash +python /scripts/compute.py list-compute --resource clusters ``` If no cluster is running, ask the user: @@ -34,58 +36,80 @@ If no cluster is running, ask the user: ### First Command: Creates Context -```python -result = execute_code( - code="import pandas as pd\ndf = pd.DataFrame({'a': [1, 2, 3]})", - compute_type="cluster", - cluster_id="1234-567890-abcdef" -) -# result contains context_id for reuse +```bash +python /scripts/compute.py execute-code \ + --code "import pandas as pd; df = pd.DataFrame({'a': [1, 2, 3]}); print(df)" \ + --compute-type cluster \ + --cluster-id "1234-567890-abcdef" +``` + +Response includes `context_id` for reuse: +```json +{ + "success": true, + "output": " a\n0 1\n1 2\n2 3", + "context_id": "ctx_abc123", + "cluster_id": "1234-567890-abcdef" +} ``` ### Follow-up Commands: Reuse Context -```python +```bash # Variables from first command still available -execute_code( - code="print(df.shape)", # df exists - context_id=result["context_id"], - cluster_id=result["cluster_id"] -) +python /scripts/compute.py execute-code \ + --code "print(df.shape)" \ + --compute-type cluster \ + --cluster-id "1234-567890-abcdef" \ + --context-id "ctx_abc123" ``` ### Auto-Select Best Running Cluster -```python -best_cluster = list_compute(resource="clusters", auto_select=True) -execute_code( - code="spark.range(100).show()", - compute_type="cluster", - cluster_id=best_cluster["cluster_id"] -) +```bash +# Get best running cluster +python /scripts/compute.py list-compute --auto-select +# Returns: {"cluster_id": "1234-567890-abcdef"} + +# Then execute on it +python /scripts/compute.py execute-code \ + --code "spark.range(100).show()" \ + --compute-type cluster \ + --cluster-id "1234-567890-abcdef" ``` ## Language Support -```python -execute_code(code='println("Hello")', compute_type="cluster", language="scala") -execute_code(code="SELECT * FROM table LIMIT 10", compute_type="cluster", language="sql") -execute_code(code='print("Hello")', compute_type="cluster", language="r") +```bash +# Scala +python /scripts/compute.py execute-code --code 'println("Hello")' --compute-type cluster --language scala --cluster-id ... + +# SQL +python /scripts/compute.py execute-code --code "SELECT * FROM table LIMIT 10" --compute-type cluster --language sql --cluster-id ... + +# R +python /scripts/compute.py execute-code --code 'print("Hello")' --compute-type cluster --language r --cluster-id ... ``` ## Installing Libraries -Install pip packages directly in the execution context (pandas/numpy are there by default): - -```python -# Install library -execute_code( - code="""%pip install faker - dbutils.library.restartPython()""", # Restart Python to pick up new packages (if needed) - compute_type="cluster", - cluster_id="...", - context_id="..." -) +Install pip packages directly in the execution context: + +```bash +python /scripts/compute.py execute-code \ + --code "%pip install faker" \ + --compute-type cluster \ + --cluster-id "..." \ + --context-id "..." +``` + +If needed, restart Python to pick up new packages: +```bash +python /scripts/compute.py execute-code \ + --code "dbutils.library.restartPython()" \ + --compute-type cluster \ + --cluster-id "..." \ + --context-id "..." ``` ## Context Lifecycle @@ -93,32 +117,57 @@ execute_code( **Keep alive (default):** Context persists until cluster terminates. **Destroy when done:** -```python -execute_code( - code="print('Done!')", - compute_type="cluster", - destroy_context_on_completion=True -) +```bash +python /scripts/compute.py execute-code \ + --code "print('Done!')" \ + --compute-type cluster \ + --cluster-id "..." \ + --destroy-context ``` -## Handling No Running Cluster +## Managing Clusters -When no cluster is running, `execute_code` returns: -```json -{ - "success": false, - "error": "No running cluster available", - "startable_clusters": [{"cluster_id": "...", "cluster_name": "...", "state": "TERMINATED"}], - "suggestions": ["Start a terminated cluster", "Use serverless instead"] -} +Two equivalent paths: the standalone script (convenience wrapper) or the raw `databricks` CLI (more fields exposed). Prefer the script for the common operations listed here. + +```bash +# List all clusters +python /scripts/compute.py list-compute --resource clusters + +# Get specific cluster status +python /scripts/compute.py list-compute --cluster-id "1234-567890-abcdef" + +# Start a cluster (WITH USER APPROVAL ONLY - costs money, 3-8min startup) +python /scripts/compute.py manage-cluster --action start --cluster-id "1234-567890-abcdef" + +# Terminate a cluster (reversible) +python /scripts/compute.py manage-cluster --action terminate --cluster-id "1234-567890-abcdef" + +# Create a new cluster +python /scripts/compute.py manage-cluster --action create --name "my-cluster" --num-workers 2 +``` + +### Filter running interactive clusters only (raw CLI) + +Useful before asking the user which cluster to reuse. `--cluster-sources UI,API` excludes job clusters (which would otherwise dominate the list on busy workspaces): + +```bash +databricks clusters list --cluster-sources UI,API --output json \ + | jq '.[] | select(.state == "RUNNING")' ``` -### Starting a Cluster (With User Approval Only) +### Create with a full spec (raw CLI) + +The script's `manage-cluster --action create` is fine for quick defaults; for full control (DBR version, instance type, tags) use the raw CLI: -```python -manage_cluster(action="start", cluster_id="1234-567890-abcdef") -# Poll until running (wait 20sec) -list_compute(resource="clusters", cluster_id="1234-567890-abcdef") +```bash +# SPARK_VERSION is positional; custom_tags recommended for resource tracking +databricks clusters create 15.4.x-scala2.12 --json '{ + "cluster_name": "my-cluster", + "node_type_id": "i3.xlarge", + "num_workers": 2, + "autotermination_minutes": 60, + "custom_tags": {"aidevkit_project": "ai-dev-kit"} +}' ``` ## Common Issues @@ -127,7 +176,7 @@ list_compute(resource="clusters", cluster_id="1234-567890-abcdef") |-------|----------| | "No running cluster" | Ask user to start or use serverless | | Context not found | Context expired; create new one | -| Library not found | `%pip install ` then if needed `dbutils.library.restartPython()` | +| Library not found | `%pip install ` then restart Python if needed | ## When NOT to Use diff --git a/databricks-skills/databricks-execution-compute/scripts/compute.py b/databricks-skills/databricks-execution-compute/scripts/compute.py new file mode 100644 index 00000000..e90a4ac7 --- /dev/null +++ b/databricks-skills/databricks-execution-compute/scripts/compute.py @@ -0,0 +1,743 @@ +#!/usr/bin/env python3 +"""Compute CLI - Execute code and manage compute resources on Databricks. + +Standalone script with no external dependencies beyond databricks-sdk. + +Commands: +- execute-code: Run code on serverless or cluster compute +- list-compute: List clusters, node types, or spark versions +- manage-cluster: Create, start, terminate, or delete clusters + +Requires: pip install databricks-sdk +""" + +import argparse +import base64 +import json +import uuid +from dataclasses import dataclass +from datetime import timedelta +from typing import Any, Dict, List, Optional + +from databricks.sdk import WorkspaceClient +from databricks.sdk.service.compute import ( + ClusterSource, + CommandStatus, + ContextStatus, + Environment, + Language, + ListClustersFilterBy, + ResultType, + State, +) +from databricks.sdk.service.jobs import ( + JobEnvironment, + NotebookTask, + RunResultState, + Source, + SubmitTask, +) +from databricks.sdk.service.workspace import ImportFormat, Language as WsLang + + +# --------------------------------------------------------------------------- +# Authentication +# --------------------------------------------------------------------------- + +def get_workspace_client() -> WorkspaceClient: + """Get authenticated WorkspaceClient using standard auth chain.""" + return WorkspaceClient() + + +def get_current_username() -> str: + """Get the current user's username.""" + w = get_workspace_client() + return w.current_user.me().user_name + + +# --------------------------------------------------------------------------- +# Exceptions +# --------------------------------------------------------------------------- + +class NoRunningClusterError(Exception): + """Raised when no running cluster is available.""" + + def __init__(self, message: str, suggestions: List[str] = None, startable_clusters: List[Dict] = None): + super().__init__(message) + self.suggestions = suggestions or [] + self.startable_clusters = startable_clusters or [] + + +# --------------------------------------------------------------------------- +# Result Classes +# --------------------------------------------------------------------------- + +@dataclass +class ExecutionResult: + """Result from cluster command execution.""" + success: bool + output: str = "" + error: str = "" + cluster_id: str = "" + context_id: str = "" + status: str = "" + result_type: str = "" + + def to_dict(self) -> Dict[str, Any]: + return { + "success": self.success, + "output": self.output, + "error": self.error, + "cluster_id": self.cluster_id, + "context_id": self.context_id, + "status": self.status, + "result_type": self.result_type, + } + + +@dataclass +class ServerlessRunResult: + """Result from serverless code execution.""" + success: bool + output: str = "" + error: str = "" + run_id: int = 0 + run_page_url: str = "" + state: str = "" + execution_duration_ms: int = 0 + + def to_dict(self) -> Dict[str, Any]: + return { + "success": self.success, + "output": self.output, + "error": self.error, + "run_id": self.run_id, + "run_page_url": self.run_page_url, + "state": self.state, + "execution_duration_ms": self.execution_duration_ms, + } + + +# --------------------------------------------------------------------------- +# Cluster Execution +# --------------------------------------------------------------------------- + +def list_clusters() -> List[Dict[str, Any]]: + """List interactive clusters created by humans (UI/API, not jobs).""" + w = get_workspace_client() + clusters = [] + # Filter to only UI and API created clusters (interactive, human-created) + # Excludes JOB clusters (created by jobs) and other system clusters + filter_by = ListClustersFilterBy( + cluster_sources=[ClusterSource.UI, ClusterSource.API] + ) + for c in w.clusters.list(filter_by=filter_by, page_size=100): + clusters.append({ + "cluster_id": c.cluster_id, + "cluster_name": c.cluster_name, + "state": c.state.value if c.state else "UNKNOWN", + "creator_user_name": c.creator_user_name, + "spark_version": c.spark_version, + "node_type_id": c.node_type_id, + "num_workers": c.num_workers, + }) + return clusters + + +def get_best_cluster() -> str: + """Get the best running interactive cluster ID, or raise NoRunningClusterError.""" + w = get_workspace_client() + running = [] + startable = [] + + # Filter to only interactive clusters (UI/API created) + filter_by = ListClustersFilterBy( + cluster_sources=[ClusterSource.UI, ClusterSource.API] + ) + for c in w.clusters.list(filter_by=filter_by, page_size=100): + info = { + "cluster_id": c.cluster_id, + "cluster_name": c.cluster_name, + "state": c.state.value if c.state else "UNKNOWN", + } + if c.state == State.RUNNING: + running.append(info) + elif c.state in (State.TERMINATED, State.PENDING): + startable.append(info) + + if running: + return running[0]["cluster_id"] + + raise NoRunningClusterError( + "No running cluster available.", + suggestions=[ + "Start an existing cluster with: python compute.py manage-cluster --action start --cluster-id ", + "Use serverless compute: python compute.py execute-code --compute-type serverless --code '...'", + ], + startable_clusters=startable, + ) + + +def start_cluster(cluster_id: str) -> Dict[str, Any]: + """Start a cluster and wait for it to be running.""" + w = get_workspace_client() + w.clusters.start(cluster_id=cluster_id) + # Don't wait - just return immediately + return {"success": True, "cluster_id": cluster_id, "message": "Cluster start initiated"} + + +def get_cluster_status(cluster_id: str) -> Dict[str, Any]: + """Get the status of a specific cluster.""" + w = get_workspace_client() + c = w.clusters.get(cluster_id=cluster_id) + return { + "cluster_id": c.cluster_id, + "cluster_name": c.cluster_name, + "state": c.state.value if c.state else "UNKNOWN", + "state_message": c.state_message, + "creator_user_name": c.creator_user_name, + "spark_version": c.spark_version, + "node_type_id": c.node_type_id, + "num_workers": c.num_workers, + } + + +def _get_or_create_context(w: WorkspaceClient, cluster_id: str, context_id: Optional[str], language: str) -> str: + """Get existing context or create a new one.""" + lang_map = {"python": Language.PYTHON, "scala": Language.SCALA, "sql": Language.SQL, "r": Language.R} + lang = lang_map.get(language.lower(), Language.PYTHON) + + if context_id: + # Verify context exists + try: + status = w.command_execution.context_status(cluster_id=cluster_id, context_id=context_id) + if status.status == ContextStatus.RUNNING: + return context_id + except Exception: + pass # Context doesn't exist, create new one + + # Create new context + ctx = w.command_execution.create(cluster_id=cluster_id, language=lang).result() + return ctx.id + + +def execute_databricks_command( + code: str, + cluster_id: Optional[str] = None, + context_id: Optional[str] = None, + language: str = "python", + timeout: int = 120, + destroy_context_on_completion: bool = False, +) -> ExecutionResult: + """Execute code on a Databricks cluster using Command Execution API.""" + w = get_workspace_client() + + # Get cluster ID if not provided + if not cluster_id: + cluster_id = get_best_cluster() + + # Get or create context + ctx_id = _get_or_create_context(w, cluster_id, context_id, language) + + # Execute command + lang_map = {"python": Language.PYTHON, "scala": Language.SCALA, "sql": Language.SQL, "r": Language.R} + lang = lang_map.get(language.lower(), Language.PYTHON) + + try: + cmd = w.command_execution.execute( + cluster_id=cluster_id, + context_id=ctx_id, + language=lang, + command=code, + ).result(timeout=timedelta(seconds=timeout)) + + # Parse results + output = "" + error = "" + result_type = cmd.results.result_type.value if cmd.results and cmd.results.result_type else "" + + if cmd.results: + if cmd.results.result_type == ResultType.TEXT: + output = cmd.results.data or "" + elif cmd.results.result_type == ResultType.TABLE: + output = json.dumps(cmd.results.data) if cmd.results.data else "" + elif cmd.results.result_type == ResultType.ERROR: + error = cmd.results.cause or str(cmd.results.data) or "Unknown error" + + success = cmd.status == CommandStatus.FINISHED and cmd.results.result_type != ResultType.ERROR + + return ExecutionResult( + success=success, + output=output, + error=error, + cluster_id=cluster_id, + context_id=ctx_id, + status=cmd.status.value if cmd.status else "", + result_type=result_type, + ) + + finally: + if destroy_context_on_completion and ctx_id: + try: + w.command_execution.destroy(cluster_id=cluster_id, context_id=ctx_id) + except Exception: + pass + + +# --------------------------------------------------------------------------- +# Serverless Execution +# --------------------------------------------------------------------------- + +def run_code_on_serverless( + code: str, + language: str = "python", + timeout: int = 1800, + environments: Optional[List[Any]] = None, +) -> ServerlessRunResult: + """Run code on serverless compute using Jobs API runs/submit. + + Args: + code: Source to execute. + language: "python" or "sql". + timeout: Max wait time in seconds. + environments: Optional list of environments to install dependencies. + Each entry may be a dict (documented shape) or a typed + ``JobEnvironment``. Dict shape: + {"environment_key": "my_env", + "spec": {"client": "4", "dependencies": ["pandas", "mlflow"]}} + ``client`` must be ``"4"`` (or higher) for dependencies to install; + ``"1"`` is the default but does NOT install ``dependencies``. + """ + w = get_workspace_client() + + # Create temp notebook + username = get_current_username() + notebook_name = f"_tmp_serverless_{uuid.uuid4().hex[:8]}" + notebook_path = f"/Workspace/Users/{username}/.tmp/{notebook_name}" + + # Ensure directory exists + try: + w.workspace.mkdirs(f"/Workspace/Users/{username}/.tmp") + except Exception: + pass + + # Upload notebook content + if language.lower() == "sql": + notebook_content = f"-- Databricks notebook source\n{code}" + else: + notebook_content = f"# Databricks notebook source\n{code}" + + content_b64 = base64.b64encode(notebook_content.encode()).decode() + + ws_lang_map = {"python": WsLang.PYTHON, "sql": WsLang.SQL} + ws_lang = ws_lang_map.get(language.lower(), WsLang.PYTHON) + + w.workspace.import_( + path=notebook_path, + content=content_b64, + format=ImportFormat.SOURCE, + language=ws_lang, + overwrite=True, + ) + + # Normalize environments (accept dicts or typed JobEnvironment). + # The SDK serializes each list item via .as_dict(), so raw dicts fail there; + # typed objects also lack .get(), so we need to canonicalize before reading + # environment_key for the task binding. + if environments: + normalized = [] + for e in environments: + if isinstance(e, JobEnvironment): + normalized.append(e) + elif isinstance(e, dict): + spec = e.get("spec", {}) + if isinstance(spec, dict): + spec = Environment(**spec) + elif not isinstance(spec, Environment): + raise TypeError( + f"environments[].spec must be a dict or Environment, got {type(spec).__name__}" + ) + normalized.append( + JobEnvironment( + environment_key=e.get("environment_key", "default"), + spec=spec, + ) + ) + else: + raise TypeError( + f"environments[] entries must be dict or JobEnvironment, got {type(e).__name__}" + ) + job_envs = normalized + env_key = job_envs[0].environment_key or "default" + else: + job_envs = [JobEnvironment(environment_key="default", spec=Environment(client="1"))] + env_key = "default" + + try: + # Submit run + run = w.jobs.submit( + run_name=f"serverless-run-{uuid.uuid4().hex[:8]}", + tasks=[ + SubmitTask( + task_key="main", + notebook_task=NotebookTask( + notebook_path=notebook_path, + source=Source.WORKSPACE, + ), + environment_key=env_key, + ) + ], + environments=job_envs, + ).result(timeout=timedelta(seconds=timeout)) + + # Get run output + run_output = w.jobs.get_run_output(run_id=run.tasks[0].run_id) + + output = "" + error = "" + success = run.state.result_state == RunResultState.SUCCESS + + if run_output.notebook_output and run_output.notebook_output.result: + output = run_output.notebook_output.result + if run_output.error: + error = run_output.error + + return ServerlessRunResult( + success=success, + output=output, + error=error, + run_id=run.run_id, + run_page_url=run.run_page_url or "", + state=run.state.result_state.value if run.state and run.state.result_state else "", + execution_duration_ms=run.execution_duration or 0, + ) + + finally: + # Cleanup temp notebook + try: + w.workspace.delete(notebook_path) + except Exception: + pass + + +# --------------------------------------------------------------------------- +# Cluster Management +# --------------------------------------------------------------------------- + +def create_cluster( + name: str, + num_workers: int = 1, + autotermination_minutes: int = 120, + spark_version: Optional[str] = None, + node_type_id: Optional[str] = None, +) -> Dict[str, Any]: + """Create a new cluster.""" + w = get_workspace_client() + + # Get defaults if not provided + if not spark_version: + versions = list(w.clusters.spark_versions()) + # Pick latest LTS + for v in versions: + if "LTS" in v.name and "ML" not in v.name: + spark_version = v.key + break + if not spark_version and versions: + spark_version = versions[0].key + + if not node_type_id: + node_types = list(w.clusters.list_node_types().node_types) + # Pick smallest available + for nt in sorted(node_types, key=lambda x: x.memory_mb or 0): + if nt.is_deprecated is not True: + node_type_id = nt.node_type_id + break + + cluster = w.clusters.create( + cluster_name=name, + spark_version=spark_version, + node_type_id=node_type_id, + num_workers=num_workers, + autotermination_minutes=autotermination_minutes, + ).result() + + return { + "success": True, + "cluster_id": cluster.cluster_id, + "cluster_name": name, + "message": "Cluster created", + } + + +def terminate_cluster(cluster_id: str) -> Dict[str, Any]: + """Terminate a cluster (can be restarted).""" + w = get_workspace_client() + w.clusters.delete(cluster_id=cluster_id) + return {"success": True, "cluster_id": cluster_id, "message": "Cluster terminated"} + + +def delete_cluster(cluster_id: str) -> Dict[str, Any]: + """Permanently delete a cluster.""" + w = get_workspace_client() + w.clusters.permanent_delete(cluster_id=cluster_id) + return {"success": True, "cluster_id": cluster_id, "message": "Cluster permanently deleted"} + + +def list_node_types() -> List[Dict[str, Any]]: + """List available node types.""" + w = get_workspace_client() + result = [] + for nt in w.clusters.list_node_types().node_types: + result.append({ + "node_type_id": nt.node_type_id, + "memory_mb": nt.memory_mb, + "num_cores": nt.num_cores, + "description": nt.description, + "is_deprecated": nt.is_deprecated, + }) + return result + + +def list_spark_versions() -> List[Dict[str, Any]]: + """List available Spark versions.""" + w = get_workspace_client() + result = [] + response = w.clusters.spark_versions() + for v in response.versions or []: + result.append({ + "key": v.key, + "name": v.name, + }) + return result + + +# --------------------------------------------------------------------------- +# CLI Commands +# --------------------------------------------------------------------------- + +def _none_if_empty(value): + """Convert empty strings to None.""" + return None if value == "" else value + + +def _no_cluster_error_response(e: NoRunningClusterError) -> Dict[str, Any]: + """Build a structured error response when no running cluster is available.""" + return { + "success": False, + "error": str(e), + "suggestions": e.suggestions, + "startable_clusters": e.startable_clusters, + } + + +def cmd_execute_code(args): + """Execute code on Databricks via serverless or cluster compute.""" + code = _none_if_empty(args.code) + file_path = _none_if_empty(args.file) + cluster_id = _none_if_empty(args.cluster_id) + context_id = _none_if_empty(args.context_id) + language = _none_if_empty(args.language) or "python" + compute_type = args.compute_type + timeout = args.timeout + destroy_context = args.destroy_context + + # Parse --environments (JSON string or @path/to/file.json) for serverless + environments = None + env_arg = _none_if_empty(getattr(args, "environments", None)) + if env_arg: + try: + if env_arg.startswith("@"): + with open(env_arg[1:], "r", encoding="utf-8") as fh: + environments = json.load(fh) + else: + environments = json.loads(env_arg) + except (OSError, json.JSONDecodeError) as e: + return {"success": False, "error": f"Invalid --environments: {e}"} + if not isinstance(environments, list): + return {"success": False, + "error": "--environments must be a JSON array of environment objects"} + + if not code and not file_path: + return {"success": False, "error": "Either --code or --file must be provided."} + + # Read code from file if provided + if file_path and not code: + try: + with open(file_path, "r", encoding="utf-8") as f: + code = f.read() + except FileNotFoundError: + return {"success": False, "error": f"File not found: {file_path}"} + + # Resolve "auto" compute type + if compute_type == "auto": + if cluster_id or context_id: + compute_type = "cluster" + elif language.lower() in ("scala", "r"): + compute_type = "cluster" + else: + compute_type = "serverless" + + # Serverless execution + if compute_type == "serverless": + default_timeout = timeout if timeout else 1800 + try: + result = run_code_on_serverless( + code=code, + language=language, + timeout=default_timeout, + environments=environments, + ) + except TypeError as e: + return {"success": False, "error": str(e)} + return result.to_dict() + + if environments: + return {"success": False, + "error": "--environments is only supported with --compute-type serverless"} + + # Cluster execution + default_timeout = timeout if timeout else 120 + try: + result = execute_databricks_command( + code=code, + cluster_id=cluster_id, + context_id=context_id, + language=language, + timeout=default_timeout, + destroy_context_on_completion=destroy_context, + ) + return result.to_dict() + except NoRunningClusterError as e: + return _no_cluster_error_response(e) + + +def cmd_list_compute(args): + """List compute resources: clusters, node types, or spark versions.""" + resource = args.resource.lower() + cluster_id = _none_if_empty(args.cluster_id) + auto_select = args.auto_select + + if resource == "clusters": + if cluster_id: + return get_cluster_status(cluster_id) + if auto_select: + try: + best = get_best_cluster() + return {"cluster_id": best} + except NoRunningClusterError as e: + return _no_cluster_error_response(e) + return {"clusters": list_clusters()} + + elif resource == "node_types": + return {"node_types": list_node_types()} + + elif resource == "spark_versions": + return {"spark_versions": list_spark_versions()} + + else: + return {"success": False, "error": f"Unknown resource: {resource}. Use: clusters, node_types, spark_versions"} + + +def cmd_manage_cluster(args): + """Create, start, terminate, or delete a cluster.""" + action = args.action.lower() + cluster_id = _none_if_empty(args.cluster_id) + name = _none_if_empty(args.name) + + if action == "create": + if not name: + return {"success": False, "error": "name is required for create action."} + return create_cluster( + name=name, + num_workers=args.num_workers or 1, + autotermination_minutes=args.autotermination_minutes or 120, + ) + + elif action == "start": + if not cluster_id: + return {"success": False, "error": "cluster_id is required for start action."} + return start_cluster(cluster_id) + + elif action == "terminate": + if not cluster_id: + return {"success": False, "error": "cluster_id is required for terminate action."} + return terminate_cluster(cluster_id) + + elif action == "delete": + if not cluster_id: + return {"success": False, "error": "cluster_id is required for delete action."} + return delete_cluster(cluster_id) + + elif action == "get": + if not cluster_id: + return {"success": False, "error": "cluster_id is required for get action."} + try: + return get_cluster_status(cluster_id) + except Exception as e: + if "does not exist" in str(e).lower(): + return {"success": True, "cluster_id": cluster_id, "state": "DELETED", "exists": False} + return {"success": False, "error": str(e)} + + else: + return {"success": False, "error": f"Unknown action: {action}. Use: create, start, terminate, delete, get"} + + +# --------------------------------------------------------------------------- +# CLI Setup +# --------------------------------------------------------------------------- + +def main(): + parser = argparse.ArgumentParser( + description="Execute code and manage compute on Databricks", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + subparsers = parser.add_subparsers(dest="command", required=True) + + # execute-code + exec_parser = subparsers.add_parser("execute-code", help="Run code on Databricks") + exec_parser.add_argument("--code", help="Code to execute") + exec_parser.add_argument("--file", help="File to execute") + exec_parser.add_argument("--compute-type", default="auto", choices=["auto", "serverless", "cluster"], + help="Compute type (default: auto)") + exec_parser.add_argument("--cluster-id", help="Cluster ID (for cluster compute)") + exec_parser.add_argument("--context-id", help="Context ID (reuse existing context)") + exec_parser.add_argument("--language", default="python", choices=["python", "scala", "sql", "r"], + help="Language (default: python)") + exec_parser.add_argument("--timeout", type=int, help="Timeout in seconds") + exec_parser.add_argument("--destroy-context", action="store_true", help="Destroy context after execution") + exec_parser.add_argument( + "--environments", + help=( + "Serverless only. JSON array of environments (or @path/to/file.json). " + 'Example: \'[{"environment_key":"ml_env","spec":{"client":"4",' + '"dependencies":["mlflow","scikit-learn"]}}]\'. ' + 'IMPORTANT: "client":"4" installs dependencies; "1" does not.' + ), + ) + exec_parser.set_defaults(func=cmd_execute_code) + + # list-compute + list_parser = subparsers.add_parser("list-compute", help="List compute resources") + list_parser.add_argument("--resource", default="clusters", choices=["clusters", "node_types", "spark_versions"], + help="Resource to list (default: clusters)") + list_parser.add_argument("--cluster-id", help="Get specific cluster status") + list_parser.add_argument("--auto-select", action="store_true", help="Return best running cluster") + list_parser.set_defaults(func=cmd_list_compute) + + # manage-cluster + manage_parser = subparsers.add_parser("manage-cluster", help="Manage clusters") + manage_parser.add_argument("--action", required=True, choices=["create", "start", "terminate", "delete", "get"], + help="Action to perform") + manage_parser.add_argument("--cluster-id", help="Cluster ID") + manage_parser.add_argument("--name", help="Cluster name (for create)") + manage_parser.add_argument("--num-workers", type=int, help="Number of workers (for create)") + manage_parser.add_argument("--autotermination-minutes", type=int, help="Auto-termination minutes (for create)") + manage_parser.set_defaults(func=cmd_manage_cluster) + + args = parser.parse_args() + result = args.func(args) + print(json.dumps(result, indent=2, default=str)) + + +if __name__ == "__main__": + main() diff --git a/databricks-skills/databricks-genie/SKILL.md b/databricks-skills/databricks-genie/SKILL.md index 82332476..381d7c11 100644 --- a/databricks-skills/databricks-genie/SKILL.md +++ b/databricks-skills/databricks-genie/SKILL.md @@ -5,196 +5,220 @@ description: "Create and query Databricks Genie Spaces for natural language SQL # Databricks Genie -Create, manage, and query Databricks Genie Spaces - natural language interfaces for SQL-based data exploration. +Create, manage, and query Genie Spaces - natural language interfaces for SQL-based data exploration. ## Overview Genie Spaces allow users to ask natural language questions about structured data in Unity Catalog. The system translates questions into SQL queries, executes them on a SQL warehouse, and presents results conversationally. -## When to Use This Skill - -Use this skill when: -- Creating a new Genie Space for data exploration -- Adding sample questions to guide users -- Connecting Unity Catalog tables to a conversational interface -- Asking questions to a Genie Space programmatically (Conversation API) -- Exporting a Genie Space configuration (serialized_space) for backup or migration -- Importing / cloning a Genie Space from a serialized payload -- Migrating a Genie Space between workspaces or environments (dev โ†’ staging โ†’ prod) - - Only supports catalog remapping where catalog names differ across environments - - Not supported for schema and/or table names that differ across environments - - Not including migration of tables between environments (only migration of Genie Spaces) - -## MCP Tools - -| Tool | Purpose | -|------|---------| -| `manage_genie` | Create, get, list, delete, export, and import Genie Spaces | -| `ask_genie` | Ask natural language questions to a Genie Space | -| `get_table_stats_and_schema` | Inspect table schemas before creating a space | -| `execute_sql` | Test SQL queries directly | - -### manage_genie - Space Management - -| Action | Description | Required Params | -|--------|-------------|-----------------| -| `create_or_update` | Idempotent create/update a space | display_name, table_identifiers (or serialized_space) | -| `get` | Get space details | space_id | -| `list` | List all spaces | (none) | -| `delete` | Delete a space | space_id | -| `export` | Export space config for migration/backup | space_id | -| `import` | Import space from serialized config | warehouse_id, serialized_space | - -**Example tool calls:** -``` -# MCP Tool: manage_genie -# Create a new space -manage_genie( - action="create_or_update", - display_name="Sales Analytics", - table_identifiers=["catalog.schema.customers", "catalog.schema.orders"], - description="Explore sales data with natural language", - sample_questions=["What were total sales last month?"] -) - -# MCP Tool: manage_genie -# Get space details with full config -manage_genie(action="get", space_id="space_123", include_serialized_space=True) - -# MCP Tool: manage_genie -# List all spaces -manage_genie(action="list") - -# MCP Tool: manage_genie -# Export for migration -exported = manage_genie(action="export", space_id="space_123") - -# MCP Tool: manage_genie -# Import to new workspace -manage_genie( - action="import", - warehouse_id="warehouse_456", - serialized_space=exported["serialized_space"], - title="Sales Analytics (Prod)" -) -``` +## Creating a Genie Space -### ask_genie - Conversation API (Query) +### Step 1: Understand the Data -Ask natural language questions to a Genie Space. Pass `conversation_id` for follow-up questions. +Before creating a Genie Space, explore the available tables to: +- **Select relevant tables** โ€” typically gold layer (aggregated KPIs) and sometimes silver layer (cleaned facts) or metric views +- **Understand the story** โ€” what business questions can this data answer? What insights can users discover? +- **Design meaningful sample questions** โ€” questions should reflect real use cases and lead to actionable insights in the data +Use `discover-schema` as the default โ€” one call returns columns, types, sample rows, null counts, and row count. If you only know the schema, list tables first with `query "SHOW TABLES IN ..."`. + +`databricks experimental aitools tools discover-schema catalog.schema.gold_sales catalog.schema.gold_customers` + +For Genie, knowing column distribution shapes the sample questions and text instructions. If you don't already know the data, probe cardinality, ranges, and top categorical values with aggregate SQL through `databricks experimental aitools tools query --warehouse "..."` so your sample questions reflect what's actually in the data. Both commands auto-pick the default warehouse; set `DATABRICKS_WAREHOUSE_ID` or pass `--warehouse ` to override. + +Fan out independent probes (state โˆˆ `PENDING|RUNNING|SUCCEEDED|FAILED|CANCELED|CLOSED`): + +```bash +submit() { databricks api post /api/2.0/sql/statements --json "$(jq -nc --arg w "$1" --arg s "$2" '{warehouse_id:$w,statement:$s,wait_timeout:"0s",on_wait_timeout:"CONTINUE"}')" | jq -r .statement_id; } +SIDS=(); for q in "$@"; do SIDS+=( "$(submit "$WH" "$q")" ); done +for s in "${SIDS[@]}"; do databricks api get "/api/2.0/sql/statements/$s" | jq '{state:.status.state, rows:.result.data_array}'; done +# cancel: databricks api post "/api/2.0/sql/statements/$SID/cancel" ``` -# MCP Tool: ask_genie -# Start a new conversation -result = ask_genie( - space_id="space_123", - question="What were total sales last month?" -) -# Returns: {question, conversation_id, message_id, status, sql, columns, data, row_count} - -# MCP Tool: ask_genie -# Follow-up question in same conversation -result = ask_genie( - space_id="space_123", - question="Break that down by region", - conversation_id=result["conversation_id"] -) -``` -## Quick Start +### Step 2: Create the Space + +Define your space in a local JSON file (e.g., `genie_space.json`) for version control and easy iteration. See "serialized_space Format" below for the full structure. + +```bash +# List all Genie Spaces +databricks genie list-spaces + +# Create a Genie Space from a local file +# IMPORTANT: sample_questions require a 32-char hex "id" and "question" must be an array +databricks genie create-space --json "{ + \"warehouse_id\": \"WAREHOUSE_ID\", + \"title\": \"Sales Analytics\", + \"description\": \"Explore sales data\", + \"parent_path\": \"/Workspace/Users/you@company.com/genie_spaces\", + \"serialized_space\": $(cat genie_space.json | jq -c '.' | jq -Rs '.') +}" -### 1. Inspect Your Tables +# Get space details (with full config) +databricks genie get-space SPACE_ID --include-serialized-space -Before creating a Genie Space, understand your data: +# By default, after creation, tag the Genie Space to track resources created with this skill +databricks workspace-entity-tag-assignments create-tag-assignment \ + geniespaces SPACE_ID aidevkit_project --tag-value ai-dev-kit +# Delete a Genie Space +databricks genie trash-space SPACE_ID ``` -# MCP Tool: get_table_stats_and_schema -get_table_stats_and_schema( - catalog="my_catalog", - schema="sales", - table_stat_level="SIMPLE" -) + +### Step 3: Test and Iterate + +Use the Conversation API (section below) to ask questions and verify answers. If answers are inaccurate or incomplete, improve the space โ€” see "Improving a Genie Space" below. + +### Export & Import + +**Convention:** `genie_space.json` always holds the **parsed** space object (not a JSON-string-encoded blob), so it's readable and editable. At each use site we stringify it with `jq -c '.' | jq -Rs '.'` โ€” same pattern as Step 2 Create and "Improving a Genie Space" below. `jq -r '.serialized_space | fromjson'` on export strips the outer quoting so the file is already a parsed object. + +```bash +# Export: extract serialized_space AND unwrap it to a parsed object on disk +databricks genie get-space SPACE_ID --include-serialized-space -o json \ + | jq '.serialized_space | fromjson' > genie_space.json + +# Import: same stringify pattern as Step 2 (Create) +databricks genie create-space --json "{ + \"warehouse_id\": \"WAREHOUSE_ID\", + \"title\": \"Sales Analytics\", + \"description\": \"Migrated space\", + \"parent_path\": \"/Workspace/Users/you@company.com/genie_spaces\", + \"serialized_space\": $(cat genie_space.json | jq -c '.' | jq -Rs '.') +}" ``` -### 2. Create the Genie Space +### Improving a Genie Space + +When Genie answers are inaccurate or incomplete, improve the space by updating questions, SQL examples, or instructions: + +```bash +# 1. Edit your local genie_space.json (add questions, fix SQL examples, improve instructions) +# 2. Push updates back to the space +databricks genie update-space SPACE_ID --json "{\"serialized_space\": $(cat genie_space.json | jq -c '.' | jq -Rs '.')}" ``` -# MCP Tool: manage_genie -manage_genie( - action="create_or_update", - display_name="Sales Analytics", - table_identifiers=[ - "my_catalog.sales.customers", - "my_catalog.sales.orders" + +## serialized_space Format + +The `serialized_space` field is a JSON string containing the full space configuration. + +### Field Format Requirements + +**IMPORTANT:** All items in `sample_questions`, `example_question_sqls`, and `text_instructions` require a unique `id` field. + +| Field | Format | +|-------|--------| +| `config.sample_questions[]` | `{"id": "32hexchars", "question": ["..."]}` | +| `instructions.example_question_sqls[]` | `{"id": "32hexchars", "question": ["..."], "sql": ["..."]}` | +| `instructions.text_instructions[]` | `{"id": "32hexchars", "content": ["..."]}` | + +- **ID format:** 32-character lowercase hex, unique across **all three lists combined** (a duplicate between e.g. `text_instructions` and `example_question_sqls` is rejected). +- **Text fields are arrays:** `question`, `sql`, and `content` are arrays of strings, not plain strings. +- **Sort order matters:** `data_sources.tables` must be sorted by `identifier`; `example_question_sqls` and `text_instructions` must be sorted by `id`. (`sample_questions` is silently re-sorted server-side.) +- **Simple ID scheme that satisfies both rules:** prefix per list + monotonic counter, total 32 hex chars โ€” `1โ€ฆ0001`, `1โ€ฆ0002` for `sample_questions`; `2โ€ฆ0001`, `2โ€ฆ0002` for `example_question_sqls`; `3โ€ฆ0001` for `text_instructions`. Authoring order = sort order, no collisions. + +### Text Instructions + +`text_instructions` make the Genie Space more reliable by explaining: +- **Where to find information** โ€” which tables contain which metrics +- **How to answer specific questions** โ€” when a user asks X, use table Y with filter Z +- **Business context** โ€” definitions, thresholds, and domain knowledge + +Well-crafted instructions significantly improve answer accuracy. + +### Example + +Top-level keys are `version`, `config`, `data_sources`, `instructions`. Every item in `sample_questions`, `example_question_sqls`, and `text_instructions` needs a unique 32-char hex `id` and all text fields are arrays: + +```json +{ + "version": 2, + "config": { + "sample_questions": [ + {"id": "a1b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4", "question": ["What is our current on-time performance?"]},... + ] + }, + "data_sources": { + "tables": [ + {"identifier": "catalog.ops.gold_otp_summary"},... + ] + }, + "instructions": { + "example_question_sqls": [ + { + "id": "b2c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5", + "question": ["What is our on-time performance?"], + "sql": ["SELECT flight_date, ROUND(SUM(on_time_count) * 100.0 / SUM(total_flights), 1) AS otp_pct\n", "FROM catalog.ops.gold_otp_summary\n", "WHERE flight_date >= date_sub(current_date(), 7)\n", "GROUP BY flight_date ORDER BY flight_date"] + } ], - description="Explore sales data with natural language", - sample_questions=[ - "What were total sales last month?", - "Who are our top 10 customers?" + "text_instructions": [ + { + "id": "c3d4e5f6a1b2c3d4e5f6a1b2c3d4e5f6", + "content": [ + "On-time performance (OTP) questions: Use gold_otp_summary table. OTP target is 85%.\n", + "Delay analysis questions: Use gold_delay_analysis table. Filter by delay_code for specific delay types.\n", + "When asked about 'this week' or 'recent': Use flight_date >= date_sub(current_date(), 7).\n", + "When comparing aircraft: Join with gold_aircraft_reliability on tail_number." + ] + } ] -) + } +} ``` -### 3. Ask Questions (Conversation API) - -``` -# MCP Tool: ask_genie -ask_genie( - space_id="your_space_id", - question="What were total sales last month?" -) -# Returns: SQL, columns, data, row_count -``` -### 4. Export & Import (Clone / Migrate) +## Cross-Workspace Migration -Export a space (preserves all tables, instructions, SQL examples, and layout): +When migrating between workspaces, catalog names often differ. Export the space, remap with `sed`, then import: -``` -# MCP Tool: manage_genie -exported = manage_genie(action="export", space_id="your_space_id") -# exported["serialized_space"] contains the full config +```bash +sed -i '' 's/source_catalog/target_catalog/g' genie_space.json ``` -Clone to a new space (same catalog): +Use `DATABRICKS_CONFIG_PROFILE=profile_name` to target different workspaces. -``` -# MCP Tool: manage_genie -manage_genie( - action="import", - warehouse_id=exported["warehouse_id"], - serialized_space=exported["serialized_space"], - title=exported["title"], # override title; omit to keep original - description=exported["description"], -) -``` +## Conversation API -> **Cross-workspace migration:** Each MCP server is workspace-scoped. Configure one server entry per workspace profile in your IDE's MCP config, then `manage_genie(action="export")` from the source server and `manage_genie(action="import")` via the target server. See [spaces.md ยงMigration](spaces.md#migrating-across-workspaces-with-catalog-remapping) for the full workflow. +Ask questions via three CLI primitives: `start-conversation`, `create-message` (follow-ups), and `get-message` (state + SQL + text). `--no-wait` on `start-conversation` / `create-message` returns immediately with `{conversation_id, message_id}`; poll `get-message` until `.status` is `COMPLETED`, `FAILED`, or `CANCELLED`. Intermediate states you'll see: `SUBMITTED`, `FILTERING_CONTEXT`, `ASKING_AI`, `EXECUTING_QUERY`. -## Reference Files +```bash +# Start a new conversation (async โ€” get IDs back immediately) +databricks genie start-conversation --no-wait SPACE_ID "What were total sales last month?" +# โ†’ {"conversation_id": "...", "message_id": "..."} -- [spaces.md](spaces.md) - Creating and managing Genie Spaces -- [conversation.md](conversation.md) - Asking questions via the Conversation API +# Poll state +databricks genie get-message SPACE_ID CONV_ID MSG_ID | jq '{status, error}' -## Prerequisites +# When COMPLETED, pull the generated SQL and any text reply +databricks genie get-message SPACE_ID CONV_ID MSG_ID \ + | jq '.attachments[] | {sql: .query.query, description: .query.description, text: .text.content}' -Before creating a Genie Space: +# Fetch the query result rows (columns + data_array) +databricks genie get-message-attachment-query-result SPACE_ID CONV_ID MSG_ID ATTACHMENT_ID \ + | jq '{columns: .statement_response.manifest.schema.columns | map({name, type: .type_name}), + rows: .statement_response.result.data_array}' + +# Follow-up in the same conversation (Genie remembers context) +databricks genie create-message --no-wait SPACE_ID CONV_ID "Break that down by region" +``` -1. **Tables in Unity Catalog** - Bronze/silver/gold tables with the data -2. **SQL Warehouse** - A warehouse to execute queries (auto-detected if not specified) +Start a new conversation for unrelated topics. Use `create-message` (same `CONV_ID`) only for follow-ups on the same topic. -### Creating Tables +On `FAILED`, `get-message` populates `.error.error` with the underlying error string (e.g. `[INSUFFICIENT_PERMISSIONS] ...`) and `.error.type` (e.g. `SQL_EXECUTION_EXCEPTION`). Attachments may still include `suggested_questions` even when the primary query failed. -Use these skills in sequence: -1. `databricks-synthetic-data-gen` - Generate raw parquet files -2. `databricks-spark-declarative-pipelines` - Create bronze/silver/gold tables +## Troubleshooting -## Common Issues +| Issue | Solution | +|-------|----------| +| `sample_question.id must be provided` | Add 32-char hex UUID `id` to each sample question | +| `Expected an array for question` | Use `"question": ["text"]` not `"question": "text"` | +| No warehouse available | Create a SQL warehouse or provide `warehouse_id` | +| Empty `serialized_space` on export | Requires CAN EDIT permission on the space | +| Tables not found after migration | Remap catalog name in `serialized_space` before import | +| Slow answers / query timeouts | Size up the warehouse attached to the space; simplify or pre-aggregate tall source tables | +| Wrong or empty answers | Add `example_question_sqls` and `text_instructions` โ€” see "Improving a Genie Space" | -See [spaces.md ยงTroubleshooting](spaces.md#troubleshooting) for a full list of issues and solutions. ## Related Skills -- **[databricks-agent-bricks](../databricks-agent-bricks/SKILL.md)** - Use Genie Spaces as agents inside Supervisor Agents -- **[databricks-synthetic-data-gen](../databricks-synthetic-data-gen/SKILL.md)** - Generate raw parquet data to populate tables for Genie -- **[databricks-spark-declarative-pipelines](../databricks-spark-declarative-pipelines/SKILL.md)** - Build bronze/silver/gold tables consumed by Genie Spaces -- **[databricks-unity-catalog](../databricks-unity-catalog/SKILL.md)** - Manage the catalogs, schemas, and tables Genie queries +- **[databricks-synthetic-data-gen](../databricks-synthetic-data-gen/SKILL.md)** - Generate data for Genie tables +- **[databricks-spark-declarative-pipelines](../databricks-spark-declarative-pipelines/SKILL.md)** - Build bronze/silver/gold tables diff --git a/databricks-skills/databricks-genie/conversation.md b/databricks-skills/databricks-genie/conversation.md deleted file mode 100644 index e4320e8b..00000000 --- a/databricks-skills/databricks-genie/conversation.md +++ /dev/null @@ -1,239 +0,0 @@ -# Genie Conversations - -Use the Genie Conversation API to ask natural language questions to a curated Genie Space. - -## Overview - -The `ask_genie` tool allows you to programmatically send questions to a Genie Space and receive SQL-generated answers. Instead of writing SQL directly, you delegate the query generation to Genie, which has been curated with business logic, instructions, and certified queries. - -## When to Use `ask_genie` - -### Use `ask_genie` When: - -| Scenario | Why | -|----------|-----| -| Genie Space has curated business logic | Genie knows rules like "active customer = ordered in 90 days" | -| User explicitly says "ask Genie" or "use my Genie Space" | User intent to use their curated space | -| Complex business metrics with specific definitions | Genie has certified queries for official metrics | -| Testing a Genie Space after creating it | Validate the space works correctly | -| User wants conversational data exploration | Genie handles context for follow-up questions | - -### Use Direct SQL (`execute_sql`) Instead When: - -| Scenario | Why | -|----------|-----| -| Simple ad-hoc query | Direct SQL is faster, no curation needed | -| You already have the exact SQL | No need for Genie to regenerate | -| Genie Space doesn't exist for this data | Can't use Genie without a space | -| Need precise control over the query | Direct SQL gives exact control | - -## MCP Tools - -| Tool | Purpose | -|------|---------| -| `ask_genie` | Ask a question or follow-up (`conversation_id` optional) | - -## Basic Usage - -### Ask a Question - -```python -ask_genie( - space_id="01abc123...", - question="What were total sales last month?" -) -``` - -**Response:** -```python -{ - "question": "What were total sales last month?", - "conversation_id": "conv_xyz789", - "message_id": "msg_123", - "status": "COMPLETED", - "sql": "SELECT SUM(total_amount) AS total_sales FROM orders WHERE order_date >= DATE_TRUNC('month', CURRENT_DATE - INTERVAL 1 MONTH) AND order_date < DATE_TRUNC('month', CURRENT_DATE)", - "columns": ["total_sales"], - "data": [[125430.50]], - "row_count": 1 -} -``` - -### Ask Follow-up Questions - -Use the `conversation_id` from the first response to ask follow-up questions with context: - -```python -# First question -result = ask_genie( - space_id="01abc123...", - question="What were total sales last month?" -) - -# Follow-up (uses context from first question) -ask_genie( - space_id="01abc123...", - question="Break that down by region", - conversation_id=result["conversation_id"] -) -``` - -Genie remembers the context, so "that" refers to "total sales last month". - -## Response Fields - -| Field | Description | -|-------|-------------| -| `question` | The original question asked | -| `conversation_id` | ID for follow-up questions | -| `message_id` | Unique message identifier | -| `status` | `COMPLETED`, `FAILED`, `CANCELLED`, `TIMEOUT` | -| `sql` | The SQL query Genie generated | -| `columns` | List of column names in result | -| `data` | Query results as list of rows | -| `row_count` | Number of rows returned | -| `text_response` | Text explanation (if Genie asks for clarification) | -| `error` | Error message (if status is not COMPLETED) | - -## Handling Responses - -### Successful Response - -```python -result = ask_genie(space_id, "Who are our top 10 customers?") - -if result["status"] == "COMPLETED": - print(f"SQL: {result['sql']}") - print(f"Rows: {result['row_count']}") - for row in result["data"]: - print(row) -``` - -### Failed Response - -```python -result = ask_genie(space_id, "What is the meaning of life?") - -if result["status"] == "FAILED": - print(f"Error: {result['error']}") - # Genie couldn't answer - may need to rephrase or use direct SQL -``` - -### Timeout - -```python -result = ask_genie(space_id, question, timeout_seconds=60) - -if result["status"] == "TIMEOUT": - print("Query took too long - try a simpler question or increase timeout") -``` - -## Example Workflows - -### Workflow 1: User Asks to Use Genie - -``` -User: "Ask my Sales Genie what the churn rate is" - -Claude: -1. Identifies user wants to use Genie (explicit request) -2. Calls ask_genie(space_id="sales_genie_id", question="What is the churn rate?") -3. Returns: "Based on your Sales Genie, the churn rate is 4.2%. - Genie used this SQL: SELECT ..." -``` - -### Workflow 2: Testing a New Genie Space - -``` -User: "I just created a Genie Space for HR data. Can you test it?" - -Claude: -1. Gets the space_id from the user or recent manage_genie(action="create_or_update") result -2. Calls ask_genie with test questions: - - "How many employees do we have?" - - "What is the average salary by department?" -3. Reports results: "Your HR Genie is working. It correctly answered..." -``` - -### Workflow 3: Data Exploration with Follow-ups - -``` -User: "Use my analytics Genie to explore sales trends" - -Claude: -1. ask_genie(space_id, "What were total sales by month this year?") -2. User: "Which month had the highest growth?" -3. ask_genie(space_id, "Which month had the highest growth?", conversation_id=conv_id) -4. User: "What products drove that growth?" -5. ask_genie(space_id, "What products drove that growth?", conversation_id=conv_id) -``` - -## Best Practices - -### Start New Conversations for New Topics - -Don't reuse conversations across unrelated questions: - -```python -# Good: New conversation for new topic -result1 = ask_genie(space_id, "What were sales last month?") # New conversation -result2 = ask_genie(space_id, "How many employees do we have?") # New conversation - -# Good: Follow-up for related question -result1 = ask_genie(space_id, "What were sales last month?") -result2 = ask_genie(space_id, "Break that down by product", - conversation_id=result1["conversation_id"]) # Related follow-up -``` - -### Handle Clarification Requests - -Genie may ask for clarification instead of returning results: - -```python -result = ask_genie(space_id, "Show me the data") - -if result.get("text_response"): - # Genie is asking for clarification - print(f"Genie asks: {result['text_response']}") - # Rephrase with more specifics -``` - -### Set Appropriate Timeouts - -- Simple aggregations: 30-60 seconds -- Complex joins: 60-120 seconds -- Large data scans: 120+ seconds - -```python -# Quick question -ask_genie(space_id, "How many orders today?", timeout_seconds=30) - -# Complex analysis -ask_genie(space_id, "Calculate customer lifetime value for all customers", - timeout_seconds=180) -``` - -## Troubleshooting - -### "Genie Space not found" - -- Verify the `space_id` is correct -- Check you have access to the space -- Use `manage_genie(action="get", space_id=...)` to verify it exists - -### "Query timed out" - -- Increase `timeout_seconds` -- Simplify the question -- Check if the SQL warehouse is running - -### "Failed to generate SQL" - -- Rephrase the question more clearly -- Check if the question is answerable with the available tables -- Add more instructions/curation to the Genie Space - -### Unexpected Results - -- Review the generated SQL in the response -- Add SQL instructions to the Genie Space via the Databricks UI -- Add sample questions that demonstrate correct patterns diff --git a/databricks-skills/databricks-genie/spaces.md b/databricks-skills/databricks-genie/spaces.md deleted file mode 100644 index ff8acb60..00000000 --- a/databricks-skills/databricks-genie/spaces.md +++ /dev/null @@ -1,395 +0,0 @@ -# Creating Genie Spaces - -This guide covers creating and managing Genie Spaces for SQL-based data exploration. - -## What is a Genie Space? - -A Genie Space connects to Unity Catalog tables and translates natural language questions into SQL โ€” understanding schemas, generating queries, executing them on a SQL warehouse, and presenting results conversationally. - -## Creation Workflow - -### Step 1: Inspect Table Schemas (Required) - -**Before creating a Genie Space, you MUST inspect the table schemas** to understand what data is available: - -```python -get_table_stats_and_schema( - catalog="my_catalog", - schema="sales", - table_stat_level="SIMPLE" -) -``` - -This returns: -- Table names and row counts -- Column names and data types -- Sample values and cardinality -- Null counts and statistics - -### Step 2: Analyze and Plan - -Based on the schema information: - -1. **Select relevant tables** - Choose tables that support the user's use case -2. **Identify key columns** - Note date columns, metrics, dimensions, and foreign keys -3. **Understand relationships** - How do tables join together? -4. **Plan sample questions** - What questions can this data answer? - -### Step 3: Create the Genie Space - -Create the space with content tailored to the actual data: - -```python -manage_genie( - action="create_or_update", - display_name="Sales Analytics", - table_identifiers=[ - "my_catalog.sales.customers", - "my_catalog.sales.orders", - "my_catalog.sales.products" - ], - description="""Explore retail sales data with three related tables: -- customers: Customer demographics including region, segment, and signup date -- orders: Transaction history with order_date, total_amount, and status -- products: Product catalog with category, price, and inventory - -Tables join on customer_id and product_id.""", - sample_questions=[ - "What were total sales last month?", - "Who are our top 10 customers by total_amount?", - "How many orders were placed in Q4 by region?", - "What's the average order value by customer segment?", - "Which product categories have the highest revenue?", - "Show me customers who haven't ordered in 90 days" - ] -) -``` - -## Why This Workflow Matters - -**Sample questions that reference actual column names** help Genie: -- Learn the vocabulary of your data -- Generate more accurate SQL queries -- Provide better autocomplete suggestions - -**A description that explains table relationships** helps Genie: -- Understand how to join tables correctly -- Know which table contains which information -- Provide more relevant answers - -## Auto-Detection of Warehouse - -When `warehouse_id` is not specified, the tool: - -1. Lists all SQL warehouses in the workspace -2. Prioritizes by: - - **Running** warehouses first (already available) - - **Starting** warehouses second - - **Smaller sizes** preferred (cost-efficient) -3. Returns an error if no warehouses exist - -To use a specific warehouse, provide the `warehouse_id` explicitly. - -## Table Selection - -Choose tables carefully for best results: - -| Layer | Recommended | Why | -|-------|-------------|-----| -| Bronze | No | Raw data, may have quality issues | -| Silver | Yes | Cleaned and validated | -| Gold | Yes | Aggregated, optimized for analytics | - -### Tips for Table Selection - -- **Include related tables**: If users ask about customers and orders, include both -- **Use descriptive column names**: `customer_name` is better than `cust_nm` -- **Add table comments**: Genie uses metadata to understand the data - -## Sample Questions - -Sample questions help users understand what they can ask: - -**Good sample questions:** -- "What were total sales last month?" -- "Who are our top 10 customers by revenue?" -- "How many orders were placed in Q4?" -- "What's the average order value by region?" - -These appear in the Genie UI to guide users. - -## Best Practices - -### Table Design for Genie - -1. **Descriptive names**: Use `customer_lifetime_value` not `clv` -2. **Add comments**: `COMMENT ON TABLE sales.customers IS 'Customer master data'` -3. **Primary keys**: Define relationships clearly -4. **Date columns**: Include proper date/timestamp columns for time-based queries - -### Description and Context - -Provide context in the description: - -``` -Explore retail sales data from our e-commerce platform. Includes: -- Customers: demographics, segments, and account status -- Orders: transaction history with amounts and dates -- Products: catalog with categories and pricing - -Time range: Last 6 months of data -``` - -### Sample Questions - -Write sample questions that: -- Cover common use cases -- Demonstrate the data's capabilities -- Use natural language (not SQL terms) - -## Updating a Genie Space - -`manage_genie(action="create_or_update")` handles both create and update automatically. There are two ways it locates an existing space to update: - -- **By `space_id`** (explicit, preferred): pass `space_id=` to target a specific space. -- **By `display_name`** (implicit fallback): if `space_id` is omitted, the tool searches for a space with a matching name and updates it if found; otherwise it creates a new one. - -### Simple field updates (tables, questions, warehouse) - -To update metadata without a serialized config: - -```python -manage_genie( - action="create_or_update", - display_name="Sales Analytics", - space_id="01abc123...", # omit to match by name instead - table_identifiers=[ # updated table list - "my_catalog.sales.customers", - "my_catalog.sales.orders", - "my_catalog.sales.products", - ], - sample_questions=[ # updated sample questions - "What were total sales last month?", - "Who are our top 10 customers by revenue?", - ], - warehouse_id="abc123def456", # omit to keep current / auto-detect - description="Updated description.", -) -``` - -### Full config update via `serialized_space` - -To push a complete serialized configuration to an existing space (the dict contains all regular table metadata, plus it preserves all instructions, SQL examples, join specs, etc.): - -```python -manage_genie( - action="create_or_update", - display_name="Sales Analytics", # overrides title embedded in serialized_space - table_identifiers=[], # ignored when serialized_space is provided - space_id="01abc123...", # target space to overwrite - warehouse_id="abc123def456", # overrides warehouse embedded in serialized_space - description="Updated description.", # overrides description embedded in serialized_space; omit to keep the one in the payload - serialized_space=remapped_config, # JSON string from manage_genie(action="export") (after catalog remap if needed) -) -``` - -> **Note:** When `serialized_space` is provided, `table_identifiers` and `sample_questions` are ignored โ€” the full config comes from the serialized payload. However, `display_name`, `warehouse_id`, and `description` are still applied as top-level overrides on top of the serialized payload. Omit any of them to keep the values embedded in `serialized_space`. - -## Export, Import & Migration - -`manage_genie(action="export")` returns a dictionary with four top-level keys: - -| Key | Description | -|-----|-------------| -| `space_id` | ID of the exported space | -| `title` | Display name of the space | -| `description` | Description of the space | -| `warehouse_id` | SQL warehouse associated with the space (workspace-specific โ€” do **not** reuse across workspaces) | -| `serialized_space` | JSON-encoded string with the full space configuration (see below) | - -This envelope enables cloning, backup, and cross-workspace migration. Use `manage_genie(action="export")` and `manage_genie(action="import")` for all export/import operations โ€” no direct REST calls needed. - -### What is `serialized_space`? - -`serialized_space` is a JSON string (version 2) embedded inside the export envelope. Its top-level keys are: - -| Key | Contents | -|-----|----------| -| `version` | Schema version (currently `2`) | -| `config` | Space-level config: `sample_questions` shown in the UI | -| `data_sources` | `tables` array โ€” each entry has a fully-qualified `identifier` (`catalog.schema.table`) and optional `column_configs` (format assistance, entity matching per column) | -| `instructions` | `example_question_sqls` (certified Q&A pairs), `join_specs` (join relationships between tables), `sql_snippets` (`filters` and `measures` with display names and usage instructions) | -| `benchmarks` | Evaluation Q&A pairs used to measure space quality | - -Catalog names appear **everywhere** inside `serialized_space` โ€” in `data_sources.tables[].identifier`, SQL strings in `example_question_sqls`, `join_specs`, and `sql_snippets`. A single `.replace(src_catalog, tgt_catalog)` on the whole string is sufficient for catalog remapping. - -Minimum structure: -```json -{"version": 2, "data_sources": {"tables": [{"identifier": "catalog.schema.table"}]}} -``` - -### Exporting a Space - -Use `manage_genie(action="export")` to export the full configuration (requires CAN EDIT permission): - -```python -exported = manage_genie(action="export", space_id="01abc123...") -# Returns: -# { -# "space_id": "01abc123...", -# "title": "Sales Analytics", -# "description": "Explore sales data...", -# "warehouse_id": "abc123def456", -# "serialized_space": "{\"version\":2,\"data_sources\":{...},\"instructions\":{...}}" -# } -``` - -You can also get `serialized_space` inline via `manage_genie(action="get")`: - -```python -details = manage_genie(action="get", space_id="01abc123...", include_serialized_space=True) -serialized = details["serialized_space"] -``` - -### Cloning a Space (Same Workspace) - -```python -# Step 1: Export the source space -source = manage_genie(action="export", space_id="01abc123...") - -# Step 2: Import as a new space -manage_genie( - action="import", - warehouse_id=source["warehouse_id"], - serialized_space=source["serialized_space"], - title=source["title"], # override title; omit to keep original - description=source["description"], -) -# Returns: {"space_id": "01def456...", "title": "Sales Analytics (Dev Copy)", "operation": "imported"} -``` - -### Migrating Across Workspaces with Catalog Remapping - -When migrating between environments (e.g. prod โ†’ dev), Unity Catalog names are often different. The `serialized_space` string contains the source catalog name **everywhere** โ€” in table identifiers, SQL queries, join specs, and filter snippets. You must remap it before importing. - -**Agent workflow (3 steps):** - -**Step 1 โ€” Export from source workspace:** -```python -exported = manage_genie(action="export", space_id="01f106e1239d14b28d6ab46f9c15e540") -# exported keys: warehouse_id, title, description, serialized_space -# exported["serialized_space"] contains all references to source catalog -``` - -**Step 2 โ€” Remap catalog name in `serialized_space`:** - -The agent does this as an inline string substitution between the two MCP calls: -```python -modified_serialized = exported["serialized_space"].replace( - "source_catalog_name", # e.g. "healthverity_claims_sample_patient_dataset" - "target_catalog_name" # e.g. "healthverity_claims_sample_patient_dataset_dev" -) -``` -This replaces all occurrences โ€” table identifiers, SQL FROM clauses, join specs, and filter snippets. - -**Step 3 โ€” Import to target workspace:** -```python -manage_genie( - action="import", - warehouse_id="", # from manage_warehouse(action="list") on target - serialized_space=modified_serialized, - title=exported["title"], - description=exported["description"] -) -``` - -### Batch Migration of Multiple Spaces - -To migrate several spaces at once, loop through space IDs. The agent exports, remaps the catalog, then imports each: - -``` -For each space_id in [id1, id2, id3]: - 1. exported = manage_genie(action="export", space_id=space_id) - 2. modified = exported["serialized_space"].replace(src_catalog, tgt_catalog) - 3. result = manage_genie(action="import", warehouse_id=wh_id, serialized_space=modified, title=exported["title"], description=exported["description"]) - 4. record result["space_id"] for updating databricks.yml -``` - -After migration, update `databricks.yml` with the new dev `space_id` values under the `dev` target's `genie_space_ids` variable. - -### Updating an Existing Space with New Config - -To push a serialized config to an already-existing space (rather than creating a new one), use `manage_genie(action="create_or_update")` with `space_id=` and `serialized_space=`. The export โ†’ remap โ†’ push pattern is identical to the migration steps above; just replace `manage_genie(action="import")` with `manage_genie(action="create_or_update", space_id=TARGET_SPACE_ID, ...)` as the final call. - -### Permissions Required - -| Operation | Required Permission | -|-----------|-------------------| -| `manage_genie(action="export")` / `manage_genie(action="get", include_serialized_space=True)` | CAN EDIT on source space | -| `manage_genie(action="import")` | Can create items in target workspace folder | -| `manage_genie(action="create_or_update")` with `serialized_space` (update) | CAN EDIT on target space | - -## Example End-to-End Workflow - -1. **Generate synthetic data** using `databricks-synthetic-data-gen` skill: - - Creates parquet files in `/Volumes/catalog/schema/raw_data/` - -2. **Create tables** using `databricks-spark-declarative-pipelines` skill: - - Creates `catalog.schema.bronze_*` โ†’ `catalog.schema.silver_*` โ†’ `catalog.schema.gold_*` - -3. **Inspect the tables**: - ```python - get_table_stats_and_schema(catalog="catalog", schema="schema") - ``` - -4. **Create the Genie Space**: - - `display_name`: "My Data Explorer" - - `table_identifiers`: `["catalog.schema.silver_customers", "catalog.schema.silver_orders"]` - -5. **Add sample questions** based on actual column names - -6. **Test** in the Databricks UI - -## Troubleshooting - -### No warehouse available - -- Create a SQL warehouse in the Databricks workspace -- Or provide a specific `warehouse_id` - -### Queries are slow - -- Ensure the warehouse is running (not stopped) -- Consider using a larger warehouse size -- Check if tables are optimized (OPTIMIZE, Z-ORDER) - -### Poor query generation - -- Use descriptive column names -- Add table and column comments -- Include sample questions that demonstrate the vocabulary -- Add instructions via the Databricks Genie UI - -### `manage_genie(action="export")` returns empty `serialized_space` - -Requires at least **CAN EDIT** permission on the space. - -### `manage_genie(action="import")` fails with permission error - -Ensure you have CREATE privileges in the target workspace folder. - -### Tables not found after migration - -Catalog name was not remapped โ€” replace the source catalog name in `serialized_space` before calling `manage_genie(action="import")`. The catalog appears in table identifiers, SQL FROM clauses, join specs, and filter snippets; a single `.replace(src_catalog, tgt_catalog)` on the whole string covers all occurrences. - -### `manage_genie` lands in the wrong workspace - -Each MCP server is workspace-scoped. Set up two named MCP server entries (one per profile) in your IDE's MCP config instead of switching a single server's profile mid-session. - -### MCP server doesn't pick up profile change - -The MCP process reads `DATABRICKS_CONFIG_PROFILE` once at startup โ€” editing the config file requires an IDE reload to take effect. - -### `manage_genie(action="import")` fails with JSON parse error - -The `serialized_space` string may contain multi-line SQL arrays with `\n` escape sequences. Flatten SQL arrays to single-line strings before passing to avoid double-escaping issues. diff --git a/databricks-skills/databricks-iceberg/4-snowflake-interop.md b/databricks-skills/databricks-iceberg/4-snowflake-interop.md index 2f9d9536..ad6fae3a 100644 --- a/databricks-skills/databricks-iceberg/4-snowflake-interop.md +++ b/databricks-skills/databricks-iceberg/4-snowflake-interop.md @@ -196,7 +196,7 @@ Create a storage credential for the cloud storage where Snowflake stores its Ice ```bash # In Databricks CLI (AWS example) databricks storage-credentials create snowflake_storage_cred \ - --aws-iam-role-arn "arn:aws:iam::123456789012:role/snowflake-data-access" + --json '{"aws_iam_role": {"role_arn": "arn:aws:iam::123456789012:role/snowflake-data-access"}}' ``` ### Step 3: Create an External Location diff --git a/databricks-skills/databricks-jobs/SKILL.md b/databricks-skills/databricks-jobs/SKILL.md index 0f60a241..d66f87e5 100644 --- a/databricks-skills/databricks-jobs/SKILL.md +++ b/databricks-skills/databricks-jobs/SKILL.md @@ -46,8 +46,10 @@ print(f"Created job: {job.job_id}") ### CLI ```bash +# By default, add tags to track resources created with this skill databricks jobs create --json '{ "name": "my-etl-job", + "tags": {"aidevkit_project": "ai-dev-kit"}, "tasks": [{ "task_key": "extract", "notebook_task": { @@ -58,6 +60,8 @@ databricks jobs create --json '{ }' ``` +> **Tagging**: By default, add `"tags": {"aidevkit_project": "ai-dev-kit"}` to track resources created with this skill. + ### Asset Bundles (DABs) ```yaml @@ -261,8 +265,8 @@ databricks jobs get 12345 # Run job databricks jobs run-now 12345 -# Run with parameters -databricks jobs run-now 12345 --job-params '{"env": "prod"}' +# Run with parameters (must use --json with job_id inside) +databricks jobs run-now --json '{"job_id": 12345, "job_parameters": {"env": "prod"}}' # Cancel run databricks jobs cancel-run 67890 diff --git a/databricks-skills/databricks-jobs/task-types.md b/databricks-skills/databricks-jobs/task-types.md index c5b06fbe..f7c3e043 100644 --- a/databricks-skills/databricks-jobs/task-types.md +++ b/databricks-skills/databricks-jobs/task-types.md @@ -618,7 +618,6 @@ Define reusable Python environments for serverless tasks with custom pip depende > **IMPORTANT:** The `client` field is **required** in the environment `spec`. It specifies the > base serverless environment version. Use `"4"` as the value. Without it, the API returns: > `"Either base environment or version must be provided for environment"`. -> The MCP `manage_jobs` tool (action="create") auto-injects `client: "4"` if omitted, but CLI/SDK calls require it explicitly. ### DABs YAML diff --git a/databricks-skills/databricks-jobs/triggers-schedules.md b/databricks-skills/databricks-jobs/triggers-schedules.md index 9022c715..a0c0fd48 100644 --- a/databricks-skills/databricks-jobs/triggers-schedules.md +++ b/databricks-skills/databricks-jobs/triggers-schedules.md @@ -431,8 +431,8 @@ run_result = w.jobs.run_now_and_wait(job_id=12345) # Run job databricks jobs run-now 12345 -# Run with parameters -databricks jobs run-now 12345 --job-params '{"env": "prod"}' +# Run with parameters (must use --json with job_id inside) +databricks jobs run-now --json '{"job_id": 12345, "job_parameters": {"env": "prod"}}' ``` **DABs:** diff --git a/databricks-skills/databricks-lakebase-autoscale/SKILL.md b/databricks-skills/databricks-lakebase-autoscale/SKILL.md index f471765c..82fcbcdd 100644 --- a/databricks-skills/databricks-lakebase-autoscale/SKILL.md +++ b/databricks-skills/databricks-lakebase-autoscale/SKILL.md @@ -5,330 +5,228 @@ description: "Patterns and best practices for Lakebase Autoscaling (next-gen man # Lakebase Autoscaling -Patterns and best practices for using Lakebase Autoscaling, the next-generation managed PostgreSQL on Databricks with autoscaling compute, branching, scale-to-zero, and instant restore. +Next-generation managed PostgreSQL on Databricks โ€” autoscaling compute (0.5-112 CU, ~2 GB/CU), Git-like branching, scale-to-zero, and point-in-time restore (up to 35 days). -## When to Use +**Interface: Databricks CLI (`databricks postgres ...`).** Every admin command below uses the CLI. The only place SDK is preferred is inside an application that needs to refresh 1-hour OAuth tokens โ€” see [connection-patterns.md](references/connection-patterns.md). -Use this skill when: -- Building applications that need a PostgreSQL database with autoscaling compute -- Working with database branching for dev/test/staging workflows -- Adding persistent state to applications with scale-to-zero cost savings -- Implementing reverse ETL from Delta Lake to an operational database via synced tables -- Managing Lakebase Autoscaling projects, branches, computes, or credentials +## Hierarchy -## Overview +``` +Project โ†’ Branch(es) โ†’ Endpoint(s) (compute) + Database(s) (Postgres DBs) +``` -Lakebase Autoscaling is Databricks' next-generation managed PostgreSQL service for OLTP workloads. It provides autoscaling compute, Git-like branching, scale-to-zero, and instant point-in-time restore. +A new project includes by default: a `production` branch, a primary R/W endpoint named `primary` (1 CU min/max, autoscaling on, scale-to-zero off), a `databricks_postgres` database, and a Postgres role for the creating user. -| Feature | Description | -|---------|-------------| -| **Autoscaling Compute** | 0.5-112 CU with 2 GB RAM per CU; scales dynamically based on load | -| **Scale-to-Zero** | Compute suspends after configurable inactivity timeout | -| **Branching** | Create isolated database environments (like Git branches) for dev/test | -| **Instant Restore** | Point-in-time restore from any moment within the configured window (up to 35 days) | -| **OAuth Authentication** | Token-based auth via Databricks SDK (1-hour expiry) | -| **Reverse ETL** | Sync data from Delta tables to PostgreSQL via synced tables | +Resource names are hierarchical paths: `projects/{id}/branches/{id}/endpoints/{id}`. IDs are 1-63 chars, lowercase/digits/hyphens, no leading or trailing hyphen, immutable after creation. -**Available Regions (AWS):** us-east-1, us-east-2, eu-central-1, eu-west-1, eu-west-2, ap-south-1, ap-southeast-1, ap-southeast-2 +**Regions (AWS):** us-east-1, us-east-2, eu-central-1, eu-west-1, eu-west-2, ap-south-1, ap-southeast-1, ap-southeast-2. **Azure (Beta):** eastus2, westeurope, westus. -**Available Regions (Azure Beta):** eastus2, westeurope, westus +All CLI create/update/delete operations are long-running โ€” the CLI blocks until the LRO completes and returns the final resource. -## Project Hierarchy +--- -Understanding the hierarchy is essential for working with Lakebase Autoscaling: +## Projects -``` -Project (top-level container) - โ””โ”€โ”€ Branch(es) (isolated database environments) - โ”œโ”€โ”€ Compute (primary R/W endpoint) - โ”œโ”€โ”€ Read Replica(s) (optional, read-only) - โ”œโ”€โ”€ Role(s) (Postgres roles) - โ””โ”€โ”€ Database(s) (Postgres databases) - โ””โ”€โ”€ Schema(s) -``` +Top-level container. One per application or environment grouping. -| Object | Description | -|--------|-------------| -| **Project** | Top-level container. Created via `w.postgres.create_project()`. | -| **Branch** | Isolated database environment with copy-on-write storage. Default branch is `production`. | -| **Compute** | Postgres server powering a branch. Configurable CU sizing and autoscaling. | -| **Database** | Standard Postgres database within a branch. Default is `databricks_postgres`. | +```bash +# Create โ€” variants: pg_version 16 | 17. PROJECT_ID is positional. +databricks postgres create-project my-app \ + --json '{"spec": {"display_name": "My App", "pg_version": "17"}}' -## Quick Start +# Get / list +databricks postgres get-project projects/my-app +databricks postgres list-projects -Create a project and connect: +# Update โ€” positional arg is the field mask +databricks postgres update-project projects/my-app spec.display_name \ + --json '{"spec": {"display_name": "Renamed App"}}' -```python -from databricks.sdk import WorkspaceClient -from databricks.sdk.service.postgres import Project, ProjectSpec +# Delete (irreversible โ€” wipes all branches/data; drop UC catalogs/synced tables first) +databricks postgres delete-project projects/my-app +``` -w = WorkspaceClient() +โ†’ Defaults, limits table, LRO mechanics, SDK equivalents: [projects.md](references/projects.md). -# Create a project (long-running operation) -operation = w.postgres.create_project( - project=Project( - spec=ProjectSpec( - display_name="My Application", - pg_version="17" - ) - ), - project_id="my-app" -) -result = operation.wait() -print(f"Created project: {result.name}") -``` +--- -## Common Patterns +## Branches -### Generate OAuth Token +Isolated DB environments sharing storage with their parent via copy-on-write. The default branch is `production` (cannot be deleted). Branches can be TTL-expiring or permanent. -```python -from databricks.sdk import WorkspaceClient +```bash +# Create โ€” variants: "ttl": "604800s" (TTL in seconds) | "no_expiry": true (permanent) +databricks postgres create-branch projects/my-app development \ + --json '{"spec": {"source_branch": "projects/my-app/branches/production", "ttl": "604800s"}}' -w = WorkspaceClient() +# Get / list +databricks postgres get-branch projects/my-app/branches/development +databricks postgres list-branches projects/my-app -# Generate database credential for connecting (optionally scoped to an endpoint) -cred = w.postgres.generate_database_credential( - endpoint="projects/my-app/branches/production/endpoints/ep-primary" -) -token = cred.token # Use as password in connection string -# Token expires after 1 hour -``` +# Protect (protected branches can't be deleted/reset/archived) +databricks postgres update-branch projects/my-app/branches/production \ + spec.is_protected --json '{"spec": {"is_protected": true}}' -### Connect from Notebook +# Reset to parent's latest state (destroys local changes; not for root/protected/parent branches) +databricks postgres reset-branch projects/my-app/branches/development -```python -import psycopg -from databricks.sdk import WorkspaceClient +# Delete (children must be deleted first; protection must be removed first) +databricks postgres delete-branch projects/my-app/branches/development +``` -w = WorkspaceClient() +โ†’ Copy-on-write internals, TTL rules (max 30 days), reset constraints, SDK equivalents: [branches.md](references/branches.md). -# Get endpoint details -endpoint = w.postgres.get_endpoint( - name="projects/my-app/branches/production/endpoints/ep-primary" -) -host = endpoint.status.hosts.host +--- -# Generate token (scoped to endpoint) -cred = w.postgres.generate_database_credential( - endpoint="projects/my-app/branches/production/endpoints/ep-primary" -) +## Endpoints (Compute) -# Connect using psycopg3 -conn_string = ( - f"host={host} " - f"dbname=databricks_postgres " - f"user={w.current_user.me().user_name} " - f"password={cred.token} " - f"sslmode=require" -) -with psycopg.connect(conn_string) as conn: - with conn.cursor() as cur: - cur.execute("SELECT version()") - print(cur.fetchone()) +A compute runs Postgres for one branch. One R/W endpoint per branch (plus optional read replicas). Autoscale range: 0.5-32 CU with max-min โ‰ค 16 CU. Large fixed sizes: 36-112 CU. + +```bash +# Create an R/W endpoint โ€” replace RW with ENDPOINT_TYPE_READ_ONLY for read replicas +databricks postgres create-endpoint \ + projects/my-app/branches/production my-compute \ + --json '{"spec": {"endpoint_type": "ENDPOINT_TYPE_READ_WRITE", + "autoscaling_limit_min_cu": 0.5, + "autoscaling_limit_max_cu": 4.0}}' + +# Get host, state, CU range +databricks postgres get-endpoint projects/my-app/branches/production/endpoints/primary + +# List all endpoints on a branch +databricks postgres list-endpoints projects/my-app/branches/production + +# Resize โ€” mask is a comma-separated positional; JSON holds new values +databricks postgres update-endpoint \ + projects/my-app/branches/production/endpoints/primary \ + "spec.autoscaling_limit_min_cu,spec.autoscaling_limit_max_cu" \ + --json '{"spec": {"autoscaling_limit_min_cu": 2.0, "autoscaling_limit_max_cu": 8.0}}' + +# Delete +databricks postgres delete-endpoint projects/my-app/branches/production/endpoints/my-compute ``` -### Create a Branch for Development +**Scale-to-zero:** off on `production` by default, configurable elsewhere (min 60s, default 5min). Reactivation takes ~100ms; session context (temp tables, prepared statements, in-memory cache) is **reset** on wake. -```python -from databricks.sdk.service.postgres import Branch, BranchSpec, Duration - -# Create a dev branch with 7-day expiration -branch = w.postgres.create_branch( - parent="projects/my-app", - branch=Branch( - spec=BranchSpec( - source_branch="projects/my-app/branches/production", - ttl=Duration(seconds=604800) # 7 days - ) - ), - branch_id="development" -).wait() -print(f"Branch created: {branch.name}") -``` +โ†’ CU sizing table, autoscaling math, scale-to-zero internals, SDK equivalents: [computes.md](references/computes.md). -### Resize Compute (Autoscaling) +--- -```python -from databricks.sdk.service.postgres import Endpoint, EndpointSpec, FieldMask - -# Update compute to autoscale between 2-8 CU -w.postgres.update_endpoint( - name="projects/my-app/branches/production/endpoints/ep-primary", - endpoint=Endpoint( - name="projects/my-app/branches/production/endpoints/ep-primary", - spec=EndpointSpec( - autoscaling_limit_min_cu=2.0, - autoscaling_limit_max_cu=8.0 - ) - ), - update_mask=FieldMask(field_mask=[ - "spec.autoscaling_limit_min_cu", - "spec.autoscaling_limit_max_cu" - ]) -).wait() -``` +## Credentials & Connecting -## MCP Tools +OAuth tokens are 1-hour TTL and used as the Postgres password with `sslmode=require`. -The following MCP tools are available for managing Lakebase infrastructure. Use `type="autoscale"` for Lakebase Autoscaling. +```bash +# Generate a scoped OAuth token (use as PGPASSWORD). ENDPOINT is positional. +databricks postgres generate-database-credential \ + projects/my-app/branches/production/endpoints/primary +``` -### manage_lakebase_database - Project Management +Full connection-string recipe (use with `psql`, psycopg, or any Postgres client): -| Action | Description | Required Params | -|--------|-------------|-----------------| -| `create_or_update` | Create or update a project | name | -| `get` | Get project details (includes branches/endpoints) | name | -| `list` | List all projects | (none, optional type filter) | -| `delete` | Delete project and all branches/computes/data | name | +```bash +ENDPOINT="projects/my-app/branches/production/endpoints/primary" +HOST=$(databricks postgres get-endpoint "$ENDPOINT" | jq -r '.status.hosts.host') +USER=$(databricks current-user me | jq -r '.userName') +TOKEN=$(databricks postgres generate-database-credential "$ENDPOINT" | jq -r '.token') -**Example usage:** -```python -# Create an autoscale project -manage_lakebase_database( - action="create_or_update", - name="my-app", - type="autoscale", - display_name="My Application", - pg_version="17" -) +# psycopg keyword form: +echo "host=$HOST dbname=databricks_postgres user=$USER password=$TOKEN sslmode=require" -# Get project with branches -manage_lakebase_database(action="get", name="my-app", type="autoscale") +# Postgres URI form (the user is an email, so URL-encode @ as %40): +echo "postgresql://${USER/@/%40}:$TOKEN@$HOST:5432/databricks_postgres?sslmode=require" -# Delete project -manage_lakebase_database(action="delete", name="my-app", type="autoscale") +# Connect with psql: +PGPASSWORD="$TOKEN" psql "host=$HOST dbname=databricks_postgres user=$USER sslmode=require" ``` -### manage_lakebase_branch - Branch Management +Token TTL is ~1 hour. For app deployment, store **only the endpoint path** as config and generate the token at startup (and every 45 min thereafter) โ€” never bake the token into env files. -| Action | Description | Required Params | -|--------|-------------|-----------------| -| `create_or_update` | Create/update branch with compute endpoint | project_name, branch_id | -| `delete` | Delete branch and endpoints | name (full branch name) | +Application code is the one place to use the SDK โ€” tokens expire hourly and must be refreshed in-process. -**Example usage:** ```python -# Create a dev branch with 7-day TTL -manage_lakebase_branch( - action="create_or_update", - project_name="my-app", - branch_id="development", - source_branch="production", - ttl_seconds=604800, # 7 days - autoscaling_limit_min_cu=0.5, - autoscaling_limit_max_cu=4.0, - scale_to_zero_seconds=300 -) +# Application code โ€” refresh token every 45 min: +import psycopg +from databricks.sdk import WorkspaceClient -# Delete branch -manage_lakebase_branch(action="delete", name="projects/my-app/branches/development") +w = WorkspaceClient() +ep = "projects/my-app/branches/production/endpoints/primary" +host = w.postgres.get_endpoint(name=ep).status.hosts.host +token = w.postgres.generate_database_credential(endpoint=ep).token +conn = psycopg.connect( + f"host={host} dbname=databricks_postgres " + f"user={w.current_user.me().user_name} password={token} sslmode=require" +) ``` -### generate_lakebase_credential - OAuth Tokens +โ†’ Runtime connection patterns (minimal SDK snippet, SQLAlchemy pooling, async refresh loop, macOS DNS workaround, static-URL local dev): [connection-patterns.md](references/connection-patterns.md). -Generate OAuth token (~1hr) for PostgreSQL connections. Use as password with `sslmode=require`. +--- -```python -# For autoscale endpoints -generate_lakebase_credential(endpoint="projects/my-app/branches/production/endpoints/ep-primary") -``` +## Reverse ETL (Synced Tables) -## Reference Files +Syncs Unity Catalog Delta tables into Lakebase as Postgres tables via managed Lakeflow pipelines. Modes: `SNAPSHOT` (one-shot) | `TRIGGERED` (scheduled, needs CDF) | `CONTINUOUS` (~15s latency, needs CDF). -- [projects.md](projects.md) - Project management patterns and settings -- [branches.md](branches.md) - Branching workflows, protection, and expiration -- [computes.md](computes.md) - Compute sizing, autoscaling, and scale-to-zero -- [connection-patterns.md](connection-patterns.md) - Connection patterns for different use cases -- [reverse-etl.md](reverse-etl.md) - Synced tables from Delta Lake to Lakebase +```bash +# Create a synced table โ€” swap scheduling_policy to SNAPSHOT | TRIGGERED | CONTINUOUS +databricks postgres create-synced-table lakebase_catalog.schema.synced_table \ + --json '{"spec": {"source_table_full_name": "analytics.gold.user_profiles", + "primary_key_columns": ["user_id"], + "scheduling_policy": "TRIGGERED", + "new_pipeline_spec": {"storage_catalog": "lakebase_catalog", + "storage_schema": "staging"}}}' + +# Status (detailed_state shows sync progress) +databricks postgres get-synced-table synced_tables/lakebase_catalog.schema.synced_table + +# Delete (also drop the Postgres-side table separately) +databricks postgres delete-synced-table synced_tables/lakebase_catalog.schema.synced_table +``` -## CLI Quick Reference +Enable CDF on the source for TRIGGERED/CONTINUOUS: `ALTER TABLE ... SET TBLPROPERTIES (delta.enableChangeDataFeed = true)`. Each synced table uses up to 16 connections and counts against per-branch limits. -```bash -# Create a project -databricks postgres create-project \ - --project-id my-app \ - --json '{"spec": {"display_name": "My App", "pg_version": "17"}}' +โ†’ Mode comparison, type mapping (UC โ†’ Postgres), capacity planning, schema evolution rules, SDK equivalents: [reverse-etl.md](references/reverse-etl.md). -# List projects -databricks postgres list-projects +--- -# Get project details -databricks postgres get-project projects/my-app +## Common Issues -# Create a branch -databricks postgres create-branch projects/my-app development \ - --json '{"spec": {"source_branch": "projects/my-app/branches/production", "no_expiry": true}}' +| Issue | Solution | +|-------|----------| +| Token expired during long query | Refresh tokens every 45 min (1h TTL) | +| Connection refused after scale-to-zero | Compute wakes on connect (~100ms); add retry logic | +| DNS resolution fails on macOS | Pass `hostaddr` (resolved via `dig`) alongside `host` to psycopg | +| Branch delete blocked | Delete child branches first; remove protection first | +| Autoscaling range rejected | max-min must be โ‰ค 16 CU (e.g., 4-20 valid; 0.5-32 invalid) | +| SSL required error | Always `sslmode=require` | +| Update mask required | CLI `update-*` commands take the mask as a positional arg | +| Connection closed after 24h idle | 24h idle timeout, 3-day max lifetime โ€” add retry | -# List branches -databricks postgres list-branches projects/my-app +## Databricks Apps Integration -# Get endpoint details -databricks postgres get-endpoint projects/my-app/branches/production/endpoints/ep-primary +Scaffold an app connected to Lakebase at creation time: -# Delete a project -databricks postgres delete-project projects/my-app +```bash +databricks apps init --name my-app \ + --features lakebase \ + --set "lakebase.postgres.branch=production" \ + --set "lakebase.postgres.database=databricks_postgres" ``` -## Key Differences from Lakebase Provisioned +## High Availability -| Aspect | Provisioned | Autoscaling | -|--------|-------------|-------------| -| SDK module | `w.database` | `w.postgres` | -| Top-level resource | Instance | Project | -| Capacity | CU_1, CU_2, CU_4, CU_8 (16 GB/CU) | 0.5-112 CU (2 GB/CU) | -| Branching | Not supported | Full branching support | -| Scale-to-zero | Not supported | Configurable timeout | -| Operations | Synchronous | Long-running operations (LRO) | -| Read replicas | Readable secondaries | Dedicated read-only endpoints | +HA adds 1โ€“3 read secondaries across availability zones with automatic failover. Secondaries are accessible via a `-ro` suffix on the host and independently autoscale (but won't drop below the primary's current CU). HA is incompatible with scale-to-zero. See [computes.md](references/computes.md) for sizing constraints. -## Common Issues +## Lakehouse Sync (Beta โ€” AWS only) -| Issue | Solution | -|-------|----------| -| **Token expired during long query** | Implement token refresh loop; tokens expire after 1 hour | -| **Connection refused after scale-to-zero** | Compute wakes automatically on connection; reactivation takes a few hundred ms; implement retry logic | -| **DNS resolution fails on macOS** | Use `dig` command to resolve hostname, pass `hostaddr` to psycopg | -| **Branch deletion blocked** | Delete child branches first; cannot delete branches with children | -| **Autoscaling range too wide** | Max - min cannot exceed 8 CU (e.g., 8-16 CU is valid, 0.5-32 CU is not) | -| **SSL required error** | Always use `sslmode=require` in connection string | -| **Update mask required** | All update operations require an `update_mask` specifying fields to modify | -| **Connection closed after 24h idle** | All connections have a 24-hour idle timeout and 3-day max lifetime; implement retry logic | - -## Current Limitations - -These features are NOT yet supported in Lakebase Autoscaling: -- High availability with readable secondaries (use read replicas instead) -- Databricks Apps UI integration (Apps can connect manually via credentials) -- Feature Store integration -- Stateful AI agents (LangChain memory) -- Postgres-to-Delta sync (only Delta-to-Postgres reverse ETL) -- Custom billing tags and serverless budget policies -- Direct migration from Lakebase Provisioned (use pg_dump/pg_restore or reverse ETL) - -## SDK Version Requirements - -- **Databricks SDK for Python**: >= 0.81.0 (for `w.postgres` module) -- **psycopg**: 3.x (supports `hostaddr` parameter for DNS workaround) -- **SQLAlchemy**: 2.x with `postgresql+psycopg` driver - -```python -%pip install -U "databricks-sdk>=0.81.0" "psycopg[binary]>=3.0" sqlalchemy -``` +Reverse direction: continuously streams Postgres changes from Lakebase into Unity Catalog Delta tables via CDC. Azure support TBD. Enable via the project UI. -## Notes +## Not Yet Supported -- **Compute Units** in Autoscaling provide ~2 GB RAM each (vs 16 GB in Provisioned). -- **Resource naming** follows hierarchical paths: `projects/{id}/branches/{id}/endpoints/{id}`. -- All create/update/delete operations are **long-running** -- use `.wait()` in the SDK. -- Tokens are short-lived (1 hour) -- production apps MUST implement token refresh. -- **Postgres versions** 16 and 17 are supported. +Custom billing tags / serverless budget policies. ## Related Skills -- **[databricks-lakebase-provisioned](../databricks-lakebase-provisioned/SKILL.md)** - fixed-capacity managed PostgreSQL (predecessor) -- **[databricks-app-apx](../databricks-app-apx/SKILL.md)** - full-stack apps that can use Lakebase for persistence -- **[databricks-app-python](../databricks-app-python/SKILL.md)** - Python apps with Lakebase backend -- **[databricks-python-sdk](../databricks-python-sdk/SKILL.md)** - SDK used for project management and token generation -- **[databricks-bundles](../databricks-bundles/SKILL.md)** - deploying apps with Lakebase resources -- **[databricks-jobs](../databricks-jobs/SKILL.md)** - scheduling reverse ETL sync jobs +- [databricks-app-apx](../databricks-app-apx/SKILL.md), [databricks-app-python](../databricks-app-python/SKILL.md) โ€” apps using Lakebase +- [databricks-bundles](../databricks-bundles/SKILL.md) โ€” bundle deploys with Lakebase resources +- [databricks-jobs](../databricks-jobs/SKILL.md) โ€” scheduling reverse ETL pipelines diff --git a/databricks-skills/databricks-lakebase-autoscale/branches.md b/databricks-skills/databricks-lakebase-autoscale/branches.md deleted file mode 100644 index f44f7234..00000000 --- a/databricks-skills/databricks-lakebase-autoscale/branches.md +++ /dev/null @@ -1,212 +0,0 @@ -# Lakebase Autoscaling Branches - -## Overview - -Branches in Lakebase Autoscaling are isolated database environments that share storage with their parent through copy-on-write. They enable Git-like workflows for databases: create isolated dev/test environments, test schema changes safely, and recover from mistakes. - -## Branch Types - -| Option | Description | Use Case | -|--------|-------------|----------| -| **Current data** | Branch from latest state of parent | Development, testing with current data | -| **Past data** | Branch from a specific point in time | Point-in-time recovery, historical analysis | - -## Creating a Branch - -### With Expiration (TTL) - -```python -from databricks.sdk import WorkspaceClient -from databricks.sdk.service.postgres import Branch, BranchSpec, Duration - -w = WorkspaceClient() - -# Create branch with 7-day expiration -result = w.postgres.create_branch( - parent="projects/my-app", - branch=Branch( - spec=BranchSpec( - source_branch="projects/my-app/branches/production", - ttl=Duration(seconds=604800) # 7 days - ) - ), - branch_id="development" -).wait() - -print(f"Branch created: {result.name}") -print(f"Expires: {result.status.expire_time}") -``` - -### Permanent Branch (No Expiration) - -```python -result = w.postgres.create_branch( - parent="projects/my-app", - branch=Branch( - spec=BranchSpec( - source_branch="projects/my-app/branches/production", - no_expiry=True - ) - ), - branch_id="staging" -).wait() -``` - -### CLI - -```bash -# With TTL -databricks postgres create-branch projects/my-app development \ - --json '{ - "spec": { - "source_branch": "projects/my-app/branches/production", - "ttl": "604800s" - } - }' - -# Permanent -databricks postgres create-branch projects/my-app staging \ - --json '{ - "spec": { - "source_branch": "projects/my-app/branches/production", - "no_expiry": true - } - }' -``` - -## Getting Branch Details - -```python -branch = w.postgres.get_branch( - name="projects/my-app/branches/development" -) - -print(f"Branch: {branch.name}") -print(f"Protected: {branch.status.is_protected}") -print(f"Default: {branch.status.default}") -print(f"State: {branch.status.current_state}") -print(f"Size: {branch.status.logical_size_bytes} bytes") -``` - -## Listing Branches - -```python -branches = list(w.postgres.list_branches( - parent="projects/my-app" -)) - -for branch in branches: - print(f"Branch: {branch.name}") - print(f" Default: {branch.status.default}") - print(f" Protected: {branch.status.is_protected}") -``` - -## Protecting a Branch - -Protected branches cannot be deleted, reset, or archived. - -```python -from databricks.sdk.service.postgres import Branch, BranchSpec, FieldMask - -w.postgres.update_branch( - name="projects/my-app/branches/production", - branch=Branch( - name="projects/my-app/branches/production", - spec=BranchSpec(is_protected=True) - ), - update_mask=FieldMask(field_mask=["spec.is_protected"]) -).wait() -``` - -To remove protection: - -```python -w.postgres.update_branch( - name="projects/my-app/branches/production", - branch=Branch( - name="projects/my-app/branches/production", - spec=BranchSpec(is_protected=False) - ), - update_mask=FieldMask(field_mask=["spec.is_protected"]) -).wait() -``` - -## Updating Branch Expiration - -```python -# Extend to 14 days -w.postgres.update_branch( - name="projects/my-app/branches/development", - branch=Branch( - name="projects/my-app/branches/development", - spec=BranchSpec( - is_protected=False, - ttl=Duration(seconds=1209600) # 14 days - ) - ), - update_mask=FieldMask(field_mask=["spec.is_protected", "spec.expiration"]) -).wait() - -# Remove expiration -w.postgres.update_branch( - name="projects/my-app/branches/development", - branch=Branch( - name="projects/my-app/branches/development", - spec=BranchSpec(no_expiry=True) - ), - update_mask=FieldMask(field_mask=["spec.expiration"]) -).wait() -``` - -## Resetting a Branch from Parent - -Reset completely replaces a branch's data and schema with the latest from its parent. Local changes are lost. - -```python -w.postgres.reset_branch( - name="projects/my-app/branches/development" -).wait() -``` - -**Constraints:** -- Root branches (like `production`) cannot be reset (no parent) -- Branches with children cannot be reset (delete children first) -- Connections are temporarily interrupted during reset - -## Deleting a Branch - -```python -w.postgres.delete_branch( - name="projects/my-app/branches/development" -).wait() -``` - -**Constraints:** -- Cannot delete branches with child branches (delete children first) -- Cannot delete protected branches (remove protection first) -- Cannot delete the default branch - -## Branch Expiration - -Branch expiration sets an automatic deletion timestamp. Useful for: -- **CI/CD environments**: 2-4 hours -- **Demos**: 24-48 hours -- **Feature development**: 1-7 days -- **Long-term testing**: up to 30 days - -**Maximum expiration period:** 30 days from current time. - -### Expiration Restrictions - -- Cannot expire protected branches -- Cannot expire default branches -- Cannot expire branches that have children -- When a branch expires, all compute resources are also deleted - -## Best Practices - -1. **Use TTL for ephemeral branches**: Set expiration for dev/test branches to avoid accumulation -2. **Protect production branches**: Prevent accidental deletion or reset -3. **Reset instead of recreate**: Use reset from parent when you need fresh data without new branch overhead -4. **Schema diff before merge**: Compare schemas between branches before applying changes to production -5. **Monitor unarchived limit**: Only 10 unarchived branches are allowed per project diff --git a/databricks-skills/databricks-lakebase-autoscale/computes.md b/databricks-skills/databricks-lakebase-autoscale/computes.md deleted file mode 100644 index 0f53d50c..00000000 --- a/databricks-skills/databricks-lakebase-autoscale/computes.md +++ /dev/null @@ -1,208 +0,0 @@ -# Lakebase Autoscaling Computes - -## Overview - -A compute is a virtualized service that runs Postgres for a branch. Each branch has one primary read-write compute and can have optional read replicas. Computes support autoscaling, scale-to-zero, and granular sizing from 0.5 to 112 CU. - -## Compute Sizing - -Each Compute Unit (CU) allocates approximately 2 GB of RAM. - -### Available Sizes - -| Category | Range | Notes | -|----------|-------|-------| -| **Autoscale computes** | 0.5-32 CU | Dynamic scaling within range (max-min <= 8 CU) | -| **Large fixed-size** | 36-112 CU | Fixed size, no autoscaling | - -### Representative Sizes - -| Compute Units | RAM | Max Connections | -|--------------|-----|-----------------| -| 0.5 CU | ~1 GB | 104 | -| 1 CU | ~2 GB | 209 | -| 4 CU | ~8 GB | 839 | -| 8 CU | ~16 GB | 1,678 | -| 16 CU | ~32 GB | 3,357 | -| 32 CU | ~64 GB | 4,000 | -| 64 CU | ~128 GB | 4,000 | -| 112 CU | ~224 GB | 4,000 | - -**Note:** Lakebase Provisioned used ~16 GB per CU. Autoscaling uses ~2 GB per CU for more granular scaling. - -## Creating a Compute - -```python -from databricks.sdk import WorkspaceClient -from databricks.sdk.service.postgres import Endpoint, EndpointSpec, EndpointType - -w = WorkspaceClient() - -# Create a read-write compute endpoint -result = w.postgres.create_endpoint( - parent="projects/my-app/branches/production", - endpoint=Endpoint( - spec=EndpointSpec( - endpoint_type=EndpointType.ENDPOINT_TYPE_READ_WRITE, - autoscaling_limit_min_cu=0.5, - autoscaling_limit_max_cu=4.0 - ) - ), - endpoint_id="my-compute" -).wait() - -print(f"Endpoint created: {result.name}") -print(f"Host: {result.status.hosts.host}") -``` - -### CLI - -```bash -databricks postgres create-endpoint \ - projects/my-app/branches/production my-compute \ - --json '{ - "spec": { - "endpoint_type": "ENDPOINT_TYPE_READ_WRITE", - "autoscaling_limit_min_cu": 0.5, - "autoscaling_limit_max_cu": 4.0 - } - }' -``` - -**Important:** Each branch can have only one read-write compute. - -## Getting Compute Details - -```python -endpoint = w.postgres.get_endpoint( - name="projects/my-app/branches/production/endpoints/my-compute" -) - -print(f"Endpoint: {endpoint.name}") -print(f"Type: {endpoint.status.endpoint_type}") -print(f"State: {endpoint.status.current_state}") -print(f"Host: {endpoint.status.hosts.host}") -print(f"Min CU: {endpoint.status.autoscaling_limit_min_cu}") -print(f"Max CU: {endpoint.status.autoscaling_limit_max_cu}") -``` - -## Listing Computes - -```python -endpoints = list(w.postgres.list_endpoints( - parent="projects/my-app/branches/production" -)) - -for ep in endpoints: - print(f"Endpoint: {ep.name}") - print(f" Type: {ep.status.endpoint_type}") - print(f" CU Range: {ep.status.autoscaling_limit_min_cu}-{ep.status.autoscaling_limit_max_cu}") -``` - -## Resizing a Compute - -Use `update_mask` to specify which fields to update: - -```python -from databricks.sdk.service.postgres import Endpoint, EndpointSpec, FieldMask - -# Update min and max CU -w.postgres.update_endpoint( - name="projects/my-app/branches/production/endpoints/my-compute", - endpoint=Endpoint( - name="projects/my-app/branches/production/endpoints/my-compute", - spec=EndpointSpec( - autoscaling_limit_min_cu=2.0, - autoscaling_limit_max_cu=8.0 - ) - ), - update_mask=FieldMask(field_mask=[ - "spec.autoscaling_limit_min_cu", - "spec.autoscaling_limit_max_cu" - ]) -).wait() -``` - -### CLI - -```bash -# Update single field -databricks postgres update-endpoint \ - projects/my-app/branches/production/endpoints/my-compute \ - spec.autoscaling_limit_max_cu \ - --json '{"spec": {"autoscaling_limit_max_cu": 8.0}}' - -# Update multiple fields -databricks postgres update-endpoint \ - projects/my-app/branches/production/endpoints/my-compute \ - "spec.autoscaling_limit_min_cu,spec.autoscaling_limit_max_cu" \ - --json '{"spec": {"autoscaling_limit_min_cu": 2.0, "autoscaling_limit_max_cu": 8.0}}' -``` - -## Deleting a Compute - -```python -w.postgres.delete_endpoint( - name="projects/my-app/branches/production/endpoints/my-compute" -).wait() -``` - -## Autoscaling - -Autoscaling dynamically adjusts compute resources based on workload demand. - -### Configuration - -- **Range:** 0.5-32 CU -- **Constraint:** Max - Min cannot exceed 8 CU -- **Valid examples:** 4-8 CU, 8-16 CU, 16-24 CU -- **Invalid example:** 0.5-32 CU (range of 31.5 CU) - -### Best Practices - -- Set minimum CU large enough to cache your working set in memory -- Performance may be degraded until compute scales up and caches data -- Connection limits are based on the maximum CU in the range - -## Scale-to-Zero - -Automatically suspends compute after a period of inactivity. - -| Setting | Description | -|---------|-------------| -| **Enabled** | Compute suspends after inactivity timeout (saves cost) | -| **Disabled** | Always-active compute (eliminates wake-up latency) | - -**Default behavior:** -- `production` branch: Scale-to-zero **disabled** (always active) -- Other branches: Scale-to-zero can be configured - -**Default inactivity timeout:** 5 minutes -**Minimum inactivity timeout:** 60 seconds - -### Wake-up Behavior - -When a connection arrives on a suspended compute: -1. Compute starts automatically (reactivation takes a few hundred milliseconds) -2. The connection request is handled transparently once active -3. Compute restarts at minimum autoscaling size (if autoscaling enabled) -4. Applications should implement connection retry logic for the brief reactivation period - -### Session Context After Reactivation - -When a compute suspends and reactivates, session context is **reset**: -- In-memory statistics and cache contents are cleared -- Temporary tables and prepared statements are lost -- Session-specific configuration settings reset -- Connection pools and active transactions are terminated - -If your application requires persistent session data, consider disabling scale-to-zero. - -## Sizing Guidance - -| Factor | Recommendation | -|--------|---------------| -| Query complexity | Complex analytical queries benefit from larger computes | -| Concurrent connections | More connections need more CPU and memory | -| Data volume | Larger datasets may need more memory for performance | -| Response time | Critical apps may require larger computes | diff --git a/databricks-skills/databricks-lakebase-autoscale/projects.md b/databricks-skills/databricks-lakebase-autoscale/projects.md deleted file mode 100644 index 659207a4..00000000 --- a/databricks-skills/databricks-lakebase-autoscale/projects.md +++ /dev/null @@ -1,204 +0,0 @@ -# Lakebase Autoscaling Projects - -## Overview - -A project is the top-level container for Lakebase Autoscaling resources, including branches, computes, databases, and roles. Each project is isolated and contains its own Postgres version, compute defaults, and restore window settings. - -## Project Structure - -``` -Project - โ””โ”€โ”€ Branches (production, development, staging, etc.) - โ”œโ”€โ”€ Computes (R/W compute, read replicas) - โ”œโ”€โ”€ Roles (Postgres roles) - โ””โ”€โ”€ Databases (Postgres databases) -``` - -When a project is created, it includes by default: -- A `production` branch (the default branch) -- A primary read-write compute (8-32 CU, autoscaling enabled, scale-to-zero disabled) -- A `databricks_postgres` database -- A Postgres role for the creating user's Databricks identity - -## Resource Naming - -Projects follow a hierarchical naming convention: -``` -projects/{project_id} -``` - -**Resource ID requirements:** -- 1-63 characters long -- Lowercase letters, digits, and hyphens only -- Cannot start or end with a hyphen -- Cannot be changed after creation - -## Creating a Project - -### Python SDK - -```python -from databricks.sdk import WorkspaceClient -from databricks.sdk.service.postgres import Project, ProjectSpec - -w = WorkspaceClient() - -# Create a project (long-running operation) -operation = w.postgres.create_project( - project=Project( - spec=ProjectSpec( - display_name="My Application", - pg_version="17" - ) - ), - project_id="my-app" -) - -# Wait for completion -result = operation.wait() -print(f"Created project: {result.name}") -print(f"Display name: {result.status.display_name}") -print(f"Postgres version: {result.status.pg_version}") -``` - -### CLI - -```bash -databricks postgres create-project \ - --project-id my-app \ - --json '{ - "spec": { - "display_name": "My Application", - "pg_version": "17" - } - }' -``` - -## Getting Project Details - -### Python SDK - -```python -project = w.postgres.get_project(name="projects/my-app") - -print(f"Project: {project.name}") -print(f"Display name: {project.status.display_name}") -print(f"Postgres version: {project.status.pg_version}") -``` - -### CLI - -```bash -databricks postgres get-project projects/my-app -``` - -**Note:** The `spec` field is not populated for GET operations. All properties are returned in the `status` field. - -## Listing Projects - -```python -projects = w.postgres.list_projects() - -for project in projects: - print(f"Project: {project.name}") - print(f" Display name: {project.status.display_name}") - print(f" Postgres version: {project.status.pg_version}") -``` - -## Updating a Project - -Updates require an `update_mask` specifying which fields to modify: - -```python -from databricks.sdk.service.postgres import Project, ProjectSpec, FieldMask - -# Update display name -operation = w.postgres.update_project( - name="projects/my-app", - project=Project( - name="projects/my-app", - spec=ProjectSpec( - display_name="My Updated Application" - ) - ), - update_mask=FieldMask(field_mask=["spec.display_name"]) -) -result = operation.wait() -``` - -### CLI - -```bash -databricks postgres update-project projects/my-app spec.display_name \ - --json '{ - "spec": { - "display_name": "My Updated Application" - } - }' -``` - -## Deleting a Project - -**WARNING:** Deleting a project is permanent and also deletes all branches, computes, databases, roles, and data. - -Delete all Unity Catalog catalogs and synced tables before deleting the project. - -```python -operation = w.postgres.delete_project(name="projects/my-app") -# This is a long-running operation -``` - -### CLI - -```bash -databricks postgres delete-project projects/my-app -``` - -## Project Settings - -### Compute Defaults - -Default settings for new primary computes: -- Compute size range (0.5-112 CU) -- Scale-to-zero timeout (default: 5 minutes) - -### Instant Restore - -Configure the restore window length (2-35 days). Longer windows increase storage costs. - -### Postgres Version - -Supports Postgres 16 and Postgres 17. - -## Project Limits - -| Resource | Limit | -|----------|-------| -| Concurrently active computes | 20 | -| Branches per project | 500 | -| Postgres roles per branch | 500 | -| Postgres databases per branch | 500 | -| Logical data size per branch | 8 TB | -| Projects per workspace | 1000 | -| Protected branches | 1 | -| Root branches | 3 | -| Unarchived branches | 10 | -| Snapshots | 10 | -| Maximum history retention | 35 days | -| Minimum scale-to-zero time | 60 seconds | - -## Long-Running Operations - -All create, update, and delete operations return a long-running operation (LRO). Use `.wait()` in the SDK to block until completion: - -```python -# Start operation -operation = w.postgres.create_project(...) - -# Wait for completion -result = operation.wait() - -# Or check status manually -op_status = w.postgres.get_operation(name=operation.name) -print(f"Done: {op_status.done}") -``` diff --git a/databricks-skills/databricks-lakebase-autoscale/references/branches.md b/databricks-skills/databricks-lakebase-autoscale/references/branches.md new file mode 100644 index 00000000..d5092b4b --- /dev/null +++ b/databricks-skills/databricks-lakebase-autoscale/references/branches.md @@ -0,0 +1,133 @@ +# Lakebase Autoscaling โ€” Branches (deep dive) + +Deep dive for the Branches concept. Basic CLI is in [SKILL.md](../SKILL.md). + +## How Branching Works + +A branch is a logical Postgres instance whose storage is a **copy-on-write** fork of its parent at a specific LSN (point in the parent's WAL history). Reads hit the shared base until a page is modified; writes create branch-local copies. This is why branches are cheap to create and diverge gradually. + +Consequences: +- Creating a branch is nearly instant regardless of parent size. +- Storage grows with write volume on the branch, not with the parent's size. +- Resetting a branch drops its CoW layer and re-points at the parent's current state. + +## Branch Sources + +When creating a branch you pick a source LSN implicitly: + +| Option | `spec` field | Use case | +|--------|--------------|----------| +| Current data | `source_branch` only | Dev/test with up-to-date data | +| Past data | `source_branch` + `source_lsn` or `source_time` | Point-in-time recovery, reproduce a bug | + +Past-data branching is bounded by the project's `history_retention_seconds` (default 7 days, max 35). + +## TTL & Permanence + +Branches are either ephemeral (TTL) or permanent (`no_expiry: true`). Max TTL is 30 days from creation. You **cannot** set TTL on: +- Protected branches +- The default branch (`production`) +- Branches that have children + +When a TTL branch expires, its endpoints and data are deleted. + +## Protection + +A protected branch cannot be deleted, reset, or archived. Only 1 branch per project can be protected. Typically this is `production`. Protection is stored on the branch spec and toggled with `update-branch`. + +## Reset + +`reset-branch` replaces a branch's CoW layer with a fresh fork from its parent's current head. Effect: +- All local schema and data changes are discarded +- Active connections are interrupted briefly +- Cannot run on: root branches (no parent), protected branches, parents-of-others + +Use reset when your dev branch has drifted and you want fresh data without recreating the branch (preserves the branch name and any downstream config). + +## Constraints Cheat-Sheet + +| Action | Blocked when | +|--------|-------------| +| Delete | Has children; is protected; is default | +| Reset | Is root; has children; is protected | +| TTL/expire | Is protected; is default; has children | +| Archive | Is protected | + +## Advanced CLI + +Past-data branch from LSN: + +```bash +databricks postgres create-branch projects/my-app debug-bug-1234 \ + --json '{"spec": {"source_branch": "projects/my-app/branches/production", + "source_lsn": "0/1A2B3C4D", + "no_expiry": true}}' +``` + +Past-data branch from timestamp: + +```bash +databricks postgres create-branch projects/my-app pre-incident \ + --json '{"spec": {"source_branch": "projects/my-app/branches/production", + "source_time": "2026-04-20T14:30:00Z", + "ttl": "86400s"}}' +``` + +Extend or drop a TTL: + +```bash +# Extend to 14 days +databricks postgres update-branch projects/my-app/branches/development \ + spec.expiration --json '{"spec": {"ttl": "1209600s"}}' + +# Convert to permanent +databricks postgres update-branch projects/my-app/branches/development \ + spec.expiration --json '{"spec": {"no_expiry": true}}' +``` + +## Best Practices + +- TTL everything ephemeral โ€” dev/CI branches accumulate fast against the 10-unarchived limit. +- Protect `production` at project creation time, not "eventually". +- Prefer reset over recreate when you just need fresh data โ€” it preserves the branch name and downstream references. +- Compare schemas between branches (`pg_dump --schema-only`) before merging changes back upstream. + +## Typical TTL Envelopes + +| Workload | TTL | +|----------|-----| +| CI run | 2-4 h | +| Demo | 24-48 h | +| Feature branch | 1-7 days | +| Long-lived test env | up to 30 days | + +## SDK Equivalents + +```python +from databricks.sdk import WorkspaceClient +from databricks.sdk.service.postgres import Branch, BranchSpec, Duration, FieldMask + +w = WorkspaceClient() + +# Create (TTL or permanent) +w.postgres.create_branch( + parent="projects/my-app", + branch=Branch(spec=BranchSpec( + source_branch="projects/my-app/branches/production", + ttl=Duration(seconds=604800), # or: no_expiry=True + )), + branch_id="development", +).wait() + +# Protect +w.postgres.update_branch( + name="projects/my-app/branches/production", + branch=Branch(name="projects/my-app/branches/production", + spec=BranchSpec(is_protected=True)), + update_mask=FieldMask(field_mask=["spec.is_protected"]), +).wait() + +# Reset / delete +w.postgres.reset_branch(name="projects/my-app/branches/development").wait() +w.postgres.delete_branch(name="projects/my-app/branches/development").wait() +``` diff --git a/databricks-skills/databricks-lakebase-autoscale/references/computes.md b/databricks-skills/databricks-lakebase-autoscale/references/computes.md new file mode 100644 index 00000000..0ddea61c --- /dev/null +++ b/databricks-skills/databricks-lakebase-autoscale/references/computes.md @@ -0,0 +1,157 @@ +# Lakebase Autoscaling โ€” Computes (deep dive) + +Deep dive for Endpoints (computes). Basic CLI is in [SKILL.md](../SKILL.md). + +## What an Endpoint Is + +An endpoint is a Postgres server instance attached to one branch. Each branch has exactly one R/W endpoint (named `primary` by default) and may have additional read-only replicas. The endpoint owns the hostname clients connect to and the CU budget that determines concurrency and RAM. + +## Compute Units + +1 CU โ‰ˆ 2 GB RAM (vs ~16 GB/CU on Lakebase Provisioned โ€” the autoscaling tier trades per-unit RAM for finer scaling granularity). + +| CU | RAM | Max connections | +|----|-----|-----------------| +| 0.5 | ~1 GB | 104 | +| 1 | ~2 GB | 209 | +| 4 | ~8 GB | 839 | +| 8 | ~16 GB | 1,678 | +| 16 | ~32 GB | 3,357 | +| 32 | ~64 GB | 4,000 | +| 64 | ~128 GB | 4,000 | +| 112 | ~224 GB | 4,000 | + +Max connections flattens at 4,000 above 32 CU โ€” scale up past 32 CU for memory/CPU, not for connection headroom. + +## Sizing Categories + +| Category | Range | Behavior | +|----------|-------|----------| +| Autoscale | 0.5-32 CU | Dynamic scaling; `max โˆ’ min โ‰ค 16 CU` | +| Large fixed | 36-112 CU | Fixed size, no autoscaling | + +**Autoscaling window constraint.** The spread between `autoscaling_limit_min_cu` and `autoscaling_limit_max_cu` cannot exceed 16 CU: +- Valid: .5-4, 4-20, 8-32 +- Invalid: 0.5-32 (31.5 CU spread), 1-24 (23 CU spread) + +Set the minimum high enough to keep your working set in memory โ€” traffic that lands after a scale-up pays a cache-warm penalty until hot pages are faulted back in. + +## Scale-to-Zero + +When enabled, an endpoint suspends after an inactivity window (min 60 s, default 5 min). Default state per branch: + +| Branch | Default | +|--------|---------| +| `production` | Scale-to-zero **off** (always active) | +| Others | Scale-to-zero configurable | + +### Wake-up + +Incoming connections to a suspended endpoint trigger reactivation. Expected latency is ~100ms, but: +- First connection may see a timeout โ€” applications must retry. +- Endpoint resumes at the **minimum** of its autoscaling range; expect cache-cold performance until load ramps up. +- All session-scoped state is lost: in-memory stats, temp tables, prepared statements, session GUCs, active transactions. + +If your app keeps session state server-side (e.g., Postgres advisory locks, prepared statements you don't re-prepare), disable scale-to-zero. + +## Advanced CLI + +Create a read replica: + +```bash +databricks postgres create-endpoint \ + projects/my-app/branches/production ep-readonly-1 \ + --json '{"spec": {"endpoint_type": "ENDPOINT_TYPE_READ_ONLY", + "autoscaling_limit_min_cu": 1.0, + "autoscaling_limit_max_cu": 4.0}}' +``` + +Change scale-to-zero timeout (durations are strings ending in `s`): + +```bash +databricks postgres update-endpoint \ + projects/my-app/branches/development/endpoints/primary \ + spec.suspend_timeout_duration \ + --json '{"spec": {"suspend_timeout_duration": "1800s"}}' # 30 min +``` + +Disable scale-to-zero on a non-default branch (`"0s"` = off): + +```bash +databricks postgres update-endpoint \ + projects/my-app/branches/staging/endpoints/primary \ + spec.suspend_timeout_duration \ + --json '{"spec": {"suspend_timeout_duration": "0s"}}' +``` + +Convert from autoscale to a large fixed size (e.g., 64 CU): + +```bash +databricks postgres update-endpoint \ + projects/my-app/branches/production/endpoints/primary \ + "spec.autoscaling_limit_min_cu,spec.autoscaling_limit_max_cu" \ + --json '{"spec": {"autoscaling_limit_min_cu": 64.0, "autoscaling_limit_max_cu": 64.0}}' +``` + +## High Availability + +HA adds 1โ€“3 read secondaries across availability zones with automatic failover (hot standby, automatic primary promotion on failure). + +**Secondaries vs. read replicas:** +- **HA secondaries** share a `-ro` hostname suffix and floor at the primary's current CU. Use for failover with optional read offload. +- **Read replicas** are separate endpoints (`ENDPOINT_TYPE_READ_ONLY`) with independent sizing. Use for dedicated read scaling. + +**HA constraints:** +- Scale-to-zero is **not supported** with HA enabled. +- Autoscaling spread limit (โ‰ค16 CU) still applies. +- Secondaries autoscale independently but won't drop below the primary's current minimum. +- Total compute instances per project: 2โ€“4 (primary + 1โ€“3 secondaries). + +## Sizing Guidance + +| Factor | Recommendation | +|--------|----------------| +| Query complexity | Complex analytical queries benefit from larger computes | +| Concurrent connections | Scale up until 32 CU; past that you're buying CPU/RAM, not connections | +| Working-set size | Min CU should hold your hot data in RAM | +| Latency-sensitive apps | Disable scale-to-zero or accept wake-up retries | + +## SDK Equivalents + +```python +from databricks.sdk import WorkspaceClient +from databricks.sdk.service.postgres import ( + Endpoint, EndpointSpec, EndpointType, FieldMask, +) + +w = WorkspaceClient() + +# Create R/W endpoint +w.postgres.create_endpoint( + parent="projects/my-app/branches/production", + endpoint=Endpoint(spec=EndpointSpec( + endpoint_type=EndpointType.ENDPOINT_TYPE_READ_WRITE, + autoscaling_limit_min_cu=0.5, + autoscaling_limit_max_cu=4.0, + )), + endpoint_id="my-compute", +).wait() + +# Resize +w.postgres.update_endpoint( + name="projects/my-app/branches/production/endpoints/my-compute", + endpoint=Endpoint( + name="projects/my-app/branches/production/endpoints/my-compute", + spec=EndpointSpec(autoscaling_limit_min_cu=2.0, autoscaling_limit_max_cu=8.0), + ), + update_mask=FieldMask(field_mask=[ + "spec.autoscaling_limit_min_cu", + "spec.autoscaling_limit_max_cu", + ]), +).wait() + +# Delete +w.postgres.delete_endpoint( + name="projects/my-app/branches/production/endpoints/my-compute" +).wait() +``` diff --git a/databricks-skills/databricks-lakebase-autoscale/connection-patterns.md b/databricks-skills/databricks-lakebase-autoscale/references/connection-patterns.md similarity index 78% rename from databricks-skills/databricks-lakebase-autoscale/connection-patterns.md rename to databricks-skills/databricks-lakebase-autoscale/references/connection-patterns.md index 398862b3..ca295292 100644 --- a/databricks-skills/databricks-lakebase-autoscale/connection-patterns.md +++ b/databricks-skills/databricks-lakebase-autoscale/references/connection-patterns.md @@ -1,8 +1,20 @@ -# Lakebase Autoscaling Connection Patterns +# Lakebase Autoscaling โ€” Connection Patterns (deep dive) -## Overview +Deep dive for the application-runtime connection layer. Basic credential generation and a minimal Python snippet are in [SKILL.md](../SKILL.md#credentials--connecting). -This document covers different connection patterns for Lakebase Autoscaling, from simple scripts to production applications with token refresh. +**Why this file uses the SDK and the others don't.** OAuth tokens are 1-hour TTL and must be refreshed from inside the running process โ€” shelling out to the CLI per refresh is slow, fragile, and awkward to embed in a pool. All admin operations (project, branch, endpoint, synced-table lifecycle) stay on the CLI; only runtime token rotation and connection pooling live here. + +This document covers connection patterns from simple scripts to production applications with token refresh. + +## Requirements + +```python +%pip install -U "databricks-sdk>=0.81.0" "psycopg[binary]>=3.0" sqlalchemy +``` + +- `databricks-sdk >= 0.81.0` โ€” required for the `w.postgres` module +- `psycopg 3.x` โ€” supports the `hostaddr` parameter for the macOS DNS workaround +- `sqlalchemy 2.x` with the `postgresql+psycopg` driver ## Authentication Methods @@ -93,7 +105,7 @@ class LakebaseAutoscaleConnectionManager: database_name: str = "databricks_postgres", pool_size: int = 5, max_overflow: int = 10, - token_refresh_seconds: int = 3000 # 50 minutes + token_refresh_seconds: int = 2700 # 45 minutes ): self.project_id = project_id self.branch_id = branch_id @@ -296,9 +308,47 @@ conn = psycopg.connect(**conn_params) ## Best Practices 1. **Always use SSL**: Set `sslmode=require` in all connections -2. **Implement token refresh**: Tokens expire after 1 hour; refresh at 50 minutes +2. **Implement token refresh**: Tokens expire after 1 hour; refresh every 45 minutes 3. **Use connection pooling**: Avoid creating new connections per request 4. **Handle DNS issues on macOS**: Use the `hostaddr` workaround if needed 5. **Close connections properly**: Use context managers or explicit cleanup -6. **Handle scale-to-zero wake-up**: First connection after idle may take 2-5 seconds +6. **Handle scale-to-zero wake-up**: First connection after idle may take ~100ms; implement retry logic 7. **Log token refresh events**: Helps debug authentication issues + +## Data API (Autoscaling only) + +A PostgREST-compatible HTTP CRUD interface โ€” no Postgres driver required. Enable in the project UI (auto-creates an `authenticator` role and `pgrst` schema). + +All requests require a Databricks OAuth bearer token: + +```bash +TOKEN=$(databricks postgres generate-database-credential \ + projects/my-app/branches/production/endpoints/primary | jq -r '.token') +DATA_API_URL="https:///api/2.0/lakebase/projects/my-app/data" +``` + +```bash +# GET โ€” filter, paginate, order +curl -H "Authorization: Bearer $TOKEN" \ + "$DATA_API_URL/public/users?age=gt.21&limit=10&order=created_at.desc" + +# POST โ€” insert +curl -X POST -H "Authorization: Bearer $TOKEN" \ + -H "Content-Type: application/json" \ + -d '{"name": "Alice", "email": "alice@example.com"}' \ + "$DATA_API_URL/public/users" + +# PATCH โ€” update (filter is required) +curl -X PATCH -H "Authorization: Bearer $TOKEN" \ + -H "Content-Type: application/json" \ + -d '{"status": "active"}' \ + "$DATA_API_URL/public/users?id=eq.42" + +# DELETE โ€” filter is required +curl -X DELETE -H "Authorization: Bearer $TOKEN" \ + "$DATA_API_URL/public/users?id=eq.42" +``` + +**Row-level security** with `current_user` policies is strongly recommended for multi-tenant apps. + +**Unsupported:** computed relationships, inner-join embedding, custom media types, transaction control headers, EXPLAIN/trace, pre-request functions, PostGIS auto-GeoJSON. diff --git a/databricks-skills/databricks-lakebase-autoscale/references/projects.md b/databricks-skills/databricks-lakebase-autoscale/references/projects.md new file mode 100644 index 00000000..ca629312 --- /dev/null +++ b/databricks-skills/databricks-lakebase-autoscale/references/projects.md @@ -0,0 +1,122 @@ +# Lakebase Autoscaling โ€” Projects (deep dive) + +Deep dive for the Projects concept. Basic CLI is in [SKILL.md](../SKILL.md). + +## What a Project Is + +A project is the top-level isolation boundary: its own Postgres major version, compute defaults, restore window, and hierarchical namespace for branches/endpoints/databases/roles. Projects do not share storage or compute with each other. + +A freshly created project contains: +- Default branch `production` (cannot be deleted; default scale-to-zero **off**) +- Primary R/W endpoint named `primary` (1 CU min/max by default, autoscaling on) +- Database `databricks_postgres` +- A Postgres role mapped to the creator's Databricks identity + +## Naming Rules + +Path: `projects/{project_id}`. The `project_id` must be 1-63 chars, `[a-z0-9-]`, no leading/trailing hyphen, and is immutable. + +## Defaults & Settings + +| Setting | Default | Configurable range | +|---------|---------|--------------------| +| Postgres version | 17 | 16 or 17 | +| Scale-to-zero (production) | off | off always; cannot be enabled on default branch | +| Scale-to-zero (other branches) | 5 min | โ‰ฅ 60 s | +| Primary endpoint CU range | 1-1 (creator can override) | 0.5-32 (autoscale) or 36-112 (fixed) | +| Restore window | 1 day | 2-35 days | + +Longer restore windows increase storage cost. + +## Limits + +| Resource | Limit | +|----------|-------| +| Projects per workspace | 1,000 | +| Concurrently active computes per project | 20 | +| Branches per project | 500 | +| Unarchived branches | 10 | +| Root branches | 3 | +| Protected branches | 1 | +| Roles / databases per branch | 500 / 500 | +| Logical data size per branch | 8 TB | +| Snapshots | 10 | +| Max history retention | 35 days | +| Min scale-to-zero time | 60 s | + +## Long-Running Operations + +Every create/update/delete returns an LRO. The CLI blocks until `done=true` by default; programmatic callers can poll: + +```bash +# Start async and get an operation name back (advanced; the default CLI behaviour is blocking) +databricks postgres list-operations projects/my-app +databricks postgres get-operation projects/my-app/operations/op- +``` + +## Advanced CLI + +Full update with multiple fields (durations are ISO8601-style strings like `"2592000s"`): + +```bash +databricks postgres update-project projects/my-app \ + "spec.display_name,spec.history_retention_duration" \ + --json '{"spec": {"display_name": "My App (prod)", + "history_retention_duration": "2592000s"}}' # 30 days +``` + +Deleting a project with active synced tables fails โ€” drop the UC synced tables and catalogs first, then delete. + +## Field Reference + +Returned fields on `get-project` live under `status`, **not** `spec`: + +``` +status.display_name +status.pg_version +status.history_retention_duration # e.g. "86400s" +status.enable_pg_native_login +status.owner +status.default_endpoint_settings.autoscaling_limit_min_cu +status.default_endpoint_settings.autoscaling_limit_max_cu +status.default_endpoint_settings.suspend_timeout_duration # scale-to-zero; "0s" = off +create_time +update_time +uid +``` + +## SDK Equivalents + +Use these only inside Python automation where shelling out to the CLI is awkward. Requires `databricks-sdk>=0.81.0`. + +```python +from databricks.sdk import WorkspaceClient +from databricks.sdk.service.postgres import Project, ProjectSpec, FieldMask + +w = WorkspaceClient() + +# Create +w.postgres.create_project( + project=Project(spec=ProjectSpec(display_name="My App", pg_version="17")), + project_id="my-app", +).wait() + +# Get / list +w.postgres.get_project(name="projects/my-app") +list(w.postgres.list_projects()) + +# Update +w.postgres.update_project( + name="projects/my-app", + project=Project(name="projects/my-app", + spec=ProjectSpec(display_name="Renamed App")), + update_mask=FieldMask(field_mask=["spec.display_name"]), +).wait() + +# Delete +w.postgres.delete_project(name="projects/my-app").wait() + +# Poll an LRO manually instead of .wait() +op = w.postgres.create_project(...) +w.postgres.get_operation(name=op.name).done +``` diff --git a/databricks-skills/databricks-lakebase-autoscale/references/reverse-etl.md b/databricks-skills/databricks-lakebase-autoscale/references/reverse-etl.md new file mode 100644 index 00000000..49720ff2 --- /dev/null +++ b/databricks-skills/databricks-lakebase-autoscale/references/reverse-etl.md @@ -0,0 +1,148 @@ +# Lakebase Autoscaling โ€” Reverse ETL (deep dive) + +Deep dive for synced tables. Basic CLI is in [SKILL.md](../SKILL.md). + +## How Synced Tables Work + +A synced table is a managed replica of a Unity Catalog Delta table, materialized into Lakebase as a queryable Postgres table. Two artifacts are created: + +1. A **Unity Catalog synced table** (read-only from the user's perspective, owned by the sync pipeline) +2. A **Postgres table** inside the Lakebase database, updated by the same pipeline + +The pipeline is a managed Lakeflow Spark Declarative Pipeline that runs on Databricks compute โ€” the user doesn't provision it directly. + +## Performance Envelope + +| Mode | Throughput (per CU) | Latency | +|------|---------------------|---------| +| Snapshot (initial load) | ~2,000 rows/s | minutes-hours depending on size | +| Triggered / Continuous (incremental) | ~150 rows/s | seconds (Continuous); scheduled (Triggered) | + +Each synced table uses **up to 16 connections** on the target endpoint. Account for this when sizing concurrency-heavy apps alongside reverse ETL on the same endpoint. + +## Sync Modes + +| Mode | Mechanism | Best for | Requires CDF | +|------|-----------|----------|--------------| +| `SNAPSHOT` | One-shot full copy | Initial load, historical analysis, small reference tables; use when you'd modify >10% of data anyway | No | +| `TRIGGERED` | Scheduled incremental updates | Dashboards refreshed hourly/daily | Yes | +| `CONTINUOUS` | Streaming, min 15 s intervals | Real-time features, live apps (highest cost) | Yes | + +Enable CDF on the source before creating TRIGGERED or CONTINUOUS synced tables: + +```sql +ALTER TABLE your_catalog.your_schema.your_table +SET TBLPROPERTIES (delta.enableChangeDataFeed = true) +``` + +## Data Type Mapping (UC โ†’ Postgres) + +| Unity Catalog | Postgres | +|---------------|----------| +| BIGINT | BIGINT | +| BINARY | BYTEA | +| BOOLEAN | BOOLEAN | +| DATE | DATE | +| DECIMAL(p,s) | NUMERIC | +| DOUBLE | DOUBLE PRECISION | +| FLOAT | REAL | +| INT | INTEGER | +| INTERVAL | INTERVAL | +| SMALLINT | SMALLINT | +| STRING | TEXT | +| TIMESTAMP | TIMESTAMP WITH TIME ZONE | +| TIMESTAMP_NTZ | TIMESTAMP WITHOUT TIME ZONE | +| TINYINT | SMALLINT | +| ARRAY / MAP / STRUCT | JSONB | + +**Unsupported:** GEOGRAPHY, GEOMETRY, VARIANT, OBJECT. + +## Naming & Schema Rules + +- Database, schema, and table names: `[A-Za-z0-9_]+` only. +- Schema evolution on TRIGGERED/CONTINUOUS: **additive only** (new columns). Breaking changes require dropping and re-creating the synced table. +- Primary key column(s) on the source are required and must be unique. + +## Capacity Planning + +| Dimension | Limit / guideline | +|-----------|-------------------| +| Connections used per synced table | up to 16 | +| Total synced-table size per branch | 2 TB | +| Recommended size per synced table | < 1 TB | +| CDF retention on source | Longer than the sync pipeline's commit cadence | + +## Deletion + +Synced tables exist in two places โ€” drop both: + +1. Remove the UC synced table (Catalog Explorer, CLI, or SDK). +2. Drop the Postgres table to free branch storage: + +```sql +DROP TABLE your_database.your_schema.your_table; +``` + +## Best Practices + +- Enable CDF **before** creating TRIGGERED/CONTINUOUS tables, not after โ€” the initial snapshot won't include incremental updates otherwise. +- Pick the cheapest mode that meets latency: most dashboards are fine on TRIGGERED hourly. +- Index Postgres targets for your query patterns โ€” synced tables arrive without indexes. +- Monitor pipeline status (`data_synchronization_status`) โ€” it surfaces backpressure and schema mismatches before they affect readers. +- Keep reverse-ETL pipelines on their own branch (or at least a sized-up endpoint) when running alongside OLTP traffic. + +## Example Use Cases + +Product catalog for a web app (hourly refresh is plenty): + +```bash +databricks postgres create-synced-table ecommerce_catalog.public.products \ + --json '{"spec": {"source_table_full_name": "gold.products.catalog", + "primary_key_columns": ["product_id"], + "scheduling_policy": "TRIGGERED"}}' +``` + +Real-time feature serving for ML (needs CDF on `ml.features.user_features`): + +```bash +databricks postgres create-synced-table ml_catalog.public.user_features \ + --json '{"spec": {"source_table_full_name": "ml.features.user_features", + "primary_key_columns": ["user_id"], + "scheduling_policy": "CONTINUOUS"}}' +``` + +## Lakehouse Sync (Beta โ€” AWS only) + +Reverse direction: continuously streams Postgres row changes from Lakebase into Unity Catalog Delta tables via CDC. Enable via the project UI. Azure support TBD. + +## SDK Equivalents + +Synced tables use the `w.postgres` SDK module (CLI v0.294.0+): + +```python +from databricks.sdk import WorkspaceClient +from databricks.sdk.service.postgres import ( + SyncedTable, SyncedTableSpec, + NewPipelineSpec, SyncedTableSchedulingPolicy, +) + +w = WorkspaceClient() + +w.postgres.create_synced_table( + SyncedTable( + name="lakebase_catalog.schema.synced_table", + spec=SyncedTableSpec( + source_table_full_name="analytics.gold.user_profiles", + primary_key_columns=["user_id"], + scheduling_policy=SyncedTableSchedulingPolicy.TRIGGERED, + new_pipeline_spec=NewPipelineSpec( + storage_catalog="lakebase_catalog", + storage_schema="staging", + ), + ), + ) +) + +status = w.postgres.get_synced_table(name="synced_tables/lakebase_catalog.schema.synced_table") +print(status.data_synchronization_status.detailed_state) +``` diff --git a/databricks-skills/databricks-lakebase-autoscale/reverse-etl.md b/databricks-skills/databricks-lakebase-autoscale/reverse-etl.md deleted file mode 100644 index f983eebb..00000000 --- a/databricks-skills/databricks-lakebase-autoscale/reverse-etl.md +++ /dev/null @@ -1,177 +0,0 @@ -# Reverse ETL with Lakebase Autoscaling - -## Overview - -Reverse ETL allows you to sync data from Unity Catalog Delta tables into Lakebase Autoscaling as PostgreSQL tables. This enables OLTP access patterns on data processed in the Lakehouse. - -## How It Works - -Synced tables create a managed copy of Unity Catalog data in Lakebase: - -1. A new Unity Catalog table (read-only, managed by the sync pipeline) -2. A Postgres table in Lakebase (queryable by applications) - -The sync pipeline uses managed Lakeflow Spark Declarative Pipelines to continuously update both tables. - -### Performance - -- **Continuous writes:** ~1,200 rows/sec per CU -- **Bulk writes:** ~15,000 rows/sec per CU -- **Connections used:** Up to 16 per synced table - -## Sync Modes - -| Mode | Description | Best For | Notes | -|------|-------------|----------|-------| -| **Snapshot** | One-time full copy | Initial setup, historical analysis | 10x more efficient if modifying >10% of data | -| **Triggered** | Scheduled updates on demand | Dashboards updated hourly/daily | Requires CDF on source table | -| **Continuous** | Real-time streaming (seconds of latency) | Live applications | Highest cost, minimum 15s intervals, requires CDF | - -**Note:** Triggered and Continuous modes require Change Data Feed (CDF) enabled on the source table: - -```sql -ALTER TABLE your_catalog.your_schema.your_table -SET TBLPROPERTIES (delta.enableChangeDataFeed = true) -``` - -## Creating Synced Tables - -### Using Python SDK - -```python -from databricks.sdk import WorkspaceClient -from databricks.sdk.service.database import ( - SyncedDatabaseTable, - SyncedTableSpec, - NewPipelineSpec, - SyncedTableSchedulingPolicy, -) - -w = WorkspaceClient() - -# Create a synced table -synced_table = w.database.create_synced_database_table( - SyncedDatabaseTable( - name="lakebase_catalog.schema.synced_table", - spec=SyncedTableSpec( - source_table_full_name="analytics.gold.user_profiles", - primary_key_columns=["user_id"], - scheduling_policy=SyncedTableSchedulingPolicy.TRIGGERED, - new_pipeline_spec=NewPipelineSpec( - storage_catalog="lakebase_catalog", - storage_schema="staging" - ) - ), - ) -) -print(f"Created synced table: {synced_table.name}") -``` - -### Using CLI - -```bash -databricks database create-synced-database-table \ - --json '{ - "name": "lakebase_catalog.schema.synced_table", - "spec": { - "source_table_full_name": "analytics.gold.user_profiles", - "primary_key_columns": ["user_id"], - "scheduling_policy": "TRIGGERED", - "new_pipeline_spec": { - "storage_catalog": "lakebase_catalog", - "storage_schema": "staging" - } - } - }' -``` - -## Checking Synced Table Status - -```python -status = w.database.get_synced_database_table(name="lakebase_catalog.schema.synced_table") -print(f"State: {status.data_synchronization_status.detailed_state}") -print(f"Message: {status.data_synchronization_status.message}") -``` - -## Deleting a Synced Table - -Delete from both Unity Catalog and Postgres: - -1. **Unity Catalog:** Delete from Catalog Explorer or SDK -2. **Postgres:** Drop the table to free storage - -```sql -DROP TABLE your_database.your_schema.your_table; -``` - -## Data Type Mapping - -| Unity Catalog Type | Postgres Type | -|-------------------|---------------| -| BIGINT | BIGINT | -| BINARY | BYTEA | -| BOOLEAN | BOOLEAN | -| DATE | DATE | -| DECIMAL(p,s) | NUMERIC | -| DOUBLE | DOUBLE PRECISION | -| FLOAT | REAL | -| INT | INTEGER | -| INTERVAL | INTERVAL | -| SMALLINT | SMALLINT | -| STRING | TEXT | -| TIMESTAMP | TIMESTAMP WITH TIME ZONE | -| TIMESTAMP_NTZ | TIMESTAMP WITHOUT TIME ZONE | -| TINYINT | SMALLINT | -| ARRAY | JSONB | -| MAP | JSONB | -| STRUCT | JSONB | - -**Unsupported types:** GEOGRAPHY, GEOMETRY, VARIANT, OBJECT - -## Capacity Planning - -- **Connection usage:** Each synced table uses up to 16 connections -- **Size limits:** 2 TB total across all synced tables; recommend < 1 TB per table -- **Naming:** Database, schema, and table names only allow `[A-Za-z0-9_]+` -- **Schema evolution:** Only additive changes (e.g., adding columns) for Triggered/Continuous modes - -## Use Cases - -### Product Catalog for Web App - -```python -w.database.create_synced_database_table( - SyncedDatabaseTable( - name="ecommerce_catalog.public.products", - spec=SyncedTableSpec( - source_table_full_name="gold.products.catalog", - primary_key_columns=["product_id"], - scheduling_policy=SyncedTableSchedulingPolicy.TRIGGERED, - ), - ) -) -``` - -### Real-time Feature Serving - -```python -w.database.create_synced_database_table( - SyncedDatabaseTable( - name="ml_catalog.public.user_features", - spec=SyncedTableSpec( - source_table_full_name="ml.features.user_features", - primary_key_columns=["user_id"], - scheduling_policy=SyncedTableSchedulingPolicy.CONTINUOUS, - ), - ) -) -``` - -## Best Practices - -1. **Enable CDF** on source tables before creating Triggered or Continuous synced tables -2. **Choose appropriate sync mode**: Snapshot for small tables, Triggered for hourly/daily, Continuous for real-time -3. **Monitor sync status**: Check for failures and latency via Catalog Explorer -4. **Index target tables**: Create appropriate indexes in Postgres for your query patterns -5. **Handle schema changes**: Only additive changes are supported for streaming modes -6. **Account for connection limits**: Each synced table uses up to 16 connections diff --git a/databricks-skills/databricks-lakebase-provisioned/SKILL.md b/databricks-skills/databricks-lakebase-provisioned/SKILL.md deleted file mode 100644 index 7548219c..00000000 --- a/databricks-skills/databricks-lakebase-provisioned/SKILL.md +++ /dev/null @@ -1,352 +0,0 @@ ---- -name: databricks-lakebase-provisioned -description: "Patterns and best practices for Lakebase Provisioned (Databricks managed PostgreSQL) for OLTP workloads. Use when creating Lakebase instances, connecting applications or Databricks Apps to PostgreSQL, implementing reverse ETL via synced tables, storing agent or chat memory, or configuring OAuth authentication for Lakebase." ---- - -# Lakebase Provisioned - -Patterns and best practices for using Lakebase Provisioned (Databricks managed PostgreSQL) for OLTP workloads. - -## When to Use - -Use this skill when: -- Building applications that need a PostgreSQL database for transactional workloads -- Adding persistent state to Databricks Apps -- Implementing reverse ETL from Delta Lake to an operational database -- Storing chat/agent memory for LangChain applications - -## Overview - -Lakebase Provisioned is Databricks' managed PostgreSQL database service for OLTP (Online Transaction Processing) workloads. It provides a fully managed PostgreSQL-compatible database that integrates with Unity Catalog and supports OAuth token-based authentication. - -| Feature | Description | -|---------|-------------| -| **Managed PostgreSQL** | Fully managed instances with automatic provisioning | -| **OAuth Authentication** | Token-based auth via Databricks SDK (1-hour expiry) | -| **Unity Catalog** | Register databases for governance | -| **Reverse ETL** | Sync data from Delta tables to PostgreSQL | -| **Apps Integration** | First-class support in Databricks Apps | - -**Available Regions (AWS):** us-east-1, us-east-2, us-west-2, eu-central-1, eu-west-1, ap-south-1, ap-southeast-1, ap-southeast-2 - -## Quick Start - -Create and connect to a Lakebase Provisioned instance: - -```python -from databricks.sdk import WorkspaceClient -import uuid - -# Initialize client -w = WorkspaceClient() - -# Create a database instance -instance = w.database.create_database_instance( - name="my-lakebase-instance", - capacity="CU_1", # CU_1, CU_2, CU_4, CU_8 - stopped=False -) -print(f"Instance created: {instance.name}") -print(f"DNS endpoint: {instance.read_write_dns}") -``` - -## Common Patterns - -### Generate OAuth Token - -```python -from databricks.sdk import WorkspaceClient -import uuid - -w = WorkspaceClient() - -# Generate OAuth token for database connection -cred = w.database.generate_database_credential( - request_id=str(uuid.uuid4()), - instance_names=["my-lakebase-instance"] -) -token = cred.token # Use this as password in connection string -``` - -### Connect from Notebook - -```python -import psycopg -from databricks.sdk import WorkspaceClient -import uuid - -# Get instance details -w = WorkspaceClient() -instance = w.database.get_database_instance(name="my-lakebase-instance") - -# Generate token -cred = w.database.generate_database_credential( - request_id=str(uuid.uuid4()), - instance_names=["my-lakebase-instance"] -) - -# Connect using psycopg3 -conn_string = f"host={instance.read_write_dns} dbname=postgres user={w.current_user.me().user_name} password={cred.token} sslmode=require" -with psycopg.connect(conn_string) as conn: - with conn.cursor() as cur: - cur.execute("SELECT version()") - print(cur.fetchone()) -``` - -### SQLAlchemy with Token Refresh (Production) - -For long-running applications, tokens must be refreshed (expire after 1 hour): - -```python -import asyncio -import os -import uuid -from sqlalchemy import event -from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession -from sqlalchemy.orm import sessionmaker -from databricks.sdk import WorkspaceClient - -# Token refresh state -_current_token = None -_token_refresh_task = None -TOKEN_REFRESH_INTERVAL = 50 * 60 # 50 minutes (before 1-hour expiry) - -def _generate_token(instance_name: str) -> str: - """Generate fresh OAuth token.""" - w = WorkspaceClient() - cred = w.database.generate_database_credential( - request_id=str(uuid.uuid4()), - instance_names=[instance_name] - ) - return cred.token - -async def _token_refresh_loop(instance_name: str): - """Background task to refresh token every 50 minutes.""" - global _current_token - while True: - await asyncio.sleep(TOKEN_REFRESH_INTERVAL) - _current_token = await asyncio.to_thread(_generate_token, instance_name) - -def init_database(instance_name: str, database_name: str, username: str) -> AsyncEngine: - """Initialize database with OAuth token injection.""" - global _current_token - - w = WorkspaceClient() - instance = w.database.get_database_instance(name=instance_name) - - # Generate initial token - _current_token = _generate_token(instance_name) - - # Build URL (password injected via do_connect) - url = f"postgresql+psycopg://{username}@{instance.read_write_dns}:5432/{database_name}" - - engine = create_async_engine( - url, - pool_size=5, - max_overflow=10, - pool_recycle=3600, - connect_args={"sslmode": "require"} - ) - - # Inject token on each connection - @event.listens_for(engine.sync_engine, "do_connect") - def provide_token(dialect, conn_rec, cargs, cparams): - cparams["password"] = _current_token - - return engine -``` - -### Databricks Apps Integration - -For Databricks Apps, use environment variables for configuration: - -```python -# Environment variables set by Databricks Apps: -# - LAKEBASE_INSTANCE_NAME: Instance name -# - LAKEBASE_DATABASE_NAME: Database name -# - LAKEBASE_USERNAME: Username (optional, defaults to service principal) - -import os - -def is_lakebase_configured() -> bool: - """Check if Lakebase is configured for this app.""" - return bool( - os.environ.get("LAKEBASE_PG_URL") or - (os.environ.get("LAKEBASE_INSTANCE_NAME") and - os.environ.get("LAKEBASE_DATABASE_NAME")) - ) -``` - -Add Lakebase as an app resource via CLI: - -```bash -databricks apps add-resource $APP_NAME \ - --resource-type database \ - --resource-name lakebase \ - --database-instance my-lakebase-instance -``` - -### Register with Unity Catalog - -```python -from databricks.sdk import WorkspaceClient - -w = WorkspaceClient() - -# Register database in Unity Catalog -w.database.register_database_instance( - name="my-lakebase-instance", - catalog="my_catalog", - schema="my_schema" -) -``` - -### MLflow Model Resources - -Declare Lakebase as a model resource for automatic credential provisioning: - -```python -from mlflow.models.resources import DatabricksLakebase - -resources = [ - DatabricksLakebase(database_instance_name="my-lakebase-instance"), -] - -# When logging model -mlflow.langchain.log_model( - model, - artifact_path="model", - resources=resources, - pip_requirements=["databricks-langchain[memory]"] -) -``` - -## MCP Tools - -The following MCP tools are available for managing Lakebase infrastructure. Use `type="provisioned"` for Lakebase Provisioned. - -### manage_lakebase_database - Database Management - -| Action | Description | Required Params | -|--------|-------------|-----------------| -| `create_or_update` | Create or update a database | name | -| `get` | Get database details | name | -| `list` | List all databases | (none, optional type filter) | -| `delete` | Delete database and resources | name | - -**Example usage:** -```python -# Create a provisioned database -manage_lakebase_database( - action="create_or_update", - name="my-lakebase-instance", - type="provisioned", - capacity="CU_1" -) - -# Get database details -manage_lakebase_database(action="get", name="my-lakebase-instance", type="provisioned") - -# List all databases -manage_lakebase_database(action="list") - -# Delete with cascade -manage_lakebase_database(action="delete", name="my-lakebase-instance", type="provisioned", force=True) -``` - -### manage_lakebase_sync - Reverse ETL - -| Action | Description | Required Params | -|--------|-------------|-----------------| -| `create_or_update` | Set up reverse ETL from Delta to Lakebase | instance_name, source_table_name, target_table_name | -| `delete` | Remove synced table (and optionally catalog) | table_name | - -**Example usage:** -```python -# Set up reverse ETL -manage_lakebase_sync( - action="create_or_update", - instance_name="my-lakebase-instance", - source_table_name="catalog.schema.delta_table", - target_table_name="lakebase_catalog.schema.postgres_table", - scheduling_policy="TRIGGERED" # or SNAPSHOT, CONTINUOUS -) - -# Delete synced table -manage_lakebase_sync(action="delete", table_name="lakebase_catalog.schema.postgres_table") -``` - -### generate_lakebase_credential - OAuth Tokens - -Generate OAuth token (~1hr) for PostgreSQL connections. Use as password with `sslmode=require`. - -```python -# For provisioned instances -generate_lakebase_credential(instance_names=["my-lakebase-instance"]) -``` - -## Reference Files - -- [connection-patterns.md](connection-patterns.md) - Detailed connection patterns for different use cases -- [reverse-etl.md](reverse-etl.md) - Syncing data from Delta Lake to Lakebase - -## CLI Quick Reference - -```bash -# Create instance -databricks database create-database-instance \ - --name my-lakebase-instance \ - --capacity CU_1 - -# Get instance details -databricks database get-database-instance --name my-lakebase-instance - -# Generate credentials -databricks database generate-database-credential \ - --request-id $(uuidgen) \ - --json '{"instance_names": ["my-lakebase-instance"]}' - -# List instances -databricks database list-database-instances - -# Stop instance (saves cost) -databricks database stop-database-instance --name my-lakebase-instance - -# Start instance -databricks database start-database-instance --name my-lakebase-instance -``` - -## Common Issues - -| Issue | Solution | -|-------|----------| -| **Token expired during long query** | Implement token refresh loop (see SQLAlchemy with Token Refresh section); tokens expire after 1 hour | -| **DNS resolution fails on macOS** | Use `dig` command to resolve hostname, pass `hostaddr` to psycopg | -| **Connection refused** | Ensure instance is not stopped; check `instance.state` | -| **Permission denied** | User must be granted access to the Lakebase instance | -| **SSL required error** | Always use `sslmode=require` in connection string | - -## SDK Version Requirements - -- **Databricks SDK for Python**: >= 0.61.0 (0.81.0+ recommended for full API support) -- **psycopg**: 3.x (supports `hostaddr` parameter for DNS workaround) -- **SQLAlchemy**: 2.x with `postgresql+psycopg` driver - -```python -%pip install -U "databricks-sdk>=0.81.0" "psycopg[binary]>=3.0" sqlalchemy -``` - -## Notes - -- **Capacity values** use compute unit sizing: `CU_1`, `CU_2`, `CU_4`, `CU_8`. -- **Lakebase Autoscaling** is a newer offering with automatic scaling but limited regional availability. This skill focuses on **Lakebase Provisioned** which is more widely available. -- For memory/state in LangChain agents, use `databricks-langchain[memory]` which includes Lakebase support. -- Tokens are short-lived (1 hour) - production apps MUST implement token refresh. - -## Related Skills - -- **[databricks-app-apx](../databricks-app-apx/SKILL.md)** - full-stack apps that can use Lakebase for persistence -- **[databricks-app-python](../databricks-app-python/SKILL.md)** - Python apps with Lakebase backend -- **[databricks-python-sdk](../databricks-python-sdk/SKILL.md)** - SDK used for instance management and token generation -- **[databricks-bundles](../databricks-bundles/SKILL.md)** - deploying apps with Lakebase resources -- **[databricks-jobs](../databricks-jobs/SKILL.md)** - scheduling reverse ETL sync jobs diff --git a/databricks-skills/databricks-lakebase-provisioned/connection-patterns.md b/databricks-skills/databricks-lakebase-provisioned/connection-patterns.md deleted file mode 100644 index e6843548..00000000 --- a/databricks-skills/databricks-lakebase-provisioned/connection-patterns.md +++ /dev/null @@ -1,279 +0,0 @@ -# Lakebase Connection Patterns - -## Overview - -This document covers different connection patterns for Lakebase Provisioned, from simple scripts to production applications with token refresh. - -## Connection Methods - -### 1. Direct psycopg Connection (Simple Scripts) - -For one-off scripts or notebooks: - -```python -import psycopg -from databricks.sdk import WorkspaceClient -import uuid - -def get_connection(instance_name: str, database_name: str = "postgres"): - """Get a database connection with fresh OAuth token.""" - w = WorkspaceClient() - - # Get instance details - instance = w.database.get_database_instance(name=instance_name) - - # Generate OAuth token (valid for 1 hour) - cred = w.database.generate_database_credential( - request_id=str(uuid.uuid4()), - instance_names=[instance_name] - ) - - # Build connection string - conn_string = ( - f"host={instance.read_write_dns} " - f"dbname={database_name} " - f"user={w.current_user.me().user_name} " - f"password={cred.token} " - f"sslmode=require" - ) - - return psycopg.connect(conn_string) - -# Usage -with get_connection("my-instance") as conn: - with conn.cursor() as cur: - cur.execute("SELECT NOW()") - print(cur.fetchone()) -``` - -### 2. Connection Pool with Token Refresh (Production) - -For long-running applications that need connection pooling: - -```python -import asyncio -import uuid -from contextlib import asynccontextmanager -from typing import AsyncGenerator, Optional - -from sqlalchemy import event -from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker -from databricks.sdk import WorkspaceClient - -class LakebaseConnectionManager: - """Manages Lakebase connections with automatic token refresh.""" - - def __init__( - self, - instance_name: str, - database_name: str, - pool_size: int = 5, - max_overflow: int = 10, - token_refresh_seconds: int = 3000 # 50 minutes - ): - self.instance_name = instance_name - self.database_name = database_name - self.pool_size = pool_size - self.max_overflow = max_overflow - self.token_refresh_seconds = token_refresh_seconds - - self._current_token: Optional[str] = None - self._refresh_task: Optional[asyncio.Task] = None - self._engine = None - self._session_maker = None - - def _generate_token(self) -> str: - """Generate fresh OAuth token.""" - w = WorkspaceClient() - cred = w.database.generate_database_credential( - request_id=str(uuid.uuid4()), - instance_names=[self.instance_name] - ) - return cred.token - - async def _refresh_loop(self): - """Background task to refresh token periodically.""" - while True: - await asyncio.sleep(self.token_refresh_seconds) - try: - self._current_token = await asyncio.to_thread(self._generate_token) - except Exception as e: - print(f"Token refresh failed: {e}") - - def initialize(self): - """Initialize database engine and start token refresh.""" - w = WorkspaceClient() - - # Get instance info - instance = w.database.get_database_instance(name=self.instance_name) - username = w.current_user.me().user_name - - # Generate initial token - self._current_token = self._generate_token() - - # Create engine (password injected via event) - url = ( - f"postgresql+psycopg://{username}@" - f"{instance.read_write_dns}:5432/{self.database_name}" - ) - - self._engine = create_async_engine( - url, - pool_size=self.pool_size, - max_overflow=self.max_overflow, - pool_recycle=3600, - connect_args={"sslmode": "require"} - ) - - # Inject token on connect - @event.listens_for(self._engine.sync_engine, "do_connect") - def inject_token(dialect, conn_rec, cargs, cparams): - cparams["password"] = self._current_token - - self._session_maker = async_sessionmaker( - self._engine, - class_=AsyncSession, - expire_on_commit=False - ) - - def start_refresh(self): - """Start background token refresh task.""" - if not self._refresh_task: - self._refresh_task = asyncio.create_task(self._refresh_loop()) - - async def stop_refresh(self): - """Stop token refresh task.""" - if self._refresh_task: - self._refresh_task.cancel() - try: - await self._refresh_task - except asyncio.CancelledError: - pass - self._refresh_task = None - - @asynccontextmanager - async def session(self) -> AsyncGenerator[AsyncSession, None]: - """Get a database session.""" - async with self._session_maker() as session: - yield session - - async def close(self): - """Close all connections.""" - await self.stop_refresh() - if self._engine: - await self._engine.dispose() - -# Usage in FastAPI -from fastapi import FastAPI - -app = FastAPI() -db_manager = LakebaseConnectionManager("my-instance", "my_database") - -@app.on_event("startup") -async def startup(): - db_manager.initialize() - db_manager.start_refresh() - -@app.on_event("shutdown") -async def shutdown(): - await db_manager.close() - -@app.get("/data") -async def get_data(): - async with db_manager.session() as session: - result = await session.execute("SELECT * FROM my_table") - return result.fetchall() -``` - -### 3. Static URL Mode (Local Development) - -For local development, use a static connection URL: - -```python -import os -from sqlalchemy.ext.asyncio import create_async_engine - -# Set environment variable with full connection URL -# LAKEBASE_PG_URL=postgresql://user:password@host:5432/database - -def get_database_url() -> str: - """Get database URL from environment.""" - url = os.environ.get("LAKEBASE_PG_URL") - if url and url.startswith("postgresql://"): - # Convert to psycopg3 async driver - url = url.replace("postgresql://", "postgresql+psycopg://", 1) - return url - -engine = create_async_engine( - get_database_url(), - pool_size=5, - connect_args={"sslmode": "require"} -) -``` - -### 4. DNS Resolution Workaround (macOS) - -Python's `socket.getaddrinfo()` fails with long hostnames on macOS. Use `dig` as fallback: - -```python -import subprocess -import socket - -def resolve_hostname(hostname: str) -> str: - """Resolve hostname using dig command (macOS workaround).""" - try: - # Try Python's resolver first - return socket.gethostbyname(hostname) - except socket.gaierror: - pass - - # Fallback to dig command - try: - result = subprocess.run( - ["dig", "+short", hostname], - capture_output=True, - text=True, - timeout=5 - ) - ips = result.stdout.strip().split('\n') - for ip in ips: - if ip and not ip.startswith(';'): - return ip - except Exception: - pass - - raise RuntimeError(f"Could not resolve hostname: {hostname}") - -# Use with psycopg -conn_params = { - "host": hostname, # For TLS SNI - "hostaddr": resolve_hostname(hostname), # Actual IP - "dbname": database_name, - "user": username, - "password": token, - "sslmode": "require" -} -conn = psycopg.connect(**conn_params) -``` - -## Environment Variables - -| Variable | Description | Required | -|----------|-------------|----------| -| `LAKEBASE_PG_URL` | Static PostgreSQL URL (local dev) | Either this OR instance/database | -| `LAKEBASE_INSTANCE_NAME` | Lakebase instance name | With DATABASE_NAME | -| `LAKEBASE_DATABASE_NAME` | Database name | With INSTANCE_NAME | -| `LAKEBASE_USERNAME` | Override username | No | -| `LAKEBASE_HOST` | Override host | No | -| `DB_POOL_SIZE` | Connection pool size | No (default: 5) | -| `DB_MAX_OVERFLOW` | Max pool overflow | No (default: 10) | -| `DB_POOL_RECYCLE_INTERVAL` | Pool recycle seconds | No (default: 3600) | - -## Best Practices - -1. **Always use SSL**: Set `sslmode=require` in all connections -2. **Implement token refresh**: Tokens expire after 1 hour; refresh at 50 minutes -3. **Use connection pooling**: Avoid creating new connections per request -4. **Handle DNS issues on macOS**: Use the `hostaddr` workaround if needed -5. **Close connections properly**: Use context managers or explicit cleanup -6. **Log token refresh events**: Helps debug authentication issues diff --git a/databricks-skills/databricks-lakebase-provisioned/reverse-etl.md b/databricks-skills/databricks-lakebase-provisioned/reverse-etl.md deleted file mode 100644 index 5b5caef4..00000000 --- a/databricks-skills/databricks-lakebase-provisioned/reverse-etl.md +++ /dev/null @@ -1,171 +0,0 @@ -# Reverse ETL with Lakebase Provisioned - -## Overview - -Reverse ETL allows you to sync data from Unity Catalog Delta tables into Lakebase Provisioned as PostgreSQL tables. This enables OLTP access patterns on data processed in the Lakehouse. - -## Sync Modes - -| Mode | Description | Best For | Notes | -|------|-------------|----------|-------| -| **Snapshot** | One-time full copy | Initial setup, small tables | 10x more efficient if modifying >10% of data | -| **Triggered** | Scheduled updates on demand | Dashboards updated hourly/daily | Requires CDF on source table | -| **Continuous** | Real-time streaming (seconds of latency) | Live applications | Highest cost, minimum 15s intervals, requires CDF | - -**Note:** Triggered and Continuous modes require Change Data Feed (CDF) enabled on the source table: - -```sql -ALTER TABLE your_catalog.your_schema.your_table -SET TBLPROPERTIES (delta.enableChangeDataFeed = true) -``` - -## Creating Synced Tables - -### Using Python SDK - -```python -from databricks.sdk import WorkspaceClient -from databricks.sdk.service.database import ( - SyncedDatabaseTable, - SyncedTableSpec, - SyncedTableSchedulingPolicy, -) - -w = WorkspaceClient() - -# Create a synced table from Unity Catalog to Lakebase Provisioned -synced_table = w.database.create_synced_database_table( - SyncedDatabaseTable( - name="lakebase_catalog.schema.synced_table", - database_instance_name="my-lakebase-instance", - spec=SyncedTableSpec( - source_table_full_name="analytics.gold.user_profiles", - primary_key_columns=["user_id"], - scheduling_policy=SyncedTableSchedulingPolicy.TRIGGERED, - ), - ) -) -print(f"Created synced table: {synced_table.name}") -``` - -**Key parameters:** - -| Parameter | Description | -|-----------|-------------| -| `name` | Fully qualified target table name (catalog.schema.table) | -| `database_instance_name` | Lakebase Provisioned instance name | -| `source_table_full_name` | Fully qualified source Delta table (catalog.schema.table) | -| `primary_key_columns` | List of primary key columns from the source table | -| `scheduling_policy` | `SNAPSHOT`, `TRIGGERED`, or `CONTINUOUS` | - -### Using CLI - -```bash -databricks database create-synced-database-table \ - --json '{ - "name": "lakebase_catalog.schema.synced_table", - "database_instance_name": "my-lakebase-instance", - "spec": { - "source_table_full_name": "analytics.gold.user_profiles", - "primary_key_columns": ["user_id"], - "scheduling_policy": "TRIGGERED" - } - }' -``` - -**Note:** There is no SQL syntax for creating synced tables. Use the Python SDK, CLI, or Catalog Explorer UI. - -## Checking Synced Table Status - -```python -status = w.database.get_synced_database_table(name="lakebase_catalog.schema.synced_table") -print(f"State: {status.data_synchronization_status.detailed_state}") -print(f"Message: {status.data_synchronization_status.message}") -``` - -## Deleting a Synced Table - -Delete from both Unity Catalog and Postgres: - -1. **Unity Catalog:** Delete via Catalog Explorer or SDK -2. **Postgres:** Drop the table to free storage - -```python -# Delete the synced table via SDK -w.database.delete_synced_database_table(name="lakebase_catalog.schema.synced_table") -``` - -```sql --- Drop the Postgres table to free storage -DROP TABLE your_database.your_schema.your_table; -``` - -## Use Cases - -### 1. Product Catalog for Web App - -```python -w.database.create_synced_database_table( - SyncedDatabaseTable( - name="ecommerce_catalog.public.products", - database_instance_name="ecommerce-db", - spec=SyncedTableSpec( - source_table_full_name="gold.products.catalog", - primary_key_columns=["product_id"], - scheduling_policy=SyncedTableSchedulingPolicy.TRIGGERED, - ), - ) -) -# Application queries PostgreSQL directly with low-latency point lookups -``` - -### 2. User Profiles for Authentication - -```python -w.database.create_synced_database_table( - SyncedDatabaseTable( - name="auth_catalog.public.user_profiles", - database_instance_name="auth-db", - spec=SyncedTableSpec( - source_table_full_name="gold.users.profiles", - primary_key_columns=["user_id"], - scheduling_policy=SyncedTableSchedulingPolicy.CONTINUOUS, - ), - ) -) -``` - -### 3. Feature Store for Real-time ML - -```python -w.database.create_synced_database_table( - SyncedDatabaseTable( - name="ml_catalog.public.user_features", - database_instance_name="feature-store-db", - spec=SyncedTableSpec( - source_table_full_name="ml.features.user_features", - primary_key_columns=["user_id"], - scheduling_policy=SyncedTableSchedulingPolicy.CONTINUOUS, - ), - ) -) -# ML model queries features with low latency -``` - -## Best Practices - -1. **Enable CDF** on source tables before creating Triggered or Continuous synced tables -2. **Choose appropriate sync mode**: Snapshot for small tables or one-time loads, Triggered for hourly/daily refreshes, Continuous for real-time -3. **Monitor sync status**: Check for failures and latency via Catalog Explorer or `get_synced_database_table()` -4. **Index target tables**: Create appropriate indexes in PostgreSQL for your query patterns -5. **Handle schema changes**: Only additive changes (e.g., adding columns) are supported for Triggered/Continuous modes -6. **Account for connection limits**: Each synced table uses up to 16 connections - -## Common Issues - -| Issue | Solution | -|-------|----------| -| **Sync fails with CDF error** | Enable Change Data Feed on source table before using Triggered or Continuous mode | -| **Schema mismatch** | Only additive schema changes are supported; for breaking changes, delete and recreate the synced table | -| **Sync takes too long** | Switch to Triggered mode for scheduled updates; use Snapshot for initial bulk loads | -| **Target table locked** | Avoid DDL on target during sync operations | diff --git a/databricks-skills/databricks-metric-views/SKILL.md b/databricks-skills/databricks-metric-views/SKILL.md index 3cc4b427..6f5e7eb2 100644 --- a/databricks-skills/databricks-metric-views/SKILL.md +++ b/databricks-skills/databricks-metric-views/SKILL.md @@ -27,16 +27,11 @@ Use this skill when: ### Inspect Source Table Schema -Before creating a metric view, call `get_table_stats_and_schema` to understand available columns for dimensions and measures: +Before authoring a metric view, inspect the source tables. Use `discover-schema` as the default โ€” one call returns columns, types, sample rows, null counts, and row count. If you only know the schema, list tables first with `query "SHOW TABLES IN ..."`. -``` -get_table_stats_and_schema( - catalog="catalog", - schema="schema", - table_names=["orders"], - table_stat_level="SIMPLE" # Use "DETAILED" for cardinality, min/max, histograms -) -``` +`databricks experimental aitools tools discover-schema catalog.schema.orders catalog.schema.customers` + +For dimensions and measures, probe distribution beyond sampling โ€” cardinality of candidate dimensions, min/max/percentiles for measures, top categorical values. Write aggregate SQL through `databricks experimental aitools tools query --warehouse "..."`. Both commands auto-pick the default warehouse; set `DATABRICKS_WAREHOUSE_ID` or pass `--warehouse ` to override. ### Create a Metric View @@ -95,72 +90,88 @@ ORDER BY ALL | YAML Syntax | [yaml-reference.md](yaml-reference.md) | Complete YAML spec: dimensions, measures, joins, materialization | | Patterns & Examples | [patterns.md](patterns.md) | Common patterns: star schema, snowflake, filtered measures, window measures, ratios | -## MCP Tools - -Use the `manage_metric_views` tool for all metric view operations: - -| Action | Description | -|--------|-------------| -| `create` | Create a metric view with dimensions and measures | -| `alter` | Update a metric view's YAML definition | -| `describe` | Get the full definition and metadata | -| `query` | Query measures grouped by dimensions | -| `drop` | Drop a metric view | -| `grant` | Grant SELECT privileges to users/groups | - -### Create via MCP - -```python -manage_metric_views( - action="create", - full_name="catalog.schema.orders_metrics", - source="catalog.schema.orders", - or_replace=True, - comment="Orders KPIs for sales analysis", - filter_expr="order_date > '2020-01-01'", - dimensions=[ - {"name": "Order Month", "expr": "DATE_TRUNC('MONTH', order_date)", "comment": "Month of order"}, - {"name": "Order Status", "expr": "status"}, - ], - measures=[ - {"name": "Order Count", "expr": "COUNT(1)"}, - {"name": "Total Revenue", "expr": "SUM(total_price)", "comment": "Sum of total price"}, - ], -) +## SQL Operations + +### Create Metric View + +```sql +CREATE OR REPLACE VIEW catalog.schema.orders_metrics +WITH METRICS +LANGUAGE YAML +AS $$ + version: 1.1 + comment: "Orders KPIs for sales analysis" + source: catalog.schema.orders + filter: order_date > '2020-01-01' + dimensions: + - name: Order Month + expr: DATE_TRUNC('MONTH', order_date) + comment: "Month of order" + - name: Order Status + expr: status + measures: + - name: Order Count + expr: COUNT(1) + - name: Total Revenue + expr: SUM(total_price) + comment: "Sum of total price" +$$; ``` -### Query via MCP - -```python -manage_metric_views( - action="query", - full_name="catalog.schema.orders_metrics", - query_measures=["Total Revenue", "Order Count"], - query_dimensions=["Order Month"], - where="extract(year FROM `Order Month`) = 2024", - order_by="ALL", - limit=100, -) +### Query Metric View + +```sql +SELECT + `Order Month`, + MEASURE(`Total Revenue`) AS total_revenue, + MEASURE(`Order Count`) AS order_count +FROM catalog.schema.orders_metrics +WHERE extract(year FROM `Order Month`) = 2024 +GROUP BY ALL +ORDER BY ALL +LIMIT 100; ``` -### Describe via MCP +### Describe Metric View -```python -manage_metric_views( - action="describe", - full_name="catalog.schema.orders_metrics", -) +```sql +DESCRIBE TABLE EXTENDED catalog.schema.orders_metrics; + +-- Or get YAML definition +SHOW CREATE TABLE catalog.schema.orders_metrics; ``` ### Grant Access -```python -manage_metric_views( - action="grant", - full_name="catalog.schema.orders_metrics", - principal="data-consumers", - privileges=["SELECT"], -) +```sql +GRANT SELECT ON VIEW catalog.schema.orders_metrics TO `data-consumers`; +``` + +### Drop Metric View + +```sql +DROP VIEW IF EXISTS catalog.schema.orders_metrics; +``` + +### CLI Execution + +```bash +# Execute SQL via CLI +databricks experimental aitools tools query --warehouse WAREHOUSE_ID " +CREATE OR REPLACE VIEW catalog.schema.orders_metrics +WITH METRICS +LANGUAGE YAML +AS \$\$ + version: 1.1 + source: catalog.schema.orders + dimensions: + - name: Order Month + expr: DATE_TRUNC('MONTH', order_date) + measures: + - name: Total Revenue + expr: SUM(total_price) +\$\$ +" ``` ## YAML Spec Quick Reference diff --git a/databricks-skills/databricks-metric-views/patterns.md b/databricks-skills/databricks-metric-views/patterns.md index 1f067f4c..c109abcc 100644 --- a/databricks-skills/databricks-metric-views/patterns.md +++ b/databricks-skills/databricks-metric-views/patterns.md @@ -579,73 +579,81 @@ GROUP BY ALL ORDER BY ALL ``` -## MCP Tool Examples +## SQL Examples ### Create with joins -```python -manage_metric_views( - action="create", - full_name="catalog.schema.sales_metrics", - source="catalog.schema.fact_sales", - or_replace=True, - joins=[ - { - "name": "customer", - "source": "catalog.schema.dim_customer", - "on": "source.customer_id = customer.id" - }, - { - "name": "product", - "source": "catalog.schema.dim_product", - "on": "source.product_id = product.id" - } - ], - dimensions=[ - {"name": "Customer Segment", "expr": "customer.segment"}, - {"name": "Product Category", "expr": "product.category"}, - {"name": "Sale Month", "expr": "DATE_TRUNC('MONTH', source.sale_date)"}, - ], - measures=[ - {"name": "Total Revenue", "expr": "SUM(source.amount)"}, - {"name": "Order Count", "expr": "COUNT(1)"}, - {"name": "Unique Customers", "expr": "COUNT(DISTINCT source.customer_id)"}, - ], -) +```sql +CREATE OR REPLACE VIEW catalog.schema.sales_metrics +WITH METRICS +LANGUAGE YAML +AS $$ + version: 1.1 + source: catalog.schema.fact_sales + joins: + - name: customer + source: catalog.schema.dim_customer + on: source.customer_id = customer.id + - name: product + source: catalog.schema.dim_product + on: source.product_id = product.id + dimensions: + - name: Customer Segment + expr: customer.segment + - name: Product Category + expr: product.category + - name: Sale Month + expr: DATE_TRUNC('MONTH', source.sale_date) + measures: + - name: Total Revenue + expr: SUM(source.amount) + - name: Order Count + expr: COUNT(1) + - name: Unique Customers + expr: COUNT(DISTINCT source.customer_id) +$$; ``` ### Alter to add a new measure -```python -manage_metric_views( - action="alter", - full_name="catalog.schema.sales_metrics", - source="catalog.schema.fact_sales", - joins=[ - {"name": "customer", "source": "catalog.schema.dim_customer", "on": "source.customer_id = customer.id"}, - ], - dimensions=[ - {"name": "Customer Segment", "expr": "customer.segment"}, - {"name": "Sale Month", "expr": "DATE_TRUNC('MONTH', source.sale_date)"}, - ], - measures=[ - {"name": "Total Revenue", "expr": "SUM(source.amount)"}, - {"name": "Order Count", "expr": "COUNT(1)"}, - {"name": "Average Order Value", "expr": "AVG(source.amount)"}, # New measure - ], -) +```sql +-- Use CREATE OR REPLACE to update the metric view +CREATE OR REPLACE VIEW catalog.schema.sales_metrics +WITH METRICS +LANGUAGE YAML +AS $$ + version: 1.1 + source: catalog.schema.fact_sales + joins: + - name: customer + source: catalog.schema.dim_customer + on: source.customer_id = customer.id + dimensions: + - name: Customer Segment + expr: customer.segment + - name: Sale Month + expr: DATE_TRUNC('MONTH', source.sale_date) + measures: + - name: Total Revenue + expr: SUM(source.amount) + - name: Order Count + expr: COUNT(1) + - name: Average Order Value + expr: AVG(source.amount) +$$; ``` ### Query with filters -```python -manage_metric_views( - action="query", - full_name="catalog.schema.sales_metrics", - query_measures=["Total Revenue", "Order Count"], - query_dimensions=["Customer Segment", "Sale Month"], - where="`Customer Segment` = 'Enterprise'", - order_by="ALL", - limit=50, -) +```sql +SELECT + `Customer Segment`, + `Sale Month`, + MEASURE(`Total Revenue`) AS total_revenue, + MEASURE(`Order Count`) AS order_count +FROM catalog.schema.sales_metrics +WHERE `Customer Segment` = 'Enterprise' +GROUP BY ALL +ORDER BY ALL +LIMIT 50; ``` diff --git a/databricks-skills/databricks-model-serving/1-classical-ml.md b/databricks-skills/databricks-model-serving/1-classical-ml.md index 4b973e0a..42b6a016 100644 --- a/databricks-skills/databricks-model-serving/1-classical-ml.md +++ b/databricks-skills/databricks-model-serving/1-classical-ml.md @@ -140,16 +140,14 @@ endpoint = w.serving_endpoints.create_and_wait( ## Query the Endpoint -### Via MCP Tool - -``` -manage_serving_endpoint( - action="query", - name="diabetes-predictor", - dataframe_records=[ - {"age": 45, "bmi": 25.3, "bp": 120, "s1": 200} - ] -) +### Via CLI + +```bash +databricks serving-endpoints query diabetes-predictor --json '{ + "dataframe_records": [ + {"age": 45, "bmi": 25.3, "bp": 120, "s1": 200} + ] +}' ``` ### Via Python SDK diff --git a/databricks-skills/databricks-model-serving/3-genai-agents.md b/databricks-skills/databricks-model-serving/3-genai-agents.md index 4061dbab..1f1f9f8b 100644 --- a/databricks-skills/databricks-model-serving/3-genai-agents.md +++ b/databricks-skills/databricks-model-serving/3-genai-agents.md @@ -221,10 +221,12 @@ for event in AGENT.predict_stream(request): print(event) ``` -Run via MCP: +Run via CLI: -``` -execute_code(file_path="./my_agent/test_agent.py") +```bash +# Upload and run on Databricks +databricks workspace import-dir ./my_agent /Workspace/Users//my_agent +databricks jobs run-now JOB_ID # JOB_ID is positional; job runs test_agent.py ``` ## Logging the Agent @@ -267,18 +269,16 @@ from databricks import agents agents.deploy( "main.agents.my_agent", version="1", - tags={"source": "mcp"} + tags={"source": "cli"} ) # Takes ~15 minutes ``` ## Query Deployed Agent -``` -manage_serving_endpoint( - action="query", - name="my-agent-endpoint", - messages=[{"role": "user", "content": "What is Databricks?"}], - max_tokens=500 -) +```bash +databricks serving-endpoints query my-agent-endpoint --json '{ + "messages": [{"role": "user", "content": "What is Databricks?"}], + "max_tokens": 500 +}' ``` diff --git a/databricks-skills/databricks-model-serving/5-development-testing.md b/databricks-skills/databricks-model-serving/5-development-testing.md index 2a3806cf..71970aa9 100644 --- a/databricks-skills/databricks-model-serving/5-development-testing.md +++ b/databricks-skills/databricks-model-serving/5-development-testing.md @@ -1,8 +1,6 @@ # Development & Testing Workflow -MCP-based workflow for developing and testing agents on Databricks. - -> **If MCP tools are not available**, use Databricks CLI or the Python SDK directly. See [Databricks CLI docs](https://docs.databricks.com/dev-tools/cli/) for `databricks workspace import` and `databricks clusters spark-submit` commands. +CLI-based workflow for developing and testing agents on Databricks. ## Overview @@ -13,17 +11,17 @@ MCP-based workflow for developing and testing agents on Databricks. โ–ผ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚ Step 2: Upload to workspace โ”‚ -โ”‚ โ†’ manage_workspace_files MCP tool โ”‚ +โ”‚ โ†’ databricks workspace import-dir โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ–ผ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚ Step 3: Install packages โ”‚ -โ”‚ โ†’ execute_code MCP tool โ”‚ +โ”‚ โ†’ databricks jobs (serverless with pip requirements) โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ โ–ผ โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ” โ”‚ Step 4: Test agent (iterate) โ”‚ -โ”‚ โ†’ execute_code MCP tool (with file_path) โ”‚ +โ”‚ โ†’ databricks jobs run-now โ”‚ โ”‚ โ†’ If error: fix locally, re-upload, re-run โ”‚ โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜ ``` @@ -85,17 +83,13 @@ print("Response:", result.model_dump(exclude_none=True)) ## Step 2: Upload to Workspace -Use the `manage_workspace_files` MCP tool: +Use the Databricks CLI: -``` -manage_workspace_files( - action="upload", - local_path="./my_agent", - workspace_path="/Workspace/Users/you@company.com/my_agent" -) +```bash +databricks workspace import-dir ./my_agent /Workspace/Users/you@company.com/my_agent ``` -This uploads all files in parallel. +This uploads all files recursively. ## Step 3: Install Packages @@ -135,8 +129,8 @@ execute_code( 1. Read the error from the output 2. Fix the local file (`agent.py` or `test_agent.py`) -3. Re-upload: `manage_workspace_files(action="upload", ...)` -4. Re-run: `execute_code(file_path=...)` +3. Re-upload: `databricks workspace import-dir ./my_agent /Workspace/.../my_agent` +4. Re-run the job ### Iteration Tips @@ -188,13 +182,12 @@ print(response.content) ## Workflow Summary -| Step | MCP Tool | Purpose | -|------|----------|---------| -| Upload files | `manage_workspace_files` (action="upload") | Sync local files to workspace | -| Install packages | `execute_code` | Set up dependencies | -| Restart Python | `execute_code` | Apply package changes | -| Test agent | `execute_code` (with `file_path`) | Run test script | -| Debug | `execute_code` | Quick checks | +| Step | CLI Command | Purpose | +|------|-------------|---------| +| Upload files | `databricks workspace import-dir` | Sync local files to workspace | +| Install packages | Job with pip requirements | Set up dependencies | +| Test agent | `databricks jobs run-now` | Run test script | +| Debug | Run notebook or script | Quick checks | ## Next Steps diff --git a/databricks-skills/databricks-model-serving/6-logging-registration.md b/databricks-skills/databricks-model-serving/6-logging-registration.md index cd687358..2413fd75 100644 --- a/databricks-skills/databricks-model-serving/6-logging-registration.md +++ b/databricks-skills/databricks-model-serving/6-logging-registration.md @@ -60,10 +60,12 @@ uc_model_info = mlflow.register_model( print(f"Registered: {uc_model_info.name} version {uc_model_info.version}") ``` -Run via MCP: +Run via CLI: -``` -execute_code(file_path="./my_agent/log_model.py") +```bash +# Upload and run on Databricks +databricks workspace import-dir ./my_agent /Workspace/Users//my_agent +databricks jobs run-now JOB_ID # JOB_ID is positional; job runs log_model.py ``` ## Resources for Auto Authentication @@ -141,7 +143,7 @@ mlflow.models.predict( ) ``` -Run via MCP (in log_model.py or separate file): +Run validation (in log_model.py or separate file): ```python # validate_model.py diff --git a/databricks-skills/databricks-model-serving/7-deployment.md b/databricks-skills/databricks-model-serving/7-deployment.md index 666cb168..14e4f69f 100644 --- a/databricks-skills/databricks-model-serving/7-deployment.md +++ b/databricks-skills/databricks-model-serving/7-deployment.md @@ -2,7 +2,7 @@ Deploy models to serving endpoints. Uses async job-based approach for agents (deployment takes ~15 min). -> **If MCP tools are not available**, use `databricks.agents.deploy()` directly in a notebook, or create jobs via CLI: `databricks jobs create --json @job.json` +> Use `databricks.agents.deploy()` directly in a notebook, or create jobs via CLI: `databricks jobs create --json @job.json` ## Deployment Options @@ -13,7 +13,7 @@ Deploy models to serving endpoints. Uses async job-based approach for agents (de ## GenAI Agent Deployment (Job-Based) -Since agent deployment takes ~15 minutes, use a job to avoid MCP timeouts. +Since agent deployment takes ~15 minutes, use a job for async deployment. ### Step 1: Create Deployment Script @@ -32,7 +32,7 @@ print(f"Deploying {model_name} version {version}...") deployment = agents.deploy( model_name, version, - tags={"source": "mcp", "environment": "dev"} + tags={"source": "cli", "environment": "dev"} ) print(f"Deployment complete!") @@ -41,40 +41,40 @@ print(f"Endpoint: {deployment.endpoint_name}") ### Step 2: Create Deployment Job (One-Time) -Use the `manage_jobs` MCP tool with action="create": - -``` -manage_jobs( - action="create", - name="deploy-agent-job", - tasks=[ - { - "task_key": "deploy", - "spark_python_task": { - "python_file": "/Workspace/Users/you@company.com/my_agent/deploy_agent.py", - "parameters": ["{{job.parameters.model_name}}", "{{job.parameters.version}}"] - } - } - ], - parameters=[ - {"name": "model_name", "default": "main.agents.my_agent"}, - {"name": "version", "default": "1"} - ] -) +Use the Databricks CLI: + +```bash +databricks jobs create --json '{ + "name": "deploy-agent-job", + "tags": {"aidevkit_project": "ai-dev-kit"}, + "tasks": [{ + "task_key": "deploy", + "spark_python_task": { + "python_file": "/Workspace/Users/you@company.com/my_agent/deploy_agent.py", + "parameters": ["{{job.parameters.model_name}}", "{{job.parameters.version}}"] + }, + "new_cluster": { + "spark_version": "16.1.x-scala2.12", + "node_type_id": "i3.xlarge", + "num_workers": 0 + } + }], + "parameters": [ + {"name": "model_name", "default": "main.agents.my_agent"}, + {"name": "version", "default": "1"} + ] +}' ``` Save the returned `job_id`. ### Step 3: Run Deployment (Async) -Use `manage_job_runs` with action="run_now" - returns immediately: +Run the job - returns immediately: -``` -manage_job_runs( - action="run_now", - job_id="", - job_parameters={"model_name": "main.agents.my_agent", "version": "1"} -) +```bash +# Note: job_id is positional, parameters go in --json +databricks jobs run-now --json '{"job_id": , "job_parameters": {"model_name": "main.agents.my_agent", "version": "1"}}' ``` Save the returned `run_id`. @@ -83,14 +83,15 @@ Save the returned `run_id`. Check job run status: -``` -manage_job_runs(action="get", run_id="") +```bash +# run_id is positional +databricks jobs get-run ``` Or check endpoint directly: -``` -manage_serving_endpoint(action="get", name="") +```bash +databricks serving-endpoints get ``` ## Classical ML Deployment @@ -163,7 +164,7 @@ deployment = agents.deploy( "main.agents.my_agent", "1", endpoint_name="my-agent-endpoint", # Control the name - tags={"source": "mcp", "environment": "dev"} + tags={"source": "cli", "environment": "dev"} ) ``` @@ -172,7 +173,7 @@ deployment = agents.deploy( Endpoints created via `agents.deploy()` appear under **Serving** in the Databricks UI. If you don't see your endpoint: 1. **Check the filter** - The Serving page defaults to "Owned by me". If the deployment ran as a service principal (e.g., via a job), switch to "All" to see it. -2. **Verify via API** - Use `manage_serving_endpoint(action="list")` or `manage_serving_endpoint(action="get", name="...")` to confirm the endpoint exists and check its state. +2. **Verify via CLI** - Use `databricks serving-endpoints list` or `databricks serving-endpoints get ` to confirm the endpoint exists and check its state. 3. **Check the name** - The auto-generated name may not be what you expect. Print `deployment.endpoint_name` in the deploy script or check the job run output. ### Deployment Script with Explicit Naming @@ -261,18 +262,18 @@ client.update_endpoint( ## Workflow Summary -| Step | MCP Tool | Waits? | -|------|----------|--------| -| Upload deploy script | `manage_workspace_files` (action="upload") | Yes | -| Create job (one-time) | `manage_jobs` (action="create") | Yes | -| Run deployment | `manage_job_runs` (action="run_now") | **No** - returns immediately | -| Check job status | `manage_job_runs` (action="get") | Yes | -| Check endpoint status | `manage_serving_endpoint` (action="get") | Yes | +| Step | CLI Command | Waits? | +|------|-------------|--------| +| Upload deploy script | `databricks workspace import-dir` | Yes | +| Create job (one-time) | `databricks jobs create` | Yes | +| Run deployment | `databricks jobs run-now` | **No** - returns immediately | +| Check job status | `databricks jobs get-run` | Yes | +| Check endpoint status | `databricks serving-endpoints get` | Yes | ## After Deployment Once endpoint is READY: -1. **Test with MCP**: `manage_serving_endpoint(action="query", name="...", messages=[...])` +1. **Test with CLI**: `databricks serving-endpoints query --json '{"messages": [...]}'` 2. **Share with team**: Endpoint URL in Databricks UI 3. **Integrate in apps**: Use REST API or SDK diff --git a/databricks-skills/databricks-model-serving/8-querying-endpoints.md b/databricks-skills/databricks-model-serving/8-querying-endpoints.md index 4dfa2f91..2cebb0c1 100644 --- a/databricks-skills/databricks-model-serving/8-querying-endpoints.md +++ b/databricks-skills/databricks-model-serving/8-querying-endpoints.md @@ -2,87 +2,41 @@ Send requests to deployed Model Serving endpoints. -> **If MCP tools are not available**, use the Python SDK or REST API examples below. - -## MCP Tools +## CLI Commands ### Check Endpoint Status Before querying, verify the endpoint is ready: -``` -manage_serving_endpoint(action="get", name="my-agent-endpoint") -``` - -Response: -```json -{ - "name": "my-agent-endpoint", - "state": "READY", - "served_entities": [ - {"name": "my_agent-1", "entity_name": "main.agents.my_agent", "deployment_state": "READY"} - ] -} +```bash +databricks serving-endpoints get my-agent-endpoint ``` ### Query Chat/Agent Endpoint -``` -manage_serving_endpoint( - action="query", - name="my-agent-endpoint", - messages=[ - {"role": "user", "content": "What is Databricks?"} - ], - max_tokens=500, - temperature=0.7 -) -``` - -Response: -```json -{ - "choices": [ - { - "message": { - "role": "assistant", - "content": "Databricks is a unified data intelligence platform..." - }, - "finish_reason": "stop" - } - ], - "usage": { - "prompt_tokens": 10, - "completion_tokens": 150, - "total_tokens": 160 - } -} +```bash +databricks serving-endpoints query my-agent-endpoint --json '{ + "messages": [{"role": "user", "content": "What is Databricks?"}], + "max_tokens": 500, + "temperature": 0.7 +}' ``` ### Query ML Model Endpoint -``` -manage_serving_endpoint( - action="query", - name="sklearn-classifier", - dataframe_records=[ - {"age": 25, "income": 50000, "credit_score": 720}, - {"age": 35, "income": 75000, "credit_score": 680} - ] -) -``` - -Response: -```json -{ - "predictions": [0.85, 0.72] -} +```bash +databricks serving-endpoints query sklearn-classifier --json '{ + "dataframe_records": [ + {"age": 25, "income": 50000, "credit_score": 720}, + {"age": 35, "income": 75000, "credit_score": 680} + ] +}' ``` ### List All Endpoints -``` -manage_serving_endpoint(action="list", limit=20) +```bash +databricks serving-endpoints list ``` ## Python SDK diff --git a/databricks-skills/databricks-model-serving/9-package-requirements.md b/databricks-skills/databricks-model-serving/9-package-requirements.md index f9ceb7a9..e5508b6a 100644 --- a/databricks-skills/databricks-model-serving/9-package-requirements.md +++ b/databricks-skills/databricks-model-serving/9-package-requirements.md @@ -137,24 +137,23 @@ export DATABRICKS_TOKEN="your-token" export DATABRICKS_CONFIG_PROFILE="your-profile" ``` -## Installing Packages via MCP +## Installing Packages -Use `execute_code`: +In a notebook or Python script on Databricks: -``` -execute_code( - code="%pip install -U mlflow==3.6.0 databricks-langchain langgraph==0.3.4 databricks-agents pydantic" -) +```python +%pip install -U mlflow==3.6.0 databricks-langchain langgraph==0.3.4 databricks-agents pydantic +dbutils.library.restartPython() ``` -Then restart Python: +Or via job libraries configuration: -``` -execute_code( - code="dbutils.library.restartPython()", - cluster_id="", - context_id="" -) +```json +"libraries": [ + {"pypi": {"package": "mlflow==3.6.0"}}, + {"pypi": {"package": "databricks-langchain"}}, + {"pypi": {"package": "langgraph==0.3.4"}} +] ``` ## Checking Installed Versions @@ -171,17 +170,13 @@ for pkg in packages: print(f"{pkg}: NOT INSTALLED") ``` -Via MCP: +In a notebook: -``` -execute_code( - code=""" +```python import pkg_resources for pkg in ['mlflow', 'langchain', 'langgraph', 'pydantic', 'databricks-langchain']: try: print(f"{pkg}: {pkg_resources.get_distribution(pkg).version}") except: print(f"{pkg}: NOT INSTALLED") - """ -) ``` diff --git a/databricks-skills/databricks-model-serving/SKILL.md b/databricks-skills/databricks-model-serving/SKILL.md index 74160298..448d769f 100644 --- a/databricks-skills/databricks-model-serving/SKILL.md +++ b/databricks-skills/databricks-model-serving/SKILL.md @@ -82,59 +82,40 @@ ALWAYS use exact endpoint names from this table. NEVER guess or abbreviate. | Custom PyFunc | [2-custom-pyfunc.md](2-custom-pyfunc.md) | Custom preprocessing, signatures | | GenAI Agents | [3-genai-agents.md](3-genai-agents.md) | ResponsesAgent, LangGraph | | Tools Integration | [4-tools-integration.md](4-tools-integration.md) | UC Functions, Vector Search | -| Development & Testing | [5-development-testing.md](5-development-testing.md) | MCP workflow, iteration | +| Development & Testing | [5-development-testing.md](5-development-testing.md) | CLI workflow, iteration | | Logging & Registration | [6-logging-registration.md](6-logging-registration.md) | mlflow.pyfunc.log_model | | Deployment | [7-deployment.md](7-deployment.md) | Job-based async deployment | -| Querying Endpoints | [8-querying-endpoints.md](8-querying-endpoints.md) | SDK, REST, MCP tools | +| Querying Endpoints | [8-querying-endpoints.md](8-querying-endpoints.md) | CLI, SDK, REST | | Package Requirements | [9-package-requirements.md](9-package-requirements.md) | DBR versions, pip | --- ## Quick Start: Deploy a GenAI Agent -### Step 1: Install Packages (in notebook or via MCP) +### Step 1: Install Packages (in notebook) ```python %pip install -U mlflow==3.6.0 databricks-langchain langgraph==0.3.4 databricks-agents pydantic dbutils.library.restartPython() ``` -Or via MCP: -``` -execute_code(code="%pip install -U mlflow==3.6.0 databricks-langchain langgraph==0.3.4 databricks-agents pydantic") -``` - ### Step 2: Create Agent File Create `agent.py` locally with `ResponsesAgent` pattern (see [3-genai-agents.md](3-genai-agents.md)). ### Step 3: Upload to Workspace -``` -manage_workspace_files( - action="upload", - local_path="./my_agent", - workspace_path="/Workspace/Users/you@company.com/my_agent" -) +```bash +databricks workspace import-dir ./my_agent /Workspace/Users/you@company.com/my_agent ``` ### Step 4: Test Agent -``` -execute_code( - file_path="./my_agent/test_agent.py", - cluster_id="" -) -``` +Run `test_agent.py` on a cluster to validate the agent works. ### Step 5: Log Model -``` -execute_code( - file_path="./my_agent/log_model.py", - cluster_id="" -) -``` +Run `log_model.py` on a cluster to register the model in Unity Catalog. ### Step 6: Deploy (Async via Job) @@ -142,12 +123,10 @@ See [7-deployment.md](7-deployment.md) for job-based deployment that doesn't tim ### Step 7: Query Endpoint -``` -manage_serving_endpoint( - action="query", - name="my-agent-endpoint", - messages=[{"role": "user", "content": "Hello!"}] -) +```bash +databricks serving-endpoints query my-agent-endpoint --json '{ + "messages": [{"role": "user", "content": "Hello!"}] +}' ``` --- @@ -174,55 +153,55 @@ Then deploy via UI or SDK. See [1-classical-ml.md](1-classical-ml.md). --- -## MCP Tools +## CLI Commands + +### Endpoint Management + +```bash +# List all serving endpoints +databricks serving-endpoints list -> **If MCP tools are not available**, use the SDK/CLI examples in the reference files below. +# Get endpoint details and status +databricks serving-endpoints get my-agent-endpoint -### Development & Testing +# By default, after creation, tag serving endpoints to track resources created with this skill +databricks serving-endpoints patch my-agent-endpoint --json '{ + "add_tags": [{"key": "aidevkit_project", "value": "ai-dev-kit"}] +}' -| Tool | Purpose | -|------|---------| -| `manage_workspace_files` (action="upload") | Upload agent files to workspace | -| `execute_code` | Install packages, test agent, log model | +# Query a chat/agent endpoint +databricks serving-endpoints query my-agent-endpoint --json '{ + "messages": [{"role": "user", "content": "Hello!"}], + "max_tokens": 500 +}' -### Deployment +# Query a traditional ML endpoint +databricks serving-endpoints query sklearn-classifier --json '{ + "dataframe_records": [{"age": 25, "income": 50000, "credit_score": 720}] +}' +``` -| Tool | Purpose | -|------|---------| -| `manage_jobs` (action="create") | Create deployment job (one-time) | -| `manage_job_runs` (action="run_now") | Kick off deployment (async) | -| `manage_job_runs` (action="get") | Check deployment job status | +### Workspace File Operations -### manage_serving_endpoint - Querying +```bash +# Upload agent files to workspace +databricks workspace import-dir ./my_agent /Workspace/Users/you@company.com/my_agent -| Action | Description | Required Params | -|--------|-------------|-----------------| -| `get` | Check endpoint status (READY/NOT_READY/NOT_FOUND) | name | -| `list` | List all endpoints | (none, optional limit) | -| `query` | Send requests to endpoint | name + one of: messages, inputs, dataframe_records | +# List workspace files +databricks workspace list /Workspace/Users/you@company.com/my_agent +``` -**Example usage:** -```python -# Check endpoint status -manage_serving_endpoint(action="get", name="my-agent-endpoint") +### Jobs for Deployment -# List all endpoints -manage_serving_endpoint(action="list") +```bash +# Create a deployment job +databricks jobs create --json @deploy_job.json -# Query a chat/agent endpoint -manage_serving_endpoint( - action="query", - name="my-agent-endpoint", - messages=[{"role": "user", "content": "Hello!"}], - max_tokens=500 -) +# Run the deployment job (JOB_ID is positional) +databricks jobs run-now JOB_ID -# Query a traditional ML endpoint -manage_serving_endpoint( - action="query", - name="sklearn-classifier", - dataframe_records=[{"age": 25, "income": 50000, "credit_score": 720}] -) +# Check job run status (RUN_ID is positional) +databricks jobs get-run RUN_ID ``` --- @@ -231,42 +210,27 @@ manage_serving_endpoint( ### Check Endpoint Status After Deployment -``` -manage_serving_endpoint(action="get", name="my-agent-endpoint") +```bash +databricks serving-endpoints get my-agent-endpoint ``` -Returns: -```json -{ - "name": "my-agent-endpoint", - "state": "READY", - "served_entities": [...] -} -``` +Returns JSON with endpoint status (`READY`, `NOT_READY`, etc.). ### Query a Chat/Agent Endpoint -``` -manage_serving_endpoint( - action="query", - name="my-agent-endpoint", - messages=[ - {"role": "user", "content": "What is Databricks?"} - ], - max_tokens=500 -) +```bash +databricks serving-endpoints query my-agent-endpoint --json '{ + "messages": [{"role": "user", "content": "What is Databricks?"}], + "max_tokens": 500 +}' ``` ### Query a Traditional ML Endpoint -``` -manage_serving_endpoint( - action="query", - name="sklearn-classifier", - dataframe_records=[ - {"age": 25, "income": 50000, "credit_score": 720} - ] -) +```bash +databricks serving-endpoints query sklearn-classifier --json '{ + "dataframe_records": [{"age": 25, "income": 50000, "credit_score": 720}] +}' ``` --- diff --git a/databricks-skills/databricks-python-sdk/SKILL.md b/databricks-skills/databricks-python-sdk/SKILL.md index eaf7cd66..bbe2dada 100644 --- a/databricks-skills/databricks-python-sdk/SKILL.md +++ b/databricks-skills/databricks-python-sdk/SKILL.md @@ -91,7 +91,7 @@ databricks --profile MY_PROFILE clusters list # Common commands databricks clusters list databricks jobs list -databricks workspace ls /Users/me +databricks workspace list /Users/me ``` --- @@ -622,4 +622,4 @@ If I'm unsure about a method, I should: - **[databricks-unity-catalog](../databricks-unity-catalog/SKILL.md)** - catalog governance - **[databricks-model-serving](../databricks-model-serving/SKILL.md)** - serving endpoint management - **[databricks-vector-search](../databricks-vector-search/SKILL.md)** - vector index operations -- **[databricks-lakebase-provisioned](../databricks-lakebase-provisioned/SKILL.md)** - managed PostgreSQL via SDK +- **[databricks-lakebase-autoscale](../databricks-lakebase-autoscale/SKILL.md)** - managed PostgreSQL with autoscaling + branching diff --git a/databricks-skills/databricks-spark-declarative-pipelines/SKILL.md b/databricks-skills/databricks-spark-declarative-pipelines/SKILL.md index a1bdd7c3..5e065d52 100644 --- a/databricks-skills/databricks-spark-declarative-pipelines/SKILL.md +++ b/databricks-skills/databricks-spark-declarative-pipelines/SKILL.md @@ -39,10 +39,10 @@ description: "Creates, configures, and updates Databricks Lakeflow Spark Declara - When the user provides table schema and asks for code, respond directly with the code. Don't ask clarifying questions if the request is clear. ## Tools -- List files in volume: `databricks fs ls dbfs:/Volumes/{catalog}/{schema}/{volume}/{path} --profile {PROFILE}` -- Query data: `databricks experimental aitools tools query --profile {PROFILE} --warehouse abc123 "SELECT 1 FROM catalog.schema.table"` -- Discover schema: `databricks experimental aitools tools discover-schema --profile {PROFILE} catalog.schema.table1 catalog.schema.table2` -- Pipelines CLI: `databricks pipelines init|deploy|run|logs|stop` or use `databricks pipelines --help` for more options +- List files in volume: `databricks fs ls dbfs:/Volumes/{catalog}/{schema}/{volume}/{path}` (the `dbfs:` prefix is required even for UC Volume paths) +- Query data: `databricks experimental aitools tools query --warehouse abc123 "SELECT 1 FROM catalog.schema.table"` +- Discover schema: `databricks experimental aitools tools discover-schema catalog.schema.table1 catalog.schema.table2` +- Pipelines CLI: `databricks pipelines create|get|delete|start-update|list-pipelines` or use `databricks pipelines --help` for more options ## Choose Your Workflow @@ -83,15 +83,14 @@ Use this when the pipeline is **part of an existing DAB project**: โ†’ See [1-project-initialization.md](references/1-project-initialization.md) for adding pipelines to existing bundles -### Option C: Rapid Iteration with MCP Tools (no bundle management) +### Option C: Rapid Iteration with CLI (no bundle management, or you'll create the DAB at the end) Use this when you need to **quickly create, test, and iterate** on a pipeline without managing bundle files: - User wants to "just run a pipeline and see if it works" - Part of a larger demo where bundle is managed separately, or the DAB bundle will be created at the end as you want to quickly test the project first - Prototyping or experimenting with pipeline logic -- User explicitly asks to use MCP tools -โ†’ See [2-mcp-approach.md](references/2-mcp-approach.md) for MCP-based workflow +โ†’ See [2-cli-approach.md](references/2-cli-approach.md) for CLI-based workflow --- @@ -101,7 +100,7 @@ Before writing pipeline code, make sure you have: ``` - [ ] Language selected: Python or SQL - [ ] Read the syntax basics: **SQL**: Always Read [sql/1-syntax-basics.md](references/sql/1-syntax-basics.md), **Python**: Always Read [python/1-syntax-basics.md](references/python/1-syntax-basics.md) -- [ ] Workflow chosen: Standalone DAB / Existing DAB / MCP iteration +- [ ] Workflow chosen: Standalone DAB / Existing DAB / CLI iteration - [ ] Compute type: serverless (default) or classic - [ ] Schema strategy: single schema with prefixes vs. multi-schema - [ ] Consider [Multi-Schema Patterns](#multi-schema-patterns) and [Modern Defaults](#modern-defaults) @@ -179,7 +178,7 @@ After choosing your workflow (see [Choose Your Workflow](#choose-your-workflow)) | Task | Guide | |------|-------| | **Setting up standalone pipeline project** | [1-project-initialization.md](references/1-project-initialization.md) | -| **Rapid iteration with MCP tools** | [2-mcp-approach.md](references/2-mcp-approach.md) | +| **Rapid iteration with CLI** | [2-cli-approach.md](references/2-cli-approach.md) | | **Advanced configuration** | [3-advanced-configuration.md](references/3-advanced-configuration.md) | | **Migrating from DLT** | [4-dlt-migration.md](references/4-dlt-migration.md) | @@ -248,7 +247,7 @@ For detailed syntax, see [sql/1-syntax-basics.md](references/sql/1-syntax-basics ### Project Structure - **Standalone pipeline projects**: Use `databricks pipelines init` for Asset Bundle with multi-environment support - **Pipeline in existing bundle**: Add to `resources/*.pipeline.yml` -- **Rapid iteration/prototyping**: Use MCP tools, formalize in bundle later +- **Rapid iteration/prototyping**: Use CLI/SDK, formalize in bundle later - See **[1-project-initialization.md](references/1-project-initialization.md)** for project setup details ### Minimal pipeline config pointers @@ -257,7 +256,7 @@ For detailed syntax, see [sql/1-syntax-basics.md](references/sql/1-syntax-basics ### Modern Defaults - **Always use raw `.sql`/`.py` files for the transformations files** - NO notebooks in your pipeline. Pipeline code must be plain files. -- **Databricks notebook source for explorations** - Use `# Databricks notebook source` format with `# COMMAND ----------` separators for ad-hoc queries. See [examples/exploration_notebook.py](scripts/exploration_notebook.py). +- **Databricks notebook source for explorations** - Use `# Databricks notebook source` format with `# COMMAND ----------` separators for ad-hoc queries. See [scripts/exploration_notebook.py](scripts/exploration_notebook.py). - **Serverless compute** - Do not use classic clusters unless explicitly required (R, RDD APIs, JAR libraries) - **Unity Catalog** (required for serverless) - **CLUSTER BY** (Liquid Clustering), not PARTITION BY with ZORDER - see [sql/5-performance.md](references/sql/5-performance.md) or [python/5-performance.md](references/python/5-performance.md) @@ -278,30 +277,58 @@ For detailed examples, see **[3-advanced-configuration.md](references/3-advanced ## Post-Run Validation (Required) -After running a pipeline (via DAB or MCP), you **MUST** validate both the execution status AND the actual data. +After running a pipeline (via DAB or CLI), you **MUST** validate both the execution status AND the actual data. ### Step 1: Check Pipeline Execution Status -**From MCP (`manage_pipeline(action="run")` or `manage_pipeline(action="create_or_update")`):** -- Check `result["success"]` and `result["state"]` -- If failed, check `result["message"]` and `result["errors"]` for details +A freshly created pipeline has `state: IDLE` and `latest_updates: null` until you trigger the first run with `start-update`. `list-pipeline-events` returns a bare JSON array (not `{"events": [...]}`). For DAB runs, also check `databricks bundle run` output. -**From DAB (`databricks bundle run`):** -- Check the command output for success/failure -- Use `manage_pipeline(action="get", pipeline_id=...)` to get detailed status and recent events +```bash +# Kick off (or re-run) a pipeline. --full-refresh reprocesses everything +# from scratch (destructive on streaming state); omit for incremental. +databricks pipelines start-update +databricks pipelines start-update --full-refresh + +# Poll status. The (.latest_updates // [{}]) guard handles the null case +# on a never-run pipeline so jq doesn't crash. +databricks pipelines get \ + | jq '{state, latest: (.latest_updates // [{}])[0] | {state, update_id, creation_time}}' + +# Surface just failures from the event log +databricks pipelines list-pipeline-events \ + | jq '[.[] | select(.level=="ERROR" or .level=="WARN") | {level, event_type, message: (.message // "")[0:200]}] | .[0:10]' +``` + +If a pipeline is already RUNNING, `start-update` queues the new update; force-stop with `databricks pipelines stop ` first if needed. + +### Updating a Pipeline (edit โ†’ re-upload โ†’ restart) + +Use `--format RAW --overwrite` โ€” pipelines use raw `.sql`/`.py` FILE entries. `--format SOURCE --language SQL|PYTHON` uploads a workspace notebook instead and **notebooks are deprecated for pipelines**; mixing the two on the same path fails with `Cannot overwrite the asset ... due to type mismatch (asked: NOTEBOOK, actual: FILE)`. + +```bash +# Single file +databricks workspace import /Workspace/Users//pipeline/07_gold.sql \ + --file ./src/pipeline/07_gold.sql --format RAW --overwrite + +# Whole directory +databricks workspace import-dir ./src/pipeline /Workspace/Users//pipeline --overwrite +``` + +After re-uploading, trigger a new run with `databricks pipelines start-update ` (see Step 1 above for the full polling pattern). ### Step 2: Validate Output Data Even if the pipeline reports SUCCESS, you **MUST** verify the data is correct: +```bash +# Check schema, row counts, sample data, and null counts for all tables +databricks experimental aitools tools discover-schema \ + my_catalog.my_schema.bronze_orders \ + my_catalog.my_schema.silver_orders \ + my_catalog.my_schema.gold_summary ``` -# MCP Tool: get_table_stats_and_schema - validates schema, row counts, and stats -get_table_stats_and_schema( - catalog="my_catalog", - schema="my_schema", - table_names=["bronze_*", "silver_*", "gold_*"] # Use glob patterns -) -``` + +This returns per table: columns/types, 5 sample rows, total_rows count, and null counts per column. **Check for:** - Empty tables (row_count = 0) - indicates ingestion or filtering issues @@ -314,7 +341,7 @@ get_table_stats_and_schema( If validation reveals problems, trace upstream to find the root cause: 1. **Start from the problematic table** - identify what's wrong (empty, wrong counts, bad data) -2. **Check its source table** - use `get_table_stats_and_schema` on the upstream table +2. **Check its source table** - run `DESCRIBE` and `COUNT(*)` on the upstream table 3. **Trace back to bronze** - continue until you find where the issue originates 4. **Common causes:** - Bronze empty โ†’ source files missing or path incorrect @@ -324,7 +351,7 @@ If validation reveals problems, trace upstream to find the root cause: 5. **Fix the SQL/Python code**, re-upload, and re-run the pipeline -**Do NOT use `execute_sql` with COUNT queries for validation** - `get_table_stats_and_schema` is faster and returns more information in a single call. +**Use `discover-schema` for validation** - it returns schema, row counts, sample data, and null counts in a single call. --- @@ -332,17 +359,20 @@ If validation reveals problems, trace upstream to find the root cause: | Issue | Solution | |-------|----------| -| **Empty output tables** | Use `get_table_stats_and_schema` to check upstream sources. Verify source files exist and paths are correct. | +| **"Only SQL, Scala and Python notebooks are supported"** | Use `{"file": {"path": "..."}}` instead of `{"notebook": {"path": "..."}}` for raw SQL files. `notebook` is for Databricks notebook format only. | +| **Empty output tables** | Use `discover-schema` to check upstream tables. Verify source files exist and paths are correct. | | **Pipeline stuck INITIALIZING** | Normal for serverless, wait a few minutes | | **"Column not found"** | Check `schemaHints` match actual data | -| **Streaming reads fail** | For file ingestion in a streaming table, you must use the `STREAM` keyword with `read_files`: `FROM STREAM read_files(...)`. For table streams use `FROM stream(table)`. See [read_files โ€” Usage in streaming tables](https://docs.databricks.com/aws/en/sql/language-manual/functions/read_files#usage-in-streaming-tables). | -| **Timeout during run** | Increase `timeout`, or use `wait_for_completion=False` and check status with `manage_pipeline(action="get")` | +| **Streaming reads fail** | Use `FROM STREAM read_files(...)` only for file ingestion; use `FROM stream(table)` for table-to-table streams. `FROM STREAM table` (no parens) parses but is legacy DLT โ€” prefer the function form. See [read_files โ€” Usage in streaming tables](https://docs.databricks.com/aws/en/sql/language-manual/functions/read_files#usage-in-streaming-tables). | +| **Timeout during run** | Use `databricks pipelines get ` to check status | | **MV doesn't refresh** | Enable row tracking on source tables | | **SCD2: query column not found** | Lakeflow uses `__START_AT` and `__END_AT` (double underscore), not `START_AT`/`END_AT`. Use `WHERE __END_AT IS NULL` for current rows. See [sql/4-cdc-patterns.md](references/sql/4-cdc-patterns.md). | | **AUTO CDC parse error at APPLY/SEQUENCE** | Put `APPLY AS DELETE WHEN` **before** `SEQUENCE BY`. Only list columns in `COLUMNS * EXCEPT (...)` that exist in the source (omit `_rescued_data` unless bronze uses rescue data). Omit `TRACK HISTORY ON *` if it causes "end of input" errors; default is equivalent. See [sql/4-cdc-patterns.md](references/sql/4-cdc-patterns.md). | | **"Cannot create streaming table from batch query"** | In a streaming table query, use `FROM STREAM read_files(...)` so `read_files` leverages Auto Loader; `FROM read_files(...)` alone is batch. See [sql/2-ingestion.md](references/sql/2-ingestion.md) and [read_files โ€” Usage in streaming tables](https://docs.databricks.com/aws/en/sql/language-manual/functions/read_files#usage-in-streaming-tables). | +| **"Paths must end with .py or .sql"** on `pipelines create` | `{"file": {"path": ...}}` needs a single file. Use `{"glob": {"include": "/**"}}` for a directory, or enumerate files individually. | +| **`type mismatch (asked: NOTEBOOK, actual: FILE)`** on `workspace import` | Existing path is a FILE (raw `.sql`/`.py`). Re-upload with `--format RAW --overwrite`, not `--format SOURCE --language SQL` (creates a NOTEBOOK โ€” deprecated for pipelines). | -**For detailed errors**, the `result["message"]` from `manage_pipeline(action="create_or_update")` includes suggested next steps. Use `manage_pipeline(action="get", pipeline_id=...)` which includes recent events and error details. +**For detailed errors**, use `databricks pipelines get ` which includes recent events, or `databricks pipelines list-pipeline-events ` for full event history. --- diff --git a/databricks-skills/databricks-spark-declarative-pipelines/references/1-project-initialization.md b/databricks-skills/databricks-spark-declarative-pipelines/references/1-project-initialization.md index fbab69b3..40cc8d4a 100644 --- a/databricks-skills/databricks-spark-declarative-pipelines/references/1-project-initialization.md +++ b/databricks-skills/databricks-spark-declarative-pipelines/references/1-project-initialization.md @@ -232,8 +232,8 @@ databricks bundle run customer_pipeline_etl # Run specific target databricks bundle run customer_pipeline_etl --target prod -# Or use Pipeline API directly -databricks pipelines start-update --pipeline-id +# Or use Pipeline API directly (pipeline_id is positional) +databricks pipelines start-update ``` --- @@ -429,7 +429,7 @@ pip install --upgrade databricks-cli databricks catalogs list # Create catalog if needed -databricks catalogs create --name my_catalog +databricks catalogs create --json '{"name": "my_catalog"}' ``` ### "Language option not recognized" @@ -576,7 +576,7 @@ For technical best practices (Liquid Clustering, serverless, etc.), see **[SKILL ## References -- **[SKILL.md](../SKILL.md)** - Main development workflow and MCP tools +- **[SKILL.md](../SKILL.md)** - Main development workflow and CLI commands - **[Declarative Automation Bundles (DABs) Documentation](https://docs.databricks.com/dev-tools/bundles/)** - Official bundle reference - **[Pipeline Configuration Reference](https://docs.databricks.com/aws/en/ldp/configure-pipeline)** - Pipeline settings - **[Databricks CLI Reference](https://docs.databricks.com/dev-tools/cli/)** - CLI commands and options diff --git a/databricks-skills/databricks-spark-declarative-pipelines/references/2-cli-approach.md b/databricks-skills/databricks-spark-declarative-pipelines/references/2-cli-approach.md new file mode 100644 index 00000000..cafe4046 --- /dev/null +++ b/databricks-skills/databricks-spark-declarative-pipelines/references/2-cli-approach.md @@ -0,0 +1,174 @@ +# Rapid Pipeline Iteration with CLI + +Use CLI commands to create, run, and iterate on **SDP pipelines**. This is the fastest approach for prototyping without managing bundle files. + +**IMPORTANT: Default to serverless pipelines.** Only use classic clusters if user explicitly requires R language, Spark RDD APIs, or JAR libraries. + +### Step 1: Write Pipeline Files Locally + +Create `.sql` or `.py` files in a local folder. For syntax examples, see: +- [sql/1-syntax-basics.md](sql/1-syntax-basics.md) for SQL syntax +- [python/1-syntax-basics.md](python/1-syntax-basics.md) for Python syntax + +### Step 2: Upload to Databricks Workspace + +```bash +# Upload local folder to workspace +databricks workspace import-dir ./my_pipeline /Workspace/Users/user@example.com/my_pipeline +``` + +### Step 3: Create Pipeline + +```bash +# libraries: "file" = single .sql/.py file; "glob" = directory of files. +# A "file" pointing at a folder fails: "Paths must end with .py or .sql". +# "notebook" is deprecated โ€” use "file" or "glob". +databricks pipelines create --json '{ + "name": "my_orders_pipeline", + "catalog": "my_catalog", + "schema": "my_schema", + "serverless": true, + "libraries": [ + {"glob": {"include": "/Workspace/Users/user@example.com/my_pipeline/**"}} + ], + "tags": {"aidevkit_project": "ai-dev-kit"}, + "development": true +}' + +# Enumerate files instead of glob: +# "libraries": [ +# {"file": {"path": "/Workspace/.../bronze/ingest_orders.sql"}}, +# {"file": {"path": "/Workspace/.../silver/clean_orders.sql"}} +# ] +``` + +Save the returned `pipeline_id` for subsequent operations. + +### Step 4: Run Pipeline + +```bash +# Start a full refresh run (pipeline_id is a positional argument) +databricks pipelines start-update --full-refresh + +# Check run status +databricks pipelines get +``` + +### Step 5: Validate Results + +**On Success** - Verify tables were created with correct data: + +```bash +# Check schema, row counts, sample data, and null counts for all tables +databricks experimental aitools tools discover-schema \ + my_catalog.my_schema.bronze_orders \ + my_catalog.my_schema.silver_orders \ + my_catalog.my_schema.gold_summary +``` + +This returns per table: columns/types, 5 sample rows, total_rows count, and null counts. + +Or use Python for detailed stats: +```python +from databricks.sdk import WorkspaceClient + +w = WorkspaceClient() + +# Get table info +table = w.tables.get("my_catalog.my_schema.bronze_orders") +print(f"Columns: {len(table.columns)}") +print(f"Created: {table.created_at}") +``` + +**On Failure** - Get pipeline events and errors: + +```bash +# Get pipeline details with recent events (pipeline_id is positional) +databricks pipelines get + +# Get specific run events +databricks pipelines list-pipeline-events +``` + +### Step 6: Iterate Until Working + +1. Review errors from pipeline status or events +2. Fix issues in local files +3. Re-upload: `databricks workspace import-dir ./my_pipeline /Workspace/Users/user@example.com/my_pipeline --overwrite` +4. Update and run: `databricks pipelines update --json '...'` then `databricks pipelines start-update ` +5. Repeat until pipeline completes successfully + +--- + +## Quick Reference: CLI Commands + +### Pipeline Lifecycle + +| Command | Description | +|---------|-------------| +| `databricks pipelines create --json '{...}'` | Create new pipeline | +| `databricks pipelines get PIPELINE_ID` | Get pipeline details and status | +| `databricks pipelines update PIPELINE_ID --json '{...}'` | Update pipeline config | +| `databricks pipelines delete PIPELINE_ID` | Delete a pipeline | +| `databricks pipelines list-pipelines` | List all pipelines | + +### Run Management + +| Command | Description | +|---------|-------------| +| `databricks pipelines start-update PIPELINE_ID` | Start pipeline update | +| `databricks pipelines start-update PIPELINE_ID --full-refresh` | Start with full refresh | +| `databricks pipelines stop PIPELINE_ID` | Stop running pipeline | +| `databricks pipelines list-pipeline-events PIPELINE_ID` | Get events/logs | +| `databricks pipelines list-updates PIPELINE_ID` | List recent runs | + +### Supporting Commands + +| Command | Description | +|---------|-------------| +| `databricks workspace import-dir` | Upload files/folders to workspace | +| `databricks workspace list` | List workspace files | +| `databricks experimental aitools tools discover-schema` | Get schema, row counts, sample data, null counts | +| `databricks experimental aitools tools query` | Run ad-hoc SQL queries | + +--- + +## Python SDK Alternative + +For more programmatic control, use the Databricks SDK: + +```python +from databricks.sdk import WorkspaceClient + +w = WorkspaceClient() + +# Create pipeline - use "file" to include all .sql/.py files in a directory +pipeline = w.pipelines.create( + name="my_orders_pipeline", + catalog="my_catalog", + schema="my_schema", + serverless=True, + libraries=[ + {"file": {"path": "/Workspace/Users/user@example.com/my_pipeline"}} + ], + development=True +) +print(f"Created pipeline: {pipeline.pipeline_id}") + +# Start update +update = w.pipelines.start_update( + pipeline_id=pipeline.pipeline_id, + full_refresh=True +) + +# Poll for completion +import time +while True: + status = w.pipelines.get(pipeline_id=pipeline.pipeline_id) + if status.state in ["IDLE", "FAILED"]: + print(f"Pipeline state: {status.state}") + break + time.sleep(10) +``` + +--- diff --git a/databricks-skills/databricks-spark-declarative-pipelines/references/2-mcp-approach.md b/databricks-skills/databricks-spark-declarative-pipelines/references/2-mcp-approach.md deleted file mode 100644 index 87e0ed70..00000000 --- a/databricks-skills/databricks-spark-declarative-pipelines/references/2-mcp-approach.md +++ /dev/null @@ -1,163 +0,0 @@ -Use MCP tools to create, run, and iterate on **SDP pipelines**. The **primary tool is `manage_pipeline`** which handles the entire lifecycle. - -**IMPORTANT: Default to serverless pipelines.** Only use classic clusters if user explicitly requires R language, Spark RDD APIs, or JAR libraries. - -### Step 1: Write Pipeline Files Locally - -Create `.sql` or `.py` files in a local folder. For syntax examples, see: -- [sql/1-syntax-basics.md](sql/1-syntax-basics.md) for SQL syntax -- [python/1-syntax-basics.md](python/1-syntax-basics.md) for Python syntax - -### Step 2: Upload to Databricks Workspace - -``` -# MCP Tool: manage_workspace_files -manage_workspace_files( - action="upload", - local_path="/path/to/my_pipeline", - workspace_path="/Workspace/Users/user@example.com/my_pipeline" -) -``` - -### Step 3: Create/Update and Run Pipeline - -Use **`manage_pipeline`** with `action="create_or_update"` to manage the resource: - -``` -# MCP Tool: manage_pipeline -manage_pipeline( - action="create_or_update", - name="my_orders_pipeline", - root_path="/Workspace/Users/user@example.com/my_pipeline", - catalog="my_catalog", - schema="my_schema", - workspace_file_paths=[ - "/Workspace/Users/user@example.com/my_pipeline/bronze/ingest_orders.sql", - "/Workspace/Users/user@example.com/my_pipeline/silver/clean_orders.sql", - "/Workspace/Users/user@example.com/my_pipeline/gold/daily_summary.sql" - ], - start_run=True, # Automatically run after create/update - wait_for_completion=True, # Wait for run to finish - full_refresh=True # Reprocess all data -) -``` - -**Result contains actionable information:** -```json -{ - "success": true, - "pipeline_id": "abc-123", - "pipeline_name": "my_orders_pipeline", - "created": true, - "state": "COMPLETED", - "catalog": "my_catalog", - "schema": "my_schema", - "duration_seconds": 45.2, - "message": "Pipeline created and completed successfully in 45.2s. Tables written to my_catalog.my_schema", - "error_message": null, - "errors": [] -} -``` - -### Alternative: Run Pipeline Separately - -If you want to run an existing pipeline or control the run separately: - -``` -# MCP Tool: manage_pipeline_run -manage_pipeline_run( - action="start", - pipeline_id="", - full_refresh=True, - wait=True, # Wait for completion - timeout=1800 # 30 minute timeout -) -``` - -### Step 4: Validate Results - -**On Success** - Use `get_table_stats_and_schema` to verify tables (NOT manual SQL COUNT queries): -``` -# MCP Tool: get_table_stats_and_schema -get_table_stats_and_schema( - catalog="my_catalog", - schema="my_schema", - table_names=["bronze_orders", "silver_orders", "gold_daily_summary"] -) -# Returns schema, row counts, and column stats for all tables in one call -``` - -**On Failure** - Check `run_result["message"]` for suggested next steps, then get detailed errors: -``` -# MCP Tool: manage_pipeline -manage_pipeline(action="get", pipeline_id="") -# Returns pipeline details enriched with recent events and error messages - -# Or get events/logs directly: -# MCP Tool: manage_pipeline_run -manage_pipeline_run( - action="get_events", - pipeline_id="", - event_log_level="ERROR", # ERROR, WARN, or INFO - max_results=10 -) -``` - -### Step 5: Iterate Until Working - -1. Review errors from run result or `manage_pipeline(action="get")` -2. Fix issues in local files -3. Re-upload with `manage_workspace_files(action="upload")` -4. Run `manage_pipeline(action="create_or_update", start_run=True)` again (it will update, not recreate) -5. Repeat until `result["success"] == True` - ---- - -## Quick Reference: MCP Tools - -### manage_pipeline - Pipeline Lifecycle - -| Action | Description | Required Params | -|--------|-------------|-----------------| -| `create` | Create new pipeline | name, root_path, catalog, schema, workspace_file_paths | -| `create_or_update` | **Main entry point.** Idempotent create/update, optionally run | name, root_path, catalog, schema, workspace_file_paths | -| `get` | Get pipeline details by ID | pipeline_id | -| `update` | Update pipeline config | pipeline_id + fields to change | -| `delete` | Delete a pipeline | pipeline_id | -| `find_by_name` | Find pipeline by name | name | - -**create_or_update options:** -- `start_run=True`: Automatically run after create/update -- `wait_for_completion=True`: Block until run finishes -- `full_refresh=True`: Reprocess all data (default) -- `timeout=1800`: Max wait time in seconds - -### manage_pipeline_run - Run Management - -| Action | Description | Required Params | -|--------|-------------|-----------------| -| `start` | Start pipeline update | pipeline_id | -| `get` | Get run status | pipeline_id, update_id | -| `stop` | Stop running pipeline | pipeline_id | -| `get_events` | Get events/logs for debugging | pipeline_id | - -**start options:** -- `wait=True`: Block until complete (default) -- `full_refresh=True`: Reprocess all data -- `validate_only=True`: Dry run without writing data -- `refresh_selection=["table1", "table2"]`: Refresh specific tables only - -**get_events options:** -- `event_log_level`: "ERROR", "WARN" (default), "INFO" -- `max_results`: Number of events (default 5) -- `update_id`: Filter to specific run - -### Supporting Tools - -| Tool | Description | -|------|-------------| -| `manage_workspace_files(action="upload")` | Upload files/folders to workspace | -| `get_table_stats_and_schema` | **Use this to validate tables** - returns schema, row counts, and stats in one call | -| `execute_sql` | Run ad-hoc SQL to inspect actual data content (not for row counts) | - ---- diff --git a/databricks-skills/databricks-spark-declarative-pipelines/references/3-advanced-configuration.md b/databricks-skills/databricks-spark-declarative-pipelines/references/3-advanced-configuration.md index b637f469..44435cae 100644 --- a/databricks-skills/databricks-spark-declarative-pipelines/references/3-advanced-configuration.md +++ b/databricks-skills/databricks-spark-declarative-pipelines/references/3-advanced-configuration.md @@ -1,13 +1,13 @@ -# Advanced Pipeline Configuration (`extra_settings`) +# Advanced Pipeline Configuration -By default, pipelines are created with **serverless compute and Unity Catalog**. Use the `extra_settings` parameter only for advanced use cases. +By default, pipelines are created with **serverless compute and Unity Catalog**. Use advanced configuration options only when needed. -**CRITICAL: Do NOT use `extra_settings` to set `serverless=false` unless the user explicitly requires:** +**CRITICAL: Do NOT set `serverless=false` unless the user explicitly requires:** - R language support - Spark RDD APIs - JAR libraries or Maven coordinates -## When to Use `extra_settings` +## When to Use Advanced Configuration - **Development mode**: Faster iteration with relaxed validation - **Continuous pipelines**: Real-time streaming instead of triggered runs @@ -16,7 +16,9 @@ By default, pipelines are created with **serverless compute and Unity Catalog**. - **Python dependencies**: Install pip packages for serverless pipelines - **Classic clusters** (rare): Only if user explicitly needs R, RDD APIs, or JARs -## `extra_settings` Parameter Reference +## Pipeline JSON Configuration Reference + +These fields can be passed to `databricks pipelines create --json '{...}'` or `databricks pipelines update --json '{...}'`. ### Top-Level Fields @@ -157,198 +159,161 @@ Install pip dependencies for serverless pipelines: ## Configuration Examples +All examples use `databricks pipelines create --json '{...}'`. For updates, use `databricks pipelines update --json '{...}'`. + +> **Tagging**: By default, add `"tags": {"aidevkit_project": "ai-dev-kit"}` to track resources created with this skill. + ### Development Mode Pipeline -Use `manage_pipeline(action="create_or_update")` tool with: -- `name`: "my_dev_pipeline" -- `root_path`: "/Workspace/Users/user@example.com/my_pipeline" -- `catalog`: "dev_catalog" -- `schema`: "dev_schema" -- `workspace_file_paths`: [...] -- `start_run`: true -- `extra_settings`: -```json -{ - "development": true, - "tags": {"environment": "development", "owner": "data-team"} -} +```bash +databricks pipelines create --json '{ + "name": "my_dev_pipeline", + "catalog": "dev_catalog", + "schema": "dev_schema", + "serverless": true, + "development": true, + "libraries": [{"file": {"path": "/Workspace/Users/user@example.com/my_pipeline"}}], + "tags": {"environment": "development", "owner": "data-team"} +}' ``` ### Non-Serverless with Dedicated Cluster -Use `manage_pipeline(action="create_or_update")` tool with `extra_settings`: -```json -{ - "serverless": false, - "clusters": [{ - "label": "default", - "num_workers": 4, - "node_type_id": "i3.xlarge", - "custom_tags": {"cost_center": "analytics"} - }], - "photon": true, - "edition": "ADVANCED" -} +```bash +databricks pipelines create --json '{ + "name": "my_pipeline", + "catalog": "my_catalog", + "schema": "my_schema", + "serverless": false, + "photon": true, + "edition": "ADVANCED", + "clusters": [{ + "label": "default", + "num_workers": 4, + "node_type_id": "i3.xlarge", + "custom_tags": {"cost_center": "analytics"} + }], + "libraries": [{"file": {"path": "/Workspace/Users/user@example.com/my_pipeline"}}] +}' ``` ### Continuous Streaming Pipeline -Use `manage_pipeline(action="create_or_update")` tool with `extra_settings`: -```json -{ - "continuous": true, - "configuration": { - "spark.sql.shuffle.partitions": "auto" - } -} -``` - -### Using Instance Pool - -Use `manage_pipeline(action="create_or_update")` tool with `extra_settings`: -```json -{ - "serverless": false, - "clusters": [{ - "label": "default", - "instance_pool_id": "0727-104344-hauls13-pool-xyz", - "num_workers": 2, - "custom_tags": {"project": "analytics"} - }] -} -``` - -### Custom Event Log Location - -Use `manage_pipeline(action="create_or_update")` tool with `extra_settings`: -```json -{ - "event_log": { - "catalog": "audit_catalog", - "schema": "pipeline_logs", - "name": "my_pipeline_events" - } -} +```bash +databricks pipelines create --json '{ + "name": "my_streaming_pipeline", + "catalog": "my_catalog", + "schema": "my_schema", + "serverless": true, + "continuous": true, + "configuration": {"spark.sql.shuffle.partitions": "auto"}, + "libraries": [{"file": {"path": "/Workspace/Users/user@example.com/my_pipeline"}}] +}' ``` ### Pipeline with Email Notifications -Use `manage_pipeline(action="create_or_update")` tool with `extra_settings`: -```json -{ - "notifications": [{ - "email_recipients": ["team@example.com", "oncall@example.com"], - "alerts": ["on-update-failure", "on-update-fatal-failure", "on-flow-failure"] - }] -} +```bash +databricks pipelines create --json '{ + "name": "my_pipeline", + "catalog": "my_catalog", + "schema": "my_schema", + "serverless": true, + "libraries": [{"file": {"path": "/Workspace/Users/user@example.com/my_pipeline"}}], + "notifications": [{ + "email_recipients": ["team@example.com", "oncall@example.com"], + "alerts": ["on-update-failure", "on-update-fatal-failure", "on-flow-failure"] + }] +}' ``` ### Production Pipeline with Autoscaling -Use `manage_pipeline(action="create_or_update")` tool with `extra_settings`: -```json -{ - "serverless": false, - "development": false, - "photon": true, - "edition": "ADVANCED", - "clusters": [{ - "label": "default", - "autoscale": { - "min_workers": 2, - "max_workers": 8, - "mode": "ENHANCED" - }, - "node_type_id": "i3.xlarge", - "spark_conf": { - "spark.sql.adaptive.enabled": "true" - }, - "custom_tags": {"environment": "production"} - }], - "notifications": [{ - "email_recipients": ["data-team@example.com"], - "alerts": ["on-update-failure"] - }] -} +```bash +databricks pipelines create --json '{ + "name": "prod_pipeline", + "catalog": "prod_catalog", + "schema": "prod_schema", + "serverless": false, + "development": false, + "photon": true, + "edition": "ADVANCED", + "clusters": [{ + "label": "default", + "autoscale": {"min_workers": 2, "max_workers": 8, "mode": "ENHANCED"}, + "node_type_id": "i3.xlarge", + "spark_conf": {"spark.sql.adaptive.enabled": "true"}, + "custom_tags": {"environment": "production"} + }], + "libraries": [{"file": {"path": "/Workspace/Users/user@example.com/my_pipeline"}}], + "notifications": [{"email_recipients": ["data-team@example.com"], "alerts": ["on-update-failure"]}] +}' ``` -### Run as Service Principal +### Serverless with Python Dependencies -Use `manage_pipeline(action="create_or_update")` tool with `extra_settings`: -```json -{ - "run_as": { - "service_principal_name": "00000000-0000-0000-0000-000000000000" - } -} +```bash +databricks pipelines create --json '{ + "name": "ml_pipeline", + "catalog": "my_catalog", + "schema": "my_schema", + "serverless": true, + "libraries": [{"file": {"path": "/Workspace/Users/user@example.com/my_pipeline"}}], + "environment": { + "dependencies": ["scikit-learn==1.3.0", "pandas>=2.0.0", "requests"] + } +}' ``` ### Continuous Pipeline with Restart Window -Use `manage_pipeline(action="create_or_update")` tool with `extra_settings`: -```json -{ - "continuous": true, - "restart_window": { - "start_hour": 2, - "days_of_week": ["SATURDAY", "SUNDAY"], - "time_zone_id": "America/Los_Angeles" - } -} +```bash +databricks pipelines create --json '{ + "name": "realtime_pipeline", + "catalog": "my_catalog", + "schema": "my_schema", + "serverless": true, + "continuous": true, + "libraries": [{"file": {"path": "/Workspace/Users/user@example.com/my_pipeline"}}], + "restart_window": { + "start_hour": 2, + "days_of_week": ["SATURDAY", "SUNDAY"], + "time_zone_id": "America/Los_Angeles" + } +}' ``` -### Serverless with Python Dependencies +### Custom Event Log Location -Use `manage_pipeline(action="create_or_update")` tool with `extra_settings`: -```json -{ - "serverless": true, - "environment": { - "dependencies": [ - "scikit-learn==1.3.0", - "pandas>=2.0.0", - "requests" - ] - } -} +```bash +databricks pipelines create --json '{ + "name": "my_pipeline", + "catalog": "my_catalog", + "schema": "my_schema", + "serverless": true, + "libraries": [{"file": {"path": "/Workspace/Users/user@example.com/my_pipeline"}}], + "event_log": { + "catalog": "audit_catalog", + "schema": "pipeline_logs", + "name": "my_pipeline_events" + } +}' ``` -### Update Existing Pipeline by ID +### Update Existing Pipeline -If you have a pipeline ID from the Databricks UI, you can force an update by including `id` in `extra_settings`: -```json -{ - "id": "554f4497-4807-4182-bff0-ffac4bb4f0ce" -} -``` +```bash +# Update pipeline configuration +databricks pipelines update --json '{ + "name": "updated_pipeline_name", + "development": false, + "notifications": [{"email_recipients": ["team@example.com"], "alerts": ["on-update-failure"]}] +}' -### Full JSON Export from Databricks UI - -You can copy pipeline settings from the Databricks UI (Pipeline Settings > JSON) and pass them directly as `extra_settings`. Invalid fields like `pipeline_type` are automatically filtered: - -```json -{ - "id": "554f4497-4807-4182-bff0-ffac4bb4f0ce", - "pipeline_type": "WORKSPACE", - "continuous": false, - "development": true, - "photon": false, - "edition": "ADVANCED", - "channel": "CURRENT", - "clusters": [{ - "label": "default", - "num_workers": 1, - "instance_pool_id": "0727-104344-pool-xyz" - }], - "configuration": { - "catalog": "main", - "schema": "my_schema" - } -} +# Then run it +databricks pipelines start-update --full-refresh ``` -**Note**: Explicit tool parameters (`name`, `root_path`, `catalog`, `schema`, `workspace_file_paths`) always take precedence over values in `extra_settings`. - --- ## Multi-Schema Patterns diff --git a/databricks-skills/databricks-synthetic-data-gen/SKILL.md b/databricks-skills/databricks-synthetic-data-gen/SKILL.md index c046e488..3e6f5b71 100644 --- a/databricks-skills/databricks-synthetic-data-gen/SKILL.md +++ b/databricks-skills/databricks-synthetic-data-gen/SKILL.md @@ -126,26 +126,30 @@ Show a clear specification with **the business story and your assumptions surfac **Do NOT proceed to code generation until user approves the plan, including the catalog.** -### Post-Generation Checklist +### Post-Generation Validation -After generating data, use `get_volume_folder_details` to validate the output matches requirements: -- Row counts match the plan -- Schema matches expected columns and types -- Data distributions look reasonable (check column stats) +Use `databricks experimental aitools tools query` to validate generated data (row counts, distributions, referential integrity). Query parquet files directly: -## Use Databricks Connect Spark + Faker Pattern +```bash +databricks experimental aitools tools query --warehouse $WAREHOUSE_ID " +SELECT COUNT(*) FROM parquet.\`/Volumes/CATALOG/SCHEMA/raw_data/customers\` +" +``` + +See [references/2-troubleshooting.md](references/2-troubleshooting.md) for full validation examples. + +## Use Databricks Connect Spark + Faker Pattern ```python -from databricks.connect import DatabricksSession, DatabricksEnv +from databricks.connect import DatabricksSession from pyspark.sql import functions as F from pyspark.sql.types import StringType import pandas as pd -# Setup serverless with dependencies (MUST list all libs used in UDFs) -env = DatabricksEnv().withDependencies("faker", "holidays") -spark = DatabricksSession.builder.withEnvironment(env).serverless(True).getOrCreate() +# Setup serverless Spark session +spark = DatabricksSession.builder.serverless(True).getOrCreate() -# Pandas UDF pattern - import lib INSIDE the function +# Pandas UDF pattern - import lib INSIDE the function (libs must be installed locally) @F.pandas_udf(StringType()) def fake_name(ids: pd.Series) -> pd.Series: from faker import Faker # Import inside UDF @@ -248,9 +252,7 @@ uv pip install "databricks-connect>=16.4,<17.4" faker numpy pandas holidays | Issue | Solution | |-------|----------| -| `ImportError: cannot import name 'DatabricksEnv'` | Upgrade: `uv pip install "databricks-connect>=16.4"` | -| Python 3.11 instead of 3.12 | Python 3.12 required. Use `uv` to create env with correct version | -| `ModuleNotFoundError: faker` | Add to `withDependencies()`, import inside UDF | +| `ModuleNotFoundError: faker` | Install locally: `uv pip install faker`, import inside UDF | | Faker UDF is slow | Use `pandas_udf` for batch processing | | Out of memory | Increase `numPartitions` in `spark.range()` | | Referential integrity errors | Write master table to Delta first, read back for FK joins | diff --git a/databricks-skills/databricks-synthetic-data-gen/references/2-troubleshooting.md b/databricks-skills/databricks-synthetic-data-gen/references/2-troubleshooting.md index 420b3500..793b64f7 100644 --- a/databricks-skills/databricks-synthetic-data-gen/references/2-troubleshooting.md +++ b/databricks-skills/databricks-synthetic-data-gen/references/2-troubleshooting.md @@ -12,31 +12,16 @@ Common issues and solutions for synthetic data generation. | Mode | Solution | |------|----------| -| **DB Connect 16.4+** | Use `DatabricksEnv().withDependencies("faker", "pandas", ...)` | -| **Older DB Connect with Serverless** | Create job with `environments` parameter | -| **Databricks Runtime** | Use Databricks CLI to install `faker holidays` | +| **DB Connect with Serverless** | Install libs locally (`uv pip install faker`), use `DatabricksSession.builder.serverless(True)` | +| **Databricks Runtime** | Use Databricks CLI to install `faker holidays` | | **Classic cluster** | Use Databricks CLI to install libraries. `databricks libraries install --json '{"cluster_id": "", "libraries": [{"pypi": {"package": "faker"}}, {"pypi": {"package": "holidays"}}]}'` | ```python -# For DB Connect 16.4+ -from databricks.connect import DatabricksSession, DatabricksEnv +# For DB Connect with serverless +from databricks.connect import DatabricksSession -env = DatabricksEnv().withDependencies("faker", "pandas", "numpy", "holidays") -spark = DatabricksSession.builder.withEnvironment(env).serverless(True).getOrCreate() -``` - -### DatabricksEnv not found - -**Problem:** Using older databricks-connect version. - -**Solution:** Upgrade to 16.4+ or use job-based approach: - -```bash -# Upgrade (prefer uv, fall back to pip) -uv pip install "databricks-connect>=16.4,<17.4" -# or: pip install "databricks-connect>=16.4,<17.4" - -# Or use job with environments parameter instead +# Install dependencies locally first: uv pip install faker pandas numpy holidays +spark = DatabricksSession.builder.serverless(True).getOrCreate() ``` ### serverless_compute_id error @@ -300,25 +285,57 @@ resolution_hours = np.random.exponential(scale=resolution_scale[priority]) ## Validation Steps -After generation, verify your data: +After generation, validate using SQL queries via Databricks CLI: -```python -# 1. Check row counts -print(f"Customers: {customers_df.count():,}") -print(f"Orders: {orders_df.count():,}") - -# 2. Verify distributions -customers_df.groupBy("tier").count().show() -orders_df.describe("amount").show() - -# 3. Check referential integrity -orphans = orders_df.join( - customers_df, - orders_df.customer_id == customers_df.customer_id, - "left_anti" -) -print(f"Orphan orders: {orphans.count()}") +```bash +# Set your warehouse ID +WAREHOUSE_ID="your-warehouse-id" +VOLUME_PATH="/Volumes/CATALOG/SCHEMA/raw_data" -# 4. Verify date range -orders_df.select(F.min("order_date"), F.max("order_date")).show() +# 1. Check row counts +databricks experimental aitools tools query --warehouse $WAREHOUSE_ID " +SELECT 'customers' as table_name, COUNT(*) as row_count FROM parquet.\`${VOLUME_PATH}/customers\` +UNION ALL +SELECT 'orders', COUNT(*) FROM parquet.\`${VOLUME_PATH}/orders\` +" + +# 2. Preview schema and sample data +databricks experimental aitools tools query --warehouse $WAREHOUSE_ID " +DESCRIBE SELECT * FROM parquet.\`${VOLUME_PATH}/customers\` +" + +databricks experimental aitools tools query --warehouse $WAREHOUSE_ID " +SELECT * FROM parquet.\`${VOLUME_PATH}/customers\` LIMIT 5 +" + +# 3. Verify distributions +databricks experimental aitools tools query --warehouse $WAREHOUSE_ID " +SELECT tier, COUNT(*) as count, ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER(), 1) as pct +FROM parquet.\`${VOLUME_PATH}/customers\` +GROUP BY tier ORDER BY tier +" + +# 4. Check amount statistics +databricks experimental aitools tools query --warehouse $WAREHOUSE_ID " +SELECT + MIN(amount) as min_amount, + MAX(amount) as max_amount, + ROUND(AVG(amount), 2) as avg_amount, + ROUND(STDDEV(amount), 2) as stddev_amount +FROM parquet.\`${VOLUME_PATH}/orders\` +" + +# 5. Check referential integrity +databricks experimental aitools tools query --warehouse $WAREHOUSE_ID " +SELECT COUNT(*) as orphan_orders +FROM parquet.\`${VOLUME_PATH}/orders\` o +LEFT JOIN parquet.\`${VOLUME_PATH}/customers\` c ON o.customer_id = c.customer_id +WHERE c.customer_id IS NULL +" + +# 6. Verify date range +databricks experimental aitools tools query --warehouse $WAREHOUSE_ID " +SELECT MIN(order_date) as min_date, MAX(order_date) as max_date +FROM parquet.\`${VOLUME_PATH}/orders\` +" ``` diff --git a/databricks-skills/databricks-synthetic-data-gen/scripts/generate_synthetic_data.py b/databricks-skills/databricks-synthetic-data-gen/scripts/generate_synthetic_data.py index b9f953fa..b36edb8e 100644 --- a/databricks-skills/databricks-synthetic-data-gen/scripts/generate_synthetic_data.py +++ b/databricks-skills/databricks-synthetic-data-gen/scripts/generate_synthetic_data.py @@ -6,9 +6,9 @@ - Direct write to Unity Catalog - Works with serverless and classic compute -Auto-detects environment and uses: -- DatabricksEnv with managed dependencies if databricks-connect >= 16.4 (local) -- Standard session if running on Databricks Runtime or older databricks-connect +Prerequisites: +- Install dependencies locally: uv pip install faker pandas numpy holidays databricks-connect +- Configure ~/.databrickscfg with serverless_compute_id = auto """ import sys import os @@ -61,105 +61,23 @@ REGION_PROBS = [0.4, 0.25, 0.2, 0.15] # ============================================================================= -# ENVIRONMENT DETECTION AND SESSION CREATION +# SESSION CREATION # ============================================================================= -def is_databricks_runtime(): - """Check if running on Databricks Runtime vs locally.""" - return "DATABRICKS_RUNTIME_VERSION" in os.environ - -def get_databricks_connect_version(): - """Get databricks-connect version as (major, minor) tuple or None.""" - try: - import importlib.metadata - version_str = importlib.metadata.version('databricks-connect') - parts = version_str.split('.') - return (int(parts[0]), int(parts[1])) - except Exception: - return None - -# Detect environment -on_runtime = is_databricks_runtime() -db_version = get_databricks_connect_version() +from databricks.connect import DatabricksSession print("=" * 80) -print("ENVIRONMENT DETECTION") +print("CREATING SPARK SESSION") print("=" * 80) -print(f"Running on Databricks Runtime: {on_runtime}") -if db_version: - print(f"databricks-connect version: {db_version[0]}.{db_version[1]}") -else: - print("databricks-connect: not available") - -# Use DatabricksEnv with managed dependencies if: -# - Running locally (not on Databricks Runtime) -# - databricks-connect >= 16.4 -use_managed_deps = (not on_runtime) and db_version and db_version >= (16, 4) - -if use_managed_deps: - print("Using DatabricksEnv with managed dependencies") - print("=" * 80) - from databricks.connect import DatabricksSession, DatabricksEnv - - env = DatabricksEnv().withDependencies("faker", "pandas", "numpy", "holidays") - - if USE_SERVERLESS: - spark = DatabricksSession.builder.withEnvironment(env).serverless(True).getOrCreate() - print("Connected to serverless compute with managed dependencies!") - else: - if not CLUSTER_ID: - raise ValueError("CLUSTER_ID must be set when USE_SERVERLESS=False") - spark = DatabricksSession.builder.withEnvironment(env).clusterId(CLUSTER_ID).getOrCreate() - print(f"Connected to cluster with managed dependencies!") -else: - print("Using standard session (dependencies must be pre-installed)") - print("=" * 80) - - # Check that UDF dependencies are available - print("\nChecking UDF dependencies...") - missing_deps = [] - - try: - from faker import Faker - print(" faker: OK") - except ImportError: - missing_deps.append("faker") - print(" faker: MISSING") - - try: - import pandas as pd - print(" pandas: OK") - except ImportError: - missing_deps.append("pandas") - print(" pandas: MISSING") - - if missing_deps: - print("\n" + "=" * 80) - print("ERROR: Missing dependencies for UDFs") - print("=" * 80) - print(f"Missing: {', '.join(missing_deps)}") - if on_runtime: - print('\nSolution: Install libraries via Databricks CLI:') - print(' databricks libraries install --json \'{"cluster_id": "", "libraries": [{"pypi": {"package": "faker"}}, {"pypi": {"package": "holidays"}}]}\'') - else: - print("\nSolution: Upgrade to databricks-connect >= 16.4 for managed deps") - print(" Or create a job with environment settings") - print("=" * 80) - sys.exit(1) - print("\nAll dependencies available") - print("=" * 80) - - from databricks.connect import DatabricksSession - - if USE_SERVERLESS: - spark = DatabricksSession.builder.serverless(True).getOrCreate() - print("Connected to serverless compute") - else: - if not CLUSTER_ID: - raise ValueError("CLUSTER_ID must be set when USE_SERVERLESS=False") - spark = DatabricksSession.builder.clusterId(CLUSTER_ID).getOrCreate() - print(f"Connected to cluster ") +if USE_SERVERLESS: + spark = DatabricksSession.builder.serverless(True).getOrCreate() + print("Connected to serverless compute") +else: + if not CLUSTER_ID: + raise ValueError("CLUSTER_ID must be set when USE_SERVERLESS=False") + spark = DatabricksSession.builder.clusterId(CLUSTER_ID).getOrCreate() + print(f"Connected to cluster {CLUSTER_ID}") # Import Faker for UDF definitions from faker import Faker @@ -260,10 +178,6 @@ def generate_lognormal_amount(tiers: pd.Series) -> pd.Series: customers_df.write.mode(WRITE_MODE).parquet(f"{VOLUME_PATH}/customers") print(f" Saved customers to {VOLUME_PATH}/customers") -# Show tier distribution -print("\n Tier distribution:") -customers_df.groupBy("tier").count().orderBy("tier").show() - # ============================================================================= # GENERATE ORDERS (Child Table with Referential Integrity) # ============================================================================= @@ -366,10 +280,6 @@ def generate_lognormal_amount(tiers: pd.Series) -> pd.Series: orders_final.write.mode(WRITE_MODE).parquet(f"{VOLUME_PATH}/orders") print(f" Saved orders to {VOLUME_PATH}/orders") -# Show status distribution -print("\n Status distribution:") -orders_final.groupBy("status").count().orderBy("status").show() - # ============================================================================= # CLEANUP AND SUMMARY # ============================================================================= diff --git a/databricks-skills/databricks-unity-catalog/5-system-tables.md b/databricks-skills/databricks-unity-catalog/5-system-tables.md index e8c9d95f..0fbdb46d 100644 --- a/databricks-skills/databricks-unity-catalog/5-system-tables.md +++ b/databricks-skills/databricks-unity-catalog/5-system-tables.md @@ -48,12 +48,11 @@ w.system_schemas.enable( **CLI:** ```bash -# List system schemas -databricks system-schemas list --metastore-id your-metastore-id +# List system schemas (METASTORE_ID is positional) +databricks system-schemas list your-metastore-id -# Enable system schema -databricks system-schemas enable --metastore-id your-metastore-id \ - --schema-name access +# Enable system schema (METASTORE_ID and SCHEMA_NAME are positional) +databricks system-schemas enable your-metastore-id access ``` --- diff --git a/databricks-skills/databricks-unity-catalog/6-volumes.md b/databricks-skills/databricks-unity-catalog/6-volumes.md index 497b6090..98166ec6 100644 --- a/databricks-skills/databricks-unity-catalog/6-volumes.md +++ b/databricks-skills/databricks-unity-catalog/6-volumes.md @@ -37,18 +37,18 @@ All volume operations use the path format: --- -## MCP Tools - -| Tool | Usage | -|------|-------| -| `list_volume_files` | `list_volume_files(volume_path="/Volumes/catalog/schema/volume/path/")` | -| `get_volume_folder_details` | `get_volume_folder_details(volume_path="catalog/schema/volume/path", format="parquet")` - schema, row counts, stats | -| `upload_to_volume` | `upload_to_volume(local_path="/tmp/data/*", volume_path="/Volumes/.../dest")` - supports files, folders, globs | -| `download_from_volume` | `download_from_volume(volume_path="/Volumes/.../file.csv", local_path="/tmp/file.csv")` | -| `create_volume_directory` | `create_volume_directory(volume_path="/Volumes/.../new_folder")` - creates parents like `mkdir -p` | -| `delete_volume_file` | `delete_volume_file(volume_path="/Volumes/.../file.csv")` | -| `delete_volume_directory` | `delete_volume_directory(volume_path="/Volumes/.../folder")` - directory must be empty | -| `get_volume_file_info` | `get_volume_file_info(volume_path="/Volumes/.../file.csv")` - returns size, modified date | +## CLI Commands + +`databricks fs` requires the `dbfs:` scheme prefix for UC Volume paths โ€” without it the CLI treats the path as local filesystem and fails with `no such directory`. + +| Command | Description | +|---------|-------------| +| `databricks fs ls dbfs:/Volumes/catalog/schema/volume/path/` | List files in a volume | +| `databricks fs cp -r --overwrite /tmp/data dbfs:/Volumes/.../dest` | Upload a directory's contents to a volume | +| `databricks fs cp dbfs:/Volumes/.../file.csv /tmp/file.csv` | Download a file from a volume | +| `databricks fs mkdirs dbfs:/Volumes/.../new_folder` | Create directory (like `mkdir -p`) | +| `databricks fs rm dbfs:/Volumes/.../file.csv` | Delete file | +| `databricks fs rm -r dbfs:/Volumes/.../folder` | Delete directory recursively | --- diff --git a/databricks-skills/databricks-unity-catalog/7-data-profiling.md b/databricks-skills/databricks-unity-catalog/7-data-profiling.md index 23a2b62f..cf6c3ec1 100644 --- a/databricks-skills/databricks-unity-catalog/7-data-profiling.md +++ b/databricks-skills/databricks-unity-catalog/7-data-profiling.md @@ -36,55 +36,42 @@ Supported `AggregationGranularity` values: `AGGREGATION_GRANULARITY_5_MINUTES`, --- -## MCP Tools +## CLI & SQL Commands -Use the `manage_uc_monitors` tool for all monitor operations: +### Create a Monitor (SQL) -| Action | Description | -|--------|-------------| -| `create` | Create a quality monitor on a table | -| `get` | Get monitor details and status | -| `run_refresh` | Trigger a metric refresh | -| `list_refreshes` | List refresh history | -| `delete` | Delete the monitor (assets are not deleted) | - -### Create a Monitor +```sql +CREATE OR REPLACE QUALITY MONITOR catalog.schema.my_table +OPTIONS ( + OUTPUT_SCHEMA 'catalog.schema' +); +``` -> **Note:** The MCP tool currently only creates **snapshot** monitors. For TimeSeries or InferenceLog monitors, use the Python SDK directly (see below). +### Get Monitor Status (SQL) -```python -manage_uc_monitors( - action="create", - table_name="catalog.schema.my_table", - output_schema_name="catalog.schema", -) +```sql +DESCRIBE QUALITY MONITOR catalog.schema.my_table; ``` -### Get Monitor Status +### Trigger a Refresh (SQL) -```python -manage_uc_monitors( - action="get", - table_name="catalog.schema.my_table", -) +```sql +REFRESH QUALITY MONITOR catalog.schema.my_table; ``` -### Trigger a Refresh +### Delete a Monitor (SQL) -```python -manage_uc_monitors( - action="run_refresh", - table_name="catalog.schema.my_table", -) +```sql +DROP QUALITY MONITOR catalog.schema.my_table; ``` -### Delete a Monitor +### Execute via CLI -```python -manage_uc_monitors( - action="delete", - table_name="catalog.schema.my_table", -) +```bash +databricks experimental aitools tools query --warehouse WAREHOUSE_ID " +CREATE OR REPLACE QUALITY MONITOR catalog.schema.my_table +OPTIONS (OUTPUT_SCHEMA 'catalog.schema') +" ``` --- @@ -300,7 +287,7 @@ LIMIT 100; --- > **Note:** Data profiling was formerly known as Lakehouse Monitoring. The legacy SDK accessor -> `w.lakehouse_monitors` and the MCP tool `manage_uc_monitors` still use the previous API. +> `w.lakehouse_monitors` still uses the previous API. Use `w.data_quality` for the new API. ## Resources diff --git a/databricks-skills/databricks-unity-catalog/SKILL.md b/databricks-skills/databricks-unity-catalog/SKILL.md index 2e3d05fa..177eb953 100644 --- a/databricks-skills/databricks-unity-catalog/SKILL.md +++ b/databricks-skills/databricks-unity-catalog/SKILL.md @@ -29,15 +29,43 @@ Use this skill when: ## Quick Start -### Volume File Operations (MCP Tools) +### Create Unity Catalog Objects (CLI) -| Tool | Usage | -|------|-------| -| `list_volume_files` | `list_volume_files(volume_path="/Volumes/catalog/schema/volume/path/")` | -| `get_volume_folder_details` | `get_volume_folder_details(volume_path="catalog/schema/volume/path", format="parquet")` - schema, row counts, stats | -| `upload_to_volume` | `upload_to_volume(local_path="/tmp/data/*", volume_path="/Volumes/.../dest")` | -| `download_from_volume` | `download_from_volume(volume_path="/Volumes/.../file.csv", local_path="/tmp/file.csv")` | -| `create_volume_directory` | `create_volume_directory(volume_path="/Volumes/.../new_folder")` | +**IMPORTANT**: Use `--json` for creating UC objects. Positional args vary by command and version. + +```bash +# Create a catalog +databricks catalogs create my_catalog + +# Create a schema (args: NAME CATALOG_NAME โ€” positional, name first) +databricks schemas create my_schema my_catalog + +# Create a volume (args: CATALOG_NAME SCHEMA_NAME NAME VOLUME_TYPE โ€” catalog first) +databricks volumes create my_catalog my_schema my_volume MANAGED + +# List catalogs, schemas, volumes +databricks catalogs list +databricks schemas list my_catalog +databricks volumes list my_catalog.my_schema +``` + +### Volume File Operations (CLI) + +`databricks fs` requires the `dbfs:` scheme prefix even for UC Volume paths โ€” without it the CLI treats the path as local filesystem and errors with `no such directory`. + +```bash +# List files in a volume +databricks fs ls dbfs:/Volumes/catalog/schema/volume/path/ + +# Upload a directory's contents to a volume (-r copies contents, not the directory itself) +databricks fs cp -r --overwrite /tmp/data dbfs:/Volumes/catalog/schema/volume/dest + +# Download a file from a volume +databricks fs cp dbfs:/Volumes/catalog/schema/volume/file.csv /tmp/file.csv + +# Create a directory in a volume +databricks fs mkdirs dbfs:/Volumes/catalog/schema/volume/new_folder +``` ### Enable System Tables Access @@ -71,20 +99,17 @@ WHERE usage_date >= current_date() - 30 GROUP BY workspace_id, sku_name; ``` -## MCP Tool Integration +## SQL Queries via CLI -Use `mcp__databricks__execute_sql` for system table queries: +Use `databricks experimental aitools tools query` for system table queries: -```python -# Query lineage -mcp__databricks__execute_sql( - sql_query=""" - SELECT source_table_full_name, target_table_full_name - FROM system.access.table_lineage - WHERE event_date >= current_date() - 7 - """, - catalog="system" -) +```bash +# Query lineage via CLI +databricks experimental aitools tools query --warehouse WAREHOUSE_ID " + SELECT source_table_full_name, target_table_full_name + FROM system.access.table_lineage + WHERE event_date >= current_date() - 7 +" ``` ## Best Practices diff --git a/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md b/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md index 92322fd0..1a1a6368 100644 --- a/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md +++ b/databricks-skills/databricks-unstructured-pdf-generation/SKILL.md @@ -7,331 +7,112 @@ description: "Generate PDF documents from HTML and upload to Unity Catalog volum Convert HTML content to PDF documents and upload them to Unity Catalog Volumes. -## Overview +## Workflow -The `generate_and_upload_pdf` MCP tool converts HTML to PDF and uploads to a Unity Catalog Volume. You (the LLM) generate the HTML content, and the tool handles conversion and upload. +1. Write HTML files to `./raw_data/html/` (write multiple files in parallel for speed) +2. Convert HTML โ†’ PDF using `/scripts/pdf_generator.py` (parallel conversion) +3. Upload PDFs to Unity Catalog volume using `databricks fs cp` +4. Generate `doc_questions.json` with test questions for each document -## Tool Signature +> **Path convention:** `` below = the directory containing this SKILL.md. Resolve to the absolute install path (e.g. `~/.claude/skills/databricks-unstructured-pdf-generation`). `./raw_data/...` paths are relative to your own project cwd. -``` -generate_and_upload_pdf( - html_content: str, # Complete HTML document - filename: str, # PDF filename (e.g., "report.pdf") - catalog: str, # Unity Catalog name - schema: str, # Schema name - volume: str = "raw_data", # Volume name (default: "raw_data") - folder: str = None, # Optional subfolder -) -``` +## Dependencies -**Returns:** -```json -{ - "success": true, - "volume_path": "/Volumes/catalog/schema/volume/filename.pdf", - "error": null -} +```bash +uv pip install plutoprint ``` -## Quick Start - -Generate a simple PDF: +## Step 1: Write HTML Files -``` -generate_and_upload_pdf( - html_content=''' - - - - - -

Quarterly Report Q1 2024

-
-

Executive Summary

-

Revenue increased 15% year-over-year...

-
- -''', - filename="q1_report.pdf", - catalog="my_catalog", - schema="my_schema" -) +```bash +mkdir -p ./raw_data/html ``` -## Performance: Generate Multiple PDFs in Parallel +Write HTML documents to `./raw_data/html/filename.html`. Use subdirectories to organize (structure is preserved). -**IMPORTANT**: PDF generation and upload can take 2-5 seconds per document. When generating multiple PDFs, **call the tool in parallel** to maximize throughput. +## Step 2: Convert to PDF -### Example: Generate 5 PDFs in Parallel - -Make 5 simultaneous `generate_and_upload_pdf` calls: - -``` -# Call 1 -generate_and_upload_pdf( - html_content="...Employee Handbook content...", - filename="employee_handbook.pdf", - catalog="hr_catalog", schema="policies", folder="2024" -) - -# Call 2 (parallel) -generate_and_upload_pdf( - html_content="...Leave Policy content...", - filename="leave_policy.pdf", - catalog="hr_catalog", schema="policies", folder="2024" -) - -# Call 3 (parallel) -generate_and_upload_pdf( - html_content="...Code of Conduct content...", - filename="code_of_conduct.pdf", - catalog="hr_catalog", schema="policies", folder="2024" -) - -# Call 4 (parallel) -generate_and_upload_pdf( - html_content="...Benefits Guide content...", - filename="benefits_guide.pdf", - catalog="hr_catalog", schema="policies", folder="2024" -) - -# Call 5 (parallel) -generate_and_upload_pdf( - html_content="...Remote Work Policy content...", - filename="remote_work_policy.pdf", - catalog="hr_catalog", schema="policies", folder="2024" -) +```bash +# Convert entire folder (parallel, 4 workers) +python /scripts/pdf_generator.py convert --input ./raw_data/html --output ./raw_data/pdf ``` -By calling these in parallel (not sequentially), 5 PDFs that would take 15-25 seconds sequentially complete in 3-5 seconds total. +Skips files where PDF exists and is newer than HTML. Use `--force` to reconvert all. -## HTML Best Practices +## Step 3: Upload to Volume -### Use Complete HTML5 Structure +`databricks fs` requires the `dbfs:` scheme prefix even for UC Volume paths. `-r` copies the *contents* of the source directory into the target (the source directory name is not preserved), so files land directly under `raw_data/`. -Always include the full HTML structure: - -```html - - - - - - - - - +```bash +databricks fs cp -r --overwrite ./raw_data/pdf dbfs:/Volumes/my_catalog/my_schema/raw_data ``` -### CSS Features Supported - -PlutoPrint supports modern CSS3: -- Flexbox and Grid layouts -- CSS variables (`--var-name`) -- Web fonts (system fonts recommended) -- Colors, backgrounds, borders -- Tables with styling - -### CSS to Avoid - -- Animations and transitions (static PDF) -- Interactive elements (forms, hover effects) -- External resources (images via URL) - use embedded base64 if needed - -### Professional Document Template - -```html - - - - - - -

Document Title

- -

Section 1

-

Content here...

- -
- Important: Key information highlighted here. -
- -

Data Table

- - - -
Column 1Column 2Column 3
DataDataData
- - - - +## Step 4: Generate Test Questions + +Create `./raw_data/pdf/pdf_eval_questions.json` with questions for Knowledge Assistant evaluation or MAS: + +```json +{ + "api_errors_guide.pdf": { + "question": "What is the solution for error ERR-4521?", + "expected_fact": "Call /api/v2/auth/refresh with refresh_token before the 3600s TTL expires" + }, + "installation_manual.pdf": { + "question": "What port does the service use by default?", + "expected_fact": "Port 8443 for HTTPS, configurable via CONFIG_PORT environment variable" + } +} ``` -## Common Patterns +This JSON can be used to build KA test cases and validate retrieval accuracy. -### Pattern 1: Technical Documentation +## Document Content Guidelines -Generate API documentation, user guides, or technical specs: +When generating documents for Knowledge Assistant testing or demos: -``` -generate_and_upload_pdf( - html_content=''' - - - -

API Reference

-
- GET /api/v1/users -

Returns a list of all users.

-
-

Request Headers

-
Authorization: Bearer {token}
-Content-Type: application/json
- -''', - filename="api_reference.pdf", - catalog="docs_catalog", - schema="api_docs" -) -``` +- **Multi-page documents**: Each PDF should be several pages with substantial content +- **Specific error codes and solutions**: Include product-specific error codes, causes, and resolution steps +- **Technical details**: API endpoints, configuration parameters, version numbers, specific commands +- **Simple CSS**: Keep styling minimal for fast HTML creation and reliable PDF conversion +- **Queryable facts**: Include details a KA must read the document to answer (not general knowledge) -### Pattern 2: Business Reports +**Good document types:** +- Product user manuals with troubleshooting sections +- API error reference guides (error codes, causes, solutions) +- Installation/configuration guides with specific steps +- Technical specifications with version-specific details -``` -generate_and_upload_pdf( - html_content=''' - - - -

Q1 2024 Performance Report

-
-
$2.4M
-
Revenue
-
-
-
+15%
-
Growth
-
- -''', - filename="q1_2024_report.pdf", - catalog="finance", - schema="reports", - folder="quarterly" -) -``` +**Example content:** Instead of generic "Connection failed" errors, write: +- "Error ERR-4521: OAuth token expired. Cause: Token TTL exceeded 3600s default. Solution: Call `/api/v2/auth/refresh` with your refresh_token before expiration. See Section 4.2 for token lifecycle management." -### Pattern 3: HR Policies +## CLI Reference ``` -generate_and_upload_pdf( - html_content=''' - - - -

Employee Leave Policy

-

Effective: January 1, 2024

- -
-

1. Annual Leave

-

All full-time employees are entitled to 20 days of paid annual leave per calendar year.

-
- -
- Note: Leave requests must be submitted at least 2 weeks in advance. -
- -''', - filename="leave_policy.pdf", - catalog="hr_catalog", - schema="policies" -) -``` - -## Workflow for Multiple Documents +python /scripts/pdf_generator.py convert [OPTIONS] -When asked to generate multiple PDFs: + --input, -i Input HTML file or folder (required) + --output, -o Output folder for PDFs (required) + --force, -f Force reconvert (ignore timestamps) + --workers, -w Parallel workers (default: 4) +``` -1. **Plan the documents**: Determine titles, content structure for each -2. **Generate HTML for each**: Create complete HTML documents -3. **Call tool in parallel**: Make multiple simultaneous `generate_and_upload_pdf` calls -4. **Report results**: Summarize successful uploads and any errors +## Folder Structure -## Prerequisites +Subfolder structure is preserved: -- Unity Catalog schema must exist -- Volume must exist (default: `raw_data`) -- User must have WRITE permission on the volume +``` +./raw_data/html/ ./raw_data/pdf/ +โ”œโ”€โ”€ report.html โ†’ โ”œโ”€โ”€ report.pdf +โ”œโ”€โ”€ quarterly/ โ”œโ”€โ”€ quarterly/ +โ”‚ โ””โ”€โ”€ q1.html โ†’ โ”‚ โ””โ”€โ”€ q1.pdf +โ””โ”€โ”€ legal/ โ””โ”€โ”€ legal/ + โ””โ”€โ”€ terms.html โ†’ โ””โ”€โ”€ terms.pdf +``` ## Troubleshooting | Issue | Solution | |-------|----------| -| "Volume does not exist" | Create the volume first or use an existing one | -| "Schema does not exist" | Create the schema or check the name | -| PDF looks wrong | Check HTML/CSS syntax, use supported CSS features | -| Slow generation | Call multiple PDFs in parallel, not sequentially | +| "plutoprint not installed" | `uv pip install plutoprint` | +| PDF looks wrong | Check HTML/CSS syntax | +| "Volume does not exist" | `databricks volumes create CATALOG SCHEMA VOLUME_NAME MANAGED` (four separate positional args, not `catalog.schema.volume`) | diff --git a/databricks-skills/databricks-unstructured-pdf-generation/scripts/pdf_generator.py b/databricks-skills/databricks-unstructured-pdf-generation/scripts/pdf_generator.py new file mode 100644 index 00000000..e7808d13 --- /dev/null +++ b/databricks-skills/databricks-unstructured-pdf-generation/scripts/pdf_generator.py @@ -0,0 +1,276 @@ +#!/usr/bin/env python3 +""" +PDF Generator - Convert HTML files to PDF locally. + +Usage: + # Convert single file + python pdf_generator.py convert --input ./raw_data/html/report.html --output ./raw_data/pdf + + # Convert entire folder (parallel) + python pdf_generator.py convert --input ./raw_data/html --output ./raw_data/pdf + + # Force reconvert (ignore timestamps) + python pdf_generator.py convert --input ./raw_data/html --output ./raw_data/pdf --force + +Requires: plutoprint + uv / pip install plutoprint +""" + +import argparse +import logging +import sys +from concurrent.futures import ThreadPoolExecutor, as_completed +from dataclasses import dataclass, field +from pathlib import Path +from typing import Optional + +logging.basicConfig(level=logging.INFO, format="%(message)s") +logger = logging.getLogger(__name__) + +MAX_WORKERS = 4 + + +@dataclass +class ConversionResult: + """Result from converting HTML to PDF.""" + html_path: str + pdf_path: Optional[str] = None + success: bool = False + skipped: bool = False + error: Optional[str] = None + + def to_dict(self) -> dict: + return { + "html_path": self.html_path, + "pdf_path": self.pdf_path, + "success": self.success, + "skipped": self.skipped, + "error": self.error, + } + + +@dataclass +class BatchResult: + """Result from batch conversion.""" + total: int = 0 + converted: int = 0 + skipped: int = 0 + failed: int = 0 + results: list = field(default_factory=list) + + def to_dict(self) -> dict: + return { + "total": self.total, + "converted": self.converted, + "skipped": self.skipped, + "failed": self.failed, + "results": [r.to_dict() for r in self.results], + } + + +def _needs_conversion(html_path: Path, pdf_path: Path) -> bool: + """Check if HTML needs to be converted (PDF missing or older than HTML). + + Args: + html_path: Path to HTML file + pdf_path: Path to output PDF file + + Returns: + True if conversion needed, False if PDF is up-to-date + """ + if not pdf_path.exists(): + return True + + html_mtime = html_path.stat().st_mtime + pdf_mtime = pdf_path.stat().st_mtime + + return html_mtime > pdf_mtime + + +def convert_html_to_pdf( + html_path: Path, + pdf_path: Path, + force: bool = False, +) -> ConversionResult: + """Convert a single HTML file to PDF. + + Args: + html_path: Path to HTML file + pdf_path: Path to output PDF file + force: If True, convert even if PDF is up-to-date + + Returns: + ConversionResult with success/skip/error status + """ + result = ConversionResult(html_path=str(html_path)) + + # Check if conversion needed + if not force and not _needs_conversion(html_path, pdf_path): + result.skipped = True + result.success = True + result.pdf_path = str(pdf_path) + logger.debug(f"Skipped (up-to-date): {html_path.name}") + return result + + # Ensure output directory exists + pdf_path.parent.mkdir(parents=True, exist_ok=True) + + try: + import plutoprint + + # Read HTML content + html_content = html_path.read_text(encoding="utf-8") + + # Convert to PDF + book = plutoprint.Book(plutoprint.PAGE_SIZE_A4) + book.load_html(html_content) + book.write_to_pdf(str(pdf_path)) + + if pdf_path.exists(): + result.success = True + result.pdf_path = str(pdf_path) + logger.info(f"Converted: {html_path.name} -> {pdf_path.name}") + else: + result.error = "PDF file not created" + logger.error(f"Failed: {html_path.name} - PDF not created") + + except ImportError: + result.error = "plutoprint not installed. Run: pip install plutoprint" + logger.error(result.error) + except Exception as e: + result.error = str(e) + logger.error(f"Failed: {html_path.name} - {e}") + + return result + + +def convert_folder( + input_dir: Path, + output_dir: Path, + force: bool = False, + max_workers: int = MAX_WORKERS, +) -> BatchResult: + """Convert all HTML files in a folder to PDF (parallel). + + Preserves subfolder structure from input to output. + + Args: + input_dir: Directory containing HTML files + output_dir: Directory for output PDF files + force: If True, convert even if PDFs are up-to-date + max_workers: Number of parallel workers (default: 4) + + Returns: + BatchResult with counts and per-file results + """ + batch = BatchResult() + + # Find all HTML files + html_files = list(input_dir.rglob("*.html")) + batch.total = len(html_files) + + if batch.total == 0: + logger.warning(f"No HTML files found in {input_dir}") + return batch + + logger.info(f"Found {batch.total} HTML file(s) in {input_dir}") + + def process_file(html_path: Path) -> ConversionResult: + # Compute relative path to preserve folder structure + relative_path = html_path.relative_to(input_dir) + pdf_relative = relative_path.with_suffix(".pdf") + pdf_path = output_dir / pdf_relative + + return convert_html_to_pdf(html_path, pdf_path, force=force) + + # Process files in parallel + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = {executor.submit(process_file, f): f for f in html_files} + + for future in as_completed(futures): + result = future.result() + batch.results.append(result) + + if result.skipped: + batch.skipped += 1 + elif result.success: + batch.converted += 1 + else: + batch.failed += 1 + + logger.info(f"Done: {batch.converted} converted, {batch.skipped} skipped, {batch.failed} failed") + return batch + + +def main(): + """CLI entry point.""" + parser = argparse.ArgumentParser( + description="Convert HTML files to PDF", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Convert single file + python pdf_generator.py convert --input ./raw_data/html/report.html --output ./raw_data/pdf + + # Convert entire folder (parallel) + python pdf_generator.py convert --input ./raw_data/html --output ./raw_data/pdf + + # Force reconvert all + python pdf_generator.py convert --input ./raw_data/html --output ./raw_data/pdf --force + """, + ) + + subparsers = parser.add_subparsers(dest="command", help="Commands") + + # Convert command + conv_parser = subparsers.add_parser("convert", help="Convert HTML to PDF") + conv_parser.add_argument("--input", "-i", required=True, help="Input HTML file or folder") + conv_parser.add_argument("--output", "-o", required=True, help="Output folder for PDFs") + conv_parser.add_argument("--force", "-f", action="store_true", help="Force reconvert (ignore timestamps)") + conv_parser.add_argument("--workers", "-w", type=int, default=MAX_WORKERS, help=f"Parallel workers (default: {MAX_WORKERS})") + + args = parser.parse_args() + + if args.command == "convert": + input_path = Path(args.input) + output_path = Path(args.output) + + if not input_path.exists(): + print(f"Error: Input path does not exist: {input_path}") + sys.exit(1) + + if input_path.is_file(): + # Single file conversion + if not input_path.suffix.lower() == ".html": + print(f"Error: Input file must be .html: {input_path}") + sys.exit(1) + + pdf_path = output_path / input_path.with_suffix(".pdf").name + result = convert_html_to_pdf(input_path, pdf_path, force=args.force) + + if result.skipped: + print(f"Skipped (up-to-date): {result.pdf_path}") + elif result.success: + print(f"Converted: {result.pdf_path}") + else: + print(f"Error: {result.error}") + sys.exit(1) + else: + # Folder conversion + batch = convert_folder( + input_path, + output_path, + force=args.force, + max_workers=args.workers, + ) + + print(f"\nSummary: {batch.converted} converted, {batch.skipped} skipped, {batch.failed} failed") + if batch.failed > 0: + sys.exit(1) + else: + parser.print_help() + sys.exit(1) + + +if __name__ == "__main__": + main() diff --git a/databricks-skills/databricks-vector-search/SKILL.md b/databricks-skills/databricks-vector-search/SKILL.md index 72068ec5..2bb84d84 100644 --- a/databricks-skills/databricks-vector-search/SKILL.md +++ b/databricks-skills/databricks-vector-search/SKILL.md @@ -268,28 +268,22 @@ scan_result = w.vector_search_indexes.scan_index( ```bash # List endpoints -databricks vector-search endpoints list +databricks vector-search-endpoints list-endpoints -# Create endpoint -databricks vector-search endpoints create \ - --name my-endpoint \ - --endpoint-type STANDARD +# Create endpoint (positional args: NAME ENDPOINT_TYPE) +databricks vector-search-endpoints create-endpoint my-endpoint STANDARD -# List indexes on endpoint -databricks vector-search indexes list-indexes \ - --endpoint-name my-endpoint +# List indexes on endpoint (positional arg: ENDPOINT_NAME) +databricks vector-search-indexes list-indexes my-endpoint -# Get index status -databricks vector-search indexes get-index \ - --index-name catalog.schema.my_index +# Get index status (positional arg: INDEX_NAME) +databricks vector-search-indexes get-index catalog.schema.my_index -# Sync index (for TRIGGERED) -databricks vector-search indexes sync-index \ - --index-name catalog.schema.my_index +# Sync index (positional arg: INDEX_NAME) +databricks vector-search-indexes sync-index catalog.schema.my_index -# Delete index -databricks vector-search indexes delete-index \ - --index-name catalog.schema.my_index +# Delete index (positional arg: INDEX_NAME) +databricks vector-search-indexes delete-index catalog.schema.my_index ``` ## Common Issues @@ -302,7 +296,7 @@ databricks vector-search indexes delete-index \ | **Embedding dimension mismatch** | Ensure query and index dimensions match | | **Index not updating** | Check pipeline_type; use sync_index() for TRIGGERED | | **Out of capacity** | Upgrade to Storage-Optimized (1B+ vectors) | -| **`query_vector` truncated by MCP tool** | MCP tool calls serialize arrays as JSON and can truncate large vectors (e.g. 1024-dim). Use `query_text` instead (for managed embedding indexes), or use the Databricks SDK/CLI to pass raw vectors | +| **`query_vector` truncated** | Large vectors (e.g. 1024-dim) can be truncated when serialized as JSON. Use `query_text` instead (for managed embedding indexes), or use the Databricks SDK to pass raw vectors | ## Embedding Models @@ -323,112 +317,6 @@ embedding_source_columns=[ ] ``` -## MCP Tools - -The following MCP tools are available for managing Vector Search infrastructure. For a full end-to-end walkthrough, see [end-to-end-rag.md](end-to-end-rag.md). - -### manage_vs_endpoint - Endpoint Management - -| Action | Description | Required Params | -|--------|-------------|-----------------| -| `create_or_update` | Create endpoint (STANDARD or STORAGE_OPTIMIZED). Idempotent | name | -| `get` | Get endpoint details | name | -| `list` | List all endpoints | (none) | -| `delete` | Delete endpoint (indexes must be deleted first) | name | - -```python -# Create or update an endpoint -result = manage_vs_endpoint(action="create_or_update", name="my-vs-endpoint", endpoint_type="STANDARD") -# Returns {"name": "my-vs-endpoint", "endpoint_type": "STANDARD", "created": True} - -# List all endpoints -endpoints = manage_vs_endpoint(action="list") - -# Get specific endpoint -endpoint = manage_vs_endpoint(action="get", name="my-vs-endpoint") -``` - -### manage_vs_index - Index Management - -| Action | Description | Required Params | -|--------|-------------|-----------------| -| `create_or_update` | Create index. Idempotent, auto-triggers sync for DELTA_SYNC | name, endpoint_name, primary_key | -| `get` | Get index details | name | -| `list` | List indexes. Optional endpoint_name filter | (none) | -| `delete` | Delete index | name | - -```python -# Create a Delta Sync index with managed embeddings -result = manage_vs_index( - action="create_or_update", - name="catalog.schema.my_index", - endpoint_name="my-vs-endpoint", - primary_key="id", - index_type="DELTA_SYNC", - delta_sync_index_spec={ - "source_table": "catalog.schema.docs", - "embedding_source_columns": [{"name": "content", "embedding_model_endpoint_name": "databricks-gte-large-en"}], - "pipeline_type": "TRIGGERED" - } -) - -# Get a specific index -index = manage_vs_index(action="get", name="catalog.schema.my_index") - -# List all indexes on an endpoint -indexes = manage_vs_index(action="list", endpoint_name="my-vs-endpoint") - -# List all indexes across all endpoints -all_indexes = manage_vs_index(action="list") -``` - -### query_vs_index - Query (Hot Path) - -Query index with `query_text`, `query_vector`, or hybrid (`query_type="HYBRID"`). Prefer `query_text` over `query_vector` โ€” MCP tool calls can truncate large embedding arrays (1024-dim). - -```python -# Query an index -results = query_vs_index( - index_name="catalog.schema.my_index", - columns=["id", "content"], - query_text="machine learning best practices", - num_results=5 -) - -# Hybrid search (combines vector + keyword) -results = query_vs_index( - index_name="catalog.schema.my_index", - columns=["id", "content"], - query_text="SPARK-12345 memory error", - query_type="HYBRID", - num_results=10 -) -``` - -### manage_vs_data - Data Operations - -| Action | Description | Required Params | -|--------|-------------|-----------------| -| `upsert` | Insert/update records | index_name, inputs_json | -| `delete` | Delete by primary key | index_name, primary_keys | -| `scan` | Scan index contents | index_name | -| `sync` | Trigger sync for TRIGGERED indexes | index_name | - -```python -# Upsert data into a Direct Access index -manage_vs_data( - action="upsert", - index_name="catalog.schema.my_index", - inputs_json=[{"id": "doc1", "content": "...", "embedding": [0.1, 0.2, ...]}] -) - -# Trigger manual sync for a TRIGGERED pipeline index -manage_vs_data(action="sync", index_name="catalog.schema.my_index") - -# Scan index contents -manage_vs_data(action="scan", index_name="catalog.schema.my_index", num_results=100) -``` - ## Notes - **Storage-Optimized is newer** โ€” better for most use cases unless you need <100ms latency @@ -436,7 +324,7 @@ manage_vs_data(action="scan", index_name="catalog.schema.my_index", num_results= - **Hybrid search** โ€” available for both Delta Sync and Direct Access indexes - **`columns_to_sync` matters** โ€” only synced columns are available in query results; include all columns you need - **Filter syntax differs by endpoint** โ€” Standard uses dict-format filters, Storage-Optimized uses SQL-like string filters. Use the `databricks-vectorsearch` package's `filters` parameter which accepts both formats -- **Management vs runtime** โ€” MCP tools above handle lifecycle management; for agent tool-calling at runtime, use `VectorSearchRetrieverTool` or the Databricks managed Vector Search MCP server +- **Management vs runtime** โ€” CLI and SDK handle lifecycle management; for agent tool-calling at runtime, use `VectorSearchRetrieverTool` ## Related Skills diff --git a/databricks-skills/databricks-vector-search/end-to-end-rag.md b/databricks-skills/databricks-vector-search/end-to-end-rag.md index a3808d1b..00959f91 100644 --- a/databricks-skills/databricks-vector-search/end-to-end-rag.md +++ b/databricks-skills/databricks-vector-search/end-to-end-rag.md @@ -2,16 +2,16 @@ Build a complete Retrieval-Augmented Generation pipeline: prepare documents, create a vector index, query it, and wire it into an agent. -## MCP Tools Used +## CLI Commands Used -| Tool | Step | -|------|------| -| `execute_sql` | Create source table, insert documents | -| `manage_vs_endpoint(action="create")` | Create compute endpoint | -| `manage_vs_index(action="create")` | Create Delta Sync index with managed embeddings | -| `manage_vs_index(action="sync")` | Trigger index sync | -| `manage_vs_index(action="get")` | Check index status | -| `query_vs_index` | Test similarity search | +| Command | Step | +|---------|------| +| `databricks experimental aitools tools query` | Create source table, insert documents | +| `databricks vector-search-endpoints create-endpoint` | Create compute endpoint | +| `databricks vector-search-indexes create-index` | Create Delta Sync index with managed embeddings | +| `databricks vector-search-indexes sync-index` | Trigger index sync | +| `databricks vector-search-indexes get-index` | Check index status | +| `databricks vector-search-indexes query-index` | Test similarity search | --- @@ -34,10 +34,10 @@ INSERT INTO catalog.schema.knowledge_base VALUES ('doc-003', 'Delta Lake', 'Delta Lake is an open-source storage layer...', 'storage', current_timestamp()); ``` -Or via MCP: +Or via CLI: -```python -execute_sql(sql_query=""" +```bash +databricks experimental aitools tools query --warehouse WAREHOUSE_ID " CREATE TABLE IF NOT EXISTS catalog.schema.knowledge_base ( doc_id STRING, title STRING, @@ -45,7 +45,7 @@ execute_sql(sql_query=""" category STRING, updated_at TIMESTAMP DEFAULT current_timestamp() ) -""") +" ``` ## Step 2: Create Vector Search Endpoint diff --git a/databricks-skills/databricks-vector-search/troubleshooting-and-operations.md b/databricks-skills/databricks-vector-search/troubleshooting-and-operations.md index 7dc4b8c9..36614a23 100644 --- a/databricks-skills/databricks-vector-search/troubleshooting-and-operations.md +++ b/databricks-skills/databricks-vector-search/troubleshooting-and-operations.md @@ -4,7 +4,7 @@ Operational guidance for monitoring, cost optimization, capacity planning, and m ## Monitoring Endpoint Status -Use `manage_vs_endpoint(action="get")` (MCP tool) or `w.vector_search_endpoints.get_endpoint()` (SDK) to check endpoint health. +Use `databricks vector-search-endpoints get-endpoint ENDPOINT_NAME` (CLI) or `w.vector_search_endpoints.get_endpoint()` (SDK) to check endpoint health. ### Endpoint fields @@ -34,7 +34,7 @@ print(f"Indexes: {endpoint.num_indexes}") ## Monitoring Index Status -Use `manage_vs_index(action="get")` (MCP tool) or `w.vector_search_indexes.get_index()` (SDK) to check index health. +Use `databricks vector-search-indexes get-index INDEX_NAME` (CLI) or `w.vector_search_indexes.get_index()` (SDK) to check index health. ### Index fields diff --git a/databricks-skills/databricks-zerobus-ingest/SKILL.md b/databricks-skills/databricks-zerobus-ingest/SKILL.md index 22f90c55..fdac5f98 100644 --- a/databricks-skills/databricks-zerobus-ingest/SKILL.md +++ b/databricks-skills/databricks-zerobus-ingest/SKILL.md @@ -120,54 +120,54 @@ You must always follow all the steps in the Workflow ## Workflow 0. **Display the plan of your execution** -1. **Determinate the type of client** -2. **Get schema** Always use 4-protobuf-schema.md. Execute using the `execute_code` MCP tool -3. **Write Python code to a local file follow the instructions in the relevant guide to ingest with zerobus** in the project (e.g., `scripts/zerobus_ingest.py`). -4. **Execute on Databricks** using the `execute_code` MCP tool (with `file_path` parameter) -5. **If execution fails**: Edit the local file to fix the error, then re-execute -6. **Reuse the context** for follow-up executions by passing the returned `cluster_id` and `context_id` +1. **Determine the type of client** +2. **Get schema** Always use 4-protobuf-schema.md +3. **Write Python code to a local file** following the instructions in the relevant guide (e.g., `scripts/zerobus_ingest.py`) +4. **Upload to workspace**: `databricks workspace import-dir ./scripts /Workspace/Users//scripts` +5. **Execute on Databricks** using a job or notebook +6. **If execution fails**: Edit the local file, re-upload, and re-execute --- ## Important - Never install local packages -- Always validate MCP server requirement before execution - **Serverless limitation**: The Zerobus SDK cannot pip-install on serverless compute. Use classic compute clusters, or use the [Zerobus REST API](https://docs.databricks.com/aws/en/ingestion/zerobus-rest-api) (Beta) for notebook-based ingestion without the SDK. - **Explicit table grants**: Service principals need explicit `MODIFY` and `SELECT` grants on the target table. Schema-level inherited permissions may not be sufficient for the `authorization_details` OAuth flow. --- -### Context Reuse Pattern +### Execution Workflow -The first execution auto-selects a running cluster and creates an execution context. **Reuse this context for follow-up calls** - it's much faster (~1s vs ~15s) and shares variables/imports: - -**First execution** - use `execute_code` tool: -- `file_path`: "scripts/zerobus_ingest.py" - -Returns: `{ success, output, error, cluster_id, context_id, ... }` +**Step 1: Upload code to workspace** +```bash +databricks workspace import-dir ./scripts /Workspace/Users//scripts +``` -Save `cluster_id` and `context_id` for follow-up calls. +**Step 2: Create and run a job** +```bash +databricks jobs create --json '{ + "name": "zerobus-ingest", + "tasks": [{ + "task_key": "ingest", + "spark_python_task": { + "python_file": "/Workspace/Users//scripts/zerobus_ingest.py" + }, + "new_cluster": { + "spark_version": "16.1.x-scala2.12", + "node_type_id": "i3.xlarge", + "num_workers": 0 + } + }] +}' + +databricks jobs run-now JOB_ID +``` **If execution fails:** -1. Read the error from the result +1. Read the error from the job run output 2. Edit the local Python file to fix the issue -3. Re-execute with same context using `execute_code` tool: - - `file_path`: "scripts/zerobus_ingest.py" - - `cluster_id`: "" - - `context_id`: "" - -**Follow-up executions** reuse the context (faster, shares state): -- `file_path`: "scripts/validate_ingestion.py" -- `cluster_id`: "" -- `context_id`: "" - -### Handling Failures - -When execution fails: -1. Read the error from the result -2. **Edit the local Python file** to fix the issue -3. Re-execute using the same `cluster_id` and `context_id` (faster, keeps installed libraries) -4. If the context is corrupted, omit `context_id` to create a fresh one +3. Re-upload: `databricks workspace import-dir ./scripts /Workspace/Users//scripts` +4. Re-run: `databricks jobs run-now JOB_ID` --- @@ -175,14 +175,14 @@ When execution fails: Databricks provides Spark, pandas, numpy, and common data libraries by default. **Only install a library if you get an import error.** -Use `execute_code` tool: -- `code`: "%pip install databricks-zerobus-ingest-sdk>=1.0.0" -- `cluster_id`: "" -- `context_id`: "" - -The library is immediately available in the same context. +Add to the job configuration: +```json +"libraries": [ + {"pypi": {"package": "databricks-zerobus-ingest-sdk>=1.0.0"}} +] +``` -**Note:** Keeping the same `context_id` means installed libraries persist across calls. +Or use init scripts in the cluster configuration. ## ๐Ÿšจ Critical Learning: Timestamp Format Fix diff --git a/databricks-skills/install_skills.sh b/databricks-skills/install_skills.sh index 0fc2e1d2..67866942 100755 --- a/databricks-skills/install_skills.sh +++ b/databricks-skills/install_skills.sh @@ -3,7 +3,7 @@ # Databricks Skills Installer # # Installs Databricks skills for Claude Code into your project. -# These skills teach Claude how to work with Databricks using MCP tools. +# These skills teach Claude how to work with Databricks using CLI, SDK, and REST APIs. # # Usage: # # Install all skills (Databricks + MLflow + APX) @@ -47,7 +47,7 @@ MLFLOW_REPO_RAW_URL="https://raw.githubusercontent.com/mlflow/skills" MLFLOW_REPO_REF="main" # Databricks skills (hosted in this repo) -DATABRICKS_SKILLS="databricks-agent-bricks databricks-ai-functions databricks-aibi-dashboards databricks-bundles databricks-app-python databricks-config databricks-dbsql databricks-docs databricks-genie databricks-iceberg databricks-jobs databricks-lakebase-autoscale databricks-lakebase-provisioned databricks-metric-views databricks-mlflow-evaluation databricks-model-serving databricks-python-sdk databricks-execution-compute databricks-spark-declarative-pipelines databricks-spark-structured-streaming databricks-synthetic-data-gen databricks-unity-catalog databricks-unstructured-pdf-generation databricks-vector-search databricks-zerobus-ingest spark-python-data-source" +DATABRICKS_SKILLS="databricks-agent-bricks databricks-ai-functions databricks-aibi-dashboards databricks-bundles databricks-app-python databricks-config databricks-dbsql databricks-docs databricks-genie databricks-iceberg databricks-jobs databricks-lakebase-autoscale databricks-metric-views databricks-mlflow-evaluation databricks-model-serving databricks-python-sdk databricks-execution-compute databricks-spark-declarative-pipelines databricks-spark-structured-streaming databricks-synthetic-data-gen databricks-unity-catalog databricks-unstructured-pdf-generation databricks-vector-search databricks-zerobus-ingest spark-python-data-source" # MLflow skills (fetched from mlflow/skills repo) MLFLOW_SKILLS="agent-evaluation analyze-mlflow-chat-session analyze-mlflow-trace instrumenting-with-mlflow-tracing mlflow-onboarding querying-mlflow-metrics retrieving-mlflow-traces searching-mlflow-docs" @@ -83,7 +83,6 @@ get_skill_description() { "databricks-execution-compute") echo "Execute code and manage compute on Databricks - serverless, clusters, and SQL warehouses" ;; "databricks-unity-catalog") echo "System tables for lineage, audit, billing" ;; "databricks-lakebase-autoscale") echo "Lakebase Autoscale - managed PostgreSQL with autoscaling" ;; - "databricks-lakebase-provisioned") echo "Lakebase Provisioned - data connections and reverse ETL" ;; "databricks-metric-views") echo "Unity Catalog Metric Views - governed business metrics in YAML" ;; "databricks-model-serving") echo "Model Serving - deploy MLflow models and AI agents" ;; "databricks-mlflow-evaluation") echo "MLflow evaluation and trace analysis" ;; @@ -119,12 +118,11 @@ get_skill_extra_files() { "databricks-bundles") echo "alerts_guidance.md SDP_guidance.md" ;; "databricks-iceberg") echo "1-managed-iceberg-tables.md 2-uniform-and-compatibility.md 3-iceberg-rest-catalog.md 4-snowflake-interop.md 5-external-engine-interop.md" ;; "databricks-app-apx") echo "backend-patterns.md best-practices.md frontend-patterns.md" ;; - "databricks-app-python") echo "1-authorization.md 2-app-resources.md 3-frameworks.md 4-deployment.md 5-lakebase.md 6-mcp-approach.md examples/llm_config.py examples/fm-minimal-chat.py examples/fm-parallel-calls.py examples/fm-structured-outputs.py" ;; + "databricks-app-python") echo "1-authorization.md 2-app-resources.md 3-frameworks.md 4-deployment.md 5-lakebase.md 6-cli-approach.md examples/llm_config.py examples/fm-minimal-chat.py examples/fm-parallel-calls.py examples/fm-structured-outputs.py" ;; "databricks-jobs") echo "task-types.md triggers-schedules.md notifications-monitoring.md examples.md" ;; "databricks-python-sdk") echo "doc-index.md examples/1-authentication.py examples/2-clusters-and-jobs.py examples/3-sql-and-warehouses.py examples/4-unity-catalog.py examples/5-serving-and-vector-search.py" ;; "databricks-unity-catalog") echo "5-system-tables.md" ;; "databricks-lakebase-autoscale") echo "projects.md branches.md computes.md connection-patterns.md reverse-etl.md" ;; - "databricks-lakebase-provisioned") echo "connection-patterns.md reverse-etl.md" ;; "databricks-metric-views") echo "yaml-reference.md patterns.md" ;; "databricks-model-serving") echo "1-classical-ml.md 2-custom-pyfunc.md 3-genai-agents.md 4-tools-integration.md 5-development-testing.md 6-logging-registration.md 7-deployment.md 8-querying-endpoints.md 9-package-requirements.md" ;; "databricks-mlflow-evaluation") echo "references/CRITICAL-interfaces.md references/GOTCHAS.md references/patterns-context-optimization.md references/patterns-datasets.md references/patterns-evaluation.md references/patterns-scorers.md references/patterns-trace-analysis.md references/user-journeys.md" ;; diff --git a/databricks-tools-core/databricks_tools_core/compute/serverless.py b/databricks-tools-core/databricks_tools_core/compute/serverless.py index 65fdff2b..f1c3588e 100644 --- a/databricks-tools-core/databricks_tools_core/compute/serverless.py +++ b/databricks-tools-core/databricks_tools_core/compute/serverless.py @@ -295,14 +295,40 @@ def run_code_on_serverless( try: # --- Step 2: Submit serverless run --- try: - # Build submit kwargs, allowing job_extra_params to override defaults - extra = job_extra_params or {} - - # Determine environment_key for the task - env_key = "Default" - if "environments" in extra and extra["environments"]: - # Use the first environment's key from extra params - env_key = extra["environments"][0].get("environment_key", "Default") + # Build submit kwargs, allowing job_extra_params to override defaults. + # Callers may pass environments as dicts (documented shape) or typed + # JobEnvironment objects. Normalize to typed before use, because: + # - the SDK's jobs.submit serializes each element via .as_dict() + # - we read environment_key off the first element here + extra = dict(job_extra_params or {}) + + if extra.get("environments"): + normalized = [] + for e in extra["environments"]: + if isinstance(e, JobEnvironment): + normalized.append(e) + elif isinstance(e, dict): + spec = e.get("spec", {}) + if isinstance(spec, dict): + spec = Environment(**spec) + elif not isinstance(spec, Environment): + raise TypeError( + f"environments[].spec must be a dict or Environment, got {type(spec).__name__}" + ) + normalized.append( + JobEnvironment( + environment_key=e.get("environment_key", "Default"), + spec=spec, + ) + ) + else: + raise TypeError( + f"environments[] entries must be dict or JobEnvironment, got {type(e).__name__}" + ) + extra["environments"] = normalized + env_key = normalized[0].environment_key or "Default" + else: + env_key = "Default" submit_kwargs = { "run_name": run_name, diff --git a/databricks-tools-core/tests/integration/compute/test_serverless.py b/databricks-tools-core/tests/integration/compute/test_serverless.py index 7f68bb42..934ba8ca 100644 --- a/databricks-tools-core/tests/integration/compute/test_serverless.py +++ b/databricks-tools-core/tests/integration/compute/test_serverless.py @@ -224,3 +224,75 @@ def test_persistent_to_dict_includes_workspace_path(self): assert result.success, f"Execution failed: {result.error}" d = result.to_dict() assert d["workspace_path"] == ws_path + + +@pytest.mark.integration +class TestServerlessJobExtraParams: + """Tests for job_extra_params, especially the environments list normalization. + + Regression coverage for the bug where passing environments as dicts (the + documented shape in the docstring) crashed with + "'dict' object has no attribute 'as_dict'", and passing typed JobEnvironment + crashed with "'JobEnvironment' object has no attribute 'get'". + """ + + def test_environments_as_dicts(self): + """Documented shape: environments as plain dicts with 'spec' nested dict.""" + result = run_code_on_serverless( + code='dbutils.notebook.exit("dict env ok")', + run_name="test-env-dict", + job_extra_params={ + "environments": [ + { + "environment_key": "dict_env", + "spec": {"client": "1"}, + } + ] + }, + ) + logger.info(f"dict-env result: success={result.success}, error={result.error}") + assert result.success, f"Execution failed: {result.error}" + assert "dict env ok" in result.output + + def test_environments_as_typed_objects(self): + """Typed shape: environments as JobEnvironment + Environment instances.""" + from databricks.sdk.service.compute import Environment + from databricks.sdk.service.jobs import JobEnvironment + + result = run_code_on_serverless( + code='dbutils.notebook.exit("typed env ok")', + run_name="test-env-typed", + job_extra_params={ + "environments": [ + JobEnvironment( + environment_key="typed_env", + spec=Environment(client="1"), + ) + ] + }, + ) + logger.info(f"typed-env result: success={result.success}, error={result.error}") + assert result.success, f"Execution failed: {result.error}" + assert "typed env ok" in result.output + + def test_no_job_extra_params_uses_default_env(self): + """Regression: omitting job_extra_params still submits with the default env.""" + result = run_code_on_serverless( + code='dbutils.notebook.exit("default env ok")', + run_name="test-env-default", + ) + logger.info(f"default-env result: success={result.success}, error={result.error}") + assert result.success, f"Execution failed: {result.error}" + assert "default env ok" in result.output + + def test_malformed_environment_entry_raises_type_error(self): + """Non-dict, non-typed entries should fail fast with a TypeError (no submit).""" + result = run_code_on_serverless( + code="print('never runs')", + run_name="test-env-malformed", + job_extra_params={"environments": ["not-a-dict-or-typed"]}, + ) + # The function catches the TypeError and returns a failure result. + assert not result.success + assert result.error is not None + assert "JobEnvironment" in result.error or "type" in result.error.lower() diff --git a/install.ps1 b/install.ps1 index a4648fa6..0f34f819 100644 --- a/install.ps1 +++ b/install.ps1 @@ -1,7 +1,7 @@ # # Databricks AI Dev Kit - Unified Installer (Windows) # -# Installs skills, MCP server, and configuration for Claude Code, Cursor, OpenAI Codex, GitHub Copilot, Gemini CLI, Antigravity, and Windsurf. +# Installs skills, MCP server, and configuration for Claude Code, Cursor, OpenAI Codex, GitHub Copilot, Gemini CLI, Antigravity, Windsurf, OpenCode, and Kiro. # # Usage: irm https://raw.githubusercontent.com/databricks-solutions/ai-dev-kit/main/install.ps1 -OutFile install.ps1 # .\install.ps1 [OPTIONS] @@ -36,8 +36,10 @@ $Owner = "databricks-solutions" $Repo = "ai-dev-kit" # Determine branch/tag to use +$script:BranchExplicit = $false if ($env:AIDEVKIT_BRANCH) { $Branch = $env:AIDEVKIT_BRANCH + $script:BranchExplicit = $true } else { try { $latestReleaseUri = "https://api.github.com/repos/$Owner/$Repo/releases/latest" @@ -49,7 +51,7 @@ if ($env:AIDEVKIT_BRANCH) { } $RepoUrl = "https://github.com/$Owner/$Repo.git" -$RawUrl = "https://raw.githubusercontent.com/$Owner/$Repo/$Branch" +# $RawUrl is set after argument parsing so --branch / --experimental can affect it $InstallDir = if ($env:AIDEVKIT_HOME) { $env:AIDEVKIT_HOME } else { Join-Path $env:USERPROFILE ".ai-dev-kit" } $RepoDir = Join-Path $InstallDir "repo" $VenvDir = Join-Path $InstallDir ".venv" @@ -64,9 +66,10 @@ $MinSdkVersion = "0.85.0" $script:Profile_ = "DEFAULT" $script:Scope = "project" $script:ScopeExplicit = $false # Track if --global was explicitly passed -$script:InstallMcp = $true +$script:InstallMcp = $false $script:InstallSkills = $true $script:Force = $false +$script:ForceExplicit = $false $script:Silent = $false $script:UserTools = "" $script:Tools = "" @@ -76,6 +79,7 @@ $script:ProfileProvided = $false $script:SkillsProfile = "" $script:UserSkills = "" $script:ListSkills = $false +$script:Channel = if ($env:AIDEVKIT_CHANNEL) { $env:AIDEVKIT_CHANNEL } else { "stable" } # stable or experimental # Databricks skills (bundled in repo) $script:Skills = @( @@ -212,14 +216,17 @@ while ($i -lt $args.Count) { { $_ -in "-p", "--profile" } { $script:Profile_ = $args[$i + 1]; $script:ProfileProvided = $true; $i += 2 } { $_ -in "-g", "--global", "-Global" } { $script:Scope = "global"; $script:ScopeExplicit = $true; $i++ } { $_ -in "--skills-only", "-SkillsOnly" } { $script:InstallMcp = $false; $i++ } + { $_ -in "--mcp", "-Mcp" } { $script:InstallMcp = $true; $i++ } { $_ -in "--mcp-only", "-McpOnly" } { $script:InstallSkills = $false; $i++ } - { $_ -in "--mcp-path", "-McpPath" } { $script:UserMcpPath = $args[$i + 1]; $i += 2 } + { $_ -in "--mcp-path", "-McpPath" } { $script:UserMcpPath = $args[$i + 1]; $script:InstallMcp = $true; $i += 2 } { $_ -in "--silent", "-Silent" } { $script:Silent = $true; $i++ } { $_ -in "--tools", "-Tools" } { $script:UserTools = $args[$i + 1]; $i += 2 } { $_ -in "--skills-profile", "-SkillsProfile" } { $script:SkillsProfile = $args[$i + 1]; $i += 2 } { $_ -in "--skills", "-Skills" } { $script:UserSkills = $args[$i + 1]; $i += 2 } { $_ -in "--list-skills", "-ListSkills" } { $script:ListSkills = $true; $i++ } - { $_ -in "-f", "--force", "-Force" } { $script:Force = $true; $i++ } + { $_ -in "--experimental", "-Experimental" } { $script:Channel = "experimental"; $i++ } + { $_ -in "-b", "--branch", "-Branch" } { $Branch = $args[$i + 1]; $script:BranchExplicit = $true; $i += 2 } + { $_ -in "-f", "--force", "-Force" } { $script:Force = $true; $script:ForceExplicit = $true; $i++ } { $_ -in "-h", "--help", "-Help" } { Write-Host "Databricks AI Dev Kit Installer (Windows)" Write-Host "" @@ -232,17 +239,21 @@ while ($i -lt $args.Count) { Write-Host " --skills-only Skip MCP server setup" Write-Host " --mcp-only Skip skills installation" Write-Host " --mcp-path PATH Path to MCP server installation" + Write-Host " --mcp Install deprecated MCP server (default: no)" Write-Host " --silent Silent mode (no output except errors)" - Write-Host " --tools LIST Comma-separated: claude,cursor,copilot,codex,gemini,antigravity,windsurf,opencode" + Write-Host " --tools LIST Comma-separated: claude,cursor,copilot,codex,gemini,antigravity,windsurf,opencode,kiro" Write-Host " --skills-profile LIST Comma-separated profiles: all,data-engineer,analyst,ai-ml-engineer,app-developer" Write-Host " --skills LIST Comma-separated skill names to install (overrides profile)" Write-Host " --list-skills List available skills and profiles, then exit" + Write-Host " --experimental Install from experimental branch (early access features)" + Write-Host " -b, --branch NAME Git branch/tag to install (default: latest release)" Write-Host " -f, --force Force reinstall" Write-Host " -h, --help Show this help" Write-Host "" Write-Host "Environment Variables:" Write-Host " AIDEVKIT_BRANCH Branch or tag to install (default: latest release)" Write-Host " AIDEVKIT_HOME Installation directory (default: ~/.ai-dev-kit)" + Write-Host " AIDEVKIT_CHANNEL 'stable' (default) or 'experimental'" Write-Host "" Write-Host "Examples:" Write-Host " # Basic installation" @@ -260,6 +271,27 @@ while ($i -lt $args.Count) { } } +# If experimental channel is selected and branch wasn't explicitly overridden, +# install skills from the experimental branch instead of the latest release. +if ($script:Channel -eq "experimental" -and -not $script:BranchExplicit) { + $Branch = "experimental" +} + +# Experimental installs default to Force=true (always refresh the cached repo) +# unless the user explicitly passed --force. +if ($script:Channel -eq "experimental" -and -not $script:ForceExplicit) { + $script:Force = $true +} + +# Set raw URL after branch resolution +$RawUrl = "https://raw.githubusercontent.com/$Owner/$Repo/$Branch" + +# Keep stable and experimental clones in separate directories so they don't clobber each other +if ($script:Channel -eq "experimental") { + $RepoDir = Join-Path $InstallDir "experimental-repo" + $McpEntry = Join-Path $RepoDir "databricks-mcp-server\run_server.py" +} + # โ”€โ”€โ”€ Interactive helpers โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ function Test-Interactive { @@ -572,6 +604,8 @@ function Invoke-DetectTools { $hasWindsurf = ($null -ne (Get-Command windsurf -ErrorAction SilentlyContinue)) -or (Test-Path "$env:LOCALAPPDATA\Programs\Windsurf\Windsurf.exe") $hasOpencode = $null -ne (Get-Command opencode -ErrorAction SilentlyContinue) + $hasKiro = ($null -ne (Get-Command kiro -ErrorAction SilentlyContinue)) -or + (Test-Path "$env:LOCALAPPDATA\Programs\Kiro\Kiro.exe") $claudeState = $hasClaude; $claudeHint = if ($hasClaude) { "detected" } else { "not found" } $cursorState = $hasCursor; $cursorHint = if ($hasCursor) { "detected" } else { "not found" } @@ -581,9 +615,10 @@ function Invoke-DetectTools { $antigravityState = $hasAntigravity; $antigravityHint = if ($hasAntigravity) { "detected" } else { "not found" } $windsurfState = $hasWindsurf; $windsurfHint = if ($hasWindsurf) { "detected" } else { "not found" } $opencodeState = $hasOpencode; $opencodeHint = if ($hasOpencode) { "detected" } else { "not found" } + $kiroState = $hasKiro; $kiroHint = if ($hasKiro) { "detected" } else { "not found" } # If nothing detected, default to claude - if (-not $hasClaude -and -not $hasCursor -and -not $hasCodex -and -not $hasCopilot -and -not $hasGemini -and -not $hasAntigravity -and -not $hasWindsurf -and -not $hasOpencode) { + if (-not $hasClaude -and -not $hasCursor -and -not $hasCodex -and -not $hasCopilot -and -not $hasGemini -and -not $hasAntigravity -and -not $hasWindsurf -and -not $hasOpencode -and -not $hasKiro) { $claudeState = $true $claudeHint = "default" } @@ -602,6 +637,7 @@ function Invoke-DetectTools { @{ Label = "Antigravity"; Value = "antigravity"; State = $antigravityState; Hint = $antigravityHint } @{ Label = "Windsurf"; Value = "windsurf"; State = $windsurfState; Hint = $windsurfHint } @{ Label = "OpenCode"; Value = "opencode"; State = $opencodeState; Hint = $opencodeHint } + @{ Label = "Kiro"; Value = "kiro"; State = $kiroState; Hint = $kiroHint } ) $result = Select-Checkbox -Items $items @@ -683,12 +719,35 @@ function Invoke-PromptMcpPath { } # Update derived paths - $script:RepoDir = Join-Path $script:InstallDir "repo" + $repoSubdir = if ($script:Channel -eq "experimental") { "experimental-repo" } else { "repo" } + $script:RepoDir = Join-Path $script:InstallDir $repoSubdir $script:VenvDir = Join-Path $script:InstallDir ".venv" $script:VenvPython = Join-Path $script:VenvDir "Scripts\python.exe" $script:McpEntry = Join-Path $script:RepoDir "databricks-mcp-server\run_server.py" } +# โ”€โ”€โ”€ MCP install prompt โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +function Invoke-PromptMcpInstall { + if ($script:InstallMcp) { return } + if ($script:Silent -or -not (Test-Interactive)) { return } + + Write-Host "" + Write-Host " Deprecated MCP Server" -ForegroundColor White + Write-Host " Skills now work via CLI for better performance. MCP server is optional for backwards compatibility." -ForegroundColor DarkGray + + $items = @( + @{ Label = "Do not install"; Value = "no"; Selected = $true; Hint = "Recommended - skills work without MCP" } + @{ Label = "Install MCP server"; Value = "yes"; Selected = $false; Hint = "Legacy - requires Python venv setup" } + ) + + $selected = Select-Radio -Items $items + + if ($selected -eq "yes") { + $script:InstallMcp = $true + Invoke-PromptMcpPath + } +} + # โ”€โ”€โ”€ Check prerequisites โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ function Test-Dependencies { # Git @@ -1185,6 +1244,13 @@ function Install-Skills { $dirs += Join-Path $BaseDir ".opencode\skills" } } + "kiro" { + if ($script:Scope -eq "global") { + $dirs += Join-Path $env:USERPROFILE ".kiro\skills" + } else { + $dirs += Join-Path $BaseDir ".kiro\skills" + } + } } } $dirs = $dirs | Select-Object -Unique @@ -1685,6 +1751,16 @@ function Write-McpConfigs { } Write-Ok "OpenCode MCP config" } + "kiro" { + if ($script:Scope -eq "global") { + $kiroSettings = Join-Path $env:USERPROFILE ".kiro\settings" + } else { + $kiroSettings = Join-Path $BaseDir ".kiro\settings" + } + if (-not (Test-Path $kiroSettings)) { New-Item -ItemType Directory -Path $kiroSettings -Force | Out-Null } + Write-McpJson (Join-Path $kiroSettings "mcp.json") + Write-Ok "Kiro MCP config" + } } } } @@ -1719,7 +1795,19 @@ function Show-Summary { Write-Msg "Location: $($script:InstallDir)" Write-Msg "Scope: $($script:Scope)" Write-Msg "Tools: $(($script:Tools -split ' ') -join ', ')" + if ($script:Channel -eq "experimental") { + Write-Msg "Channel: experimental ๐Ÿงช" + } Write-Host "" + if ($script:Channel -eq "experimental") { + Write-Host " ============================================================" -ForegroundColor Yellow + Write-Host " ๐Ÿงช You're using the experimental channel" -ForegroundColor White + Write-Host " ============================================================" -ForegroundColor Yellow + Write-Host "" + Write-Msg "Thank you for testing early features! Your feedback helps us improve." + Write-Msg "Report issues: https://github.com/databricks-solutions/ai-dev-kit/issues" + Write-Host "" + } Write-Msg "Next steps:" $step = 1 if ($script:Tools -match 'cursor') { @@ -1748,6 +1836,10 @@ function Show-Summary { Write-Msg "$step. Launch OpenCode in your project: opencode" $step++ } + if ($script:Tools -match 'kiro') { + Write-Msg "$step. Open your project in Kiro to use Databricks skills and MCP tools" + $step++ + } Write-Msg "$step. Open your project in your tool of choice" $step++ Write-Msg "$step. Try: `"List my SQL warehouses`"" @@ -1893,6 +1985,72 @@ function Invoke-PromptAuth { } } +# โ”€โ”€โ”€ Release channel prompt โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +function Invoke-PromptChannel { + # Skip if already set via --experimental flag or env var + if ($script:Channel -eq "experimental") { return } + + # Skip in silent mode or non-interactive + if ($script:Silent) { return } + if (-not (Test-Interactive)) { return } + + Write-Host "" + Write-Host " Select release channel" -ForegroundColor White + + $items = @( + @{ Label = "Stable"; Value = "stable"; Selected = $true; Hint = "Latest stable release (recommended)" } + @{ Label = "Experimental"; Value = "experimental"; Selected = $false; Hint = "Early access to new features -- help us test!" } + ) + + $script:Channel = Select-Radio -Items $items + + # If experimental was selected, re-download and re-exec from experimental branch + if ($script:Channel -eq "experimental") { + Write-Host "" + Write-Host " ============================================================" -ForegroundColor Yellow + Write-Host " ๐Ÿงช Experimental Channel" -ForegroundColor White + Write-Host " ============================================================" -ForegroundColor Yellow + Write-Host "" + Write-Host " You're about to install the " -NoNewline + Write-Host "experimental" -ForegroundColor White -NoNewline + Write-Host " version of AI Dev Kit." + Write-Host " This includes early access features that may change or break." + Write-Host "" + Write-Host " We'd love your feedback!" -ForegroundColor White + Write-Host " Report issues: https://github.com/databricks-solutions/ai-dev-kit/issues" -ForegroundColor Blue + Write-Host " Discussions: https://github.com/databricks-solutions/ai-dev-kit/discussions" -ForegroundColor Blue + Write-Host "" + Write-Host " Downloading installer from experimental branch..." -ForegroundColor DarkGray + Write-Host "" + + # Build argument list preserving current flags + $newArgs = @("--experimental") + if ($script:Force) { $newArgs += "--force" } + if ($script:UserTools) { $newArgs += "--tools"; $newArgs += $script:UserTools } + if ($script:UserMcpPath) { $newArgs += "--mcp-path"; $newArgs += $script:UserMcpPath } + if ($script:SkillsProfile) { $newArgs += "--skills-profile"; $newArgs += $script:SkillsProfile } + if ($script:UserSkills) { $newArgs += "--skills"; $newArgs += $script:UserSkills } + if ($script:ScopeExplicit -and $script:Scope -eq "global") { $newArgs += "--global" } + if ($script:Profile_ -ne "DEFAULT") { $newArgs += "--profile"; $newArgs += $script:Profile_ } + if ($script:InstallMcp) { $newArgs += "--mcp" } + if (-not $script:InstallSkills) { $newArgs += "--mcp-only" } + if ($script:BranchExplicit) { $newArgs += "--branch"; $newArgs += $Branch } + + # Download experimental installer to a temp file and execute + $expUrl = "https://raw.githubusercontent.com/databricks-solutions/ai-dev-kit/experimental/install.ps1" + $tempScript = Join-Path $env:TEMP "ai-dev-kit-install-experimental.ps1" + try { + Invoke-WebRequest -Uri $expUrl -OutFile $tempScript -UseBasicParsing -ErrorAction Stop + } catch { + Write-Err "Failed to download experimental installer from ${expUrl}: $($_.Exception.Message)" + } + + # Execute the experimental installer with preserved args, then exit + & $tempScript @newArgs + exit $LASTEXITCODE + } +} + # โ”€โ”€โ”€ Main โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ function Invoke-Main { if (-not $script:Silent) { @@ -1901,6 +2059,9 @@ function Invoke-Main { Write-Host "--------------------------------" } + # โ”€โ”€ Step 1: Release channel selection (may re-exec from experimental branch) โ”€โ”€ + Invoke-PromptChannel + # Check dependencies Write-Step "Checking prerequisites" Test-Dependencies @@ -1942,9 +2103,9 @@ function Invoke-Main { } } - # MCP path + # MCP server + Invoke-PromptMcpInstall if ($script:InstallMcp) { - Invoke-PromptMcpPath Write-Ok "MCP path: $($script:InstallDir)" } @@ -1953,6 +2114,9 @@ function Invoke-Main { Write-Host "" Write-Host " Summary" -ForegroundColor White Write-Host " ------------------------------------" + if ($script:Channel -eq "experimental") { + Write-Host " Channel: " -NoNewline; Write-Host "experimental ๐Ÿงช" -ForegroundColor Yellow + } Write-Host " Tools: " -NoNewline; Write-Host "$(($script:Tools -split ' ') -join ', ')" -ForegroundColor Green Write-Host " Profile: " -NoNewline; Write-Host $script:Profile_ -ForegroundColor Green Write-Host " Scope: " -NoNewline; Write-Host $script:Scope -ForegroundColor Green @@ -1996,7 +2160,21 @@ function Invoke-Main { # Setup MCP server if ($script:InstallMcp) { Install-McpServer - } elseif (-not (Test-Path $script:RepoDir)) { + } elseif (Test-Path (Join-Path $script:RepoDir ".git")) { + # Repo already exists โ€” refresh it when Force is true, otherwise leave as-is + if ($script:Force) { + Write-Step "Refreshing sources" + $prevEAP = $ErrorActionPreference; $ErrorActionPreference = "Continue" + & git -C $script:RepoDir fetch -q --depth 1 origin $Branch 2>&1 | Out-Null + & git -C $script:RepoDir reset --hard FETCH_HEAD 2>&1 | Out-Null + if ($LASTEXITCODE -ne 0) { + Remove-Item -Recurse -Force $script:RepoDir -ErrorAction SilentlyContinue + & git -c advice.detachedHead=false clone -q --depth 1 --branch $Branch $RepoUrl $script:RepoDir 2>&1 | Out-Null + } + $ErrorActionPreference = $prevEAP + Write-Ok "Repository refreshed ($Branch)" + } + } else { Write-Step "Downloading sources" if (-not (Test-Path $script:InstallDir)) { New-Item -ItemType Directory -Path $script:InstallDir -Force | Out-Null diff --git a/install.sh b/install.sh index 51189d0f..e027eea8 100644 --- a/install.sh +++ b/install.sh @@ -2,7 +2,7 @@ # # Databricks AI Dev Kit - Unified Installer # -# Installs skills, MCP server, and configuration for Claude Code, Cursor, OpenAI Codex, GitHub Copilot, Gemini CLI, Antigravity, and Windsurf. +# Installs skills, MCP server, and configuration for Claude Code, Cursor, OpenAI Codex, GitHub Copilot, Gemini CLI, Antigravity, Windsurf, OpenCode, and Kiro. # # Usage: bash <(curl -sL https://raw.githubusercontent.com/databricks-solutions/ai-dev-kit/main/install.sh) [OPTIONS] # @@ -46,6 +46,8 @@ PROFILE="${DEVKIT_PROFILE:-DEFAULT}" SCOPE="${DEVKIT_SCOPE:-project}" SCOPE_EXPLICIT=false # Track if --global was explicitly passed FORCE="${DEVKIT_FORCE:-false}" +FORCE_EXPLICIT=false +[ -n "${DEVKIT_FORCE:-}" ] && FORCE_EXPLICIT=true IS_UPDATE=false SILENT="${DEVKIT_SILENT:-false}" TOOLS="${DEVKIT_TOOLS:-}" @@ -53,6 +55,7 @@ USER_TOOLS="" USER_MCP_PATH="${DEVKIT_MCP_PATH:-}" SKILLS_PROFILE="${DEVKIT_SKILLS_PROFILE:-}" USER_SKILLS="${DEVKIT_SKILLS:-}" +CHANNEL="${DEVKIT_CHANNEL:-stable}" # stable or experimental # Convert string booleans from env vars to actual booleans [ "$FORCE" = "true" ] || [ "$FORCE" = "1" ] && FORCE=true || FORCE=false @@ -64,8 +67,10 @@ USER_SKILLS="${DEVKIT_SKILLS:-}" OWNER="databricks-solutions" REPO="ai-dev-kit" +BRANCH_EXPLICIT=false if [ -n "${DEVKIT_BRANCH:-}" ]; then BRANCH="$DEVKIT_BRANCH" + BRANCH_EXPLICIT=true else BRANCH="$( curl -s "https://api.github.com/repos/${OWNER}/${REPO}/releases/latest" \ @@ -77,8 +82,17 @@ else fi # Installation mode defaults -INSTALL_MCP=true INSTALL_SKILLS=true +INSTALL_MCP="${DEVKIT_INSTALL_MCP:-false}" +MCP_INSTALL_PATH="${DEVKIT_MCP_PATH:-$HOME/.ai-dev-kit}" + +# Required config fields - if any new field is added here, saved configs become stale +# (hash is computed automatically, no manual version bump needed) +REQUIRED_CONFIG_FIELDS="SAVED_TOOLS SAVED_PROFILE SAVED_SCOPE SAVED_SKILLS_PROFILE SAVED_INSTALL_MCP" + +# Flags to track config state +USE_PREVIOUS_CONFIG=false +HAS_PREVIOUS_CONFIG=false # True if previous config exists (for pre-selecting defaults) # Minimum required versions MIN_CLI_VERSION="0.278.0" @@ -126,16 +140,18 @@ while [ $# -gt 0 ]; do case $1 in -p|--profile) PROFILE="$2"; shift 2 ;; -g|--global) SCOPE="global"; SCOPE_EXPLICIT=true; shift ;; - -b|--branch) BRANCH="$2"; shift 2 ;; + -b|--branch) BRANCH="$2"; BRANCH_EXPLICIT=true; shift 2 ;; --skills-only) INSTALL_MCP=false; shift ;; --mcp-only) INSTALL_SKILLS=false; shift ;; - --mcp-path) USER_MCP_PATH="$2"; shift 2 ;; + --mcp-path) USER_MCP_PATH="$2"; MCP_INSTALL_PATH="$2"; INSTALL_MCP=true; shift 2 ;; --skills-profile) SKILLS_PROFILE="$2"; shift 2 ;; --skills) USER_SKILLS="$2"; shift 2 ;; --list-skills) LIST_SKILLS=true; shift ;; --silent) SILENT=true; shift ;; + --mcp) INSTALL_MCP=true; shift ;; --tools) USER_TOOLS="$2"; shift 2 ;; - -f|--force) FORCE=true; shift ;; + --experimental) CHANNEL="experimental"; shift ;; + -f|--force) FORCE=true; FORCE_EXPLICIT=true; shift ;; -h|--help) echo "Databricks AI Dev Kit Installer" echo "" @@ -149,10 +165,13 @@ while [ $# -gt 0 ]; do echo " --mcp-only Skip skills installation" echo " --mcp-path PATH Path to MCP server installation (default: ~/.ai-dev-kit)" echo " --silent Silent mode (no output except errors)" - echo " --tools LIST Comma-separated: claude,cursor,copilot,codex,gemini,antigravity,windsurf,opencode" + echo " --tools LIST Comma-separated: claude,cursor,copilot,codex,gemini,antigravity,windsurf,opencode,kiro" echo " --skills-profile LIST Comma-separated profiles: all,data-engineer,analyst,ai-ml-engineer,app-developer" echo " --skills LIST Comma-separated skill names to install (overrides profile)" echo " --list-skills List available skills and profiles, then exit" + echo " --experimental Install from experimental branch (early access features)" + echo " --mcp Install deprecated MCP server (default: no)" + echo " --mcp-path PATH MCP server install path (default: ~/.ai-dev-kit)" echo " -f, --force Force reinstall" echo " -h, --help Show this help" echo "" @@ -166,6 +185,9 @@ while [ $# -gt 0 ]; do echo " DEVKIT_SKILLS_PROFILE Comma-separated skill profiles" echo " DEVKIT_SKILLS Comma-separated skill names" echo " DEVKIT_SILENT Set to 'true' for silent mode" + echo " DEVKIT_CHANNEL 'stable' (default) or 'experimental'" + echo " DEVKIT_INSTALL_MCP Set to 'true' to install MCP server" + echo " DEVKIT_MCP_PATH MCP server install path" echo " AIDEVKIT_HOME Installation directory (default: ~/.ai-dev-kit)" echo "" echo "Examples:" @@ -241,11 +263,28 @@ if [ "${LIST_SKILLS:-false}" = true ]; then exit 0 fi +# If experimental channel is selected and branch wasn't explicitly overridden, +# install skills from the experimental branch instead of the latest release. +if [ "$CHANNEL" = "experimental" ] && [ "$BRANCH_EXPLICIT" != true ]; then + BRANCH="experimental" +fi + +# Experimental installs default to FORCE=true (always refresh the cached repo) +# unless the user explicitly set DEVKIT_FORCE or passed --force. +if [ "$CHANNEL" = "experimental" ] && [ "$FORCE_EXPLICIT" != true ]; then + FORCE=true +fi + # Set configuration URLs after parsing branch argument REPO_URL="https://github.com/databricks-solutions/ai-dev-kit.git" RAW_URL="https://raw.githubusercontent.com/databricks-solutions/ai-dev-kit/${BRANCH}" INSTALL_DIR="${AIDEVKIT_HOME:-$HOME/.ai-dev-kit}" -REPO_DIR="$INSTALL_DIR/repo" +# Keep stable and experimental clones in separate directories so they don't clobber each other +if [ "$CHANNEL" = "experimental" ]; then + REPO_DIR="$INSTALL_DIR/experimental-repo" +else + REPO_DIR="$INSTALL_DIR/repo" +fi VENV_DIR="$INSTALL_DIR/.venv" VENV_PYTHON="$VENV_DIR/bin/python" MCP_ENTRY="$REPO_DIR/databricks-mcp-server/run_server.py" @@ -494,6 +533,150 @@ radio_select() { echo "${values[$selected]}" } +# โ”€โ”€โ”€ Configuration persistence โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +# Saves all user choices to allow quick reinstalls + +# Compute hash of required fields list (auto-detects schema changes) +get_config_schema_hash() { + # Use md5/md5sum depending on platform, truncate for readability + if command -v md5 >/dev/null 2>&1; then + echo "$REQUIRED_CONFIG_FIELDS" | md5 | cut -c1-8 + elif command -v md5sum >/dev/null 2>&1; then + echo "$REQUIRED_CONFIG_FIELDS" | md5sum | cut -c1-8 + else + # Fallback: simple checksum + echo "$REQUIRED_CONFIG_FIELDS" | cksum | cut -d' ' -f1 + fi +} + +# Get config file path (scope-aware) +get_config_file() { + local state_dir + if [ "$SCOPE" = "global" ]; then + state_dir="$INSTALL_DIR" + else + state_dir="$(pwd)/.ai-dev-kit" + fi + echo "$state_dir/.ai-dev-kit-install-config" +} + +# Save current configuration to file +save_config() { + local config_file + config_file=$(get_config_file) + mkdir -p "$(dirname "$config_file")" + + local schema_hash + schema_hash=$(get_config_schema_hash) + + cat > "$config_file" << EOF +# AI Dev Kit installation configuration +# Generated on $(date -u +"%Y-%m-%d %H:%M:%S UTC") +CONFIG_SCHEMA_HASH="$schema_hash" +SAVED_TOOLS="$TOOLS" +SAVED_PROFILE="$PROFILE" +SAVED_SCOPE="$SCOPE" +SAVED_SKILLS_PROFILE="${SKILLS_PROFILE:-all}" +SAVED_USER_SKILLS="$USER_SKILLS" +SAVED_INSTALL_MCP="$INSTALL_MCP" +SAVED_MCP_INSTALL_PATH="$MCP_INSTALL_PATH" +EOF +} + +# Load and validate previous configuration +# Returns 0 if valid config exists, 1 otherwise +# Sets SAVED_* variables if successful +# Robust: any error silently falls back to fresh install +load_previous_config() { + local config_file="" + + # First try project-local config, then global + if [ -f "$(pwd)/.ai-dev-kit/.ai-dev-kit-install-config" ]; then + config_file="$(pwd)/.ai-dev-kit/.ai-dev-kit-install-config" + elif [ -f "$INSTALL_DIR/.ai-dev-kit-install-config" ]; then + config_file="$INSTALL_DIR/.ai-dev-kit-install-config" + fi + [ -z "$config_file" ] && return 1 + + # Safely read config using grep instead of source (avoids code execution) + CONFIG_SCHEMA_HASH=$(grep -E '^CONFIG_SCHEMA_HASH=' "$config_file" 2>/dev/null | cut -d'=' -f2- | tr -d '"') || return 1 + SAVED_TOOLS=$(grep -E '^SAVED_TOOLS=' "$config_file" 2>/dev/null | cut -d'=' -f2- | tr -d '"') || return 1 + SAVED_PROFILE=$(grep -E '^SAVED_PROFILE=' "$config_file" 2>/dev/null | cut -d'=' -f2- | tr -d '"') || return 1 + SAVED_SCOPE=$(grep -E '^SAVED_SCOPE=' "$config_file" 2>/dev/null | cut -d'=' -f2- | tr -d '"') || return 1 + SAVED_SKILLS_PROFILE=$(grep -E '^SAVED_SKILLS_PROFILE=' "$config_file" 2>/dev/null | cut -d'=' -f2- | tr -d '"') || return 1 + SAVED_INSTALL_MCP=$(grep -E '^SAVED_INSTALL_MCP=' "$config_file" 2>/dev/null | cut -d'=' -f2- | tr -d '"') || return 1 + SAVED_USER_SKILLS=$(grep -E '^SAVED_USER_SKILLS=' "$config_file" 2>/dev/null | cut -d'=' -f2- | tr -d '"') || true + SAVED_MCP_INSTALL_PATH=$(grep -E '^SAVED_MCP_INSTALL_PATH=' "$config_file" 2>/dev/null | cut -d'=' -f2- | tr -d '"') || true + + # Validate schema hash matches + local expected_hash + expected_hash=$(get_config_schema_hash) + [ "${CONFIG_SCHEMA_HASH:-}" != "$expected_hash" ] && return 1 + + # Validate required fields are present + [ -z "$SAVED_TOOLS" ] && return 1 + [ -z "$SAVED_PROFILE" ] && return 1 + [ -z "$SAVED_SCOPE" ] && return 1 + [ -z "$SAVED_SKILLS_PROFILE" ] && return 1 + [ -z "$SAVED_INSTALL_MCP" ] && return 1 + + return 0 +} + +# Apply loaded config to current session variables +apply_previous_config() { + TOOLS="$SAVED_TOOLS" + PROFILE="$SAVED_PROFILE" + SCOPE="$SAVED_SCOPE" + SCOPE_EXPLICIT=true + + if [ "$SAVED_SKILLS_PROFILE" = "custom" ] || [[ "$SAVED_USER_SKILLS" == *","* ]] || [[ "$SAVED_USER_SKILLS" == *" "* ]]; then + USER_SKILLS="$SAVED_USER_SKILLS" + else + SKILLS_PROFILE="$SAVED_SKILLS_PROFILE" + fi + + INSTALL_MCP="$SAVED_INSTALL_MCP" + MCP_INSTALL_PATH="${SAVED_MCP_INSTALL_PATH:-$HOME/.ai-dev-kit}" + + USE_PREVIOUS_CONFIG=true +} + +# Display previous config and ask if user wants to use it +# Returns 0 if user wants to keep previous config (skip prompts) +# Returns 1 if user wants to reconfigure (but SAVED_* are set as defaults) +prompt_use_previous_config() { + if ! load_previous_config; then + return 1 # No valid config, proceed with prompts + fi + + echo "" + echo -e " ${B}Previous installation${N}" + echo -e " Tools: ${G}$(echo "$SAVED_TOOLS" | tr ' ' ', ')${N}, Profile: ${G}$SAVED_PROFILE${N}, Scope: ${G}$SAVED_SCOPE${N}" + if [ -n "$SAVED_USER_SKILLS" ]; then + echo -e " Skills: ${G}custom${N}, MCP: ${G}${SAVED_INSTALL_MCP}${N}" + else + echo -e " Skills: ${G}${SAVED_SKILLS_PROFILE:-all}${N}, MCP: ${G}${SAVED_INSTALL_MCP}${N}" + fi + + if [ "$SILENT" = true ] || [ ! -e /dev/tty ]; then + apply_previous_config + return 0 + fi + + local keep + keep=$(prompt "Keep this configuration? ${D}(Y/n)${N}" "y") + + if [ "$keep" = "y" ] || [ "$keep" = "Y" ] || [ "$keep" = "yes" ] || [ -z "$keep" ]; then + apply_previous_config + return 0 + else + # User wants to reconfigure - SAVED_* values remain set as defaults for prompts + HAS_PREVIOUS_CONFIG=true + return 1 + fi +} + # โ”€โ”€โ”€ Tool detection & selection โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ detect_tools() { # If provided via --tools flag or TOOLS env var, skip detection and prompts @@ -515,6 +698,7 @@ detect_tools() { local has_antigravity=false local has_windsurf=false local has_opencode=false + local has_kiro=false command -v claude >/dev/null 2>&1 && has_claude=true { [ -d "/Applications/Cursor.app" ] || command -v cursor >/dev/null 2>&1; } && has_cursor=true @@ -524,23 +708,39 @@ detect_tools() { { [ -d "/Applications/Antigravity.app" ] || command -v antigravity >/dev/null 2>&1; } && has_antigravity=true { [ -d "/Applications/Windsurf.app" ] || command -v windsurf >/dev/null 2>&1; } && has_windsurf=true command -v opencode >/dev/null 2>&1 && has_opencode=true + { [ -d "/Applications/Kiro.app" ] || command -v kiro >/dev/null 2>&1; } && has_kiro=true # Build checkbox items: "Label|value|on_or_off|hint" - local claude_state="off" cursor_state="off" codex_state="off" copilot_state="off" gemini_state="off" antigravity_state="off" windsurf_state="off" opencode_state="off" - local claude_hint="not found" cursor_hint="not found" codex_hint="not found" copilot_hint="not found" gemini_hint="not found" antigravity_hint="not found" windsurf_hint="not found" opencode_hint="not found" - [ "$has_claude" = true ] && claude_state="on" && claude_hint="detected" - [ "$has_cursor" = true ] && cursor_state="on" && cursor_hint="detected" - [ "$has_codex" = true ] && codex_state="on" && codex_hint="detected" - [ "$has_copilot" = true ] && copilot_state="on" && copilot_hint="detected" - [ "$has_gemini" = true ] && gemini_state="on" && gemini_hint="detected" - [ "$has_antigravity" = true ] && antigravity_state="on" && antigravity_hint="detected" - [ "$has_windsurf" = true ] && windsurf_state="on" && windsurf_hint="detected" - [ "$has_opencode" = true ] && opencode_state="on" && opencode_hint="detected" - - # If nothing detected, pre-select claude as default - if [ "$has_claude" = false ] && [ "$has_cursor" = false ] && [ "$has_codex" = false ] && [ "$has_copilot" = false ] && [ "$has_gemini" = false ] && [ "$has_antigravity" = false ] && [ "$has_windsurf" = false ] && [ "$has_opencode" = false ]; then - claude_state="on" - claude_hint="default" + local claude_state="off" cursor_state="off" codex_state="off" copilot_state="off" gemini_state="off" antigravity_state="off" windsurf_state="off" opencode_state="off" kiro_state="off" + local claude_hint="not found" cursor_hint="not found" codex_hint="not found" copilot_hint="not found" gemini_hint="not found" antigravity_hint="not found" windsurf_hint="not found" opencode_hint="not found" kiro_hint="not found" + + # If previous config exists, use those selections; otherwise use auto-detection + if [ "$HAS_PREVIOUS_CONFIG" = true ] && [ -n "$SAVED_TOOLS" ]; then + [[ " $SAVED_TOOLS " == *" claude "* ]] && claude_state="on" && claude_hint="previous" + [[ " $SAVED_TOOLS " == *" cursor "* ]] && cursor_state="on" && cursor_hint="previous" + [[ " $SAVED_TOOLS " == *" codex "* ]] && codex_state="on" && codex_hint="previous" + [[ " $SAVED_TOOLS " == *" copilot "* ]] && copilot_state="on" && copilot_hint="previous" + [[ " $SAVED_TOOLS " == *" gemini "* ]] && gemini_state="on" && gemini_hint="previous" + [[ " $SAVED_TOOLS " == *" antigravity "* ]] && antigravity_state="on" && antigravity_hint="previous" + [[ " $SAVED_TOOLS " == *" windsurf "* ]] && windsurf_state="on" && windsurf_hint="previous" + [[ " $SAVED_TOOLS " == *" opencode "* ]] && opencode_state="on" && opencode_hint="previous" + [[ " $SAVED_TOOLS " == *" kiro "* ]] && kiro_state="on" && kiro_hint="previous" + else + [ "$has_claude" = true ] && claude_state="on" && claude_hint="detected" + [ "$has_cursor" = true ] && cursor_state="on" && cursor_hint="detected" + [ "$has_codex" = true ] && codex_state="on" && codex_hint="detected" + [ "$has_copilot" = true ] && copilot_state="on" && copilot_hint="detected" + [ "$has_gemini" = true ] && gemini_state="on" && gemini_hint="detected" + [ "$has_antigravity" = true ] && antigravity_state="on" && antigravity_hint="detected" + [ "$has_windsurf" = true ] && windsurf_state="on" && windsurf_hint="detected" + [ "$has_opencode" = true ] && opencode_state="on" && opencode_hint="detected" + [ "$has_kiro" = true ] && kiro_state="on" && kiro_hint="detected" + + # If nothing detected, pre-select claude as default + if [ "$has_claude" = false ] && [ "$has_cursor" = false ] && [ "$has_codex" = false ] && [ "$has_copilot" = false ] && [ "$has_gemini" = false ] && [ "$has_antigravity" = false ] && [ "$has_windsurf" = false ] && [ "$has_opencode" = false ] && [ "$has_kiro" = false ]; then + claude_state="on" + claude_hint="default" + fi fi # Interactive or fallback @@ -557,6 +757,7 @@ detect_tools() { "Antigravity|antigravity|${antigravity_state}|${antigravity_hint}" \ "Windsurf|windsurf|${windsurf_state}|${windsurf_hint}" \ "OpenCode|opencode|${opencode_state}|${opencode_hint}" \ + "Kiro|kiro|${kiro_state}|${kiro_hint}" \ ) else # Silent: use detected defaults @@ -569,6 +770,7 @@ detect_tools() { [ "$has_antigravity" = true ] && tools="${tools:+$tools }antigravity" [ "$has_windsurf" = true ] && tools="${tools:+$tools }windsurf" [ "$has_opencode" = true ] && tools="${tools:+$tools }opencode" + [ "$has_kiro" = true ] && tools="${tools:+$tools }kiro" [ -z "$tools" ] && tools="claude" TOOLS="$tools" fi @@ -589,6 +791,7 @@ prompt_profile() { # Skip in silent mode or non-interactive if [ "$SILENT" = true ] || ! is_interactive; then + [ "$HAS_PREVIOUS_CONFIG" = true ] && [ -n "$SAVED_PROFILE" ] && PROFILE="$SAVED_PROFILE" return fi @@ -606,27 +809,33 @@ prompt_profile() { fi echo "" - echo -e " ${B}Select Databricks profile${N}" + echo -e " ${B}Which Databricks profile for this project?${N}" + echo -e " ${D}This will be set in .claude/settings.json for Claude Code to use.${N}" if [ ${#profiles[@]} -gt 0 ] && is_interactive; then + # Determine which profile to pre-select + local preselect="DEFAULT" + [ "$HAS_PREVIOUS_CONFIG" = true ] && [ -n "$SAVED_PROFILE" ] && preselect="$SAVED_PROFILE" + # Build radio items: "Label|value|on_or_off|hint" local -a items=() + local found_preselect=false for p in "${profiles[@]}"; do local state="off" local hint="" - [ "$p" = "DEFAULT" ] && state="on" && hint="default" + if [ "$p" = "$preselect" ]; then + state="on" + hint="previous" + found_preselect=true + fi items+=("${p}|${p}|${state}|${hint}") done - + # Add custom profile option at the end items+=("Custom profile name...|__CUSTOM__|off|Enter a custom profile name") - # If no DEFAULT profile exists, pre-select the first one - local has_default=false - for p in "${profiles[@]}"; do - [ "$p" = "DEFAULT" ] && has_default=true - done - if [ "$has_default" = false ]; then + # If preselect not found, select first one + if [ "$found_preselect" = false ]; then items[0]=$(echo "${items[0]}" | sed 's/|off|/|on|/') fi @@ -671,7 +880,11 @@ prompt_mcp_path() { fi # Update derived paths - REPO_DIR="$INSTALL_DIR/repo" + if [ "$CHANNEL" = "experimental" ]; then + REPO_DIR="$INSTALL_DIR/experimental-repo" + else + REPO_DIR="$INSTALL_DIR/repo" + fi VENV_DIR="$INSTALL_DIR/.venv" VENV_PYTHON="$VENV_DIR/bin/python" MCP_ENTRY="$REPO_DIR/databricks-mcp-server/run_server.py" @@ -762,29 +975,10 @@ prompt_skills_profile() { # Skip in silent mode or non-interactive if [ "$SILENT" = true ] || ! is_interactive; then - SKILLS_PROFILE="all" + SKILLS_PROFILE="${SAVED_SKILLS_PROFILE:-all}" return fi - # Check for previous selection (scope-local first, then global fallback for upgrades) - local profile_file="$STATE_DIR/.skills-profile" - [ ! -f "$profile_file" ] && [ "$SCOPE" = "project" ] && profile_file="$INSTALL_DIR/.skills-profile" - if [ -f "$profile_file" ]; then - local prev_profile - prev_profile=$(cat "$profile_file") - if [ "$FORCE" != true ]; then - echo "" - local display_profile - display_profile=$(echo "$prev_profile" | tr ',' ', ') - local keep - keep=$(prompt "Previous skill profile: ${B}${display_profile}${N}. Keep? ${D}(Y/n)${N}" "y") - if [ "$keep" = "y" ] || [ "$keep" = "Y" ] || [ "$keep" = "yes" ] || [ -z "$keep" ]; then - SKILLS_PROFILE="$prev_profile" - return - fi - fi - fi - echo "" echo -e " ${B}Select skill profile(s)${N}" @@ -792,7 +986,23 @@ prompt_skills_profile() { local -a p_labels=("All Skills" "Data Engineer" "Business Analyst" "AI/ML Engineer" "App Developer" "Custom") local -a p_values=("all" "data-engineer" "analyst" "ai-ml-engineer" "app-developer" "custom") local -a p_hints=("Install everything (34 skills)" "Pipelines, Spark, Jobs, Streaming (14 skills)" "Dashboards, SQL, Genie, Metrics (8 skills)" "Agents, RAG, Vector Search, MLflow (17 skills)" "Apps, Lakebase, Deployment (10 skills)" "Pick individual skills") - local -a p_states=(1 0 0 0 0 0) # "All" selected by default + + # Pre-select based on previous config if available and add "previous" hint + local -a p_states=(0 0 0 0 0 0) + if [ "$HAS_PREVIOUS_CONFIG" = true ] && [ -n "$SAVED_SKILLS_PROFILE" ]; then + # Parse comma-separated profiles and set states + hints + IFS=',' read -ra prev_profiles <<< "$SAVED_SKILLS_PROFILE" + for prev in "${prev_profiles[@]}"; do + for i in "${!p_values[@]}"; do + if [ "${p_values[$i]}" = "$prev" ]; then + p_states[$i]=1 + p_hints[$i]="previous" + fi + done + done + else + p_states[0]=1 # Default to "All" + fi local p_count=6 local p_cursor=0 local p_total_rows=$((p_count + 2)) @@ -958,6 +1168,51 @@ prompt_custom_skills() { USER_SKILLS=$(echo "$selected" | tr ' ' ',') } +# โ”€โ”€โ”€ MCP Server installation prompt โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ +prompt_mcp_install() { + # Skip if already set via env var or flag + if [ "$INSTALL_MCP" = true ]; then + return + fi + + # Skip in silent mode or non-interactive + if [ "$SILENT" = true ] || [ ! -e /dev/tty ]; then + [ "$HAS_PREVIOUS_CONFIG" = true ] && [ "$SAVED_INSTALL_MCP" = "true" ] && INSTALL_MCP=true + return + fi + + echo "" + echo -e " ${B}Deprecated MCP Server${N}" + echo -e " ${D}Skills now work via CLI for better performance. MCP server is optional for backwards compatibility.${N}" + + # Build radio items with previous config pre-selection + local skip_state="on" skip_hint="Recommended - skills work without MCP" + local install_state="off" install_hint="Legacy - requires Python venv setup" + + if [ "$HAS_PREVIOUS_CONFIG" = true ]; then + if [ "$SAVED_INSTALL_MCP" = "true" ]; then + skip_state="off" + install_state="on" + install_hint="previous" + else + skip_hint="previous" + fi + fi + + local selected + selected=$(radio_select \ + "Do not install|no|${skip_state}|${skip_hint}" \ + "Install MCP server|yes|${install_state}|${install_hint}" \ + ) + + if [ "$selected" = "yes" ]; then + INSTALL_MCP=true + # Prompt for install path + echo "" + MCP_INSTALL_PATH=$(prompt "MCP server install path" "$MCP_INSTALL_PATH") + fi +} + # Compare semantic versions (returns 0 if $1 >= $2) version_gte() { printf '%s\n%s' "$2" "$1" | sort -V -C @@ -969,93 +1224,54 @@ check_cli_version() { cli_version=$(databricks --version 2>/dev/null | grep -oE '[0-9]+\.[0-9]+\.[0-9]+' | head -1) if [ -z "$cli_version" ]; then - warn "Could not determine Databricks CLI version" - return + PREREQ_WARNINGS+=("Could not determine Databricks CLI version") + return 1 fi if version_gte "$cli_version" "$MIN_CLI_VERSION"; then - ok "Databricks CLI v${cli_version}" + PREREQS+=("Databricks CLI v${cli_version}") + return 0 else - warn "Databricks CLI v${cli_version} is outdated (minimum: v${MIN_CLI_VERSION})" - msg " ${B}Upgrade:${N} curl -fsSL https://raw.githubusercontent.com/databricks/setup-cli/main/install.sh | sh" + PREREQ_WARNINGS+=("Databricks CLI v${cli_version} outdated (min: v${MIN_CLI_VERSION}). Upgrade: curl -fsSL https://raw.githubusercontent.com/databricks/setup-cli/main/install.sh | sh") + return 1 fi } -# Check Databricks SDK version in the MCP venv -check_sdk_version() { - local sdk_version - sdk_version=$("$VENV_PYTHON" -c "from databricks.sdk.version import __version__; print(__version__)" 2>/dev/null) - - if [ -z "$sdk_version" ]; then - warn "Could not determine Databricks SDK version" - return - fi - - if version_gte "$sdk_version" "$MIN_SDK_VERSION"; then - ok "Databricks SDK v${sdk_version}" - else - warn "Databricks SDK v${sdk_version} is outdated (minimum: v${MIN_SDK_VERSION})" - msg " ${B}Upgrade:${N} $VENV_PYTHON -m pip install --upgrade databricks-sdk" - fi -} - -# Check prerequisites +# Check prerequisites (prints inline) check_deps() { + PREREQS=() + PREREQ_WARNINGS=() + command -v git >/dev/null 2>&1 || die "git required" - ok "git" + PREREQS+=("git") if command -v databricks >/dev/null 2>&1; then check_cli_version else - warn "Databricks CLI not found. Install: ${B}curl -fsSL https://raw.githubusercontent.com/databricks/setup-cli/main/install.sh | sh${N}" - msg "${D}You can still install, but authentication will require the CLI later.${N}" + PREREQ_WARNINGS+=("Databricks CLI not found. Install: curl -fsSL https://raw.githubusercontent.com/databricks/setup-cli/main/install.sh | sh") fi if [ "$INSTALL_MCP" = true ]; then if command -v uv >/dev/null 2>&1; then - PKG="uv" - ok "$PKG ($(uv --version 2>/dev/null || echo 'unknown version'))" + PREREQS+=("uv $(uv --version 2>/dev/null | grep -oE '[0-9]+\.[0-9]+\.[0-9]+' || echo '')") else die "uv is required but not found on your PATH. Install it with: ${B}curl -LsSf https://astral.sh/uv/install.sh | sh${N} Then re-run this installer." fi fi -} -# Check if update needed -check_version() { - local ver_file="$INSTALL_DIR/version" - [ "$SCOPE" = "project" ] && ver_file=".ai-dev-kit/version" - - [ ! -f "$ver_file" ] && return - [ "$FORCE" = true ] && return - - # Skip version gate if user explicitly wants a different skill profile - if [ -n "$SKILLS_PROFILE" ] || [ -n "$USER_SKILLS" ]; then - local saved_profile_file="$STATE_DIR/.skills-profile" - [ ! -f "$saved_profile_file" ] && [ "$SCOPE" = "project" ] && saved_profile_file="$INSTALL_DIR/.skills-profile" - if [ -f "$saved_profile_file" ]; then - local saved_profile - saved_profile=$(cat "$saved_profile_file") - local requested="${USER_SKILLS:+custom:$USER_SKILLS}" - [ -z "$requested" ] && requested="$SKILLS_PROFILE" - [ "$saved_profile" != "$requested" ] && return - fi + # Print inline + if [ "$SILENT" = false ] && [ ${#PREREQS[@]} -gt 0 ]; then + local prereq_list + prereq_list=$(printf '%s, ' "${PREREQS[@]}" | sed 's/, $//') + echo -e "${G}โœ“${N} ${prereq_list}" fi - local local_ver=$(cat "$ver_file") - # Use -f to fail on HTTP errors (like 404) - local remote_ver=$(curl -fsSL "$RAW_URL/VERSION" 2>/dev/null || echo "") - - # Validate remote version format (should not contain "404" or other error text) - if [ -n "$remote_ver" ] && [[ ! "$remote_ver" =~ (404|Not Found|error) ]]; then - if [ "$local_ver" = "$remote_ver" ]; then - ok "Already up to date (v${local_ver})" - msg "${D}Use --force to reinstall or --skills-profile to change profiles${N}" - exit 0 - fi - fi + # Print warnings on separate lines + for w in "${PREREQ_WARNINGS[@]}"; do + warn "$w" + done } # Setup MCP server @@ -1087,7 +1303,7 @@ setup_mcp() { fi msg "Installing Python dependencies..." - $arch_prefix uv venv --python 3.11 --allow-existing "$VENV_DIR" -q 2>/dev/null || $arch_prefix uv venv --allow-existing "$VENV_DIR" -q + $arch_prefix uv venv --python 3.12 --allow-existing "$VENV_DIR" -q 2>/dev/null || $arch_prefix uv venv --allow-existing "$VENV_DIR" -q $arch_prefix uv pip install --python "$VENV_PYTHON" -e "$REPO_DIR/databricks-tools-core" -e "$REPO_DIR/databricks-mcp-server" -q "$VENV_PYTHON" -c "import databricks_mcp_server" 2>/dev/null || die "MCP server install failed" @@ -1130,6 +1346,13 @@ install_skills() { dirs+=("$base_dir/.opencode/skills") fi ;; + kiro) + if [ "$SCOPE" = "global" ]; then + dirs+=("$HOME/.kiro/skills") + else + dirs+=("$base_dir/.kiro/skills") + fi + ;; esac done @@ -1228,13 +1451,6 @@ install_skills() { # Save manifest of installed skills (for cleanup on profile change) mv "$manifest.tmp" "$manifest" - - # Save selected profile for future reinstalls (scope-local) - if [ -n "$USER_SKILLS" ]; then - echo "custom:$USER_SKILLS" > "$STATE_DIR/.skills-profile" - else - echo "${SKILLS_PROFILE:-all}" > "$STATE_DIR/.skills-profile" - fi } # Write MCP configs @@ -1448,6 +1664,57 @@ GEMINIEOF ok "GEMINI.md" } +# Write DATABRICKS_CONFIG_PROFILE to Claude settings.json env section +# Safely merges with existing settings using Python or jq +write_claude_env() { + local path=$1 + local profile=$2 + mkdir -p "$(dirname "$path")" + + # Try Python first (most reliable for JSON manipulation) + if command -v python3 >/dev/null 2>&1; then + python3 -c " +import json +path = '$path' +profile = '$profile' +try: + with open(path) as f: cfg = json.load(f) +except: cfg = {} +env = cfg.setdefault('env', {}) +env['DATABRICKS_CONFIG_PROFILE'] = profile +with open(path, 'w') as f: json.dump(cfg, f, indent=2); f.write('\n') +" 2>/dev/null && return 0 + fi + + # Fallback: jq if available + if command -v jq >/dev/null 2>&1; then + if [ -f "$path" ]; then + local tmp="${path}.tmp" + jq --arg p "$profile" '.env = (.env // {}) | .env.DATABRICKS_CONFIG_PROFILE = $p' "$path" > "$tmp" && mv "$tmp" "$path" + else + echo "{\"env\":{\"DATABRICKS_CONFIG_PROFILE\":\"$profile\"}}" | jq '.' > "$path" + fi + return 0 + fi + + # Last resort: create new file only if it doesn't exist + if [ ! -f "$path" ]; then + cat > "$path" << EOF +{ + "env": { + "DATABRICKS_CONFIG_PROFILE": "$profile" + } +} +EOF + return 0 + fi + + # Can't safely merge without Python or jq + warn "Cannot update $path without python3 or jq. Add manually:" + msg " \"env\": {\"DATABRICKS_CONFIG_PROFILE\": \"$profile\"}" + return 1 +} + write_claude_hook() { local path=$1 local script=$2 @@ -1584,6 +1851,16 @@ write_mcp_configs() { fi ok "OpenCode MCP config" ;; + kiro) + if [ "$SCOPE" = "global" ]; then + mkdir -p "$HOME/.kiro/settings" + write_mcp_json "$HOME/.kiro/settings/mcp.json" + else + mkdir -p "$base_dir/.kiro/settings" + write_mcp_json "$base_dir/.kiro/settings/mcp.json" + fi + ok "Kiro MCP config" + ;; esac done } @@ -1607,6 +1884,7 @@ summary() { echo "" echo -e "${G}${B}Installation complete!${N}" echo "โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€" + [ "$CHANNEL" = "experimental" ] && msg "Channel: ${Y}experimental ๐Ÿงช${N}" msg "Location: $INSTALL_DIR" msg "Scope: $SCOPE" msg "Tools: $(echo "$TOOLS" | tr ' ' ', ')" @@ -1639,84 +1917,121 @@ summary() { msg "${step}. Launch OpenCode in your project: ${B}opencode${N}" step=$((step + 1)) fi + if echo "$TOOLS" | grep -q kiro; then + msg "${step}. Open your project in Kiro to use Databricks skills and MCP tools" + step=$((step + 1)) + fi msg "${step}. Open your project in your tool of choice" step=$((step + 1)) msg "${step}. Try: \"List my SQL warehouses\"" echo "" + if [ "$CHANNEL" = "experimental" ]; then + echo -e " ${Y}โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”${N}" + echo -e " ${B}๐Ÿงช You're using the experimental channel${N}" + echo -e " ${Y}โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”${N}" + echo "" + msg "Thank you for testing early features! Your feedback helps us improve." + msg "Report issues: ${BL}https://github.com/databricks-solutions/ai-dev-kit/issues${N}" + echo "" + fi fi } # Prompt for installation scope prompt_scope() { if [ "$SILENT" = true ] || ! is_interactive; then + [ "$HAS_PREVIOUS_CONFIG" = true ] && SCOPE="${SAVED_SCOPE:-project}" return fi echo "" echo -e " ${B}Select installation scope${N}" - - # Simple radio selector without Confirm button - local -a labels=("Project" "Global") - local -a values=("project" "global") - local -a hints=("Install in current directory (.cursor/, .claude/, .gemini/)" "Install in home directory (~/.cursor/, ~/.claude/, ~/.gemini/)") - local count=2 - local selected=0 - local cursor=0 - - _scope_draw() { - for i in 0 1; do - local dot="โ—‹" - local dot_color="\033[2m" - [ "$i" = "$selected" ] && dot="โ—" && dot_color="\033[0;32m" - local arrow=" " - [ "$i" = "$cursor" ] && arrow="\033[0;34mโฏ\033[0m " - local hint_style="\033[2m" - [ "$i" = "$selected" ] && hint_style="\033[0;32m" - printf "\033[2K %b%b%b %-20s %b%s\033[0m\n" "$arrow" "$dot_color" "$dot" "${labels[$i]}" "$hint_style" "${hints[$i]}" > /dev/tty - done - } - - printf "\n \033[2mโ†‘/โ†“ navigate ยท enter select\033[0m\n\n" > /dev/tty - printf "\033[?25l" > /dev/tty - trap 'printf "\033[?25h" > /dev/tty 2>/dev/null' EXIT - - _scope_draw - - while true; do - printf "\033[%dA" "$count" > /dev/tty - _scope_draw - - local key="" - IFS= read -rsn1 key < /dev/tty 2>/dev/null - - if [ "$key" = $'\x1b' ]; then - local s1="" s2="" - read -rsn1 s1 < /dev/tty 2>/dev/null - read -rsn1 s2 < /dev/tty 2>/dev/null - if [ "$s1" = "[" ]; then - case "$s2" in - A) [ "$cursor" -gt 0 ] && cursor=$((cursor - 1)) ;; - B) [ "$cursor" -lt 1 ] && cursor=$((cursor + 1)) ;; - esac - fi - elif [ "$key" = "" ]; then - selected=$cursor - printf "\033[%dA" "$count" > /dev/tty - _scope_draw - break - elif [ "$key" = " " ]; then - selected=$cursor + + # Build radio items with previous config pre-selection + local project_state="on" project_hint="Install in current directory" + local global_state="off" global_hint="Install in home directory" + + if [ "$HAS_PREVIOUS_CONFIG" = true ] && [ -n "$SAVED_SCOPE" ]; then + if [ "$SAVED_SCOPE" = "global" ]; then + project_state="off" + global_state="on" + global_hint="previous" + else + project_hint="previous" fi - done - - printf "\033[?25h" > /dev/tty - trap - EXIT - - SCOPE="${values[$selected]}" + fi + + SCOPE=$(radio_select \ + "Project|project|${project_state}|${project_hint}" \ + "Global|global|${global_state}|${global_hint}" \ + ) +} + +# Prompt for release channel (stable vs experimental) +prompt_channel() { + # Skip if already set via --experimental flag or env var + if [ "$CHANNEL" = "experimental" ]; then + return + fi + + # Skip in silent mode or non-interactive + if [ "$SILENT" = true ] || [ ! -e /dev/tty ]; then + return + fi + + echo "" + echo -e " ${B}Select release channel${N}" + + local selected + selected=$(radio_select \ + "Stable|stable|on|Latest stable release (recommended)" \ + "Experimental|experimental|off|Early access to new features โ€” help us test!" \ + ) + + CHANNEL="$selected" + + # If experimental was selected, re-download and re-exec from experimental branch + if [ "$CHANNEL" = "experimental" ]; then + echo "" + echo -e " ${Y}โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”${N}" + echo -e " ${B}๐Ÿงช Experimental Channel${N}" + echo -e " ${Y}โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”${N}" + echo "" + echo -e " You're about to install the ${B}experimental${N} version of AI Dev Kit." + echo -e " This includes early access features that may change or break." + echo "" + echo -e " ${B}We'd love your feedback!${N}" + echo -e " Report issues: ${BL}https://github.com/databricks-solutions/ai-dev-kit/issues${N}" + echo -e " Discussions: ${BL}https://github.com/databricks-solutions/ai-dev-kit/discussions${N}" + echo "" + echo -e " ${D}Downloading installer from experimental branch...${N}" + + # Build the command with all current flags preserved + local args="--experimental" + [ "$FORCE" = true ] && args="$args --force" + [ "$SILENT" = true ] && args="$args --silent" + [ -n "$USER_TOOLS" ] && args="$args --tools $USER_TOOLS" + [ -n "$USER_MCP_PATH" ] && args="$args --mcp-path $USER_MCP_PATH" + [ -n "$SKILLS_PROFILE" ] && args="$args --skills-profile $SKILLS_PROFILE" + [ -n "$USER_SKILLS" ] && args="$args --skills $USER_SKILLS" + [ "$SCOPE_EXPLICIT" = true ] && [ "$SCOPE" = "global" ] && args="$args --global" + [ "$PROFILE" != "DEFAULT" ] && args="$args --profile $PROFILE" + [ "$INSTALL_MCP" = false ] && args="$args --skills-only" + [ "$INSTALL_SKILLS" = false ] && args="$args --mcp-only" + [ "$BRANCH_EXPLICIT" = true ] && args="$args --branch $BRANCH" + + # Download and execute the experimental installer + exec bash <(curl -fsSL "https://raw.githubusercontent.com/databricks-solutions/ai-dev-kit/experimental/install.sh") $args + fi } -# Prompt to run auth +# Prompt to run auth (only for Claude + project scope) prompt_auth() { + # Skip if not Claude or if global scope + if ! echo "$TOOLS" | grep -qw "claude" || [ "$SCOPE" = "global" ]; then + return + fi + if [ "$SILENT" = true ] || ! is_interactive; then return fi @@ -1764,69 +2079,101 @@ prompt_auth() { # Main main() { - if [ "$SILENT" = false ]; then - echo "" - echo -e "${B}Databricks AI Dev Kit Installer${N}" - echo "โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€" - fi - + [ "$SILENT" = false ] && echo -e "\n${B}Databricks AI Dev Kit Installer${N}" + + # โ”€โ”€ Step 1: Release channel selection (may re-exec from experimental branch) โ”€โ”€ + prompt_channel + # Check dependencies - step "Checking prerequisites" check_deps - # โ”€โ”€ Step 2: Interactive tool selection โ”€โ”€ - step "Selecting tools" - detect_tools - ok "Selected: $(echo "$TOOLS" | tr ' ' ', ')" + # โ”€โ”€ Step 2: Check for previous configuration โ”€โ”€ + # Only prompt if running interactively and no explicit flags were provided + if [ "$SILENT" = false ] && [ -z "$USER_TOOLS" ] && [ "$SCOPE_EXPLICIT" = false ]; then + if prompt_use_previous_config; then + # Config loaded and user chose to use it - skip to skill resolution + # Set state directory based on loaded scope + if [ "$SCOPE" = "global" ]; then + STATE_DIR="$INSTALL_DIR" + else + STATE_DIR="$(pwd)/.ai-dev-kit" + fi + # Resolve skills from loaded config + if [ "$INSTALL_SKILLS" = true ]; then + resolve_skills + fi + # Skip to confirmation (USE_PREVIOUS_CONFIG is true) + fi + fi - # โ”€โ”€ Step 3: Interactive profile selection โ”€โ”€ - step "Databricks profile" - prompt_profile - ok "Profile: $PROFILE" + # โ”€โ”€ Interactive configuration (skip if using previous config) โ”€โ”€ + if [ "$USE_PREVIOUS_CONFIG" = false ]; then + # โ”€โ”€ Step 2: Interactive tool selection โ”€โ”€ + step "Selecting tools" + detect_tools + ok "Selected: $(echo "$TOOLS" | tr ' ' ', ')" + + # โ”€โ”€ Step 3: Interactive scope selection โ”€โ”€ + if [ "$SCOPE_EXPLICIT" = false ]; then + prompt_scope + ok "Scope: $SCOPE" + fi - # โ”€โ”€ Step 3.5: Interactive scope selection โ”€โ”€ - if [ "$SCOPE_EXPLICIT" = false ]; then - prompt_scope - ok "Scope: $SCOPE" + # โ”€โ”€ Step 4: Interactive profile selection (only if Claude + project scope) โ”€โ”€ + # Profile is set in .claude/settings.json env, so only for project-scoped installs + # to avoid messing with global settings that affect all projects + if echo "$TOOLS" | grep -qw "claude" && [ "$SCOPE" != "global" ]; then + step "Databricks profile for this project" + prompt_profile + ok "Profile: $PROFILE" + fi fi # Set state directory based on scope (for profile/manifest storage) + # (Also set when using previous config, but doesn't hurt to ensure it's set) if [ "$SCOPE" = "global" ]; then STATE_DIR="$INSTALL_DIR" else STATE_DIR="$(pwd)/.ai-dev-kit" fi - # โ”€โ”€ Step 4: Skill profile selection โ”€โ”€ - if [ "$INSTALL_SKILLS" = true ]; then - step "Skill profiles" - prompt_skills_profile - resolve_skills - # Count for display - local sk_count=0 - for _ in $SELECTED_SKILLS $SELECTED_MLFLOW_SKILLS $SELECTED_APX_SKILLS; do sk_count=$((sk_count + 1)); done - if [ -n "$USER_SKILLS" ]; then - ok "Custom selection ($sk_count skills)" - else - ok "Profile: ${SKILLS_PROFILE:-all} ($sk_count skills)" + # โ”€โ”€ Continue interactive configuration (skip if using previous config) โ”€โ”€ + if [ "$USE_PREVIOUS_CONFIG" = false ]; then + # โ”€โ”€ Step 4: Skill profile selection โ”€โ”€ + if [ "$INSTALL_SKILLS" = true ]; then + step "Skill profiles" + prompt_skills_profile + resolve_skills + # Count for display + local sk_count=0 + for _ in $SELECTED_SKILLS $SELECTED_MLFLOW_SKILLS $SELECTED_APX_SKILLS; do sk_count=$((sk_count + 1)); done + if [ -n "$USER_SKILLS" ]; then + ok "Custom selection ($sk_count skills)" + else + ok "Profile: ${SKILLS_PROFILE:-all} ($sk_count skills)" + fi fi - fi - # โ”€โ”€ Step 5: Interactive MCP path โ”€โ”€ - if [ "$INSTALL_MCP" = true ]; then - prompt_mcp_path - ok "MCP path: $INSTALL_DIR" + # โ”€โ”€ Step 4.5: MCP server installation prompt โ”€โ”€ + step "MCP server (deprecated)" + prompt_mcp_install + if [ "$INSTALL_MCP" = true ]; then + ok "Will install MCP server to: $MCP_INSTALL_PATH" + else + ok "Skipping MCP server (recommended)" + fi fi - # โ”€โ”€ Step 6: Confirm before proceeding โ”€โ”€ + # โ”€โ”€ Step 5: Confirm before proceeding โ”€โ”€ if [ "$SILENT" = false ]; then echo "" echo -e " ${B}Summary${N}" echo -e " โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€" + [ "$CHANNEL" = "experimental" ] && echo -e " Channel: ${Y}experimental ๐Ÿงช${N}" echo -e " Tools: ${G}$(echo "$TOOLS" | tr ' ' ', ')${N}" - echo -e " Profile: ${G}${PROFILE}${N}" echo -e " Scope: ${G}${SCOPE}${N}" - [ "$INSTALL_MCP" = true ] && echo -e " MCP server: ${G}${INSTALL_DIR}${N}" + # Only show profile for Claude + project scope (where it's actually used) + echo "$TOOLS" | grep -qw "claude" && [ "$SCOPE" != "global" ] && echo -e " Profile: ${G}${PROFILE}${N}" if [ "$INSTALL_SKILLS" = true ]; then if [ -n "$USER_SKILLS" ]; then echo -e " Skills: ${G}custom selection${N}" @@ -1836,7 +2183,11 @@ main() { echo -e " Skills: ${G}${SKILLS_PROFILE:-all} ($sk_total skills)${N}" fi fi - [ "$INSTALL_MCP" = true ] && echo -e " MCP config: ${G}yes${N}" + if [ "$INSTALL_MCP" = true ]; then + echo -e " MCP server: ${Y}Yes${N} (legacy) โ†’ $MCP_INSTALL_PATH" + else + echo -e " MCP server: ${G}No${N} (recommended)" + fi echo "" fi @@ -1850,9 +2201,6 @@ main() { fi fi - # โ”€โ”€ Step 7: Version check (may exit early if up to date) โ”€โ”€ - check_version - # Determine base directory local base_dir [ "$SCOPE" = "global" ] && base_dir="$HOME" || base_dir="$(pwd)" @@ -1860,7 +2208,18 @@ main() { # Setup MCP server if [ "$INSTALL_MCP" = true ]; then setup_mcp - elif [ ! -d "$REPO_DIR" ]; then + elif [ -d "$REPO_DIR/.git" ]; then + # Repo already exists โ€” refresh it when FORCE is true, otherwise leave as-is + if [ "$FORCE" = true ]; then + step "Refreshing sources" + git -C "$REPO_DIR" fetch -q --depth 1 origin "$BRANCH" 2>/dev/null || true + git -C "$REPO_DIR" reset --hard FETCH_HEAD 2>/dev/null || { + rm -rf "$REPO_DIR" + git -c advice.detachedHead=false clone -q --depth 1 --branch "$BRANCH" "$REPO_URL" "$REPO_DIR" + } + ok "Repository refreshed ($BRANCH)" + fi + else step "Downloading sources" mkdir -p "$INSTALL_DIR" git -c advice.detachedHead=false clone -q --depth 1 --branch "$BRANCH" "$REPO_URL" "$REPO_DIR" @@ -1870,6 +2229,14 @@ main() { # Install skills [ "$INSTALL_SKILLS" = true ] && install_skills "$base_dir" + # Write Databricks profile to Claude settings.json (project scope only) + if echo "$TOOLS" | grep -qw "claude" && [ "$SCOPE" != "global" ]; then + local claude_settings="$base_dir/.claude/settings.json" + if write_claude_env "$claude_settings" "$PROFILE"; then + ok "Claude env: DATABRICKS_CONFIG_PROFILE=$PROFILE" + fi + fi + # Write GEMINI.md if gemini is selected if echo "$TOOLS" | grep -q gemini; then if [ "$SCOPE" = "global" ]; then @@ -1884,10 +2251,13 @@ main() { # Save version save_version - + + # Save configuration for quick reinstalls + save_config + # Prompt to run auth prompt_auth - + # Done summary }