From 569c0e4b3d57faa517564048e8e503d35499dea1 Mon Sep 17 00:00:00 2001 From: mpkrass7 Date: Fri, 8 May 2026 11:07:59 -0400 Subject: [PATCH] Remove databricks-lakebase-provisioned skill (#516) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Provisioned Lakebase is being converted to Autoscaling, and the duplicate skill was confusing the LLM agent. Removes the skill files, test fixtures, installer entries, and cross-references — but leaves the underlying MCP tools and builder-app DB code that still support provisioned instances customers may have today. Co-authored-by: Isaac --- .test/scripts/generate_ground_truth.py | 1 - .test/skills/_routing/ground_truth.yaml | 4 +- .../ground_truth.yaml | 387 ------------------ .../manifest.yaml | 30 -- .test/src/skill_test/scorers/routing.py | 2 +- .test/tests/test_scorers.py | 8 +- .../server/services/skills_manager.py | 6 - .../databricks_mcp_server/tools/lakebase.py | 4 +- databricks-skills/README.md | 2 +- .../databricks-app-python/SKILL.md | 2 +- .../databricks-lakebase-autoscale/SKILL.md | 1 - .../databricks-lakebase-provisioned/SKILL.md | 352 ---------------- .../connection-patterns.md | 279 ------------- .../reverse-etl.md | 171 -------- .../databricks-python-sdk/SKILL.md | 2 +- databricks-skills/install_skills.sh | 4 +- install.ps1 | 5 +- install.sh | 5 +- 18 files changed, 17 insertions(+), 1248 deletions(-) delete mode 100644 .test/skills/databricks-lakebase-provisioned/ground_truth.yaml delete mode 100644 .test/skills/databricks-lakebase-provisioned/manifest.yaml delete mode 100644 databricks-skills/databricks-lakebase-provisioned/SKILL.md delete mode 100644 databricks-skills/databricks-lakebase-provisioned/connection-patterns.md delete mode 100644 databricks-skills/databricks-lakebase-provisioned/reverse-etl.md diff --git a/.test/scripts/generate_ground_truth.py b/.test/scripts/generate_ground_truth.py index 083b08a2..a492fe39 100644 --- a/.test/scripts/generate_ground_truth.py +++ b/.test/scripts/generate_ground_truth.py @@ -551,7 +551,6 @@ def str_representer(dumper, data): "databricks-docs", "databricks-jobs", "databricks-lakebase-autoscale", - "databricks-lakebase-provisioned", "databricks-metric-views", "databricks-mlflow-evaluation", "databricks-python-sdk", diff --git a/.test/skills/_routing/ground_truth.yaml b/.test/skills/_routing/ground_truth.yaml index d89e1fd9..3bb51833 100644 --- a/.test/skills/_routing/ground_truth.yaml +++ b/.test/skills/_routing/ground_truth.yaml @@ -219,12 +219,12 @@ test_cases: expectations: expected_skills: - "databricks-app-python" - - "databricks-lakebase-provisioned" + - "databricks-lakebase-autoscale" is_multi_skill: true metadata: category: "multi_skill" difficulty: "medium" - reasoning: "Combines 'Streamlit' (app-python) with 'Lakebase' (databricks-lakebase-provisioned)" + reasoning: "Combines 'Streamlit' (app-python) with 'Lakebase' (databricks-lakebase-autoscale)" - id: "routing_multi_app_serving_001" inputs: diff --git a/.test/skills/databricks-lakebase-provisioned/ground_truth.yaml b/.test/skills/databricks-lakebase-provisioned/ground_truth.yaml deleted file mode 100644 index e840846c..00000000 --- a/.test/skills/databricks-lakebase-provisioned/ground_truth.yaml +++ /dev/null @@ -1,387 +0,0 @@ -metadata: - skill_name: databricks-lakebase-provisioned - version: 0.1.0 - created_at: '2026-02-23T08:00:00.000000' - -test_cases: -- id: lakebase_prov_create_instance_001 - inputs: - prompt: "How do I create a new Lakebase Provisioned database instance?" - outputs: - response: | - ```python - from databricks.sdk import WorkspaceClient - import uuid - - w = WorkspaceClient() - - # Create a database instance - instance = w.database.create_database_instance( - name="my-lakebase-instance", - capacity="CU_1", # CU_1, CU_2, CU_4, CU_8 - stopped=False - ) - print(f"Instance created: {instance.name}") - print(f"DNS endpoint: {instance.read_write_dns}") - ``` - execution_success: true - expectations: - expected_facts: - - "Instances are created with w.database.create_database_instance()" - - "Capacity options are CU_1, CU_2, CU_4, and CU_8" - - "The instance provides a read_write_dns endpoint for connections" - expected_patterns: - - pattern: "create_database_instance" - min_count: 1 - description: "Uses create_database_instance() method" - - pattern: "capacity" - min_count: 1 - description: "Specifies instance capacity" - - pattern: "read_write_dns" - min_count: 1 - description: "References the DNS endpoint" - - pattern: "WorkspaceClient" - min_count: 1 - description: "Uses WorkspaceClient for SDK access" - guidelines: - - "Must show available capacity options (CU_1 through CU_8)" - - "Must reference the read_write_dns endpoint" - metadata: - category: happy_path - difficulty: easy - source: auto_generated - -- id: lakebase_prov_connect_notebook_002 - inputs: - prompt: "Show me how to connect to Lakebase Provisioned from a Databricks notebook using psycopg." - outputs: - response: | - ```python - import psycopg - from databricks.sdk import WorkspaceClient - import uuid - - # Get instance details - w = WorkspaceClient() - instance = w.database.get_database_instance(name="my-lakebase-instance") - - # Generate OAuth token - cred = w.database.generate_database_credential( - request_id=str(uuid.uuid4()), - instance_names=["my-lakebase-instance"] - ) - - # Connect using psycopg3 - conn_string = ( - f"host={instance.read_write_dns} " - f"dbname=postgres " - f"user={w.current_user.me().user_name} " - f"password={cred.token} " - f"sslmode=require" - ) - with psycopg.connect(conn_string) as conn: - with conn.cursor() as cur: - cur.execute("SELECT version()") - print(cur.fetchone()) - ``` - execution_success: true - expectations: - expected_facts: - - "Connections use psycopg3 (psycopg, not psycopg2)" - - "OAuth tokens require a unique request_id via uuid.uuid4()" - - "generate_database_credential() takes instance_names as a list" - - "The token is used as the password in the connection string" - expected_patterns: - - pattern: "psycopg\\.connect" - min_count: 1 - description: "Connects using psycopg3" - - pattern: "generate_database_credential" - min_count: 1 - description: "Generates OAuth token for authentication" - - pattern: "uuid\\.uuid4" - min_count: 1 - description: "Uses uuid4 for unique request_id" - - pattern: "sslmode=require" - min_count: 1 - description: "Requires SSL for the connection" - guidelines: - - "Must use psycopg (not psycopg2)" - - "Must include request_id with uuid4" - - "Connection must include sslmode=require" - metadata: - category: happy_path - difficulty: medium - source: auto_generated - -- id: lakebase_prov_oauth_token_003 - inputs: - prompt: "How do I generate an OAuth token for Lakebase Provisioned?" - outputs: - response: | - ```python - from databricks.sdk import WorkspaceClient - import uuid - - w = WorkspaceClient() - - # Generate OAuth token for database connection - cred = w.database.generate_database_credential( - request_id=str(uuid.uuid4()), - instance_names=["my-lakebase-instance"] - ) - token = cred.token # Use this as password in connection string - ``` - - Tokens expire after 1 hour. For long-running applications, implement a token refresh loop. - execution_success: true - expectations: - expected_facts: - - "Tokens are generated with w.database.generate_database_credential()" - - "Each request requires a unique request_id" - - "instance_names is a list of instance names to scope the credential" - - "Tokens expire after 1 hour" - expected_patterns: - - pattern: "generate_database_credential" - min_count: 1 - description: "Uses generate_database_credential() method" - - pattern: "uuid4" - min_count: 1 - description: "Generates unique request ID" - - pattern: "instance_names" - min_count: 1 - description: "Specifies target instances" - guidelines: - - "Must mention the 1-hour expiration" - - "Must show the request_id requirement with uuid4" - metadata: - category: happy_path - difficulty: easy - source: auto_generated - -- id: lakebase_prov_sqlalchemy_token_refresh_004 - inputs: - prompt: "How do I set up SQLAlchemy with automatic token refresh for Lakebase Provisioned?" - outputs: - response: | - ```python - import asyncio - import uuid - from sqlalchemy import event - from sqlalchemy.ext.asyncio import create_async_engine - from databricks.sdk import WorkspaceClient - - _current_token = None - TOKEN_REFRESH_INTERVAL = 50 * 60 # 50 minutes (before 1-hour expiry) - - def _generate_token(instance_name: str) -> str: - w = WorkspaceClient() - cred = w.database.generate_database_credential( - request_id=str(uuid.uuid4()), - instance_names=[instance_name] - ) - return cred.token - - async def _token_refresh_loop(instance_name: str): - global _current_token - while True: - await asyncio.sleep(TOKEN_REFRESH_INTERVAL) - _current_token = await asyncio.to_thread( - _generate_token, instance_name - ) - - def init_database(instance_name, database_name, username): - global _current_token - w = WorkspaceClient() - instance = w.database.get_database_instance(name=instance_name) - _current_token = _generate_token(instance_name) - - url = f"postgresql+psycopg://{username}@{instance.read_write_dns}:5432/{database_name}" - engine = create_async_engine( - url, - pool_size=5, - max_overflow=10, - pool_recycle=3600, - connect_args={"sslmode": "require"} - ) - - @event.listens_for(engine.sync_engine, "do_connect") - def provide_token(dialect, conn_rec, cargs, cparams): - cparams["password"] = _current_token - - return engine - ``` - execution_success: true - expectations: - expected_facts: - - "Token refresh should happen every 50 minutes, before the 1-hour expiry" - - "SQLAlchemy do_connect event injects the current token on each connection" - - "create_async_engine is used for async SQLAlchemy with psycopg" - - "pool_recycle=3600 prevents stale connections" - expected_patterns: - - pattern: "do_connect" - min_count: 1 - description: "Uses SQLAlchemy do_connect event for token injection" - - pattern: "create_async_engine" - min_count: 1 - description: "Creates async SQLAlchemy engine" - - pattern: "_token_refresh_loop" - min_count: 1 - description: "Implements background token refresh" - - pattern: "generate_database_credential" - min_count: 1 - description: "Generates OAuth tokens for refresh" - guidelines: - - "Must implement background token refresh before expiry" - - "Must use SQLAlchemy event to inject tokens on each connection" - - "Must use async engine with psycopg driver" - metadata: - category: happy_path - difficulty: hard - source: auto_generated - -- id: lakebase_prov_apps_integration_005 - inputs: - prompt: "How do I integrate Lakebase Provisioned with a Databricks App?" - outputs: - response: | - Databricks Apps use environment variables for Lakebase configuration: - - ```python - import os - - def is_lakebase_configured() -> bool: - """Check if Lakebase is configured for this app.""" - return bool( - os.environ.get("LAKEBASE_PG_URL") or - (os.environ.get("LAKEBASE_INSTANCE_NAME") and - os.environ.get("LAKEBASE_DATABASE_NAME")) - ) - ``` - - Register the database resource with your app: - ```bash - databricks apps add-resource $APP_NAME \ - --resource-type database \ - --resource-name lakebase \ - --database-instance my-lakebase-instance - ``` - - Environment variables set by Databricks Apps: `LAKEBASE_INSTANCE_NAME`, `LAKEBASE_DATABASE_NAME`, `LAKEBASE_USERNAME`, `LAKEBASE_PG_URL`. - execution_success: true - expectations: - expected_facts: - - "Databricks Apps set LAKEBASE_INSTANCE_NAME and LAKEBASE_DATABASE_NAME environment variables" - - "Database resources are registered with databricks apps add-resource" - - "LAKEBASE_PG_URL provides a ready-to-use connection string" - expected_patterns: - - pattern: "LAKEBASE_INSTANCE_NAME" - min_count: 1 - description: "References the instance name environment variable" - - pattern: "LAKEBASE_DATABASE_NAME" - min_count: 1 - description: "References the database name environment variable" - - pattern: "databricks apps add-resource" - min_count: 1 - description: "Shows the CLI command to register database resource" - guidelines: - - "Must list the environment variables set by Databricks Apps" - - "Must show the CLI command to register the database resource" - metadata: - category: happy_path - difficulty: medium - source: auto_generated - -- id: lakebase_prov_cli_reference_006 - inputs: - prompt: "What Databricks CLI commands are available for managing Lakebase Provisioned instances?" - outputs: - response: | - ```bash - # Create instance - databricks database create-database-instance \ - --name my-lakebase-instance \ - --capacity CU_1 - - # Get instance details - databricks database get-database-instance --name my-lakebase-instance - - # Generate credentials - databricks database generate-database-credential \ - --request-id $(uuidgen) \ - --json '{"instance_names": ["my-lakebase-instance"]}' - - # List instances - databricks database list-database-instances - - # Stop instance (saves cost) - databricks database stop-database-instance --name my-lakebase-instance - - # Start instance - databricks database start-database-instance --name my-lakebase-instance - ``` - execution_success: true - expectations: - expected_facts: - - "Lakebase Provisioned CLI commands are under databricks database" - - "Instances can be stopped and started to save costs" - - "Credential generation requires a unique request-id" - expected_patterns: - - pattern: "databricks database create-database-instance" - min_count: 1 - description: "Shows instance creation command" - - pattern: "databricks database stop-database-instance" - min_count: 1 - description: "Shows instance stop command" - - pattern: "databricks database generate-database-credential" - min_count: 1 - description: "Shows credential generation command" - guidelines: - - "Must show at least 5 different CLI subcommands" - - "Must include stop/start commands for cost management" - metadata: - category: happy_path - difficulty: easy - source: auto_generated - -- id: lakebase_prov_register_unity_catalog_007 - inputs: - prompt: "How do I register a Lakebase Provisioned instance with Unity Catalog?" - outputs: - response: | - ```python - from databricks.sdk import WorkspaceClient - - w = WorkspaceClient() - - # Register database in Unity Catalog - w.database.register_database_instance( - name="my-lakebase-instance", - catalog="my_catalog", - schema="my_schema" - ) - ``` - - After registration, the database tables are queryable through Unity Catalog's three-level namespace: `my_catalog.my_schema.table_name`. - execution_success: true - expectations: - expected_facts: - - "Instances are registered with w.database.register_database_instance()" - - "Registration requires a catalog and schema name" - - "After registration, tables are accessible via Unity Catalog namespace" - expected_patterns: - - pattern: "register_database_instance" - min_count: 1 - description: "Uses register_database_instance() method" - - pattern: "catalog" - min_count: 1 - description: "Specifies Unity Catalog catalog name" - - pattern: "schema" - min_count: 1 - description: "Specifies Unity Catalog schema name" - guidelines: - - "Must show the registration method with catalog and schema" - - "Must explain the three-level namespace after registration" - metadata: - category: happy_path - difficulty: medium - source: auto_generated diff --git a/.test/skills/databricks-lakebase-provisioned/manifest.yaml b/.test/skills/databricks-lakebase-provisioned/manifest.yaml deleted file mode 100644 index 33c4c861..00000000 --- a/.test/skills/databricks-lakebase-provisioned/manifest.yaml +++ /dev/null @@ -1,30 +0,0 @@ -skill_name: databricks-lakebase-provisioned -tool_modules: [lakebase, sql] -description: Patterns and best practices for using Lakebase Provisioned (Databricks managed PostgreSQL) for OLTP workloads. -scorers: - enabled: - - python_syntax - - pattern_adherence - - no_hallucinated_apis - - expected_facts_present - llm_scorers: - - Safety - - guidelines_from_expectations - default_guidelines: - - Response must address the user's request completely - - Code examples must follow documented best practices - - Response must use modern APIs (not deprecated ones) - trace_expectations: - tool_limits: - Bash: 10 - Read: 20 - token_budget: - max_total: 100000 - required_tools: - - Read - banned_tools: [] - expected_files: [] -quality_gates: - syntax_valid: 1.0 - pattern_adherence: 0.9 - execution_success: 0.8 diff --git a/.test/src/skill_test/scorers/routing.py b/.test/src/skill_test/scorers/routing.py index fa32c073..76575370 100644 --- a/.test/src/skill_test/scorers/routing.py +++ b/.test/src/skill_test/scorers/routing.py @@ -73,7 +73,7 @@ "multi-agent", "supervisor", ], - "databricks-lakebase-provisioned": ["lakebase", "postgresql", "postgres"], + "databricks-lakebase-autoscale": ["lakebase", "postgresql", "postgres"], "databricks-model-serving": ["model serving", "serving endpoint", "inference endpoint"], } diff --git a/.test/tests/test_scorers.py b/.test/tests/test_scorers.py index 63de0125..edc2f10c 100644 --- a/.test/tests/test_scorers.py +++ b/.test/tests/test_scorers.py @@ -118,10 +118,10 @@ def test_detect_fastapi_react_matches_both(self): assert "databricks-app-python" in skills def test_detect_lakebase(self): - """Test detection of databricks-lakebase-provisioned skill.""" + """Test detection of databricks-lakebase-autoscale skill.""" prompt = "Create an app that stores data in Lakebase" skills = detect_skills_from_prompt(prompt) - assert "databricks-lakebase-provisioned" in skills + assert "databricks-lakebase-autoscale" in skills def test_detect_model_serving(self): """Test detection of databricks-model-serving skill.""" @@ -141,7 +141,7 @@ def test_detect_multi_app_lakebase(self): prompt = "Create a Streamlit app that stores data in Lakebase" skills = detect_skills_from_prompt(prompt) assert "databricks-app-python" in skills - assert "databricks-lakebase-provisioned" in skills + assert "databricks-lakebase-autoscale" in skills def test_detect_multi_app_serving(self): """Test detection of app + model serving.""" @@ -178,7 +178,7 @@ def test_all_skills_have_triggers(self): "databricks-synthetic-data-gen", "databricks-mlflow-evaluation", "databricks-agent-bricks", - "databricks-lakebase-provisioned", + "databricks-lakebase-autoscale", "databricks-model-serving", ] for skill in expected_skills: diff --git a/databricks-builder-app/server/services/skills_manager.py b/databricks-builder-app/server/services/skills_manager.py index 0732d5db..36bd9b8c 100644 --- a/databricks-builder-app/server/services/skills_manager.py +++ b/databricks-builder-app/server/services/skills_manager.py @@ -35,12 +35,6 @@ 'manage_vs_endpoint', 'manage_vs_index', 'manage_vs_data', 'query_vs_index', ], 'databricks-metric-views': ['manage_metric_views'], - # Provisioned and Autoscale Lakebase share the core database/sync/credential - # tools. Autoscale additionally claims branch tools. If either skill is - # enabled, the shared tools are available. - 'databricks-lakebase-provisioned': [ - 'manage_lakebase_database', 'manage_lakebase_sync', 'generate_lakebase_credential', - ], 'databricks-lakebase-autoscale': [ 'manage_lakebase_database', 'manage_lakebase_sync', 'generate_lakebase_credential', 'manage_lakebase_branch', diff --git a/databricks-mcp-server/databricks_mcp_server/tools/lakebase.py b/databricks-mcp-server/databricks_mcp_server/tools/lakebase.py index c82667bd..51771fef 100644 --- a/databricks-mcp-server/databricks_mcp_server/tools/lakebase.py +++ b/databricks-mcp-server/databricks_mcp_server/tools/lakebase.py @@ -116,7 +116,7 @@ def manage_lakebase_database( force=True cascades to children (provisioned). Autoscale deletes all branches/computes/data. Returns: {status, ...}. - See databricks-lakebase-provisioned or databricks-lakebase-autoscale skill for details.""" + See databricks-lakebase-autoscale skill for details.""" act = action.lower() if act == "create_or_update": @@ -235,7 +235,7 @@ def manage_lakebase_sync( Requires table_name. Optional catalog_name to also delete catalog. Returns: {synced_table, catalog (if deleted)}. - See databricks-lakebase-provisioned skill for sync workflows.""" + See databricks-lakebase-autoscale skill for sync workflows.""" act = action.lower() if act == "create_or_update": diff --git a/databricks-skills/README.md b/databricks-skills/README.md index a81730a2..a64cd137 100644 --- a/databricks-skills/README.md +++ b/databricks-skills/README.md @@ -105,7 +105,7 @@ cp -r ai-dev-kit/databricks-skills/databricks-agent-bricks .claude/skills/ - **databricks-app-python** - Python web apps (Dash, Streamlit, Flask) with foundation model integration - **databricks-python-sdk** - Python SDK, Connect, CLI, REST API - **databricks-config** - Profile authentication setup -- **databricks-lakebase-provisioned** - Managed PostgreSQL for OLTP workloads +- **databricks-lakebase-autoscale** - Managed PostgreSQL with autoscaling for OLTP workloads ### 📚 Reference - **databricks-docs** - Documentation index via llms.txt diff --git a/databricks-skills/databricks-app-python/SKILL.md b/databricks-skills/databricks-app-python/SKILL.md index 777d3377..56f76d4b 100644 --- a/databricks-skills/databricks-app-python/SKILL.md +++ b/databricks-skills/databricks-app-python/SKILL.md @@ -207,5 +207,5 @@ class EntityIn(BaseModel): - **[databricks-app-apx](../databricks-app-apx/SKILL.md)** - full-stack apps with FastAPI + React - **[databricks-bundles](../databricks-bundles/SKILL.md)** - deploying apps via DABs - **[databricks-python-sdk](../databricks-python-sdk/SKILL.md)** - backend SDK integration -- **[databricks-lakebase-provisioned](../databricks-lakebase-provisioned/SKILL.md)** - adding persistent PostgreSQL state +- **[databricks-lakebase-autoscale](../databricks-lakebase-autoscale/SKILL.md)** - adding persistent PostgreSQL state - **[databricks-model-serving](../databricks-model-serving/SKILL.md)** - serving ML models for app integration diff --git a/databricks-skills/databricks-lakebase-autoscale/SKILL.md b/databricks-skills/databricks-lakebase-autoscale/SKILL.md index f471765c..be737be3 100644 --- a/databricks-skills/databricks-lakebase-autoscale/SKILL.md +++ b/databricks-skills/databricks-lakebase-autoscale/SKILL.md @@ -326,7 +326,6 @@ These features are NOT yet supported in Lakebase Autoscaling: ## Related Skills -- **[databricks-lakebase-provisioned](../databricks-lakebase-provisioned/SKILL.md)** - fixed-capacity managed PostgreSQL (predecessor) - **[databricks-app-apx](../databricks-app-apx/SKILL.md)** - full-stack apps that can use Lakebase for persistence - **[databricks-app-python](../databricks-app-python/SKILL.md)** - Python apps with Lakebase backend - **[databricks-python-sdk](../databricks-python-sdk/SKILL.md)** - SDK used for project management and token generation diff --git a/databricks-skills/databricks-lakebase-provisioned/SKILL.md b/databricks-skills/databricks-lakebase-provisioned/SKILL.md deleted file mode 100644 index 7548219c..00000000 --- a/databricks-skills/databricks-lakebase-provisioned/SKILL.md +++ /dev/null @@ -1,352 +0,0 @@ ---- -name: databricks-lakebase-provisioned -description: "Patterns and best practices for Lakebase Provisioned (Databricks managed PostgreSQL) for OLTP workloads. Use when creating Lakebase instances, connecting applications or Databricks Apps to PostgreSQL, implementing reverse ETL via synced tables, storing agent or chat memory, or configuring OAuth authentication for Lakebase." ---- - -# Lakebase Provisioned - -Patterns and best practices for using Lakebase Provisioned (Databricks managed PostgreSQL) for OLTP workloads. - -## When to Use - -Use this skill when: -- Building applications that need a PostgreSQL database for transactional workloads -- Adding persistent state to Databricks Apps -- Implementing reverse ETL from Delta Lake to an operational database -- Storing chat/agent memory for LangChain applications - -## Overview - -Lakebase Provisioned is Databricks' managed PostgreSQL database service for OLTP (Online Transaction Processing) workloads. It provides a fully managed PostgreSQL-compatible database that integrates with Unity Catalog and supports OAuth token-based authentication. - -| Feature | Description | -|---------|-------------| -| **Managed PostgreSQL** | Fully managed instances with automatic provisioning | -| **OAuth Authentication** | Token-based auth via Databricks SDK (1-hour expiry) | -| **Unity Catalog** | Register databases for governance | -| **Reverse ETL** | Sync data from Delta tables to PostgreSQL | -| **Apps Integration** | First-class support in Databricks Apps | - -**Available Regions (AWS):** us-east-1, us-east-2, us-west-2, eu-central-1, eu-west-1, ap-south-1, ap-southeast-1, ap-southeast-2 - -## Quick Start - -Create and connect to a Lakebase Provisioned instance: - -```python -from databricks.sdk import WorkspaceClient -import uuid - -# Initialize client -w = WorkspaceClient() - -# Create a database instance -instance = w.database.create_database_instance( - name="my-lakebase-instance", - capacity="CU_1", # CU_1, CU_2, CU_4, CU_8 - stopped=False -) -print(f"Instance created: {instance.name}") -print(f"DNS endpoint: {instance.read_write_dns}") -``` - -## Common Patterns - -### Generate OAuth Token - -```python -from databricks.sdk import WorkspaceClient -import uuid - -w = WorkspaceClient() - -# Generate OAuth token for database connection -cred = w.database.generate_database_credential( - request_id=str(uuid.uuid4()), - instance_names=["my-lakebase-instance"] -) -token = cred.token # Use this as password in connection string -``` - -### Connect from Notebook - -```python -import psycopg -from databricks.sdk import WorkspaceClient -import uuid - -# Get instance details -w = WorkspaceClient() -instance = w.database.get_database_instance(name="my-lakebase-instance") - -# Generate token -cred = w.database.generate_database_credential( - request_id=str(uuid.uuid4()), - instance_names=["my-lakebase-instance"] -) - -# Connect using psycopg3 -conn_string = f"host={instance.read_write_dns} dbname=postgres user={w.current_user.me().user_name} password={cred.token} sslmode=require" -with psycopg.connect(conn_string) as conn: - with conn.cursor() as cur: - cur.execute("SELECT version()") - print(cur.fetchone()) -``` - -### SQLAlchemy with Token Refresh (Production) - -For long-running applications, tokens must be refreshed (expire after 1 hour): - -```python -import asyncio -import os -import uuid -from sqlalchemy import event -from sqlalchemy.ext.asyncio import create_async_engine, AsyncSession -from sqlalchemy.orm import sessionmaker -from databricks.sdk import WorkspaceClient - -# Token refresh state -_current_token = None -_token_refresh_task = None -TOKEN_REFRESH_INTERVAL = 50 * 60 # 50 minutes (before 1-hour expiry) - -def _generate_token(instance_name: str) -> str: - """Generate fresh OAuth token.""" - w = WorkspaceClient() - cred = w.database.generate_database_credential( - request_id=str(uuid.uuid4()), - instance_names=[instance_name] - ) - return cred.token - -async def _token_refresh_loop(instance_name: str): - """Background task to refresh token every 50 minutes.""" - global _current_token - while True: - await asyncio.sleep(TOKEN_REFRESH_INTERVAL) - _current_token = await asyncio.to_thread(_generate_token, instance_name) - -def init_database(instance_name: str, database_name: str, username: str) -> AsyncEngine: - """Initialize database with OAuth token injection.""" - global _current_token - - w = WorkspaceClient() - instance = w.database.get_database_instance(name=instance_name) - - # Generate initial token - _current_token = _generate_token(instance_name) - - # Build URL (password injected via do_connect) - url = f"postgresql+psycopg://{username}@{instance.read_write_dns}:5432/{database_name}" - - engine = create_async_engine( - url, - pool_size=5, - max_overflow=10, - pool_recycle=3600, - connect_args={"sslmode": "require"} - ) - - # Inject token on each connection - @event.listens_for(engine.sync_engine, "do_connect") - def provide_token(dialect, conn_rec, cargs, cparams): - cparams["password"] = _current_token - - return engine -``` - -### Databricks Apps Integration - -For Databricks Apps, use environment variables for configuration: - -```python -# Environment variables set by Databricks Apps: -# - LAKEBASE_INSTANCE_NAME: Instance name -# - LAKEBASE_DATABASE_NAME: Database name -# - LAKEBASE_USERNAME: Username (optional, defaults to service principal) - -import os - -def is_lakebase_configured() -> bool: - """Check if Lakebase is configured for this app.""" - return bool( - os.environ.get("LAKEBASE_PG_URL") or - (os.environ.get("LAKEBASE_INSTANCE_NAME") and - os.environ.get("LAKEBASE_DATABASE_NAME")) - ) -``` - -Add Lakebase as an app resource via CLI: - -```bash -databricks apps add-resource $APP_NAME \ - --resource-type database \ - --resource-name lakebase \ - --database-instance my-lakebase-instance -``` - -### Register with Unity Catalog - -```python -from databricks.sdk import WorkspaceClient - -w = WorkspaceClient() - -# Register database in Unity Catalog -w.database.register_database_instance( - name="my-lakebase-instance", - catalog="my_catalog", - schema="my_schema" -) -``` - -### MLflow Model Resources - -Declare Lakebase as a model resource for automatic credential provisioning: - -```python -from mlflow.models.resources import DatabricksLakebase - -resources = [ - DatabricksLakebase(database_instance_name="my-lakebase-instance"), -] - -# When logging model -mlflow.langchain.log_model( - model, - artifact_path="model", - resources=resources, - pip_requirements=["databricks-langchain[memory]"] -) -``` - -## MCP Tools - -The following MCP tools are available for managing Lakebase infrastructure. Use `type="provisioned"` for Lakebase Provisioned. - -### manage_lakebase_database - Database Management - -| Action | Description | Required Params | -|--------|-------------|-----------------| -| `create_or_update` | Create or update a database | name | -| `get` | Get database details | name | -| `list` | List all databases | (none, optional type filter) | -| `delete` | Delete database and resources | name | - -**Example usage:** -```python -# Create a provisioned database -manage_lakebase_database( - action="create_or_update", - name="my-lakebase-instance", - type="provisioned", - capacity="CU_1" -) - -# Get database details -manage_lakebase_database(action="get", name="my-lakebase-instance", type="provisioned") - -# List all databases -manage_lakebase_database(action="list") - -# Delete with cascade -manage_lakebase_database(action="delete", name="my-lakebase-instance", type="provisioned", force=True) -``` - -### manage_lakebase_sync - Reverse ETL - -| Action | Description | Required Params | -|--------|-------------|-----------------| -| `create_or_update` | Set up reverse ETL from Delta to Lakebase | instance_name, source_table_name, target_table_name | -| `delete` | Remove synced table (and optionally catalog) | table_name | - -**Example usage:** -```python -# Set up reverse ETL -manage_lakebase_sync( - action="create_or_update", - instance_name="my-lakebase-instance", - source_table_name="catalog.schema.delta_table", - target_table_name="lakebase_catalog.schema.postgres_table", - scheduling_policy="TRIGGERED" # or SNAPSHOT, CONTINUOUS -) - -# Delete synced table -manage_lakebase_sync(action="delete", table_name="lakebase_catalog.schema.postgres_table") -``` - -### generate_lakebase_credential - OAuth Tokens - -Generate OAuth token (~1hr) for PostgreSQL connections. Use as password with `sslmode=require`. - -```python -# For provisioned instances -generate_lakebase_credential(instance_names=["my-lakebase-instance"]) -``` - -## Reference Files - -- [connection-patterns.md](connection-patterns.md) - Detailed connection patterns for different use cases -- [reverse-etl.md](reverse-etl.md) - Syncing data from Delta Lake to Lakebase - -## CLI Quick Reference - -```bash -# Create instance -databricks database create-database-instance \ - --name my-lakebase-instance \ - --capacity CU_1 - -# Get instance details -databricks database get-database-instance --name my-lakebase-instance - -# Generate credentials -databricks database generate-database-credential \ - --request-id $(uuidgen) \ - --json '{"instance_names": ["my-lakebase-instance"]}' - -# List instances -databricks database list-database-instances - -# Stop instance (saves cost) -databricks database stop-database-instance --name my-lakebase-instance - -# Start instance -databricks database start-database-instance --name my-lakebase-instance -``` - -## Common Issues - -| Issue | Solution | -|-------|----------| -| **Token expired during long query** | Implement token refresh loop (see SQLAlchemy with Token Refresh section); tokens expire after 1 hour | -| **DNS resolution fails on macOS** | Use `dig` command to resolve hostname, pass `hostaddr` to psycopg | -| **Connection refused** | Ensure instance is not stopped; check `instance.state` | -| **Permission denied** | User must be granted access to the Lakebase instance | -| **SSL required error** | Always use `sslmode=require` in connection string | - -## SDK Version Requirements - -- **Databricks SDK for Python**: >= 0.61.0 (0.81.0+ recommended for full API support) -- **psycopg**: 3.x (supports `hostaddr` parameter for DNS workaround) -- **SQLAlchemy**: 2.x with `postgresql+psycopg` driver - -```python -%pip install -U "databricks-sdk>=0.81.0" "psycopg[binary]>=3.0" sqlalchemy -``` - -## Notes - -- **Capacity values** use compute unit sizing: `CU_1`, `CU_2`, `CU_4`, `CU_8`. -- **Lakebase Autoscaling** is a newer offering with automatic scaling but limited regional availability. This skill focuses on **Lakebase Provisioned** which is more widely available. -- For memory/state in LangChain agents, use `databricks-langchain[memory]` which includes Lakebase support. -- Tokens are short-lived (1 hour) - production apps MUST implement token refresh. - -## Related Skills - -- **[databricks-app-apx](../databricks-app-apx/SKILL.md)** - full-stack apps that can use Lakebase for persistence -- **[databricks-app-python](../databricks-app-python/SKILL.md)** - Python apps with Lakebase backend -- **[databricks-python-sdk](../databricks-python-sdk/SKILL.md)** - SDK used for instance management and token generation -- **[databricks-bundles](../databricks-bundles/SKILL.md)** - deploying apps with Lakebase resources -- **[databricks-jobs](../databricks-jobs/SKILL.md)** - scheduling reverse ETL sync jobs diff --git a/databricks-skills/databricks-lakebase-provisioned/connection-patterns.md b/databricks-skills/databricks-lakebase-provisioned/connection-patterns.md deleted file mode 100644 index e6843548..00000000 --- a/databricks-skills/databricks-lakebase-provisioned/connection-patterns.md +++ /dev/null @@ -1,279 +0,0 @@ -# Lakebase Connection Patterns - -## Overview - -This document covers different connection patterns for Lakebase Provisioned, from simple scripts to production applications with token refresh. - -## Connection Methods - -### 1. Direct psycopg Connection (Simple Scripts) - -For one-off scripts or notebooks: - -```python -import psycopg -from databricks.sdk import WorkspaceClient -import uuid - -def get_connection(instance_name: str, database_name: str = "postgres"): - """Get a database connection with fresh OAuth token.""" - w = WorkspaceClient() - - # Get instance details - instance = w.database.get_database_instance(name=instance_name) - - # Generate OAuth token (valid for 1 hour) - cred = w.database.generate_database_credential( - request_id=str(uuid.uuid4()), - instance_names=[instance_name] - ) - - # Build connection string - conn_string = ( - f"host={instance.read_write_dns} " - f"dbname={database_name} " - f"user={w.current_user.me().user_name} " - f"password={cred.token} " - f"sslmode=require" - ) - - return psycopg.connect(conn_string) - -# Usage -with get_connection("my-instance") as conn: - with conn.cursor() as cur: - cur.execute("SELECT NOW()") - print(cur.fetchone()) -``` - -### 2. Connection Pool with Token Refresh (Production) - -For long-running applications that need connection pooling: - -```python -import asyncio -import uuid -from contextlib import asynccontextmanager -from typing import AsyncGenerator, Optional - -from sqlalchemy import event -from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker -from databricks.sdk import WorkspaceClient - -class LakebaseConnectionManager: - """Manages Lakebase connections with automatic token refresh.""" - - def __init__( - self, - instance_name: str, - database_name: str, - pool_size: int = 5, - max_overflow: int = 10, - token_refresh_seconds: int = 3000 # 50 minutes - ): - self.instance_name = instance_name - self.database_name = database_name - self.pool_size = pool_size - self.max_overflow = max_overflow - self.token_refresh_seconds = token_refresh_seconds - - self._current_token: Optional[str] = None - self._refresh_task: Optional[asyncio.Task] = None - self._engine = None - self._session_maker = None - - def _generate_token(self) -> str: - """Generate fresh OAuth token.""" - w = WorkspaceClient() - cred = w.database.generate_database_credential( - request_id=str(uuid.uuid4()), - instance_names=[self.instance_name] - ) - return cred.token - - async def _refresh_loop(self): - """Background task to refresh token periodically.""" - while True: - await asyncio.sleep(self.token_refresh_seconds) - try: - self._current_token = await asyncio.to_thread(self._generate_token) - except Exception as e: - print(f"Token refresh failed: {e}") - - def initialize(self): - """Initialize database engine and start token refresh.""" - w = WorkspaceClient() - - # Get instance info - instance = w.database.get_database_instance(name=self.instance_name) - username = w.current_user.me().user_name - - # Generate initial token - self._current_token = self._generate_token() - - # Create engine (password injected via event) - url = ( - f"postgresql+psycopg://{username}@" - f"{instance.read_write_dns}:5432/{self.database_name}" - ) - - self._engine = create_async_engine( - url, - pool_size=self.pool_size, - max_overflow=self.max_overflow, - pool_recycle=3600, - connect_args={"sslmode": "require"} - ) - - # Inject token on connect - @event.listens_for(self._engine.sync_engine, "do_connect") - def inject_token(dialect, conn_rec, cargs, cparams): - cparams["password"] = self._current_token - - self._session_maker = async_sessionmaker( - self._engine, - class_=AsyncSession, - expire_on_commit=False - ) - - def start_refresh(self): - """Start background token refresh task.""" - if not self._refresh_task: - self._refresh_task = asyncio.create_task(self._refresh_loop()) - - async def stop_refresh(self): - """Stop token refresh task.""" - if self._refresh_task: - self._refresh_task.cancel() - try: - await self._refresh_task - except asyncio.CancelledError: - pass - self._refresh_task = None - - @asynccontextmanager - async def session(self) -> AsyncGenerator[AsyncSession, None]: - """Get a database session.""" - async with self._session_maker() as session: - yield session - - async def close(self): - """Close all connections.""" - await self.stop_refresh() - if self._engine: - await self._engine.dispose() - -# Usage in FastAPI -from fastapi import FastAPI - -app = FastAPI() -db_manager = LakebaseConnectionManager("my-instance", "my_database") - -@app.on_event("startup") -async def startup(): - db_manager.initialize() - db_manager.start_refresh() - -@app.on_event("shutdown") -async def shutdown(): - await db_manager.close() - -@app.get("/data") -async def get_data(): - async with db_manager.session() as session: - result = await session.execute("SELECT * FROM my_table") - return result.fetchall() -``` - -### 3. Static URL Mode (Local Development) - -For local development, use a static connection URL: - -```python -import os -from sqlalchemy.ext.asyncio import create_async_engine - -# Set environment variable with full connection URL -# LAKEBASE_PG_URL=postgresql://user:password@host:5432/database - -def get_database_url() -> str: - """Get database URL from environment.""" - url = os.environ.get("LAKEBASE_PG_URL") - if url and url.startswith("postgresql://"): - # Convert to psycopg3 async driver - url = url.replace("postgresql://", "postgresql+psycopg://", 1) - return url - -engine = create_async_engine( - get_database_url(), - pool_size=5, - connect_args={"sslmode": "require"} -) -``` - -### 4. DNS Resolution Workaround (macOS) - -Python's `socket.getaddrinfo()` fails with long hostnames on macOS. Use `dig` as fallback: - -```python -import subprocess -import socket - -def resolve_hostname(hostname: str) -> str: - """Resolve hostname using dig command (macOS workaround).""" - try: - # Try Python's resolver first - return socket.gethostbyname(hostname) - except socket.gaierror: - pass - - # Fallback to dig command - try: - result = subprocess.run( - ["dig", "+short", hostname], - capture_output=True, - text=True, - timeout=5 - ) - ips = result.stdout.strip().split('\n') - for ip in ips: - if ip and not ip.startswith(';'): - return ip - except Exception: - pass - - raise RuntimeError(f"Could not resolve hostname: {hostname}") - -# Use with psycopg -conn_params = { - "host": hostname, # For TLS SNI - "hostaddr": resolve_hostname(hostname), # Actual IP - "dbname": database_name, - "user": username, - "password": token, - "sslmode": "require" -} -conn = psycopg.connect(**conn_params) -``` - -## Environment Variables - -| Variable | Description | Required | -|----------|-------------|----------| -| `LAKEBASE_PG_URL` | Static PostgreSQL URL (local dev) | Either this OR instance/database | -| `LAKEBASE_INSTANCE_NAME` | Lakebase instance name | With DATABASE_NAME | -| `LAKEBASE_DATABASE_NAME` | Database name | With INSTANCE_NAME | -| `LAKEBASE_USERNAME` | Override username | No | -| `LAKEBASE_HOST` | Override host | No | -| `DB_POOL_SIZE` | Connection pool size | No (default: 5) | -| `DB_MAX_OVERFLOW` | Max pool overflow | No (default: 10) | -| `DB_POOL_RECYCLE_INTERVAL` | Pool recycle seconds | No (default: 3600) | - -## Best Practices - -1. **Always use SSL**: Set `sslmode=require` in all connections -2. **Implement token refresh**: Tokens expire after 1 hour; refresh at 50 minutes -3. **Use connection pooling**: Avoid creating new connections per request -4. **Handle DNS issues on macOS**: Use the `hostaddr` workaround if needed -5. **Close connections properly**: Use context managers or explicit cleanup -6. **Log token refresh events**: Helps debug authentication issues diff --git a/databricks-skills/databricks-lakebase-provisioned/reverse-etl.md b/databricks-skills/databricks-lakebase-provisioned/reverse-etl.md deleted file mode 100644 index 5b5caef4..00000000 --- a/databricks-skills/databricks-lakebase-provisioned/reverse-etl.md +++ /dev/null @@ -1,171 +0,0 @@ -# Reverse ETL with Lakebase Provisioned - -## Overview - -Reverse ETL allows you to sync data from Unity Catalog Delta tables into Lakebase Provisioned as PostgreSQL tables. This enables OLTP access patterns on data processed in the Lakehouse. - -## Sync Modes - -| Mode | Description | Best For | Notes | -|------|-------------|----------|-------| -| **Snapshot** | One-time full copy | Initial setup, small tables | 10x more efficient if modifying >10% of data | -| **Triggered** | Scheduled updates on demand | Dashboards updated hourly/daily | Requires CDF on source table | -| **Continuous** | Real-time streaming (seconds of latency) | Live applications | Highest cost, minimum 15s intervals, requires CDF | - -**Note:** Triggered and Continuous modes require Change Data Feed (CDF) enabled on the source table: - -```sql -ALTER TABLE your_catalog.your_schema.your_table -SET TBLPROPERTIES (delta.enableChangeDataFeed = true) -``` - -## Creating Synced Tables - -### Using Python SDK - -```python -from databricks.sdk import WorkspaceClient -from databricks.sdk.service.database import ( - SyncedDatabaseTable, - SyncedTableSpec, - SyncedTableSchedulingPolicy, -) - -w = WorkspaceClient() - -# Create a synced table from Unity Catalog to Lakebase Provisioned -synced_table = w.database.create_synced_database_table( - SyncedDatabaseTable( - name="lakebase_catalog.schema.synced_table", - database_instance_name="my-lakebase-instance", - spec=SyncedTableSpec( - source_table_full_name="analytics.gold.user_profiles", - primary_key_columns=["user_id"], - scheduling_policy=SyncedTableSchedulingPolicy.TRIGGERED, - ), - ) -) -print(f"Created synced table: {synced_table.name}") -``` - -**Key parameters:** - -| Parameter | Description | -|-----------|-------------| -| `name` | Fully qualified target table name (catalog.schema.table) | -| `database_instance_name` | Lakebase Provisioned instance name | -| `source_table_full_name` | Fully qualified source Delta table (catalog.schema.table) | -| `primary_key_columns` | List of primary key columns from the source table | -| `scheduling_policy` | `SNAPSHOT`, `TRIGGERED`, or `CONTINUOUS` | - -### Using CLI - -```bash -databricks database create-synced-database-table \ - --json '{ - "name": "lakebase_catalog.schema.synced_table", - "database_instance_name": "my-lakebase-instance", - "spec": { - "source_table_full_name": "analytics.gold.user_profiles", - "primary_key_columns": ["user_id"], - "scheduling_policy": "TRIGGERED" - } - }' -``` - -**Note:** There is no SQL syntax for creating synced tables. Use the Python SDK, CLI, or Catalog Explorer UI. - -## Checking Synced Table Status - -```python -status = w.database.get_synced_database_table(name="lakebase_catalog.schema.synced_table") -print(f"State: {status.data_synchronization_status.detailed_state}") -print(f"Message: {status.data_synchronization_status.message}") -``` - -## Deleting a Synced Table - -Delete from both Unity Catalog and Postgres: - -1. **Unity Catalog:** Delete via Catalog Explorer or SDK -2. **Postgres:** Drop the table to free storage - -```python -# Delete the synced table via SDK -w.database.delete_synced_database_table(name="lakebase_catalog.schema.synced_table") -``` - -```sql --- Drop the Postgres table to free storage -DROP TABLE your_database.your_schema.your_table; -``` - -## Use Cases - -### 1. Product Catalog for Web App - -```python -w.database.create_synced_database_table( - SyncedDatabaseTable( - name="ecommerce_catalog.public.products", - database_instance_name="ecommerce-db", - spec=SyncedTableSpec( - source_table_full_name="gold.products.catalog", - primary_key_columns=["product_id"], - scheduling_policy=SyncedTableSchedulingPolicy.TRIGGERED, - ), - ) -) -# Application queries PostgreSQL directly with low-latency point lookups -``` - -### 2. User Profiles for Authentication - -```python -w.database.create_synced_database_table( - SyncedDatabaseTable( - name="auth_catalog.public.user_profiles", - database_instance_name="auth-db", - spec=SyncedTableSpec( - source_table_full_name="gold.users.profiles", - primary_key_columns=["user_id"], - scheduling_policy=SyncedTableSchedulingPolicy.CONTINUOUS, - ), - ) -) -``` - -### 3. Feature Store for Real-time ML - -```python -w.database.create_synced_database_table( - SyncedDatabaseTable( - name="ml_catalog.public.user_features", - database_instance_name="feature-store-db", - spec=SyncedTableSpec( - source_table_full_name="ml.features.user_features", - primary_key_columns=["user_id"], - scheduling_policy=SyncedTableSchedulingPolicy.CONTINUOUS, - ), - ) -) -# ML model queries features with low latency -``` - -## Best Practices - -1. **Enable CDF** on source tables before creating Triggered or Continuous synced tables -2. **Choose appropriate sync mode**: Snapshot for small tables or one-time loads, Triggered for hourly/daily refreshes, Continuous for real-time -3. **Monitor sync status**: Check for failures and latency via Catalog Explorer or `get_synced_database_table()` -4. **Index target tables**: Create appropriate indexes in PostgreSQL for your query patterns -5. **Handle schema changes**: Only additive changes (e.g., adding columns) are supported for Triggered/Continuous modes -6. **Account for connection limits**: Each synced table uses up to 16 connections - -## Common Issues - -| Issue | Solution | -|-------|----------| -| **Sync fails with CDF error** | Enable Change Data Feed on source table before using Triggered or Continuous mode | -| **Schema mismatch** | Only additive schema changes are supported; for breaking changes, delete and recreate the synced table | -| **Sync takes too long** | Switch to Triggered mode for scheduled updates; use Snapshot for initial bulk loads | -| **Target table locked** | Avoid DDL on target during sync operations | diff --git a/databricks-skills/databricks-python-sdk/SKILL.md b/databricks-skills/databricks-python-sdk/SKILL.md index eaf7cd66..b322c00e 100644 --- a/databricks-skills/databricks-python-sdk/SKILL.md +++ b/databricks-skills/databricks-python-sdk/SKILL.md @@ -622,4 +622,4 @@ If I'm unsure about a method, I should: - **[databricks-unity-catalog](../databricks-unity-catalog/SKILL.md)** - catalog governance - **[databricks-model-serving](../databricks-model-serving/SKILL.md)** - serving endpoint management - **[databricks-vector-search](../databricks-vector-search/SKILL.md)** - vector index operations -- **[databricks-lakebase-provisioned](../databricks-lakebase-provisioned/SKILL.md)** - managed PostgreSQL via SDK +- **[databricks-lakebase-autoscale](../databricks-lakebase-autoscale/SKILL.md)** - managed PostgreSQL via SDK diff --git a/databricks-skills/install_skills.sh b/databricks-skills/install_skills.sh index 0fc2e1d2..563328a5 100755 --- a/databricks-skills/install_skills.sh +++ b/databricks-skills/install_skills.sh @@ -47,7 +47,7 @@ MLFLOW_REPO_RAW_URL="https://raw.githubusercontent.com/mlflow/skills" MLFLOW_REPO_REF="main" # Databricks skills (hosted in this repo) -DATABRICKS_SKILLS="databricks-agent-bricks databricks-ai-functions databricks-aibi-dashboards databricks-bundles databricks-app-python databricks-config databricks-dbsql databricks-docs databricks-genie databricks-iceberg databricks-jobs databricks-lakebase-autoscale databricks-lakebase-provisioned databricks-metric-views databricks-mlflow-evaluation databricks-model-serving databricks-python-sdk databricks-execution-compute databricks-spark-declarative-pipelines databricks-spark-structured-streaming databricks-synthetic-data-gen databricks-unity-catalog databricks-unstructured-pdf-generation databricks-vector-search databricks-zerobus-ingest spark-python-data-source" +DATABRICKS_SKILLS="databricks-agent-bricks databricks-ai-functions databricks-aibi-dashboards databricks-bundles databricks-app-python databricks-config databricks-dbsql databricks-docs databricks-genie databricks-iceberg databricks-jobs databricks-lakebase-autoscale databricks-metric-views databricks-mlflow-evaluation databricks-model-serving databricks-python-sdk databricks-execution-compute databricks-spark-declarative-pipelines databricks-spark-structured-streaming databricks-synthetic-data-gen databricks-unity-catalog databricks-unstructured-pdf-generation databricks-vector-search databricks-zerobus-ingest spark-python-data-source" # MLflow skills (fetched from mlflow/skills repo) MLFLOW_SKILLS="agent-evaluation analyze-mlflow-chat-session analyze-mlflow-trace instrumenting-with-mlflow-tracing mlflow-onboarding querying-mlflow-metrics retrieving-mlflow-traces searching-mlflow-docs" @@ -83,7 +83,6 @@ get_skill_description() { "databricks-execution-compute") echo "Execute code and manage compute on Databricks - serverless, clusters, and SQL warehouses" ;; "databricks-unity-catalog") echo "System tables for lineage, audit, billing" ;; "databricks-lakebase-autoscale") echo "Lakebase Autoscale - managed PostgreSQL with autoscaling" ;; - "databricks-lakebase-provisioned") echo "Lakebase Provisioned - data connections and reverse ETL" ;; "databricks-metric-views") echo "Unity Catalog Metric Views - governed business metrics in YAML" ;; "databricks-model-serving") echo "Model Serving - deploy MLflow models and AI agents" ;; "databricks-mlflow-evaluation") echo "MLflow evaluation and trace analysis" ;; @@ -124,7 +123,6 @@ get_skill_extra_files() { "databricks-python-sdk") echo "doc-index.md examples/1-authentication.py examples/2-clusters-and-jobs.py examples/3-sql-and-warehouses.py examples/4-unity-catalog.py examples/5-serving-and-vector-search.py" ;; "databricks-unity-catalog") echo "5-system-tables.md" ;; "databricks-lakebase-autoscale") echo "projects.md branches.md computes.md connection-patterns.md reverse-etl.md" ;; - "databricks-lakebase-provisioned") echo "connection-patterns.md reverse-etl.md" ;; "databricks-metric-views") echo "yaml-reference.md patterns.md" ;; "databricks-model-serving") echo "1-classical-ml.md 2-custom-pyfunc.md 3-genai-agents.md 4-tools-integration.md 5-development-testing.md 6-logging-registration.md 7-deployment.md 8-querying-endpoints.md 9-package-requirements.md" ;; "databricks-mlflow-evaluation") echo "references/CRITICAL-interfaces.md references/GOTCHAS.md references/patterns-context-optimization.md references/patterns-datasets.md references/patterns-evaluation.md references/patterns-scorers.md references/patterns-trace-analysis.md references/user-journeys.md" ;; diff --git a/install.ps1 b/install.ps1 index 892c8854..9d5f99eb 100644 --- a/install.ps1 +++ b/install.ps1 @@ -82,7 +82,7 @@ $script:Channel = if ($env:DEVKIT_CHANNEL) { $env:DEVKIT_CHANNEL } else { " $script:Skills = @( "databricks-agent-bricks", "databricks-aibi-dashboards", "databricks-app-python", "databricks-bundles", "databricks-config", "databricks-dbsql", "databricks-docs", "databricks-genie", - "databricks-iceberg", "databricks-jobs", "databricks-lakebase-autoscale", "databricks-lakebase-provisioned", + "databricks-iceberg", "databricks-jobs", "databricks-lakebase-autoscale", "databricks-metric-views", "databricks-mlflow-evaluation", "databricks-model-serving", "databricks-ai-functions", "databricks-python-sdk", "databricks-spark-declarative-pipelines", "databricks-spark-structured-streaming", "databricks-synthetic-data-gen", "databricks-unity-catalog", "databricks-unstructured-pdf-generation", @@ -125,7 +125,7 @@ $script:ProfileAiMlMlflow = @( ) $script:ProfileAppDeveloper = @( "databricks-app-python", "databricks-app-apx", "databricks-lakebase-autoscale", - "databricks-lakebase-provisioned", "databricks-model-serving", "databricks-dbsql", + "databricks-model-serving", "databricks-dbsql", "databricks-jobs", "databricks-bundles" ) @@ -1137,7 +1137,6 @@ function Invoke-PromptCustomSkills { @{ Label = "Unstructured PDF"; Value = "databricks-unstructured-pdf-generation"; State = ($preselected -contains "databricks-unstructured-pdf-generation"); Hint = "Synthetic PDFs for RAG" } @{ Label = "Synthetic Data"; Value = "databricks-synthetic-data-gen"; State = ($preselected -contains "databricks-synthetic-data-gen"); Hint = "Generate test data" } @{ Label = "Lakebase Autoscale"; Value = "databricks-lakebase-autoscale"; State = ($preselected -contains "databricks-lakebase-autoscale"); Hint = "Managed PostgreSQL" } - @{ Label = "Lakebase Provisioned"; Value = "databricks-lakebase-provisioned"; State = ($preselected -contains "databricks-lakebase-provisioned"); Hint = "Provisioned PostgreSQL" } @{ Label = "App Python"; Value = "databricks-app-python"; State = ($preselected -contains "databricks-app-python"); Hint = "Dash, Streamlit, Flask" } @{ Label = "App APX"; Value = "databricks-app-apx"; State = ($preselected -contains "databricks-app-apx"); Hint = "FastAPI + React" } @{ Label = "MLflow Onboarding"; Value = "mlflow-onboarding"; State = ($preselected -contains "mlflow-onboarding"); Hint = "Getting started" } diff --git a/install.sh b/install.sh index 935f4810..c3d2d266 100644 --- a/install.sh +++ b/install.sh @@ -89,7 +89,7 @@ MIN_SDK_VERSION="0.85.0" G='\033[0;32m' Y='\033[1;33m' R='\033[0;31m' BL='\033[0;34m' B='\033[1m' D='\033[2m' N='\033[0m' # Databricks skills (bundled in repo) -SKILLS="databricks-agent-bricks databricks-ai-functions databricks-aibi-dashboards databricks-app-python databricks-bundles databricks-config databricks-dbsql databricks-docs databricks-genie databricks-iceberg databricks-jobs databricks-lakebase-autoscale databricks-lakebase-provisioned databricks-metric-views databricks-mlflow-evaluation databricks-model-serving databricks-python-sdk databricks-spark-declarative-pipelines databricks-spark-structured-streaming databricks-synthetic-data-gen databricks-unity-catalog databricks-unstructured-pdf-generation databricks-vector-search databricks-zerobus-ingest spark-python-data-source" +SKILLS="databricks-agent-bricks databricks-ai-functions databricks-aibi-dashboards databricks-app-python databricks-bundles databricks-config databricks-dbsql databricks-docs databricks-genie databricks-iceberg databricks-jobs databricks-lakebase-autoscale databricks-metric-views databricks-mlflow-evaluation databricks-model-serving databricks-python-sdk databricks-spark-declarative-pipelines databricks-spark-structured-streaming databricks-synthetic-data-gen databricks-unity-catalog databricks-unstructured-pdf-generation databricks-vector-search databricks-zerobus-ingest spark-python-data-source" # MLflow skills (fetched from mlflow/skills repo) MLFLOW_SKILLS="agent-evaluation analyze-mlflow-chat-session analyze-mlflow-trace instrumenting-with-mlflow-tracing mlflow-onboarding querying-mlflow-metrics retrieving-mlflow-traces searching-mlflow-docs" @@ -108,7 +108,7 @@ PROFILE_DATA_ENGINEER="databricks-spark-declarative-pipelines databricks-spark-s PROFILE_ANALYST="databricks-aibi-dashboards databricks-dbsql databricks-genie databricks-metric-views" PROFILE_AIML_ENGINEER="databricks-agent-bricks databricks-ai-functions databricks-vector-search databricks-model-serving databricks-genie databricks-unstructured-pdf-generation databricks-mlflow-evaluation databricks-synthetic-data-gen databricks-jobs" PROFILE_AIML_MLFLOW="agent-evaluation analyze-mlflow-chat-session analyze-mlflow-trace instrumenting-with-mlflow-tracing mlflow-onboarding querying-mlflow-metrics retrieving-mlflow-traces searching-mlflow-docs" -PROFILE_APP_DEVELOPER="databricks-app-python databricks-app-apx databricks-lakebase-autoscale databricks-lakebase-provisioned databricks-model-serving databricks-dbsql databricks-jobs databricks-bundles" +PROFILE_APP_DEVELOPER="databricks-app-python databricks-app-apx databricks-lakebase-autoscale databricks-model-serving databricks-dbsql databricks-jobs databricks-bundles" # Selected skills (populated during profile selection) SELECTED_SKILLS="" @@ -950,7 +950,6 @@ prompt_custom_skills() { "Unstructured PDF|databricks-unstructured-pdf-generation|$(_is_preselected databricks-unstructured-pdf-generation)|Synthetic PDFs for RAG" \ "Synthetic Data|databricks-synthetic-data-gen|$(_is_preselected databricks-synthetic-data-gen)|Generate test data" \ "Lakebase Autoscale|databricks-lakebase-autoscale|$(_is_preselected databricks-lakebase-autoscale)|Managed PostgreSQL" \ - "Lakebase Provisioned|databricks-lakebase-provisioned|$(_is_preselected databricks-lakebase-provisioned)|Provisioned PostgreSQL" \ "App Python|databricks-app-python|$(_is_preselected databricks-app-python)|Dash, Streamlit, Flask" \ "App APX|databricks-app-apx|$(_is_preselected databricks-app-apx)|FastAPI + React" \ "MLflow Onboarding|mlflow-onboarding|$(_is_preselected mlflow-onboarding)|Getting started" \