diff --git a/docs/_snippets/_users-and-roles-common.md b/docs/_snippets/_users-and-roles-common.md index 29726229be9..964f9a8b40d 100644 --- a/docs/_snippets/_users-and-roles-common.md +++ b/docs/_snippets/_users-and-roles-common.md @@ -42,64 +42,73 @@ Create these tables and users to be used in the examples. #### Creating a sample database, table, and rows {#creating-a-sample-database-table-and-rows} -1. Create a test database + - ```sql - CREATE DATABASE db1; - ``` +##### Create a test database {#create-a-test-database} -2. Create a table +```sql +CREATE DATABASE db1; +``` - ```sql - CREATE TABLE db1.table1 ( - id UInt64, - column1 String, - column2 String - ) - ENGINE MergeTree - ORDER BY id; - ``` +##### Create a table {#create-a-table} -3. Populate the table with sample rows +```sql +CREATE TABLE db1.table1 ( + id UInt64, + column1 String, + column2 String +) +ENGINE MergeTree +ORDER BY id; +``` - ```sql - INSERT INTO db1.table1 - (id, column1, column2) - VALUES - (1, 'A', 'abc'), - (2, 'A', 'def'), - (3, 'B', 'abc'), - (4, 'B', 'def'); - ``` +##### Populate the table with sample rows {#populate} -4. Verify the table: +```sql +INSERT INTO db1.table1 + (id, column1, column2) +VALUES + (1, 'A', 'abc'), + (2, 'A', 'def'), + (3, 'B', 'abc'), + (4, 'B', 'def'); +``` - ```sql - SELECT * - FROM db1.table1 - ``` +##### Verify the table {#verify} - ```response - Query id: 475015cc-6f51-4b20-bda2-3c9c41404e49 +```sql title="Query" +SELECT * +FROM db1.table1 +``` - ┌─id─┬─column1─┬─column2─┐ - │ 1 │ A │ abc │ - │ 2 │ A │ def │ - │ 3 │ B │ abc │ - │ 4 │ B │ def │ - └────┴─────────┴─────────┘ - ``` +```response title="Response" +Query id: 475015cc-6f51-4b20-bda2-3c9c41404e49 -5. Create a regular user that will be used to demonstrate restrict access to certain columns: +┌─id─┬─column1─┬─column2─┐ +│ 1 │ A │ abc │ +│ 2 │ A │ def │ +│ 3 │ B │ abc │ +│ 4 │ B │ def │ +└────┴─────────┴─────────┘ +``` - ```sql - CREATE USER column_user IDENTIFIED BY 'password'; - ``` +##### Create `column_user` {#create-a-user-with-restricted-access-to-columns} -6. Create a regular user that will be used to demonstrate restricting access to rows with certain values: - ```sql - CREATE USER row_user IDENTIFIED BY 'password'; - ``` +Create a regular user that will be used to demonstrate restrict access to certain columns: + +```sql +CREATE USER column_user IDENTIFIED BY 'password'; +``` + +##### Create `row_user` {#create-a-user-with-restricted-access-to-rows-with-certain-values} + +Create a regular user that will be used to demonstrate restricting access to rows with certain values: + +```sql +CREATE USER row_user IDENTIFIED BY 'password'; +``` + + #### Creating roles {#creating-roles} diff --git a/docs/best-practices/_snippets/_table_of_contents.md b/docs/best-practices/_snippets/_table_of_contents.md new file mode 100644 index 00000000000..9e0d34ef2d1 --- /dev/null +++ b/docs/best-practices/_snippets/_table_of_contents.md @@ -0,0 +1,12 @@ +| Page | Description | +|--------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------| +| [Choosing a Primary Key](/best-practices/choosing-a-primary-key) | How to select primary keys that maximize query performance and minimize storage overhead. | +| [Select Data Types](/best-practices/select-data-types) | Choose optimal data types to reduce memory usage, improve compression, and accelerate queries. | +| [Use Materialized Views](/best-practices/use-materialized-views) | Leverage materialized views to pre-aggregate data and dramatically speed up analytical queries. | +| [Minimize and Optimize JOINs](/best-practices/minimize-optimize-joins) | Best practices for using ClickHouse's `JOIN` capabilities efficiently. | +| [Choosing a Partitioning Key](/best-practices/choosing-a-partitioning-key) | Select partitioning strategies that enable efficient data pruning and faster query execution. | +| [Selecting an Insert Strategy](/best-practices/selecting-an-insert-strategy) | Optimize data ingestion throughput and reduce resource consumption with proper insert patterns. | +| [Data Skipping Indices](/best-practices/use-data-skipping-indices-where-appropriate) | Apply secondary indices strategically to skip irrelevant data blocks and accelerate filtered queries. | +| [Avoid Mutations](/best-practices/avoid-mutations) | Design schemas and workflows that eliminate costly `UPDATE`/`DELETE` operations for better performance. | +| [Avoid OPTIMIZE FINAL](/best-practices/avoid-optimize-final) | Prevent performance bottlenecks by understanding when `OPTIMIZE FINAL` hurts more than it helps. | +| [Use JSON where appropriate](/best-practices/use-json-where-appropriate) | Balance flexibility and performance when working with semi-structured JSON data in ClickHouse. | \ No newline at end of file diff --git a/docs/best-practices/index.md b/docs/best-practices/index.md index 5a3ae78ab5f..b4721106510 100644 --- a/docs/best-practices/index.md +++ b/docs/best-practices/index.md @@ -6,19 +6,10 @@ hide_title: true description: 'Landing page for Best Practices section in ClickHouse' --- +import TableOfContents from '@site/docs/best-practices/_snippets/_table_of_contents.md'; + # Best Practices in ClickHouse {#best-practices-in-clickhouse} This section provides the best practices you will want to follow to get the most out of ClickHouse. -| Page | Description | -|----------------------------------------------------------------------|--------------------------------------------------------------------------| -| [Choosing a Primary Key](/best-practices/choosing-a-primary-key) | Guidance on selecting an effective Primary Key in ClickHouse. | -| [Select Data Types](/best-practices/select-data-types) | Recommendations for choosing appropriate data types. | -| [Use Materialized Views](/best-practices/use-materialized-views) | When and how to benefit from materialized views. | -| [Minimize and Optimize JOINs](/best-practices/minimize-optimize-joins)| Best practices for minimizing and optimizing JOIN operations. | -| [Choosing a Partitioning Key](/best-practices/choosing-a-partitioning-key) | How to choose and apply partitioning keys effectively. | -| [Selecting an Insert Strategy](/best-practices/selecting-an-insert-strategy) | Strategies for efficient data insertion in ClickHouse. | -| [Data Skipping Indices](/best-practices/use-data-skipping-indices-where-appropriate) | When to apply data skipping indices for performance gains. | -| [Avoid Mutations](/best-practices/avoid-mutations) | Reasons to avoid mutations and how to design without them. | -| [Avoid OPTIMIZE FINAL](/best-practices/avoid-optimize-final) | Why `OPTIMIZE FINAL` can be costly and how to work around it. | -| [Use JSON where appropriate](/best-practices/use-json-where-appropriate) | Considerations for using JSON columns in ClickHouse. | + \ No newline at end of file diff --git a/docs/cloud-index.md b/docs/cloud-index.md deleted file mode 100644 index 911b6d139ff..00000000000 --- a/docs/cloud-index.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -slug: /cloud/overview -keywords: ['AWS', 'Cloud', 'serverless'] -title: 'Overview' -hide_title: true -description: 'Overview page for Cloud' ---- - -import Content from '@site/docs/about-us/cloud.md'; - - diff --git a/docs/cloud/_snippets/_clickpipes_faq.md b/docs/cloud/_snippets/_clickpipes_faq.md new file mode 100644 index 00000000000..e69de29bb2d diff --git a/docs/cloud/_snippets/_security_table_of_contents.md b/docs/cloud/_snippets/_security_table_of_contents.md new file mode 100644 index 00000000000..9ff837bb8a9 --- /dev/null +++ b/docs/cloud/_snippets/_security_table_of_contents.md @@ -0,0 +1,8 @@ +| Page | Description | +|---------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------| +| [Shared Responsibility Model](/cloud/security/shared-responsibility-model) | Understand how security responsibilities are divided between ClickHouse Cloud and your organization for different service types. | +| [Cloud Access Management](/cloud/security/cloud-access-management) | Manage user access with authentication, single sign-on (SSO), role-based permissions, and team invitations. | +| [Connectivity](/cloud/security/connectivity) | Configure secure network access including IP allow-lists, private networking, S3 data access, and Cloud IP address management. | +| [Enhanced Encryption](/cloud/security/cmek) | Learn about default AES 256 encryption and how to enable Transparent Data Encryption (TDE) for additional data protection at rest. | +| [Audit Logging](/cloud/security/audit-logging) | Set up and use audit logging to track and monitor activities in your ClickHouse Cloud environment. | +| [Privacy and Compliance](/cloud/security/privacy-compliance-overview) | Review security certifications, compliance standards, and learn how to manage your personal information and data rights. | \ No newline at end of file diff --git a/docs/cloud/manage/api/api-overview.md b/docs/cloud/api/api-overview.md similarity index 97% rename from docs/cloud/manage/api/api-overview.md rename to docs/cloud/api/api-overview.md index ab0484d0c5c..95a75a1c886 100644 --- a/docs/cloud/manage/api/api-overview.md +++ b/docs/cloud/api/api-overview.md @@ -15,7 +15,7 @@ organizations and services on ClickHouse Cloud. Using our Cloud API, you can create and manage services, provision API keys, add or remove members in your organization, and more. -[Learn how to create your first API key and start using the ClickHouse Cloud API.](/cloud/manage/openapi.md) +[Learn how to create your first API key and start using the ClickHouse Cloud API.](/cloud/manage/openapi) ## Swagger (OpenAPI) Endpoint and UI {#swagger-openapi-endpoint-and-ui} @@ -56,7 +56,8 @@ If your organization has been migrated to one of the [new pricing plans](https:/ You will now also be able to specify the `num_replicas` field as a property of the service resource. ::: -## Terraform and OpenAPI New Pricing: Replica Settings Explained +## Terraform and OpenAPI New Pricing: Replica Settings Explained {#terraform-and-openapi-new-pricing---replica-settings-explained} + The number of replicas each service will be created with defaults to 3 for the Scale and Enterprise tiers, while it defaults to 1 for the Basic tier. For the Scale and the Enterprise tiers it is possible to adjust it by passing a `numReplicas` field in the service creation request. The value of the `numReplicas` field must be between 2 and 20 for the first service in a warehouse. Services that are created in an existing warehouse can have a number of replicas as low as 1. diff --git a/docs/cloud/manage/api/index.md b/docs/cloud/api/index.md similarity index 100% rename from docs/cloud/manage/api/index.md rename to docs/cloud/api/index.md diff --git a/docs/cloud/manage/openapi.md b/docs/cloud/api/openapi.md similarity index 99% rename from docs/cloud/manage/openapi.md rename to docs/cloud/api/openapi.md index 919cb38cc48..6e9b0d4fad3 100644 --- a/docs/cloud/manage/openapi.md +++ b/docs/cloud/api/openapi.md @@ -17,7 +17,7 @@ import Image from '@theme/IdealImage'; ClickHouse Cloud provides an API utilizing OpenAPI that allows you to programmatically manage your account and aspects of your services. :::note -This document covers the ClickHouse Cloud API. For database API endpoints, please see [Cloud Endpoints API](/cloud/get-started/query-endpoints.md) +This document covers the ClickHouse Cloud API. For database API endpoints, please see [Cloud Endpoints API](/cloud/get-started/query-endpoints) ::: 1. You can use the **API Keys** tab on the left menu to create and manage your API keys. diff --git a/docs/cloud/manage/postman.md b/docs/cloud/api/postman.md similarity index 100% rename from docs/cloud/manage/postman.md rename to docs/cloud/api/postman.md diff --git a/docs/cloud/bestpractices/index.md b/docs/cloud/bestpractices/index.md deleted file mode 100644 index 550f2901bc4..00000000000 --- a/docs/cloud/bestpractices/index.md +++ /dev/null @@ -1,31 +0,0 @@ ---- -slug: /cloud/bestpractices -keywords: ['Cloud', 'Best Practices', 'Bulk Inserts', 'Asynchronous Inserts', 'Avoid mutations', 'Avoid nullable columns', 'Avoid Optimize Final', 'Low Cardinality Partitioning Key', 'Multi Tenancy', 'Usage Limits'] -title: 'Overview' -hide_title: true -description: 'Landing page for Best Practices section in ClickHouse Cloud' ---- - -# Best Practices in ClickHouse Cloud {#best-practices-in-clickhouse-cloud} - -This section provides best practices you will want to follow to get the most out of ClickHouse Cloud. - -| Page | Description | -|----------------------------------------------------------|----------------------------------------------------------------------------| -| [Usage Limits](/cloud/bestpractices/usage-limits)| Explore the limits of ClickHouse. | -| [Multi tenancy](/cloud/bestpractices/multi-tenancy)| Learn about different strategies to implement multi-tenancy. | - -These are in addition to the standard best practices which apply to all deployments of ClickHouse. - -| Page | Description | -|----------------------------------------------------------------------|--------------------------------------------------------------------------| -| [Choosing a Primary Key](/best-practices/choosing-a-primary-key) | Guidance on selecting an effective Primary Key in ClickHouse. | -| [Select Data Types](/best-practices/select-data-types) | Recommendations for choosing appropriate data types. | -| [Use Materialized Views](/best-practices/use-materialized-views) | When and how to benefit from materialized views. | -| [Minimize and Optimize JOINs](/best-practices/minimize-optimize-joins)| Best practices for minimizing and optimizing JOIN operations. | -| [Choosing a Partitioning Key](/best-practices/choosing-a-partitioning-key) | How to choose and apply partitioning keys effectively. | -| [Selecting an Insert Strategy](/best-practices/selecting-an-insert-strategy) | Strategies for efficient data insertion in ClickHouse. | -| [Data Skipping Indices](/best-practices/use-data-skipping-indices-where-appropriate) | When to apply data skipping indices for performance gains. | -| [Avoid Mutations](/best-practices/avoid-mutations) | Reasons to avoid mutations and how to design without them. | -| [Avoid `OPTIMIZE FINAL`](/best-practices/avoid-optimize-final) | Why `OPTIMIZE FINAL` can be costly and how to work around it. | -| [Use JSON where appropriate](/best-practices/use-json-where-appropriate) | Considerations for using JSON columns in ClickHouse. | diff --git a/docs/cloud/bestpractices/usagelimits.md b/docs/cloud/bestpractices/usagelimits.md deleted file mode 100644 index 37ab67b542c..00000000000 --- a/docs/cloud/bestpractices/usagelimits.md +++ /dev/null @@ -1,31 +0,0 @@ ---- -slug: /cloud/bestpractices/usage-limits -sidebar_label: 'Usage Limits' -title: 'Usage limits' -description: 'Describes the recommended usage limits in ClickHouse Cloud' ---- - -While ClickHouse is known for its speed and reliability, optimal performance is achieved within certain operating parameters. For example, having too many tables, databases or parts could negatively impact performance. To avoid this, Clickhouse Cloud has guardrails set up for several types of items. You can find details of these guardrails below. - -:::tip -If you've run up against one of these guardrails, it's possible that you are implementing your use case in an unoptimized way. Contact our support team and we will gladly help you refine your use case to avoid exceeding the guardrails or look together at how we can increase them in a controlled manner. -::: - -| Dimension | Limit | -|-----------|-------| -|**Databases**| 1000| -|**Tables**| 5000| -|**Columns**| ∼1000 (wide format is preferred to compact)| -|**Partitions**| 50k| -|**Parts**| 100k across the entire instance| -|**Part size**| 150gb| -|**Services per organization**| 20 (soft)| -|**Services per warehouse**| 5 (soft)| -|**Low cardinality**| 10k or less| -|**Primary keys in a table**| 4-5 that sufficiently filter down the data| -|**Query concurrency**| 1000| -|**Batch ingest**| anything > 1M will be split by the system in 1M row blocks| - -:::note -For Single Replica Services, the maximum number of databases is restricted to 100, and the maximum number of tables is restricted to 500. In addition, storage for Basic Tier Services is limited to 1 TB. -::: diff --git a/docs/cloud/manage/cloud-tiers.md b/docs/cloud/features/01_cloud_tiers.md similarity index 92% rename from docs/cloud/manage/cloud-tiers.md rename to docs/cloud/features/01_cloud_tiers.md index 244b453f8ab..c8ab9279619 100644 --- a/docs/cloud/manage/cloud-tiers.md +++ b/docs/cloud/features/01_cloud_tiers.md @@ -164,7 +164,7 @@ This page discusses which tiers are right for your specific use case. :::note Services in the basic tier are meant to be fixed in size and do not allow scaling, both automatic and manual. -Users can upgrade to the Scale or Enterprise tier to scale their services. +You can upgrade to the Scale or Enterprise tier to scale their services. ::: ## Scale {#scale} @@ -172,9 +172,9 @@ Users can upgrade to the Scale or Enterprise tier to scale their services. Designed for workloads requiring enhanced SLAs (2+ replica deployments), scalability, and advanced security. - Offers support for features such as: - - [Private networking support](../security/private-link-overview.md). + - [Private networking support](/cloud/security/private-link-overview). - [Compute-compute separation](../reference/warehouses#what-is-compute-compute-separation). - - [Flexible scaling](../manage/scaling.md) options (scale up/down, in/out). + - [Flexible scaling](/manage/scaling) options (scale up/down, in/out). ## Enterprise {#enterprise} @@ -186,8 +186,8 @@ Caters to large-scale, mission critical deployments that have stringent security - Supports enterprise-grade security: - Single Sign On (SSO) - Enhanced Encryption: For AWS and GCP services. Services are encrypted by our key by default and can be rotated to their key to enable Customer Managed Encryption Keys (CMEK). -- Allows Scheduled upgrades: Users can select the day of the week/time window for upgrades, both database and cloud releases. -- Offers [HIPAA](../security/compliance-overview.md/#hipaa-since-2024) Compliance. +- Allows Scheduled upgrades: you can select the day of the week/time window for upgrades, both database and cloud releases. +- Offers [HIPAA](/cloud/security/compliance-overview#hipaa-since-2024) Compliance. - Exports Backups to the user's account. :::note diff --git a/docs/cloud/manage/scaling.md b/docs/cloud/features/02_automatic_scaling.md similarity index 100% rename from docs/cloud/manage/scaling.md rename to docs/cloud/features/02_automatic_scaling.md diff --git a/docs/cloud/get-started/sql-console.md b/docs/cloud/features/02_cloud_console_features/01_sql-console.md similarity index 100% rename from docs/cloud/get-started/sql-console.md rename to docs/cloud/features/02_cloud_console_features/01_sql-console.md diff --git a/docs/cloud/get-started/query-insights.md b/docs/cloud/features/02_cloud_console_features/02_query-insights.md similarity index 100% rename from docs/cloud/get-started/query-insights.md rename to docs/cloud/features/02_cloud_console_features/02_query-insights.md diff --git a/docs/cloud/get-started/query-endpoints.md b/docs/cloud/features/02_cloud_console_features/03_query-endpoints.md similarity index 100% rename from docs/cloud/get-started/query-endpoints.md rename to docs/cloud/features/02_cloud_console_features/03_query-endpoints.md diff --git a/docs/cloud/manage/dashboards.md b/docs/cloud/features/02_cloud_console_features/04_dashboards.md similarity index 100% rename from docs/cloud/manage/dashboards.md rename to docs/cloud/features/02_cloud_console_features/04_dashboards.md diff --git a/docs/cloud/features/02_cloud_console_features/_category_.json b/docs/cloud/features/02_cloud_console_features/_category_.json new file mode 100644 index 00000000000..85ba09bce82 --- /dev/null +++ b/docs/cloud/features/02_cloud_console_features/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Cloud console", + "collapsible": true, + "collapsed": true, +} \ No newline at end of file diff --git a/docs/cloud/features/03_infrastructure_and_deploy/_category_.json b/docs/cloud/features/03_infrastructure_and_deploy/_category_.json new file mode 100644 index 00000000000..3e6367dd545 --- /dev/null +++ b/docs/cloud/features/03_infrastructure_and_deploy/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Infrastructure and deploy", + "collapsible": true, + "collapsed": true, +} \ No newline at end of file diff --git a/docs/cloud/reference/byoc.md b/docs/cloud/features/03_infrastructure_and_deploy/byoc.md similarity index 99% rename from docs/cloud/reference/byoc.md rename to docs/cloud/features/03_infrastructure_and_deploy/byoc.md index 1b07d16e82e..d3a64f34b03 100644 --- a/docs/cloud/reference/byoc.md +++ b/docs/cloud/features/03_infrastructure_and_deploy/byoc.md @@ -128,7 +128,7 @@ Contact ClickHouse Support to enable Private Load Balancer. 2. Select Peering Connections. 3. Click Create Peering Connection 4. Set the VPC Requester to the ClickHouse VPC ID. -5. Set the VPC Acceptor to the target VPC ID. (Select another account if applicable) +5. Set the VPC Accepter to the target VPC ID. (Select another account if applicable) 6. Click Create Peering Connection.
diff --git a/docs/cloud/reference/shared-catalog.md b/docs/cloud/features/03_infrastructure_and_deploy/shared-catalog.md similarity index 100% rename from docs/cloud/reference/shared-catalog.md rename to docs/cloud/features/03_infrastructure_and_deploy/shared-catalog.md diff --git a/docs/cloud/reference/shared-merge-tree.md b/docs/cloud/features/03_infrastructure_and_deploy/shared-merge-tree.md similarity index 100% rename from docs/cloud/reference/shared-merge-tree.md rename to docs/cloud/features/03_infrastructure_and_deploy/shared-merge-tree.md diff --git a/docs/cloud/reference/warehouses.md b/docs/cloud/features/03_infrastructure_and_deploy/warehouses.md similarity index 100% rename from docs/cloud/reference/warehouses.md rename to docs/cloud/features/03_infrastructure_and_deploy/warehouses.md diff --git a/docs/cloud/features/04_monitoring/_category_.json b/docs/cloud/features/04_monitoring/_category_.json new file mode 100644 index 00000000000..ef0bd973e2c --- /dev/null +++ b/docs/cloud/features/04_monitoring/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Monitoring", + "collapsible": true, + "collapsed": true, +} \ No newline at end of file diff --git a/docs/cloud/manage/monitoring/advanced_dashboard.md b/docs/cloud/features/04_monitoring/advanced_dashboard.md similarity index 99% rename from docs/cloud/manage/monitoring/advanced_dashboard.md rename to docs/cloud/features/04_monitoring/advanced_dashboard.md index ab320eb6ebe..578a412846e 100644 --- a/docs/cloud/manage/monitoring/advanced_dashboard.md +++ b/docs/cloud/features/04_monitoring/advanced_dashboard.md @@ -110,7 +110,7 @@ interface can help detect issues. | Network receive bytes/sec | Tracks the current speed of outbound network traffic | | Concurrent network connections | Tracks the number of current concurrent network connections | -## Identifying issues with the Advanced dashboard {#identifying-issues-with-the-advanced-dashboard} +## Identifying issues using the advanced dashboard {#identifying-issues-with-the-advanced-dashboard} Having this real-time view of the health of your ClickHouse service greatly helps mitigate issues before they impact your business or help solve them. Below are a diff --git a/docs/cloud/manage/monitoring/prometheus.md b/docs/cloud/features/04_monitoring/prometheus.md similarity index 100% rename from docs/cloud/manage/monitoring/prometheus.md rename to docs/cloud/features/04_monitoring/prometheus.md diff --git a/docs/cloud/security/shared-responsibility-model.md b/docs/cloud/features/05_security/01_shared-responsibility-model.md similarity index 98% rename from docs/cloud/security/shared-responsibility-model.md rename to docs/cloud/features/05_security/01_shared-responsibility-model.md index dbb828332eb..08c6a3e073c 100644 --- a/docs/cloud/security/shared-responsibility-model.md +++ b/docs/cloud/features/05_security/01_shared-responsibility-model.md @@ -1,7 +1,7 @@ --- -sidebar_label: 'Shared Responsibility Model' +sidebar_label: 'Shared responsibility model' slug: /cloud/security/shared-responsibility-model -title: 'Security Shared Responsibility Model' +title: 'Shared responsibility model' description: 'Learn more about the security model of ClickHouse Cloud' --- @@ -104,4 +104,4 @@ The model below generally addresses ClickHouse responsibilities and shows respon | HIPAA compliance | Available | AWS, GCP | Enterprise | | PCI compliance | Available | AWS | Enterprise | - For more information on supported compliance frameworks, please review our [Security and Compliance](/cloud/security/security-and-compliance) page. + For more information on supported compliance frameworks, please review our [Security and Compliance](/cloud/security/compliance-overview) page. diff --git a/docs/cloud/features/05_security/02_cloud-access-management/_category_.json b/docs/cloud/features/05_security/02_cloud-access-management/_category_.json new file mode 100644 index 00000000000..784ea5ce006 --- /dev/null +++ b/docs/cloud/features/05_security/02_cloud-access-management/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Cloud access management", + "collapsible": true, + "collapsed": true, +} \ No newline at end of file diff --git a/docs/cloud/security/cloud-access-management/cloud-access-management.md b/docs/cloud/features/05_security/02_cloud-access-management/cloud-access-management.md similarity index 76% rename from docs/cloud/security/cloud-access-management/cloud-access-management.md rename to docs/cloud/features/05_security/02_cloud-access-management/cloud-access-management.md index b0794fccf84..cfab1faad61 100644 --- a/docs/cloud/security/cloud-access-management/cloud-access-management.md +++ b/docs/cloud/features/05_security/02_cloud-access-management/cloud-access-management.md @@ -32,23 +32,31 @@ Users must be assigned an organization level role and may optionally be assigned | SQL console | Custom | Configure using SQL [`GRANT`](/sql-reference/statements/grant) statement; assign the role to a SQL console user by naming the role after the user | To create a custom role for a SQL console user and grant it a general role, run the following commands. The email address must match the user's email address in the console. + + + +#### Create `database_developer` and grant permissions {#create-database_developer-and-grant-permissions} + +Create the `database_developer` role and grant `SHOW`, `CREATE`, `ALTER`, and `DELETE` permissions. -1. Create the database_developer role and grant `SHOW`, `CREATE`, `ALTER`, and `DELETE` permissions. - - ```sql - CREATE ROLE OR REPLACE database_developer; - GRANT SHOW ON * TO database_developer; - GRANT CREATE ON * TO database_developer; - GRANT ALTER ON * TO database_developer; - GRANT DELETE ON * TO database_developer; - ``` - -2. Create a role for the SQL console user my.user@domain.com and assign it the database_developer role. +```sql +CREATE ROLE OR REPLACE database_developer; +GRANT SHOW ON * TO database_developer; +GRANT CREATE ON * TO database_developer; +GRANT ALTER ON * TO database_developer; +GRANT DELETE ON * TO database_developer; +``` + +#### Create SQL console user role {#create-sql-console-user-role} + +Create a role for the SQL console user my.user@domain.com and assign it the database_developer role. - ```sql - CREATE ROLE OR REPLACE `sql-console-role:my.user@domain.com`; - GRANT database_developer TO `sql-console-role:my.user@domain.com`; - ``` +```sql +CREATE ROLE OR REPLACE `sql-console-role:my.user@domain.com`; +GRANT database_developer TO `sql-console-role:my.user@domain.com`; +``` + + ### SQL console passwordless authentication {#sql-console-passwordless-authentication} SQL console users are created for each session and authenticated using X.509 certificates that are automatically rotated. The user is removed when the session is terminated. When generating access lists for audits, please navigate to the Settings tab for the service in the console and note the SQL console access in addition to the database users that exist in the database. If custom roles are configured, the user's access is listed in the role ending with the user's username. @@ -88,38 +96,46 @@ Users can use a SHA256 hash generator or code function such as `hashlib` in Pyth ### Database access listings with SQL console users {#database-access-listings-with-sql-console-users} The following process can be used to generate a complete access listing across the SQL console and databases in your organization. -1. Run the following queries to get a list of all grants in the database. - - ```sql - SELECT grants.user_name, - grants.role_name, - users.name AS role_member, - grants.access_type, - grants.database, - grants.table - FROM system.grants LEFT OUTER JOIN system.role_grants ON grants.role_name = role_grants.granted_role_name - LEFT OUTER JOIN system.users ON role_grants.user_name = users.name - - UNION ALL - - SELECT grants.user_name, - grants.role_name, - role_grants.role_name AS role_member, - grants.access_type, - grants.database, - grants.table - FROM system.role_grants LEFT OUTER JOIN system.grants ON role_grants.granted_role_name = grants.role_name - WHERE role_grants.user_name is null; - ``` - -2. Associate this list to Console users with access to SQL console. + + +#### Get a list of all database grants {#get-a-list-of-all-database-grants} + +Run the following queries to get a list of all grants in the database. + +```sql +SELECT grants.user_name, +grants.role_name, +users.name AS role_member, +grants.access_type, +grants.database, +grants.table +FROM system.grants LEFT OUTER JOIN system.role_grants ON grants.role_name = role_grants.granted_role_name +LEFT OUTER JOIN system.users ON role_grants.user_name = users.name + +UNION ALL + +SELECT grants.user_name, +grants.role_name, +role_grants.role_name AS role_member, +grants.access_type, +grants.database, +grants.table +FROM system.role_grants LEFT OUTER JOIN system.grants ON role_grants.granted_role_name = grants.role_name +WHERE role_grants.user_name is null; +``` + +#### Associate grant list to Console users with access to SQL console {#associate-grant-list-to-console-users-with-access-to-sql-console} + +Associate this list with Console users that have access to SQL console. - a. Go to the Console. +a. Go to the Console. + +b. Select the relevant service. - b. Select the relevant service. +c. Select Settings on the left. - c. Select Settings on the left. +d. Scroll to the SQL console access section. - d. Scroll to the SQL console access section. +e. Click the link for the number of users with access to the database `There are # users with access to this service.` to see the user listing. - e. Click the link for the number of users with access to the database `There are # users with access to this service.` to see the user listing. + \ No newline at end of file diff --git a/docs/cloud/security/cloud-access-management/cloud-authentication.md b/docs/cloud/features/05_security/02_cloud-access-management/cloud-authentication.md similarity index 100% rename from docs/cloud/security/cloud-access-management/cloud-authentication.md rename to docs/cloud/features/05_security/02_cloud-access-management/cloud-authentication.md diff --git a/docs/cloud/security/cloud-access-management/index.md b/docs/cloud/features/05_security/02_cloud-access-management/index.md similarity index 100% rename from docs/cloud/security/cloud-access-management/index.md rename to docs/cloud/features/05_security/02_cloud-access-management/index.md diff --git a/docs/cloud/security/inviting-new-users.md b/docs/cloud/features/05_security/02_cloud-access-management/inviting-new-users.md similarity index 100% rename from docs/cloud/security/inviting-new-users.md rename to docs/cloud/features/05_security/02_cloud-access-management/inviting-new-users.md diff --git a/docs/cloud/features/05_security/03_connectivity/_category_.json b/docs/cloud/features/05_security/03_connectivity/_category_.json new file mode 100644 index 00000000000..6e137e0592d --- /dev/null +++ b/docs/cloud/features/05_security/03_connectivity/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Connectivity", + "collapsible": true, + "collapsed": true, +} \ No newline at end of file diff --git a/docs/cloud/security/cloud-endpoints-api.md b/docs/cloud/features/05_security/03_connectivity/cloud-endpoints-api.md similarity index 100% rename from docs/cloud/security/cloud-endpoints-api.md rename to docs/cloud/features/05_security/03_connectivity/cloud-endpoints-api.md diff --git a/docs/cloud/security/connectivity-overview.md b/docs/cloud/features/05_security/03_connectivity/connectivity-overview.md similarity index 100% rename from docs/cloud/security/connectivity-overview.md rename to docs/cloud/features/05_security/03_connectivity/connectivity-overview.md diff --git a/docs/cloud/security/aws-privatelink.md b/docs/cloud/features/05_security/03_connectivity/private_networking/aws-privatelink.md similarity index 96% rename from docs/cloud/security/aws-privatelink.md rename to docs/cloud/features/05_security/03_connectivity/private_networking/aws-privatelink.md index 4a7c9644eb7..720a3a88852 100644 --- a/docs/cloud/security/aws-privatelink.md +++ b/docs/cloud/features/05_security/03_connectivity/private_networking/aws-privatelink.md @@ -375,7 +375,7 @@ jq .result.privateEndpointIds ### Connecting to a remote database {#connecting-to-a-remote-database} -Let's say you are trying to use [MySQL](../../sql-reference/table-functions/mysql.md) or [PostgreSQL](../../sql-reference/table-functions/postgresql.md) table functions in ClickHouse Cloud and connect to your database hosted in an Amazon Web Services (AWS) VPC. AWS PrivateLink cannot be used to enable this connection securely. PrivateLink is a one-way, unidirectional connection. It allows your internal network or Amazon VPC to connect securely to ClickHouse Cloud, but it does not allow ClickHouse Cloud to connect to your internal network. +Let's say you are trying to use [MySQL](/sql-reference/table-functions/mysql) or [PostgreSQL](/sql-reference/table-functions/postgresql) table functions in ClickHouse Cloud and connect to your database hosted in an Amazon Web Services (AWS) VPC. AWS PrivateLink cannot be used to enable this connection securely. PrivateLink is a one-way, unidirectional connection. It allows your internal network or Amazon VPC to connect securely to ClickHouse Cloud, but it does not allow ClickHouse Cloud to connect to your internal network. According to the [AWS PrivateLink documentation](https://docs.aws.amazon.com/whitepapers/latest/building-scalable-secure-multi-vpc-network-infrastructure/aws-privatelink.html): diff --git a/docs/cloud/security/azure-privatelink.md b/docs/cloud/features/05_security/03_connectivity/private_networking/azure-privatelink.md similarity index 100% rename from docs/cloud/security/azure-privatelink.md rename to docs/cloud/features/05_security/03_connectivity/private_networking/azure-privatelink.md diff --git a/docs/cloud/security/gcp-private-service-connect.md b/docs/cloud/features/05_security/03_connectivity/private_networking/gcp-private-service-connect.md similarity index 97% rename from docs/cloud/security/gcp-private-service-connect.md rename to docs/cloud/features/05_security/03_connectivity/private_networking/gcp-private-service-connect.md index 8fcd99e06fd..cc573e09b6c 100644 --- a/docs/cloud/security/gcp-private-service-connect.md +++ b/docs/cloud/features/05_security/03_connectivity/private_networking/gcp-private-service-connect.md @@ -421,7 +421,7 @@ curl --silent --user "${KEY_ID:?}:${KEY_SECRET:?}" -X GET -H "Content-Type: appl ### Connecting to a remote database {#connecting-to-a-remote-database} -Let's say you are trying to use the [MySQL](../../sql-reference/table-functions/mysql.md) or [PostgreSQL](../../sql-reference/table-functions/postgresql.md) table functions in ClickHouse Cloud and connect to your database hosted in GCP. GCP PSC cannot be used to enable this connection securely. PSC is a one-way, unidirectional connection. It allows your internal network or GCP VPC to connect securely to ClickHouse Cloud, but it does not allow ClickHouse Cloud to connect to your internal network. +Let's say you are trying to use the [MySQL](/sql-reference/table-functions/mysql) or [PostgreSQL](/sql-reference/table-functions/postgresql) table functions in ClickHouse Cloud and connect to your database hosted in GCP. GCP PSC cannot be used to enable this connection securely. PSC is a one-way, unidirectional connection. It allows your internal network or GCP VPC to connect securely to ClickHouse Cloud, but it does not allow ClickHouse Cloud to connect to your internal network. According to the [GCP Private Service Connect documentation](https://cloud.google.com/vpc/docs/private-service-connect): diff --git a/docs/cloud/security/private-link-overview.md b/docs/cloud/features/05_security/03_connectivity/private_networking/private-link-overview.md similarity index 62% rename from docs/cloud/security/private-link-overview.md rename to docs/cloud/features/05_security/03_connectivity/private_networking/private-link-overview.md index 183362a8e58..8d6be0c413e 100644 --- a/docs/cloud/security/private-link-overview.md +++ b/docs/cloud/features/05_security/03_connectivity/private_networking/private-link-overview.md @@ -9,6 +9,6 @@ description: 'Landing page for private link' ClickHouse Cloud provides the ability to connect your services to your cloud virtual network. Refer to the guides below for your provider: -- [AWS private Link](/cloud/security/aws-privatelink.md) -- [GCP private service connect](/cloud/security/gcp-private-service-connect.md) -- [Azure private link](/cloud/security/azure-privatelink.md) +- [AWS private Link](/manage/security/aws-privatelink) +- [GCP private service connect](/manage/security/gcp-private-service-connect) +- [Azure private link](/cloud/security/azure-privatelink) diff --git a/docs/cloud/security/setting-ip-filters.md b/docs/cloud/features/05_security/03_connectivity/setting-ip-filters.md similarity index 100% rename from docs/cloud/security/setting-ip-filters.md rename to docs/cloud/features/05_security/03_connectivity/setting-ip-filters.md diff --git a/docs/cloud/features/05_security/_category_.json b/docs/cloud/features/05_security/_category_.json new file mode 100644 index 00000000000..aed26fa7f7a --- /dev/null +++ b/docs/cloud/features/05_security/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Security", + "collapsible": true, + "collapsed": true, +} \ No newline at end of file diff --git a/docs/cloud/security/cmek.md b/docs/cloud/features/05_security/cmek.md similarity index 100% rename from docs/cloud/security/cmek.md rename to docs/cloud/features/05_security/cmek.md diff --git a/docs/cloud/support.md b/docs/cloud/features/06_support.md similarity index 88% rename from docs/cloud/support.md rename to docs/cloud/features/06_support.md index 836382cd3c5..e6b73fc87a0 100644 --- a/docs/cloud/support.md +++ b/docs/cloud/features/06_support.md @@ -1,6 +1,6 @@ --- sidebar_label: 'Cloud Support' -title: 'Cloud Support' +title: 'Support' slug: /cloud/support description: 'Learn about Cloud Support' hide_title: true diff --git a/docs/cloud/manage/notifications.md b/docs/cloud/features/07_notifications.md similarity index 100% rename from docs/cloud/manage/notifications.md rename to docs/cloud/features/07_notifications.md diff --git a/docs/cloud/features/_category_.json b/docs/cloud/features/_category_.json new file mode 100644 index 00000000000..383c8150644 --- /dev/null +++ b/docs/cloud/features/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Features", + "collapsible": true, + "collapsed": true, +} \ No newline at end of file diff --git a/docs/cloud/manage/backups/configurable-backups.md b/docs/cloud/features/backups/configurable-backups.md similarity index 100% rename from docs/cloud/manage/backups/configurable-backups.md rename to docs/cloud/features/backups/configurable-backups.md diff --git a/docs/cloud/manage/backups/export-backups-to-own-cloud-account.md b/docs/cloud/features/backups/export-backups-to-own-cloud-account.md similarity index 97% rename from docs/cloud/manage/backups/export-backups-to-own-cloud-account.md rename to docs/cloud/features/backups/export-backups-to-own-cloud-account.md index 4cd5ea78b62..0bb2be7cda7 100644 --- a/docs/cloud/manage/backups/export-backups-to-own-cloud-account.md +++ b/docs/cloud/features/backups/export-backups-to-own-cloud-account.md @@ -15,7 +15,7 @@ For details of how ClickHouse Cloud backups work, including "full" vs. "incremen Here we show examples of how to take full and incremental backups to AWS, GCP, Azure object storage as well as how to restore from the backups. :::note -Users should be aware that any usage where backups are being exported to a different region in the same cloud provider, will incur [data transfer](../network-data-transfer.mdx) charges. Currently we do not support cross cloud backups. +Users should be aware that any usage where backups are being exported to a different region in the same cloud provider, will incur [data transfer](/cloud/manage/network-data-transfer) charges. Currently we do not support cross cloud backups. ::: ## Requirements {#requirements} diff --git a/docs/cloud/manage/backups/index.md b/docs/cloud/features/backups/index.md similarity index 100% rename from docs/cloud/manage/backups/index.md rename to docs/cloud/features/backups/index.md diff --git a/docs/cloud/manage/backups/overview.md b/docs/cloud/features/backups/overview.md similarity index 100% rename from docs/cloud/manage/backups/overview.md rename to docs/cloud/features/backups/overview.md diff --git a/docs/cloud/manage/hyperdx.md b/docs/cloud/features/hyperdx.md similarity index 98% rename from docs/cloud/manage/hyperdx.md rename to docs/cloud/features/hyperdx.md index 7e56e90d279..71e5cee6102 100644 --- a/docs/cloud/manage/hyperdx.md +++ b/docs/cloud/features/hyperdx.md @@ -15,7 +15,7 @@ HyperDX is the user interface for [**ClickStack**](/use-cases/observability/clic HyperDX is a purpose-built frontend for exploring and visualizing observability data, supporting both Lucene-style and SQL queries, interactive dashboards, alerting, trace exploration, and more—all optimized for ClickHouse as the backend. -HyperDX in ClickHouse Cloud allows users to enjoy a more turnkey ClickStack experience - no infrastructure to manage, no separate authentication to configure. +HyperDX in ClickHouse Cloud allows users to enjoy a more turnkey ClickStack experience - no infrastructure to manage, no separate authentication to configure. HyperDX can be launched with a single click and connected to your data - fully integrated into the ClickHouse Cloud authentication system for seamless, secure access to your observability insights. ## Deployment {#main-concepts} diff --git a/docs/cloud/manage/integrations.md b/docs/cloud/features/integrations.md similarity index 100% rename from docs/cloud/manage/integrations.md rename to docs/cloud/features/integrations.md diff --git a/docs/cloud/manage/replica-aware-routing.md b/docs/cloud/features/replica-aware-routing.md similarity index 95% rename from docs/cloud/manage/replica-aware-routing.md rename to docs/cloud/features/replica-aware-routing.md index 8b8376b8667..370e46737cd 100644 --- a/docs/cloud/manage/replica-aware-routing.md +++ b/docs/cloud/features/replica-aware-routing.md @@ -5,7 +5,11 @@ description: 'How to use Replica-aware routing to increase cache re-use' keywords: ['cloud', 'sticky endpoints', 'sticky', 'endpoints', 'sticky routing', 'routing', 'replica aware routing'] --- -# Replica-aware routing (private preview) +import PrivatePreviewBadge from '@theme/badges/PrivatePreviewBadge'; + +# Replica-aware routing + + Replica-aware routing (also known as sticky sessions, sticky routing, or session affinity) utilizes [Envoy proxy's ring hash load balancing](https://www.envoyproxy.io/docs/envoy/latest/intro/arch_overview/upstream/load_balancing/load_balancers#ring-hash). The main purpose of replica-aware routing is to increase the chance of cache reuse. It does not guarantee isolation. diff --git a/docs/cloud/manage/upgrades.md b/docs/cloud/features/upgrades.md similarity index 99% rename from docs/cloud/manage/upgrades.md rename to docs/cloud/features/upgrades.md index f52cb9d8bbd..ad28955ffe6 100644 --- a/docs/cloud/manage/upgrades.md +++ b/docs/cloud/features/upgrades.md @@ -15,7 +15,7 @@ import scheduled_upgrade_window from '@site/static/images/cloud/manage/scheduled # Upgrades -With ClickHouse Cloud you never have to worry about patching and upgrades. We roll out upgrades that include fixes, new features and performance improvements on a periodic basis. For the full list of what is new in ClickHouse refer to our [Cloud changelog](/cloud/reference/changelog.md). +With ClickHouse Cloud you never have to worry about patching and upgrades. We roll out upgrades that include fixes, new features and performance improvements on a periodic basis. For the full list of what is new in ClickHouse refer to our [Cloud changelog](/whats-new/cloud). :::note We are introducing a new upgrade mechanism, a concept we call "make before break" (or MBB). With this new approach, we add updated replica(s) before removing the old one(s) during the upgrade operation. This results in more seamless upgrades that are less disruptive to running workloads. diff --git a/docs/cloud/get-started/index.md b/docs/cloud/get-started/index.md deleted file mode 100644 index 3c30f63f149..00000000000 --- a/docs/cloud/get-started/index.md +++ /dev/null @@ -1,17 +0,0 @@ ---- -slug: /cloud/get-started -title: 'Get Started' -description: 'Get Started Table Of Contents' -keywords: ['Cloud Quick Start', 'SQL Console', 'Query Insights', 'Query API Endpoints', 'Dashboards', 'Cloud Support'] ---- - -Welcome to ClickHouse Cloud! Explore the pages below to learn more about what ClickHouse Cloud has to offer. - -| Page | Description | -|--------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------| -| [Overview](/cloud/overview) | Overview of the benefits of using ClickHouse Cloud and what version of ClickHouse is used for it. | -| [SQL Console](/cloud/get-started/sql-console) | Learn about the interactive SQL console available in Cloud | -| [Query Insights](/cloud/get-started/query-insights) | Learn about how Cloud's Query Insights feature makes ClickHouse's built-in query log easier to use through various visualizations and tables. | -| [Query Endpoints](/cloud/get-started/query-endpoints) | Learn about the Query API Endpoints feature which allows you to create an API endpoint directly from any saved SQL query in the ClickHouse Cloud console. | -| [Dashboards](/cloud/manage/dashboards) | Learn about how SQL Console's dashboards feature allows you to collect and share visualizations from saved queries. | -| [Cloud Support](/cloud/support) | Learn more about Support Services for ClickHouse Cloud users and customers. | diff --git a/docs/cloud/bestpractices/_category_.yml b/docs/cloud/guides/_category_.yml similarity index 83% rename from docs/cloud/bestpractices/_category_.yml rename to docs/cloud/guides/_category_.yml index 1648e8a79cb..747e5fb1796 100644 --- a/docs/cloud/bestpractices/_category_.yml +++ b/docs/cloud/guides/_category_.yml @@ -1,4 +1,4 @@ -label: 'Best Practices' +label: 'Guides' collapsible: true collapsed: true link: diff --git a/docs/cloud/guides/best_practices/_category_.json b/docs/cloud/guides/best_practices/_category_.json new file mode 100644 index 00000000000..21f95c55bca --- /dev/null +++ b/docs/cloud/guides/best_practices/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Best practices", + "collapsible": true, + "collapsed": true, +} \ No newline at end of file diff --git a/docs/cloud/guides/best_practices/index.md b/docs/cloud/guides/best_practices/index.md new file mode 100644 index 00000000000..4719ea2750a --- /dev/null +++ b/docs/cloud/guides/best_practices/index.md @@ -0,0 +1,22 @@ +--- +slug: /cloud/bestpractices +keywords: ['Cloud', 'Best Practices', 'Bulk Inserts', 'Asynchronous Inserts', 'Avoid Mutations', 'Avoid Nullable Columns', 'Avoid Optimize Final', 'Low Cardinality Partitioning Key', 'Multi Tenancy', 'Usage Limits'] +title: 'Overview' +hide_title: true +description: 'Landing page for Best Practices section in ClickHouse Cloud' +--- + +import TableOfContents from '@site/docs/best-practices/_snippets/_table_of_contents.md'; + +# Best Practices in ClickHouse Cloud {#best-practices-in-clickhouse-cloud} + +This section provides best practices you will want to follow to get the most out of ClickHouse Cloud. + +| Page | Description | +|----------------------------------------------------------|----------------------------------------------------------------------------| +| [Usage Limits](/cloud/bestpractices/usage-limits)| Explore the limits of ClickHouse. | +| [Multi tenancy](/cloud/bestpractices/multi-tenancy)| Learn about different strategies to implement multi-tenancy. | + +These are in addition to the standard best practices which apply to all deployments of ClickHouse. + + \ No newline at end of file diff --git a/docs/cloud/bestpractices/multitenancy.md b/docs/cloud/guides/best_practices/multitenancy.md similarity index 99% rename from docs/cloud/bestpractices/multitenancy.md rename to docs/cloud/guides/best_practices/multitenancy.md index 5289a09b067..5f7df65427a 100644 --- a/docs/cloud/bestpractices/multitenancy.md +++ b/docs/cloud/guides/best_practices/multitenancy.md @@ -1,6 +1,6 @@ --- slug: /cloud/bestpractices/multi-tenancy -sidebar_label: 'Implement multi tenancy' +sidebar_label: 'Multi tenancy' title: 'Multi tenancy' description: 'Best practices to implement multi tenancy' --- diff --git a/docs/cloud/guides/best_practices/usagelimits.md b/docs/cloud/guides/best_practices/usagelimits.md new file mode 100644 index 00000000000..af49f5956be --- /dev/null +++ b/docs/cloud/guides/best_practices/usagelimits.md @@ -0,0 +1,40 @@ +--- +slug: /cloud/bestpractices/usage-limits +sidebar_label: 'Service limits' +title: 'Usage limits' +description: 'Describes the recommended usage limits in ClickHouse Cloud' +--- + +While ClickHouse is known for its speed and reliability, optimal performance is +achieved within certain operating parameters. For example, having too many tables, +databases or parts could negatively impact performance. To avoid this, Clickhouse +Cloud has guardrails set up for several types of items. You can find details of +these guardrails below. + +:::tip +If you've run up against one of these guardrails, it's possible that you are +implementing your use case in an unoptimized way. Contact our support team and +we will gladly help you refine your use case to avoid exceeding the guardrails +or look together at how we can increase them in a controlled manner. +::: + +| Dimension | Limit | +|-------------------------------|------------------------------------------------------------| +| **Databases** | 1000 | +| **Tables** | 5000 | +| **Columns** | ∼1000 (wide format is preferred to compact) | +| **Partitions** | 50k | +| **Parts** | 100k across the entire instance | +| **Part size** | 150gb | +| **Services per organization** | 20 (soft) | +| **Services per warehouse** | 5 (soft) | +| **Low cardinality** | 10k or less | +| **Primary keys in a table** | 4-5 that sufficiently filter down the data | +| **Query concurrency** | 1000 | +| **Batch ingest** | anything > 1M will be split by the system in 1M row blocks | + +:::note +For Single Replica Services, the maximum number of databases is restricted to +100, and the maximum number of tables is restricted to 500. In addition, storage +for Basic Tier Services is limited to 1 TB. +::: diff --git a/docs/cloud/reference/cloud-compatibility.md b/docs/cloud/guides/cloud-compatibility.md similarity index 99% rename from docs/cloud/reference/cloud-compatibility.md rename to docs/cloud/guides/cloud-compatibility.md index 86dafbfefd5..59c238c9c08 100644 --- a/docs/cloud/reference/cloud-compatibility.md +++ b/docs/cloud/guides/cloud-compatibility.md @@ -1,6 +1,6 @@ --- slug: /whats-new/cloud-compatibility -sidebar_label: 'Cloud Compatibility' +sidebar_label: 'Cloud compatibility' title: 'Cloud Compatibility' description: 'This guide provides an overview of what to expect functionally and operationally in ClickHouse Cloud.' --- diff --git a/docs/cloud/guides/index.md b/docs/cloud/guides/index.md new file mode 100644 index 00000000000..2355ca4370c --- /dev/null +++ b/docs/cloud/guides/index.md @@ -0,0 +1,6 @@ +--- +slug: /cloud/guides +title: 'Guides' +hide_title: true +description: 'Table of contents page for the ClickHouse Cloud guides section' +--- \ No newline at end of file diff --git a/docs/cloud/guides/security/_category_.json b/docs/cloud/guides/security/_category_.json new file mode 100644 index 00000000000..aed26fa7f7a --- /dev/null +++ b/docs/cloud/guides/security/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Security", + "collapsible": true, + "collapsed": true, +} \ No newline at end of file diff --git a/docs/cloud/guides/security/cloud_access_management/_category_.json b/docs/cloud/guides/security/cloud_access_management/_category_.json new file mode 100644 index 00000000000..abfdcebed27 --- /dev/null +++ b/docs/cloud/guides/security/cloud_access_management/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Cloud Access Management", + "collapsible": true, + "collapsed": true, +} \ No newline at end of file diff --git a/docs/cloud/security/common-access-management-queries.md b/docs/cloud/guides/security/cloud_access_management/common-access-management-queries.md similarity index 100% rename from docs/cloud/security/common-access-management-queries.md rename to docs/cloud/guides/security/cloud_access_management/common-access-management-queries.md diff --git a/docs/cloud/security/saml-sso-setup.md b/docs/cloud/guides/security/cloud_access_management/saml-sso-setup.md similarity index 100% rename from docs/cloud/security/saml-sso-setup.md rename to docs/cloud/guides/security/cloud_access_management/saml-sso-setup.md diff --git a/docs/cloud/guides/security/connectivity/_category_.json b/docs/cloud/guides/security/connectivity/_category_.json new file mode 100644 index 00000000000..6e137e0592d --- /dev/null +++ b/docs/cloud/guides/security/connectivity/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Connectivity", + "collapsible": true, + "collapsed": true, +} \ No newline at end of file diff --git a/docs/cloud/security/accessing-s3-data-securely.md b/docs/cloud/guides/security/connectivity/accessing-s3-data-securely.md similarity index 100% rename from docs/cloud/security/accessing-s3-data-securely.md rename to docs/cloud/guides/security/connectivity/accessing-s3-data-securely.md diff --git a/docs/cloud/manage/_category_.yml b/docs/cloud/manage/_category_.yml deleted file mode 100644 index 59089856c86..00000000000 --- a/docs/cloud/manage/_category_.yml +++ /dev/null @@ -1,6 +0,0 @@ -label: 'Manage Cloud' -collapsible: true -collapsed: true -link: - type: generated-index - title: Manage ClickHouse Cloud diff --git a/docs/cloud/manage/index.md b/docs/cloud/manage/index.md deleted file mode 100644 index 46c407d0c6b..00000000000 --- a/docs/cloud/manage/index.md +++ /dev/null @@ -1,30 +0,0 @@ ---- -slug: /cloud/manage -keywords: ['AWS', 'Cloud', 'serverless', 'management'] -title: 'Overview' -hide_title: true -description: 'Overview page for Managing Cloud' ---- - -# Managing Cloud - -In this section of the docs you will find all the information you may need about managing ClickHouse cloud. This section contains the following pages: - -| Page | Description | -|-----------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------| -| [ClickHouse Cloud Tiers](/cloud/manage/cloud-tiers) | Describes the different cloud tiers, their features, and considerations for choosing the right one. | -| [Integrations](/manage/integrations) | Covers ClickHouse Cloud's built-in integrations, custom integrations, and integrations that are not supported. | -| [Backups](/cloud/manage/backups) | Describes how backups work in ClickHouse Cloud, what options you have to configure backups for your service, and how to restore from a backup. | -| [Monitoring](/integrations/prometheus) | How to integrate Prometheus as a way to monitor ClickHouse cloud. | -| [Billing](/cloud/manage/billing/overview) | Explains the pricing model for ClickHouse Cloud, including the factors that affect the cost of your service. | -| [Configuring Settings](/manage/settings) | Describes how to configure settings for ClickHouse Cloud. | -| [Replica-aware Routing](/manage/replica-aware-routing) | Explains what Replica-aware Routing in ClickHouse Cloud is, its limitations, and how to configure it. | -| [Automatic Scaling](/manage/scaling) | Explains how ClickHouse Cloud services can be scaled manually or automatically based on your resource needs. | -| [Service Uptime and SLA](/cloud/manage/service-uptime) | Information about service uptime and Service Level Agreements offered for production instances. | -| [Notifications](/cloud/notifications) | Shows how ClickHouse Cloud notifications are received and how they can be customized. | -| [Upgrades](/manage/updates) | Information on how upgrades are rolled out in ClickHouse Cloud. | -| [Delete Account](/cloud/manage/close_account) | Information on how to close or delete your account when necessary. | -| [Programmatic API Access with Postman](/cloud/manage/postman) | A guide to help you test the ClickHouse API using Postman. | -| [Troubleshooting](/faq/troubleshooting) | A collection of commonly encountered issues and how to troubleshoot them. | -| [Data Transfer](./network-data-transfer.mdx) | Learn more about how ClickHouse Cloud meters data transferred ingress and egress. | -| [Jan 2025 Changes FAQ](./jan2025_faq/index.md) | Learn more about changes to Cloud introduced in Jan 2025. | diff --git a/docs/cloud/manage/jan2025_faq/_snippets/_clickpipes_faq.md b/docs/cloud/manage/jan2025_faq/_snippets/_clickpipes_faq.md deleted file mode 100644 index 436a1cb705b..00000000000 --- a/docs/cloud/manage/jan2025_faq/_snippets/_clickpipes_faq.md +++ /dev/null @@ -1,145 +0,0 @@ -import Image from '@theme/IdealImage'; -import clickpipesPricingFaq1 from '@site/static/images/cloud/manage/jan2025_faq/external_clickpipes_pricing_faq_1.png'; -import clickpipesPricingFaq2 from '@site/static/images/cloud/manage/jan2025_faq/external_clickpipes_pricing_faq_2.png'; -import clickpipesPricingFaq3 from '@site/static/images/cloud/manage/jan2025_faq/external_clickpipes_pricing_faq_3.png'; - -
- -Why are we introducing a pricing model for ClickPipes now? - -We decided to initially launch ClickPipes for free with the idea to gather -feedback, refine features, and ensure it meets user needs. -As the GA platform has grown, it has effectively stood the test of time by -moving trillions of rows. Introducing a pricing model allows us to continue -improving the service, maintaining the infrastructure, and providing dedicated -support and new connectors. - -
- -
- -What are ClickPipes replicas? - -ClickPipes ingests data from remote data sources via a dedicated infrastructure -that runs and scales independently of the ClickHouse Cloud service. -For this reason, it uses dedicated compute replicas. -The diagrams below show a simplified architecture. - -For streaming ClickPipes, ClickPipes replicas access the remote data sources (e.g., a Kafka broker), -pull the data, process and ingest it into the destination ClickHouse service. - -ClickPipes Replicas - Streaming ClickPipes - -In the case of object storage ClickPipes, -the ClickPipes replica orchestrates the data loading task -(identifying files to copy, maintaining the state, and moving partitions), -while the data is pulled directly from the ClickHouse service. - -ClickPipes Replicas - Object Storage ClickPipes - -
- -
- -What's the default number of replicas and their size? - -Each ClickPipe defaults to 1 replica that's provided with 2 GiB of RAM and 0.5 vCPU. -This corresponds to **0.25** ClickHouse compute units (1 unit = 8 GiB RAM, 2 vCPUs). - -
- -
- -Can ClickPipes replicas be scaled? - -Yes, ClickPipes for streaming can be scaled both horizontally and vertically. -Horizontal scaling adds more replicas to increase throughput, while vertical scaling increases the resources (CPU and RAM) allocated to each replica to handle more intensive workloads. -This can be configured during ClickPipe creation, or at any other point under **Settings** -> **Advanced Settings** -> **Scaling**. - -
- -
- -How many ClickPipes replicas do I need? - -It depends on the workload throughput and latency requirements. -We recommend starting with the default value of 1 replica, measuring your latency, and adding replicas if needed. -Keep in mind that for Kafka ClickPipes, you also have to scale the Kafka broker partitions accordingly. -The scaling controls are available under "settings" for each streaming ClickPipe. - -ClickPipes Replicas - How many ClickPipes replicas do I need? - -
- -
- -What does the ClickPipes pricing structure look like? - -It consists of two dimensions: -- **Compute**: Price per unit per hour - Compute represents the cost of running the ClickPipes replica pods whether they actively ingest data or not. - It applies to all ClickPipes types. -- **Ingested data**: per GB pricing - The ingested data rate applies to all streaming ClickPipes - (Kafka, Confluent, Amazon MSK, Amazon Kinesis, Redpanda, WarpStream, - Azure Event Hubs) for the data transferred via the replica pods. - The ingested data size (GB) is charged based on bytes received from the source (uncompressed or compressed). - -
- -
- -What are the ClickPipes public prices? - -- Compute: \$0.20 per unit per hour ($0.05 per replica per hour) -- Ingested data: $0.04 per GB - -
- -
- -How does it look in an illustrative example? - -For example, ingesting 1 TB of data over 24 hours using the Kafka connector using a single replica (0.25 compute unit) costs: - -$$ -(0.25 \times 0.20 \times 24) + (0.04 \times 1000) = \$41.2 -$$ -
- -For object storage connectors (S3 and GCS), -only the ClickPipes compute cost is incurred since the ClickPipes pod is not processing data -but only orchestrating the transfer which is operated by the underlying ClickHouse service: - -$$ -0.25 \times 0,20 \times 24 = \$1.2 -$$ - -
- -
- -When does the new pricing model take effect? - -The new pricing model takes effect for all organizations created after January 27th, 2025. - -
- -
- -What happens to current users? - -Existing users will have a **60-day grace period** where the ClickPipes service continues to be offered for free. -Billing will automatically start for ClickPipes for existing users on **March 24th, 2025.** - -
- -
- -How does ClickPipes pricing compare to the market? - -The philosophy behind ClickPipes pricing is -to cover the operating costs of the platform while offering an easy and reliable way to move data to ClickHouse Cloud. -From that angle, our market analysis revealed that we are positioned competitively. - -
diff --git a/docs/cloud/manage/jan2025_faq/backup.md b/docs/cloud/manage/jan2025_faq/backup.md deleted file mode 100644 index 579788f8dec..00000000000 --- a/docs/cloud/manage/jan2025_faq/backup.md +++ /dev/null @@ -1,18 +0,0 @@ ---- -title: 'Backup Policy' -slug: /cloud/manage/jan-2025-faq/backup -keywords: ['new tiers', 'plans', 'pricing', 'backups'] -description: 'Backup policy in new tiers' ---- - -## What is the backup policy? {#what-is-the-backup-policy} -In Basic, Scale, and Enterprise tiers backups are metered and billed separately from storage. -All services will default to one daily backup with the ability to configure more, starting with the Scale tier, via the Settings tab of the Cloud console. Each backup will be retained for at least 24 hours. - -## What happens to current configurations that users have set up separate from default backups? {#what-happens-to-current-configurations-that-users-have-set-up-separate-from-default-backups} - -Customer specific backup configurations will carry over. Users can change these as they see fit in the new tiers. - -## Are backups charged differently across tiers? {#are-backups-charged-differently-across-tiers} - -The cost of backups is the same across all tiers. diff --git a/docs/cloud/manage/jan2025_faq/billing.md b/docs/cloud/manage/jan2025_faq/billing.md deleted file mode 100644 index 4147cc9976d..00000000000 --- a/docs/cloud/manage/jan2025_faq/billing.md +++ /dev/null @@ -1,47 +0,0 @@ ---- -title: 'Billing' -slug: /cloud/manage/jan-2025-faq/billing -keywords: ['new pricing', 'billing'] -description: 'Billing details for new pricing tiers' ---- - -## Billing {#billing} - -### Are there any changes to how usage is metered and charged? {#are-there-any-changes-to-how-usage-is-metered-and-charged} - -The per-dimension unit cost for compute and storage has changed, and there are two additional dimensions to account for data transfer and ClickPipes usage. - -Some notable changes: - -- Storage price per TB will be reduced, and storage cost will no longer include backups (we will charge for them separately and will make only one backup required). Storage costs are the same across tiers and vary by region and cloud service provider. -- Compute costs will vary by tier, region, and cloud service provider. -- The new pricing dimension for data transfer is applicable for data egress across regions and on the public internet only. -- New pricing dimension for ClickPipes usage. - -### What happens to users with existing committed spend contracts? {#what-happens-to-users-with-existing-committed-spend-contracts} - -Users with active committed spend contracts will not be affected by the new per-dimension unit cost prices for compute and storage until their contract expires. However, the new pricing dimensions for data transfer and ClickPipes will be applicable starting March 24, 2025. Most customers will not see a significant increase in their monthly bill from these new dimensions. - -### Can users on a committed spend agreement with ClickHouse continue to launch services on the old plan? {#can-users-on-a-committed-spend-agreement-with-clickhouse-continue-to-launch-services-on-the-old-plan} - -Yes, users will be able to launch Development and Production services until the end date of their contract, and renewals will reflect the new pricing plan. - -If you need to modify your contract or have questions about how these changes might affect you in the future, please contact our support team or your sales representative. - -### What happens if users exhaust their credits before the end of the contract and go to PAYG? {#what-happens-if-users-exhaust-their-credits-before-the-end-of-the-contract-and-go-to-payg} - -If committed spend contracts exhaust credits before their renewal date, we bill them at the current rates until renewal (as per current policy). - -### What happens to users on the monthly PAYG? {#what-happens-to-users-on-the-monthly-payg} - -Users on a monthly PAYG plan will continue to be billed using the old pricing plan for the Development and Production services. They have until July 23, 2025, to migrate to the new plan self-serve, or they will all be migrated to the Scale configuration on this day and billed based on the new plan. - -### Where can I reference legacy plans? {#where-can-i-reference-legacy-plans} - -Legacy plans are available for reference [here](https://clickhouse.com/pricing?legacy=true). - -## Marketplaces {#marketplaces} - -### Are there changes to how users are charged via the CSP marketplaces? {#are-there-changes-to-how-users-are-charged-via-the-csp-marketplaces} - -Users who sign up to ClickHouse Cloud via a CSP Marketplace incur usage in terms of CHCs (ClickHouse Cloud Credits). This behavior has not changed. However, the underlying composition of the credit usage will align with the pricing and packaging changes outlined here and include charges for any data transfer usage and ClickPipes once those are live. diff --git a/docs/cloud/manage/jan2025_faq/dimensions.md b/docs/cloud/manage/jan2025_faq/dimensions.md deleted file mode 100644 index c4dd9268593..00000000000 --- a/docs/cloud/manage/jan2025_faq/dimensions.md +++ /dev/null @@ -1,38 +0,0 @@ ---- -title: 'New Pricing Dimensions' -slug: /cloud/manage/jan-2025-faq/pricing-dimensions -keywords: ['new pricing', 'dimensions'] -description: 'Pricing dimensions for data transfer and ClickPipes' ---- - -import Image from '@theme/IdealImage'; -import clickpipesPricingFaq1 from '@site/static/images/cloud/manage/jan2025_faq/external_clickpipes_pricing_faq_1.png'; -import clickpipesPricingFaq2 from '@site/static/images/cloud/manage/jan2025_faq/external_clickpipes_pricing_faq_2.png'; -import clickpipesPricingFaq3 from '@site/static/images/cloud/manage/jan2025_faq/external_clickpipes_pricing_faq_3.png'; -import NetworkPricing from '@site/docs/cloud/manage/_snippets/_network_transfer_rates.md'; -import ClickPipesFAQ from './_snippets/_clickpipes_faq.md' - -The following dimensions have been added to the new ClickHouse Cloud pricing. - -:::note -Data transfer and ClickPipes pricing doesn't apply to legacy plans, i.e. Development, Production, and Dedicated, until 24 March 2025. -::: - -## Data transfer pricing {#data-transfer-pricing} - -### How are users charged for data transfer, and will this vary across organization tiers and regions? {#how-are-users-charged-for-data-transfer-and-will-this-vary-across-organization-tiers-and-regions} - -- Users pay for data transfer along two dimensions — public internet egress and inter-region egress. There are no charges for intra-region data transfer or Private Link/Private Service Connect use and data transfer. However, we reserve the right to implement additional data transfer pricing dimensions if we see usage patterns that impact our ability to charge users appropriately. -- Data transfer pricing varies by Cloud Service Provider (CSP) and region. -- Data transfer pricing does **not** vary between organizational tiers. -- Public egress pricing is based only on the origin region. Inter-region (or cross-region) pricing depends on both the origin and destination regions. - - - -### Will data transfer pricing be tiered as usage increases? {#will-data-transfer-pricing-be-tiered-as-usage-increases} - -Data transfer prices will **not** be tiered as usage increases. Pricing varies by region and cloud service provider. - -## ClickPipes pricing FAQ {#clickpipes-pricing-faq} - - diff --git a/docs/cloud/manage/jan2025_faq/index.md b/docs/cloud/manage/jan2025_faq/index.md deleted file mode 100644 index 840e07c06e7..00000000000 --- a/docs/cloud/manage/jan2025_faq/index.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -title: 'Jan 2025 Changes FAQ' -slug: /cloud/manage/jan-2025-faq -description: 'Index page for new pricing FAQ' -keywords: ['new pricing', 'faq'] ---- - - - - - diff --git a/docs/cloud/manage/jan2025_faq/new_tiers.md b/docs/cloud/manage/jan2025_faq/new_tiers.md deleted file mode 100644 index b90874aedb8..00000000000 --- a/docs/cloud/manage/jan2025_faq/new_tiers.md +++ /dev/null @@ -1,64 +0,0 @@ ---- -title: 'Description of New Tiers' -slug: /cloud/manage/jan-2025-faq/new-tiers -keywords: ['new tiers', 'features', 'pricing', 'description'] -description: 'Description of new tiers and features' ---- - -## Summary of key changes {#summary-of-key-changes} - -### What key changes to expect with regard to features to tier mapping? {#what-key-changes-to-expect-with-regard-to-features-to-tier-mapping} - -- **Private Link/Private Service Connect:** Private connections are now supported across all service types on Scale and Enterprise tiers (including single-replica services). This means you can now have Private Link for both your production (large scale) and development (small scale) environments. -- **Backups:** All services now come with one backup by default and additional backups are charged separately. Users can leverage the configurable backup controls to manage additional backups. This means that services with lesser backup requirements do not need to pay a higher bundled price. Please see more details in the Backup FAQ. -- **Enhanced Encryption:** This feature is available in Enterprise tier services, including for single replica services, in AWS and GCP. Services are encrypted by our key by default and can be rotated to their key to enable Customer Managed Encryption Keys (CMEK). -- **Single Sign On (SSO):** This feature is offered in Enterprise tier and requires a support ticket to be enabled for an Organization. Users who have multiple Organizations should ensure all of their organizations are on the Enterprise tier to use SSO for each organization. - -## Basic tier {#basic-tier} - -### What are the considerations for the Basic tier? {#what-are-the-considerations-for-the-basic-tier} - -The basic tier is meant for small workloads - users want to deploy a small analytics application that does not require high availability or work on a prototype. This tier is not suitable for workloads that need scale, reliability (DR/HA), and data durability. The tier supports single replica services of fixed size 1x8GiB or 1x12GiB. Please refer to the docs and [support policy](https://clickhouse.com/support/program) for more information. - -### Can users on the Basic tier access Private Link and Private Service Connect? {#can-users-on-the-basic-tier-access-private-link-and-private-service-connect} - -No, Users will need to upgrade to Scale or Enterprise to access this feature. - -### Can users on the Basic and Scale tiers set up SSO for the organization? {#can-users-on-the-basic-and-scale-tiers-set-up-sso-for-the-organization} - -No, users will need to upgrade to the Enterprise tier to access this feature. - -### Can users launch single replica services in all tiers? {#can-users-launch-single-replica-services-in-all-tiers} - -Yes, single replica services are supported on all three tiers. Users can scale out, but are not permitted to scale into a single replica. - -### Can users scale up/down and add more replicas on the Basic tier? {#can-users-scale-updown-and-add-more-replicas-on-the-basic-tier} - -No, services on this tier are meant to support workloads that are small and fixed size (single replica `1x8GiB` or `1x12GiB`). The size of the single replica is fixed upon service creation and cannot be adjusted or scaled after service creation. If users need to scale up/down or add replicas, they will be prompted to upgrade to Scale or Enterprise tiers. - -## Scale tier {#scale-tier} - -### Which tiers on the new plans (Basic/Scale/Enterprise) support compute-compute separation? {#which-tiers-on-the-new-plans-basicscaleenterprise-support-compute-compute-separation} - -Only Scale and Enterprise tiers support compute-compute separation. Please also note that this capability requires running at least a 2+ replica parent service. - -### Can users on the legacy plans (Production/Development) access compute-compute separation? {#can-users-on-the-legacy-plans-productiondevelopment-access-compute-compute-separation} - -Compute-compute separation is not supported on existing Development and Production services, except for users who already participated in the Private Preview and Beta. If you have additional questions, please contact [support](https://clickhouse.com/support/program). - -## Enterprise tier {#enterprise-tier} - -### What different hardware profiles are supported for the Enterprise tier? {#what-different-hardware-profiles-are-supported-for-the-enterprise-tier} - -The enterprise tier will support standard profiles (1:4 vCPU:memory ratio), as well as `highMem (1:8 ratio)` and `highCPU (1:2 ratio)` **custom profiles,** offering users more flexibility to select the configuration that best suits their needs. The Enterprise Tier will use shared compute resources deployed alongside the Basic and Scale tiers. - -### What are the features exclusively offered on the Enterprise tier? {#what-are-the-features-exclusively-offered-on-the-enterprise-tier} - -- **Custom profiles:** Options for instance type selection standard profiles (1:4 vCPU: memory ratio) and `highMem (1:8 ratio)` and `highCPU (1:2 ratio)` custom profiles. -- **Enterprise-grade security:** - - **Single Sign On (SSO**) - - **Enhanced Encryption:** For AWS and GCP services. Services are encrypted by our key by default and can be rotated to their key to enable Customer Managed Encryption Keys (CMEK). -- **Scheduled upgrades:** Users can select the day of the week/time window for upgrades, both database and cloud releases. -- **HIPAA Compliance:** The customer must sign a Business Associate Agreement (BAA) through Legal before we enable HIPAA-compliant regions for them. -- **Private Regions:** It is not self-serve enabled and will need users to route requests through Sales sales@clickhouse.com. -- **Export Backups** to the customer's cloud account. diff --git a/docs/cloud/manage/jan2025_faq/plan_migrations.md b/docs/cloud/manage/jan2025_faq/plan_migrations.md deleted file mode 100644 index fffdebfe45b..00000000000 --- a/docs/cloud/manage/jan2025_faq/plan_migrations.md +++ /dev/null @@ -1,108 +0,0 @@ ---- -title: 'Migrating to New Plans' -slug: /cloud/manage/jan-2025-faq/plan-migrations -keywords: ['migration', 'new tiers', 'pricing', 'cost', 'estimation'] -description: 'Migrating to new plans, tiers, pricing, how to decide and estimate costs' ---- - -## Choosing new plans {#choosing-new-plans} - -### Can new organizations launch services on the old (legacy) plan? {#can-new-organizations-launch-services-on-the-old-legacy-plan} - -No, newly created organizations will not have access to the old plan after the announcement. - -### Can users migrate to the new pricing plan self-serve? {#can-users-migrate-to-the-new-pricing-plan-self-serve} - -Yes, see below for guidance on self-serve migrations: - -| Current Plan | New Plan | Self-Serve Migration | -|--------------|--------------------------|------------------------------------------------------------------------------------------------------------------------------------------------| -| Development | Basic | Supported if all services in the organization support are Development | -| Development | Scale (2 replicas+) | :white_check_mark: | -| Development | Enterprise (2 replicas+) | :white_check_mark: | -| Production | Scale (3 replicas+) | :white_check_mark: | -| Production | Enterprise (3 replicas+) | :white_check_mark: | -| Dedicated | Contact [support](https://clickhouse.com/support/program) | - -### What will the experience be for users in trial running Development and Production services? {#what-will-the-experience-be-for-users-in-trial-running-development-and-production-services} - -Users can upgrade during the trial and continue to use the trial credits to evaluate the new service tiers and the features it supports. However, if they choose to continue using the same Development and Production services, they can do so and upgrade to PAYG. They will still have to migrate before July 23, 2025. - -### Can users upgrade their tiers {#can-users-upgrade-their-tiers-ie-basic--scale-scale--enterprise-etc} - -Can users upgrade their tiers, for example, Basic → Scale, Scale → Enterprise, etc. -Yes, users can upgrade self-serve, and the pricing will reflect the tier selection after upgrade. - -### Can users move from a higher to a lower-cost tier {#can-users-move-from-a-higher-to-a-lower-cost-tier-eg-enterprise--scale-scale--basic-enterprise--basic-self-serve} - -For example, Enterprise → Scale, Scale → Basic, Enterprise → Basic self-serve? -Yes, but users will need to remove all premium features and may be guided to scale their multi-replica services into a single replica. - -### Can users with only development services in the organization migrate to the Basic tier? {#can-users-with-only-development-services-in-the-organization-migrate-to-the-basic-tier} - -Yes, this would be permitted. Users will be given a recommendation based on their past use and can select Basic `1x8GiB` or `1x12GiB`. - -### Can users with a development and production service in the same organization move to the basic tier? {#can-users-with-a-development-and-production-service-in-the-same-organization-move-to-the-basic-tier} - -No, if a user has both Development and Production services in the same organization, they can self-serve and migrate only to the Scale or Enterprise tier. If they want to migrate to Basic, they should delete all existing Production services. - -### Are there any changes related to the Scaling behavior with the new tiers? {#are-there-any-changes-related-to-the-scaling-behavior-with-the-new-tiers} - -We are introducing a new vertical scaling mechanism for compute replicas, which we call "Make Before Break" (MBB). This approach adds one or more replicas of the new size before removing the old replicas, preventing any loss of capacity during scaling operations. By eliminating the gap between removing existing replicas and adding new ones, MBB creates a more seamless and less disruptive scaling process. It is especially beneficial in scale-up scenarios, where high resource utilization triggers the need for additional capacity, since removing replicas prematurely would only exacerbate the resource constraints. - -Please note that as part of this change, historical system table data will be retained for up to a maximum of 30 days as part of scaling events. In addition, any system table data older than December 19, 2024, for services on AWS or GCP and older than January 14, 2025, for services on Azure will not be retained as part of the migration to the new organization tiers. - -## Estimating costs {#estimating-costs} - -### How will users be guided during migration, understanding what tier best fits their needs? {#how-will-users-be-guided-during-migration-understanding-what-tier-best-fits-their-needs} - -The console will prompt you with recommended options for each service based on historical use if you have a service. New users can review the capabilities and features listed in detail and decide on the tier that best suits their needs. - -### How do users size and estimate the cost of "warehouses" in the new pricing? {#how-do-users-size-and-estimate-the-cost-of-warehouses-in-the-new-pricing} - -Please refer to the pricing calculator on the [Pricing](https://clickhouse.com/pricing) page, which will help estimate the cost based on your workload size and tier selection. - -## Undertaking the migration {#undertaking-the-migration} - -### What are service version pre-requisites to undertaking the migration? {#what-are-service-version-pre-requisites-to-undertaking-the-migration} - -Your service has to be on version 24.8 or later and already migrated to SharedMergeTree. - -### What is the migration experience for users of the current Development and Production services? Do users need to plan for a maintenance window where the service is unavailable? {#what-is-the-migration-experience-for-users-of-the-current-development-and-production-services-do-users-need-to-plan-for-a-maintenance-window-where-the-service-is-unavailable} - -Migrations of Development and Production services to the new pricing tiers may trigger a rolling restart. To migrate a Dedicated service, please contact [support](https://clickhouse.com/support/program). - -### What other actions should a user take after the migration? {#what-other-actions-should-a-user-take-after-the-migration} - -API access patterns will be different. - -Users that use our OpenAPI to create new services will be required to remove the `tier` field in the service creation `POST` request. - -The `tier` field has been removed from the service object as we no longer have service tiers. -This will affect the objects returned by the `POST`, `GET`, and `PATCH` service requests. Therefore, any code that consumes these APIs may need to be adjusted to handle these changes. - -The number of replicas each service will be created with defaults to 3 for the Scale and Enterprise tiers, while it defaults to 1 for the Basic tier. -For the Scale and the Enterprise tiers it is possible to adjust it by passing a `numReplicas` field in the service creation request. -The value of the `numReplicas` field must be between 2 and 20 for the first service in a warehouse. Services that are created in an existing warehouse can have a number of replicas as low as 1. - -### What changes should the users make if using the existing Terraform provider for automation? {#what-changes-should-the-users-make-if-using-the-existing-terraform-provider-for-automation} - -Once an organization has been migrated to one of the new plans, users will be required to use our Terraform provider version 2.0.0 or above. - -The new Terraform provider is required to handle changes in the `tier` attribute of the service. - -After the migration, the `tier` field is no longer accepted, and references to it should be removed. - -Users will also be able to specify the `num_replicas` field as a property of the service resource. - -The number of replicas each service will be created with defaults to 3 for the Scale and Enterprise tiers, while it defaults to 1 for the Basic tier. -For the Scale and the Enterprise tiers, it is possible to adjust it by passing a `numReplicas` field in the service creation request. -The value of the `num_replicas` filed must be between 2 and 20 for the first service in a warehouse. Services that are created in an existing warehouse can have a number of replicas as low as 1. - -### Will users have to make any changes to the database access? {#will-users-have-to-make-any-changes-to-the-database-access} - -No, the database username/password will work the same as before. - -### Will users have to reconfigure private networking features? {#will-users-have-to-reconfigure-private-networking-features} - -No, users can use their existing private networking (Private Link, PSC, etc..) configuration after moving their Production service to Scale or Enterprise. diff --git a/docs/cloud/manage/jan2025_faq/scaling.md b/docs/cloud/manage/jan2025_faq/scaling.md deleted file mode 100644 index e65aff7345e..00000000000 --- a/docs/cloud/manage/jan2025_faq/scaling.md +++ /dev/null @@ -1,36 +0,0 @@ ---- -title: 'Scaling' -slug: /cloud/manage/jan-2025-faq/scaling -keywords: ['new pricing', 'faq', 'scaling'] -description: 'Scaling behavior in new pricing tiers' ---- - -ClickHouse Cloud allows scaling in both directions - vertical (increasing replica size) and horizontal (adding more replicas). - -## What scaling options will be available for each tier? {#what-scaling-options-will-be-available-for-each-tier} - -The scaling behavior per tier is as follows: - -* **Basic**: Basic tier supports only single replica services. These services are meant to be fixed in size and do not allow vertical or horizontal scaling. Users can upgrade to the Scale or Enterprise tier to scale their services. -* **Scale**: Scale tier supports single and multi-replica services. Scaling will be permitted for Multi-replica services. - * Services can vertically scale to the maximum replica size supported for a CSP/region AFTER they have scaled to a multi-replica setup; only 2+ replicas can be vertically scaled. - * Manual horizontal scaling will be available. -* **Enterprise**: Enterprise tier supports single and multi-replica services, and scaling will be permitted for Multi-replica services - * Services can vertically scale to maximum replica sizes supported for a CSP/region. - * Standard profiles (1:4 CPU to memory ratio) will support vertical auto-scaling - * Custom profiles (`highMemory` and `highCPU`) can be scaled vertically through a support ticket. - * Manual horizontal scaling will be available. - -:::note -Services can scale horizontally to a maximum of 20 replicas. If you need additional replicas, please contact our support team. -::: - -## Can users scale in their service? {#can-users-scale-in-their-service} - -Scaling in will be restricted to 2+ replicas. Once scaled out, users will not be permitted to scale down to a single replica, as this may result in instability and potential data loss. - -## Are there any changes related to the Scaling behavior with the new tiers? {#are-there-any-changes-related-to-the-scaling-behavior-with-the-new-tiers} - -We are introducing a new vertical scaling mechanism for compute replicas, which we call "Make Before Break" (MBB). This approach adds one or more replicas of the new size before removing the old replicas, preventing any loss of capacity during scaling operations. By eliminating the gap between removing existing replicas and adding new ones, MBB creates a more seamless and less disruptive scaling process. It is especially beneficial in scale-up scenarios, where high resource utilization triggers the need for additional capacity, since removing replicas prematurely would only exacerbate the resource constraints. - -Please note that as part of this change, historical system table data will be retained for up to a maximum of 30 days as part of scaling events. In addition, any system table data older than December 19, 2024, for services on AWS or GCP and older than January 14, 2025, for services on Azure will not be retained as part of the migration to the new organization tiers. diff --git a/docs/cloud/manage/jan2025_faq/summary.md b/docs/cloud/manage/jan2025_faq/summary.md deleted file mode 100644 index dfeafe642d3..00000000000 --- a/docs/cloud/manage/jan2025_faq/summary.md +++ /dev/null @@ -1,93 +0,0 @@ ---- -title: 'Summary' -slug: /cloud/manage/jan-2025-faq/summary -keywords: ['new tiers', 'packaging', 'pricing faq', 'summary'] -description: 'Summary of New ClickHouse Cloud Tiers' ---- - -The following FAQ summarizes common questions with respect to new tiers introduced in ClickHouse Cloud starting in January 2025. - -## What has changed with ClickHouse Cloud tiers? {#what-has-changed-with-clickhouse-cloud-tiers} - -At ClickHouse, we are dedicated to adapting our products to meet the ever-changing requirements of our customers. Since its introduction in GA over the past two years, ClickHouse Cloud has evolved substantially, and we've gained invaluable insights into how our customers leverage our cloud offerings. - -We are introducing new features to optimize the sizing and cost-efficiency of ClickHouse Cloud services for your workloads. These include compute-compute separation, high-performance machine types, and single-replica services. We are also evolving automatic scaling and managed upgrades to execute in a more seamless and reactive fashion. - -We are adding a new Enterprise tier to serve the needs of the most demanding customers and workloads, with focus on industry-specific security and compliance features, even more controls over underlying hardware and upgrades, and advanced disaster recovery features. - -You can read about these and other functional changes in this [blog](https://clickhouse.com/blog/evolution-of-clickhouse-cloud-new-features-superior-performance-tailored-offerings). - -## What action is required? {#what-action-is-required} - -To support these changes, we are restructuring our current tiers to more closely match how our evolving customer base is using our offerings, and you need to take action to select a new plan. - -Details and timelines for making these selections are described below. - -## How are tiers changing? {#how-are-tiers-changing} - -We are transitioning from a model that organizes paid tiers purely by "service types" which are delineated by both capacity and features (namely, these are Development, Production, and Dedicated tiers) to one that organizes paid tiers by feature availability. These new tiers are called Basic, Scale, and Enterprise and are described in more detail below. - -This change brings several key benefits: - -* **Consistent Feature Access**: Features present in a tier will be available in that tier for all sizes of services, as well as in all tiers above it. For example, private networking, previously available only for Production service types, will now be accessible for all services starting with the Scale tier, so you can deploy it for services sized both for development and production workloads as you see fit. - -* **Organizational-Level Features**: We can now provide features built at an organizational level with the appropriate plan, ensuring that customers receive the tools they need at the right level of service. For example, access to SSO (single-sign-on) and CMEK (customer-managed encryption keys) will be available at the Enterprise tier. - -* **Optimized Support Plans**: The new packaging structure also allows us to align support response times with paid tiers, which more effectively meet the needs of our diverse customer base. For example, we are now making named support engineers available to our Enterprise tier customers. - -Below we provide an overview of the new tiers, describe how they relate to use cases, and outline key features. - -**Basic: A taste of ClickHouse** - -* Basic tier is designed to offer a budget-friendly option for organizations with smaller data volumes and less demanding workloads. It allows you to run single-replica deployments with up to 12GB of memory and less than 1TB of storage and is ideal for small-scale use cases that do not require reliability guarantees. - -**Scale: Enhanced SLAs and scalability** - -* Scale tier is suitable for workloads that require enhanced SLAs, greater scalability, and advanced security measures. -* It offers unlimited compute and storage with any replication factor, access to compute-compute separation, and automatic vertical and horizontal scaling. -* Key features include: - * Support for private networking, customized backup controls, multi-factor auth, and more - * Compute-compute separation for optimized resource usage - * Flexible scaling options (both vertical and horizontal) to meet changing demands - -**Enterprise: Mission-critical deployments** - -* Enterprise tier is the best place to run large-scale, mission-critical ClickHouse deployments. -* It is best suited for organizations with stringent security and compliance needs, requiring the highest levels of performance and reliability. -* Key features include: - * Industry-specific compliance certifications, such as HIPAA - * Self-service access to SSO (Single Sign-On) and CMEK (Customer Managed Encryption Keys) - * Scheduled upgrades to ensure minimal disruption - * Support for custom configurations, including high-memory, high-CPU options, and private regions - -New tiers are described in more detail on our [website](https://clickhouse.com/pricing). - -## How is pricing changing? {#how-is-pricing-changing} - -In addition to evolving our paid tiers, we are making the following adjustments to our overall pricing structure and price points: - -* **Storage**: Storage price per TB will be reduced and will no longer bundle backups in the storage cost. -* **Backups**: Backups will be charged separately, with only one backup being mandatory. -* **Compute**: Compute costs will increase, varying by tier and region. This increase may be balanced by the introduction of compute-compute separation and single-replica services, which allow you to optimize compute usage by deploying and right-sizing services tailored to different workload types. -* **Data Transfer**: We are introducing charges for data egress, specifically for data transfer over the internet and cross region. Based on our analysis, most customers will not see a substantial increase in their monthly bill based on this new dimension. -* **ClickPipes**: Our managed ingest service, which was offered for free during the introductory period, will now incur charges based on compute and ingested data. Based on our analysis, most customers will not see a substantial increase in their monthly bill based on this new dimension. - -## When will these changes take effect? {#when-will-these-changes-take-effect} - -While changes are effective immediately for new customers, existing customers will have from 6 months to a year to transition to new plans. - -Detailed breakdown of effective dates is below: - -* **New Customers**: The new plans will take effect on **January 27, 2025** for new customers of ClickHouse Cloud. -* **Existing PAYG Customers**: Pay-as-you-go (PAYG) customers will have 6 months until **July 23, 2025** to migrate to new plans. -* **Existing Committed Spend Customers**: Customers with committed spend agreements can renegotiate their terms at the end of their current contract. -* **New usage dimensions** for Data Transfer and ClickPipes are effective for both PAYG and Committed Spend customers 8 weeks following this announcement on **March 24, 2025**. - -## What actions should you take? {#what-actions-should-you-take} - -If you are a **pay-as-you-go (PAYG) customer**, you can migrate to a new plan through the self-service options available in your ClickHouse Cloud console. - -If you are a **committed spend customer**, please reach out to your account representative to discuss your custom migration plan and timeline. - -**Need assistance?** -We're here to support you through this transition. If you have any questions or need personalized help, please reach out to your account representative or contact our support team. diff --git a/docs/cloud/manage/network-data-transfer.mdx b/docs/cloud/manage/network-data-transfer.mdx deleted file mode 100644 index 92725e6015c..00000000000 --- a/docs/cloud/manage/network-data-transfer.mdx +++ /dev/null @@ -1,36 +0,0 @@ ---- -sidebar_label: 'Data Transfer' -slug: /cloud/manage/network-data-transfer -title: 'Data Transfer' -description: 'Learn more about how ClickHouse Cloud meters data transferred ingress and egress' ---- - -import NetworkPricing from '@site/docs/cloud/manage/_snippets/_network_transfer_rates.md'; - -ClickHouse Cloud meters data transferred ingress and egress. -This includes any data in and out of ClickHouse Cloud as well as any intra-region and cross-region data transfer. -This usage is tracked at the service level. Based on this usage, customers incur data transfer charges that are then added to their monthly bill. - -ClickHouse Cloud charges for: -- Data egress from ClickHouse Cloud to the public Internet, including to other regions of other cloud providers. -- Data egress to another region in the same cloud provider. - -There are no charges for intra-region data transfer or Private Link/Private Service Connect use and data transfer. -However, we reserve the right to implement additional data transfer pricing dimensions if we see usage patterns that impact our ability to charge users appropriately. - -Data transfer charges vary by Cloud Service Provider (CSP) and region, and prices will not be tiered as usage increases. Public internet egress pricing is based only on the origin region. -Inter-region (or cross-region) pricing depends on both the origin and destination regions. Data transfer pricing does **not** vary between organizational tiers. - -**Best Practices to minimize Data Transfer Costs** - -There are some patterns to keep in mind when ingressing and egressing data to minimize data transfer costs. -1. When ingressing or egressing data from Clickhouse Cloud, use compression where possible, to minimize the amount of data transferred and the associated cost. -2. Be aware that when doing an INSERT over the native protocol with non-inlined values (e.g. INSERT INTO [TABLE] FROM INFILE [FILE] FORMAT NATIVE), ClickHouse clients pull metadata from servers to pack the data. If the metadata is larger than the INSERT payload, you might counterintuitively see more egress than there is ingress from the server perspective. If this is unacceptable, consider inlining data with VALUES syntax or using the HTTP protocol. - -The tables below shows how data transfer charges for egress vary across public internet or cross-region by cloud provider and region. - -:::note -ClickHouse Cloud meters inter-region usage in terms of tiers, Tier 1 through Tier 4, depending on the origin and destination regions. The table below shows the tier for each combination of inter-region data transfer. In the Billing usage screen on ClickHouse Cloud you will see data transfer usage broken out by tiers. -::: - - diff --git a/docs/cloud/migrate/upload-a-csv-file.md b/docs/cloud/migrate/upload-a-csv-file.md deleted file mode 100644 index 71347b0f55e..00000000000 --- a/docs/cloud/migrate/upload-a-csv-file.md +++ /dev/null @@ -1,113 +0,0 @@ ---- -title: 'Uploading files' -slug: /cloud/migrate/upload-a-csv-file -description: 'Learn how to upload files to Cloud' ---- - -import Image from '@theme/IdealImage'; -import csv_01 from '@site/static/images/cloud/migrate/csv_01.png'; -import csv_02 from '@site/static/images/cloud/migrate/csv_02.png'; -import csv_03 from '@site/static/images/cloud/migrate/csv_03.png'; -import csv_04 from '@site/static/images/cloud/migrate/csv_04.png'; -import csv_05 from '@site/static/images/cloud/migrate/csv_05.png'; -import csv_06 from '@site/static/images/cloud/migrate/csv_06.png'; -import csv_07 from '@site/static/images/cloud/migrate/csv_07.png'; -import csv_08 from '@site/static/images/cloud/migrate/csv_08.png'; -import csv_09 from '@site/static/images/cloud/migrate/csv_09.png'; -import csv_10 from '@site/static/images/cloud/migrate/csv_10.png'; - -# Upload files to Cloud - -ClickHouse Cloud provides an easy way to import your files and supports the -following formats: - -| Format | -|---------------------------------| -| `CSV` | -| `CSVWithNamesAndTypes` | -| `CSVWithNames` | -| `JSONEachRow` | -| `TabSeparated` | -| `TabSeparatedWithNames` | -| `TabSeparatedWithNamesAndTypes` | - - - -## Upload a file {#upload-file} - -From the Cloud homepage, select your service as shown below: - -upload_file_02 - -If your service is idle you will need to wake it. - -Select `Data sources` in the left hand tab as shown below: - -upload_file_03 - -Next select `Upload a file` on the right side of the data sources page: - -upload_file_04 - -A file dialogue will pop up allowing you to select the file that you wish to -use to insert data into a table on your Cloud service. - -upload_file_05 - -## Configure table {#configure-table} - -Once the file has uploaded you will be able to configure the table where you want -to insert the data to. A preview of the table with the first three rows is shown. - -upload_file_08 - -You can now select a destination table. The options are: - -- a new table -- an existing table - -
-You can specify which database you want to upload the data to, and in the case of -a new table, the name of the table that will be created. You will also be able to select the sorting key: - -upload_file_05 - -Columns read from the file are shown as `Source field`s and for each field, you -can change: -- the inferred type -- the default value -- whether to make the column [Nullable](/sql-reference/data-types/nullable) or not - -upload_file_06 - -:::note Excluding fields -You can also remove a field if you don't want to include it in the import -::: - -You can specify the type of table engine that you want to use: - -- `MergeTree` -- `ReplacingMergeTree` -- `SummingMergeTree` -- `Null` -
-You can specify a partitioning key expression and primary -key expression. - -upload_file_07 - -Click `Import to ClickHouse` (shown above) to import the data. The data import will be queued as -indicated by the `queued` status badge in the `Status` column as shown below. You can also click -`Open as query` (shown above) to open the insert query in the SQL console. The query will insert -the file which was uploaded to an S3 bucket using the `URL` table function. - -upload_file_09 - -If the job fails you will see a `failed` status badge under the `Status` column of -the `Data upload history` tab. You can click `View Details` for more information -on why the upload failed. You may need to modify the table configuration or clean -the data based on the error message for the failed insert. - -upload_file_11 - -
\ No newline at end of file diff --git a/docs/cloud/onboard/01_discover/01_what_is.md b/docs/cloud/onboard/01_discover/01_what_is.md new file mode 100644 index 00000000000..37a515a551f --- /dev/null +++ b/docs/cloud/onboard/01_discover/01_what_is.md @@ -0,0 +1,47 @@ +--- +slug: /cloud/overview +title: 'Introduction' +description: 'Learn what ClickHouse Cloud is, its benefits over open-source, and key features of the fully managed analytics platform' +keywords: ['clickhouse cloud', 'what is clickhouse cloud', 'clickhouse cloud overview', 'clickhouse cloud features'] +hide_title: true +--- + +## What is ClickHouse Cloud? {#what-is-clickhouse-cloud} + +ClickHouse Cloud is a fully managed cloud service created by the original creators +of ClickHouse, the fastest and most popular open-source columnar online analytical +processing database. + +With Cloud, infrastructure, maintenance, scaling, and operations are taken care of +for you, so that you can focus on what matters most to you, which is building value +for your organization and your customers faster. + +## Benefits of ClickHouse Cloud {#benefits-of-clickhouse-cloud} + +ClickHouse Cloud offers several major benefits over the open-source version: + +- **Fast time to value**: Start building instantly without having to size and scale your cluster. +- **Seamless scaling**: Automatic scaling adjusts to variable workloads so you don't have to over-provision for peak usage. +- **Serverless operations**: Sit back while we take care of sizing, scaling, security, reliability, and upgrades. +- **Transparent pricing**: Pay only for what you use, with resource reservations and scaling controls. +- **Total cost of ownership**: Best price / performance ratio and low administrative overhead. +- **Broad ecosystem**: Bring your favorite data connectors, visualization tools, SQL and language clients with you. + +## OSS vs ClickHouse Cloud comparison {#oss-vs-clickhouse-cloud} + +| Feature | Benefits | OSS ClickHouse | ClickHouse Cloud | +|--------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------|-------------------| +| **Deployment modes** | ClickHouse provides the flexibility to self-manage with open-source or deploy in the cloud. Use ClickHouse local for local files without a server or chDB to embed ClickHouse directly into your application. | ✅ | ✅ | +| **Storage** | As an open-source and cloud-hosted product, ClickHouse can be deployed in both shared-disk and shared-nothing architectures. | ✅ | ✅ | +| **Monitoring and alerting** | Monitoring and alerting about the status of your services is critical to ensuring optimal performance and a proactive approach to detect and triage potential issues. | ✅ | ✅ | +| **ClickPipes** | ClickPipes is ClickHouse's managed ingestion pipeline that allows you to seamlessly connect your external data sources like databases, APIs, and streaming services into ClickHouse Cloud, eliminating the need for managing pipelines, custom jobs, or ETL processes. It supports workloads of all sizes. | ❌ | ✅ | +| **Pre-built integrations** | ClickHouse provides pre-built integrations that connect ClickHouse to popular tools and services such as data lakes, SQL and language clients, visualization libraries, and more. | ❌ | ✅ | +| **SQL console** | The SQL console offers a fast, intuitive way to connect, explore, and query ClickHouse databases, featuring a slick caption, query interface, data import tools, visualizations, collaboration features, and GenAI-powered SQL assistance. | ❌ | ✅ | +| **Compliance** | ClickHouse Cloud compliance includes CCPA, EU-US DPF, GDPR, HIPAA, ISO 27001, ISO 27001 SoA, PCI DSS, SOC2. ClickHouse Cloud's security, availability, processing integrity, and confidentiality processes are all independently audited. Details: trust.clickhouse.com. | ❌ | ✅ | +| **Enterprise-grade security** | Support for advanced security features such as SSO, multi-factor authentication, role-based access control (RBAC), private and secure connections with support for Private Link and Private Service Connect, IP filtering, customer-managed encryption keys (CMEK), and more. | ❌ | ✅ | +| **Scaling and optimization** | Seamlessly scales up or down based on workload, supporting both horizontal and vertical scaling. With automated backups, replication, and high availability, ClickHouse, it provides users with optimal resource allocation. | ❌ | ✅ | +| **Support services** | Our best-in-class support services and open-source community resources provide coverage for whichever deployment model you choose. | ❌ | ✅ | +| **Database upgrades** | Regular database upgrades are essential to establish a strong security posture and access the latest features and performance improvements. | ❌ | ✅ | +| **Backups** | Backups and restore functionality ensures data durability and supports graceful recovery in the event of outages or other disruptions. | ❌ | ✅ | +| **Compute-compute separation** | Users can scale compute resources independently of storage, so teams and workloads can share the same storage and maintain dedicated compute resources. This ensures that the performance of one workload doesn't interfere with another, enhancing flexibility, performance, and cost-efficiency. | ❌ | ✅ | +| **Managed services** | With a cloud-managed service, teams can focus on business outcomes and accelerate time-to-market without having to worry about the operational overhead of sizing, setup, and maintenance of ClickHouse. | ❌ | ✅ | diff --git a/docs/cloud/onboard/01_discover/02_use_cases/00_overview.md b/docs/cloud/onboard/01_discover/02_use_cases/00_overview.md new file mode 100644 index 00000000000..7ff402f77ea --- /dev/null +++ b/docs/cloud/onboard/01_discover/02_use_cases/00_overview.md @@ -0,0 +1,21 @@ +--- +slug: /cloud/get-started/cloud/use-cases/overview +title: 'Building on ClickHouse Cloud' +description: 'Explore ClickHouse Cloud use cases including real-time analytics, observability, data lake & warehouse, and machine learning applications' +keywords: ['use cases', 'Cloud'] +sidebar_label: 'Overview' +--- + +ClickHouse Cloud is suitable for use as both a **primary data store** and as an **analytics +layer**. + +ClickHouse's columnar architecture, vectorized processing, and cloud-native design +make it uniquely suited for analytical workloads that require both speed and scale. +Broadly, the most common use cases for ClickHouse Cloud are: + +| Use case | Description | +|----------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| [Real-Time analytics](/cloud/get-started/cloud/use-cases/real-time-analytics) | ClickHouse Cloud excels at real-time analytics by delivering sub-second query responses on billions of rows through its columnar storage architecture and vectorized execution engine. The platform handles high-throughput data ingestion of millions of events per second while enabling direct queries on raw data without requiring pre-aggregation. Materialized Views provide real-time aggregations and pre-computed results, while approximate functions for quantiles and counts deliver instant insights perfect for interactive dashboards and real-time decision making.| +| [Observability](/cloud/get-started/cloud/use-cases/observability) | ClickHouse Cloud is purpose-built for observability workloads, featuring specialized engines and functions optimized for time-series data that can ingest and query terabytes of logs, metrics, and traces with ease. Through ClickStack, ClickHouse's comprehensive observability solution, organizations can break down the traditional three silos of logs, metrics, and traces by unifying all observability data in a single platform, enabling correlated analysis and eliminating the complexity of managing separate systems. This unified approach makes it ideal for application performance monitoring, infrastructure monitoring, and security event analysis at enterprise scale, with ClickStack providing the tools and integrations needed for complete observability workflows without data silos.| +| [Machine Learning and GenAI](/cloud/get-started/cloud/use-cases/AI_ML) | ClickHouse Cloud powers modern AI applications through four key capabilities: native vector similarity search for RAG applications and embedding storage, comprehensive feature store functionality for real-time ML feature engineering and serving, specialized LLM observability for tracking model performance and usage patterns, and integrated MCP (Model Context Protocol) server support that enables AI agents and LLMs to directly query and analyze data. This unified platform eliminates the complexity of managing separate systems for vector databases, feature stores, and observability tools, providing a single solution for the entire AI/ML data pipeline with ClickHouse's signature performance and scalability.| +| [Data Lake and Warehouse](/cloud/get-started/cloud/use-cases/data_lake_and_warehouse) | As a modern data warehouse solution, ClickHouse Cloud combines native cloud storage integration with S3, GCS, and Azure Blob for cost-effective storage with schema-on-read flexibility that supports semi-structured data like JSON and nested types. The platform achieves massive compression ratios of 10:1 or better, significantly reducing storage costs, while its compute-storage separation architecture allows independent scaling and cost optimization. Users benefit from a standard SQL interface enhanced with advanced analytics functions, making it easy to query and analyze data at any scale.| diff --git a/docs/cloud/onboard/01_discover/02_use_cases/01_real-time-analytics.md b/docs/cloud/onboard/01_discover/02_use_cases/01_real-time-analytics.md new file mode 100644 index 00000000000..67aa054a4b0 --- /dev/null +++ b/docs/cloud/onboard/01_discover/02_use_cases/01_real-time-analytics.md @@ -0,0 +1,160 @@ +--- +slug: /cloud/get-started/cloud/use-cases/real-time-analytics +title: 'Real-time analytics' +description: 'Learn how to build real-time analytics applications with ClickHouse Cloud for instant insights and data-driven decision making' +keywords: ['use cases', 'real-time analytics'] +sidebar_label: 'Real-time analytics' +--- + +import Image from '@theme/IdealImage'; +import rta_0 from '@site/static/images/cloud/onboard/discover/use_cases/0_rta.png'; +import rta_1 from '@site/static/images/cloud/onboard/discover/use_cases/1_rta.png'; +import rta_2 from '@site/static/images/cloud/onboard/discover/use_cases/2_rta.png'; +import rta_3 from '@site/static/images/cloud/onboard/discover/use_cases/3_rta.png'; + + + +## What is real-time analytics? {#what-is-real-time-analytics} + +Real-time analytics refers to data processing that delivers insights to end users +and customers as soon as the data is generated. It differs from traditional or +batch analytics, where data is collected in batches and processed, often a long +time after it was generated. + +Real-time analytics systems are built on top of event streams, which consist of +a series of events ordered in time. An event is something that’s already happened. +It could be the addition of an item to the shopping cart on an e-commerce website, +the emission of a reading from an Internet of Things (IoT) sensor, or a shot on +goal in a football (soccer) match. + +An event (from an imaginary IoT sensor) is shown below, as an example: + +```json +{ + "deviceId": "sensor-001", + "timestamp": "2023-10-05T14:30:00Z", + "eventType": "temperatureAlert", + "data": { + "temperature": 28.5, + "unit": "Celsius", + "thresholdExceeded": true + } +} +``` + +Organizations can discover insights about their customers by aggregating and +analyzing events like this. This has traditionally been done using batch analytics, +and in the next section, we’ll compare batch and real-time analytics. + +## Real-Time analytics vs batch analytics {#real-time-analytics-vs-batch-analytics} + +The diagram below shows what a typical batch analytics system would look like +from the perspective of an individual event: + +batch analytics diagram + +You can see that there’s quite a big gap from when the event happens until we +process and gain some insight from it. Traditionally, this was the only means of +data analysis, and we’d need to create artificial time boundaries to process +the data in batches. For example, we might process all the data collected at the +end of a day. This worked for many use cases, but for others, it’s sub-optimal +because we’re working with stale data, and it doesn’t allow us to react to the +data quickly enough. + +By contrast, in real-time analytics systems, we react to an event as soon as it +happens, as shown in the following diagram: + +Real-time analytics diagram + +We can now derive insights from events almost as soon as they’re generated. But +why is this useful? + +## Benefits of real-time analytics {#benefits-of-real-time-analytics} + +In today's fast-paced world, organizations rely on real-time analytics to stay +agile and responsive to ever-changing conditions. A real-time analytics system +can benefit a business in many ways. + +### Better decision-making {#better-decision-making} + +Decision-making can be improved by having access to actionable insights via +real-time analytics. When business operators can see events as they’re happening, +it makes it much easier to make timely interventions. + +For example, if we make changes to an application and want to know whether it’s +having a detrimental effect on the user experience, we want to know this as +quickly as possible so that we can revert the changes if necessary. With a less +real-time approach, we might have to wait until the next day to do this +analysis, by which type we’ll have a lot of unhappy users. + +### New products and revenue streams {#new-products-and-revenue-streams} + +Real-time analytics can help businesses generate new revenue streams. Organizations +can develop new data-centered products and services that give users access to +analytical querying capabilities. These products are often compelling enough for +users to pay for access. + +In addition, existing applications can be made stickier, increasing user +engagement and retention. This will result in more application use, creating more +revenue for the organization. + +### Improved customer experience {#improved-customer-experience} + +With real-time analytics, businesses can gain instant insights into customer +behavior, preferences, and needs. This lets businesses offer timely assistance, +personalize interactions, and create more engaging experiences that keep +customers returning. + +## Real-time analytics use cases {#real-time-analytics-use-cases} + +The actual value of real-time analytics becomes evident when we consider its +practical applications. Let’s examine some of them. + +### Fraud detection {#fraud-detection} + +Fraud detection is about detecting fraudulent patterns, ranging from fake accounts +to payment fraud. We want to detect this fraud as quickly as possible, flagging +suspicious activities, blocking transactions, and disabling accounts when necessary. + +This use case stretches across industries: healthcare, digital banking, financial +services, retail, and more. + +[Instacart](https://www.instacart.com/) is North America's leading online grocery +company, with millions of active customers and shoppers. It uses ClickHouse as +part of Yoda, its fraud detection platform. In addition to the general types of +fraud described above, it also tries to detect collusion between customers and +shoppers. + +Real-time analytics for fraud detection + +They identified the following characteristics of ClickHouse that enable real-time +fraud detection: + +> ClickHouse supports LSM-tree based MergeTree family engines. +> These are optimized for writing which is suitable for ingesting large amounts +> of data in real-time. + +> ClickHouse is designed and optimized explicitly for analytical queries. This +> fits perfectly with the needs of applications where data is continuously +> analyzed for patterns that might indicate fraud. + +### Time-sensitive decision making {#ftime-sensitive-decision-making} + +Time-sensitive decision-making refers to situations where users or organizations +need to make informed choices quickly based on the most current information +available. Real-time analytics empowers users to make informed choices in +dynamic environments, whether they're traders reacting to market fluctuations, +consumers making purchasing decisions, or professionals adapting to real-time +operational changes. + +Coinhall provides its users with real-time insights into price movements over +time via a candlestick chart, which shows the open, high, low, and close prices +for each trading period. They needed to be able to run these types of queries +quickly and with a large number of concurrent users. + +Real-time analutics for time-sensitive decision making + +> In terms of performance, ClickHouse was the clear winner, executing candlestick queries in 20 milliseconds, compared +> to 400 milliseconds or more for the other databases. It ran latest-price queries in 8 milliseconds, outpacing the +> next-best performance (SingleStore) which came in at 45 milliseconds. Finally, it handled ASOF JOIN queries in +> 50 milliseconds, while Snowflake took 20 minutes and Rockset timed out. diff --git a/docs/cloud/onboard/01_discover/02_use_cases/02_observability.md b/docs/cloud/onboard/01_discover/02_use_cases/02_observability.md new file mode 100644 index 00000000000..97bfb5b263f --- /dev/null +++ b/docs/cloud/onboard/01_discover/02_use_cases/02_observability.md @@ -0,0 +1,230 @@ +--- +slug: /cloud/get-started/cloud/use-cases/observability +title: 'Observability' +description: 'Use ClickHouse Cloud for observability, monitoring, logging, and system performance analysis in distributed applications' +keywords: ['use cases', 'observability'] +sidebar_label: 'Observability' +--- + + + +Modern software systems are complex. Microservices, cloud infrastructure, and +distributed systems have made it increasingly difficult to understand what's +happening inside our applications. When something goes wrong, teams need to know +where and why quickly. + +This is where observability comes in. It's evolved from simple system monitoring +into a comprehensive approach to understanding system behavior. However, +implementing effective observability isn't straightforward - it requires +understanding technical concepts and organizational challenges. + +## What is Observability? {#what-is-observability} + +Observability is understanding a system's internal state by examining its outputs. +In software systems, this means understanding what's happening inside your +applications and infrastructure through the data they generate. + +This field has evolved significantly and can be understood through two distinct +generations of observability approaches. + +The first generation, often called Observability 1.0, was built around the +traditional "three pillars" approach of metrics, logs, and traces. This approach +required multiple tools and data stores for different types of telemetry. It +often forced engineers to pre-define what they wanted to measure, making it +costly and complex to maintain multiple systems. + +Modern observability, or Observability 2.0, takes a fundamentally different +approach. It's based on collecting wide, structured events for each unit of work +(e.g., an HTTP request and response) in our system. This approach captures +high-cardinality data, such as user IDs, request IDs, Git commit hashes, +instance IDs, Kubernetes pod names, specific route parameters, and vendor +transaction IDs. A rule of thumb is adding a piece of metadata if it could help +us understand how the system behaves. + +This rich data collection enables dynamic slicing and dicing of data without +pre-defining metrics. Teams can derive metrics, traces, and other visualizations +from this base data, allowing them to answer complex questions about system +behavior that weren't anticipated when the instrumentation was first added. + +However, implementing modern observability capabilities presents its challenges. +Organizations need reliable ways to collect, process, and export this rich +telemetry data across diverse systems and technologies. While modern approaches +have evolved beyond traditional boundaries, understanding the fundamental +building blocks of observability remains crucial. + +## The three pillars of observability {#three-pillars-of-observability} + +To better understand how observability has evolved and works in practice, let's +examine the three pillars of observability - logs, metrics, and traces. + +While modern observability has moved beyond treating these as separate concerns, +they remain fundamental concepts for understanding different aspects of system +behavior. + +1. **Logs** - Text-based records of discrete events that occur within a system. +These provide detailed context about specific occurrences, errors, and state changes. +2. **Metrics** - Numerical measurements collected over time. These include counters, +gauges, and histograms that help track system performance, resource usage, and business KPIs. +3. **Traces** - Records that track the journey of requests as they flow through distributed systems. +These help understand the relationships between services and identify performance bottlenecks. + +These pillars enable teams to monitor, troubleshoot, and optimize their systems. +However, the real power comes from understanding how to effectively collect, +analyze, and correlate data across all three pillars to gain meaningful insights +into system behavior. + +## The benefits of observability {#the-benefits-of-observability} + +While the technical aspects of observability - logs, metrics, and traces - are +well understood, the business benefits are equally important to consider. + +In their book ["Observability Engineering"](https://clickhouse.com/engineering-resources/observability#:~:text=Observability%20Engineering) +(O'Reilly, 2022), Charity Majors, Liz Fong-Jones, and George Miranda draw from +industry research and anecdotal feedback to identify four key business benefits +that organizations can expect from implementing proper observability practices. +Let's examine these benefits: + +### Higher incremental revenue {#higher-incremental-revenue} + +The authors note that observability tools that help teams improve uptime and +performance can lead to increased incremental revenue through improved code quality. +This manifests in several ways: + +1. Improved customer experience: Fast problem resolution and prevention of service +degradation leads to higher customer satisfaction and retention +2. Increased system reliability: Better uptime means more successful transactions +and fewer lost business opportunities +3. Enhanced performance: The ability to identify and optimize performance bottlenecks +helps maintain responsive services that keep customers engaged +4. Competitive advantage: Organizations that can maintain high service quality +through comprehensive monitoring and quick issue resolution often gain an edge +over competitors + +### Cost Savings from faster incident response {#cost-savings-from-faster-incident-response} + +One of the most immediate benefits of observability is reduced labor costs +through faster detection and resolution of issues. This comes from: + +* Reduced Mean Time to Detect (MTTD) and Mean Time to Resolve (MTTR) +* Improved query response times, enabling faster investigation +* Quicker identification of performance bottlenecks +* Reduced time spent on-call +* Fewer resources wasted on unnecessary rollbacks + +We see this in practice - [trip.com built their observability system with ClickHouse](trip.com built their observability system with ClickHouse) +and achieved query speeds 4-30x faster than their previous solution, with 90% of +queries completing in under 300ms, enabling rapid issue investigation. + +### Cost savings from incidents avoided {#cost-savings-from-incidents-avoided} + +Observability doesn't just help resolve issues faster - it helps prevent them entirely. +The authors emphasize how teams can prevent critical issues by: + +* Identifying potential problems before they become critical +* Analyzing patterns to prevent recurring issues +* Understanding system behavior under different conditions +* Proactively addressing performance bottlenecks +* Making data-driven decisions about system improvements + +ClickHouse's [own observability platform, LogHouse](https://clickhouse.com/blog/building-a-logging-platform-with-clickhouse-and-saving-millions-over-datadog), +demonstrates this. It enables our core engineers to search historical patterns across all clusters, helping prevent +recurring issues. + +### Cost savings from decreased employee churn {#cost-savings-from-decreased-employee-churn} + +One of the most overlooked benefits is the impact on team satisfaction and retention. +The authors highlight how observability leads to: + +* Improved job satisfaction through better tooling +* Decreased developer burnout from fewer unresolved issues +* Reduced alert fatigue through better signal-to-noise ratio +* Lower on-call stress due to better incident management +* Increased team confidence in system reliability + +We see this in practice - when [Fastly migrated to ClickHouse](https://clickhouse.com/videos/scaling-graphite-with-clickhouse), +their engineers were amazed by the improvement in query performance, noting: + +> "I couldn't believe it. I actually had to go back a couple of times just to +> make sure that I was querying it properly... this is coming back too fast. +> This doesn't make sense." + +As the authors emphasize, while the specific measures of these benefits may vary +depending on the tools and implementation, these fundamental improvements can be +expected across organizations that adopt robust observability practices. The key +is choosing and implementing the right tools effectively to maximize these benefits. + +Achieving these benefits requires overcoming several significant hurdles. Even +organizations that understand the value of observability often find that +implementation presents unexpected complexities and challenges that must be +carefully navigated. + +## Challenges in implementing observability {#challenges-in-implementing-observability} + +Implementing observability within an organization is a transformative step toward +gaining deeper insights into system performance and reliability. However, this +journey is not without its challenges. As organizations strive to harness the +full potential of observability, they encounter various obstacles that can impede +progress. Let’s go through some of them. + +### Data volume and scalability {#data-volume-and-scalability} + +One of the primary hurdles in implementing observability is managing the sheer +volume and scalability of telemetry data generated by modern systems. As +organizations grow, so does the data they need to monitor, necessitating +solutions that efficiently handle large-scale data ingestion and +real-time analytics. + +### Integration with existing systems {#integration-with-existing-systems} + +Integration with existing systems poses another significant challenge. Many +organizations operate in heterogeneous environments with diverse technologies, +making it essential for observability tools to seamlessly integrate with current +infrastructure. Open standards are crucial in facilitating this integration, +ensuring interoperability and reducing the complexity of deploying observability +solutions across varied tech stacks. + +### Skill gaps {#skill-gaps} + +Skill gaps can also impede the successful implementation of observability. The +transition to advanced observability solutions often requires specialized +knowledge of data analytics and specific tools. Teams may need to invest in +training or hiring to bridge these gaps and fully leverage the capabilities of +their observability platforms. + +### Cost management {#cost-management} + +Cost management is critical, as observability solutions can become expensive, +particularly at scale. Organizations must balance the costs of these tools with +the value they provide, seeking cost-effective solutions that offer significant +savings compared to traditional approaches. + +### Data retention and storage {#data-retention-and-storage} + +Data retention and storage management present additional challenges. Deciding +how long to retain observability data without compromising performance or +insights requires careful planning and efficient storage solutions that reduce +storage requirements while maintaining data accessibility. + +### Standardization and vendor lock-in {#standardization-and-vendor-lock-in} + +Ensuring standardization and avoiding vendor lock-in are vital for maintaining +flexibility and adaptability in observability solutions. By adhering to open +standards, organizations can prevent being tied to specific vendors and ensure +their observability stack can evolve with their needs. + +### Security and compliance {#security-and-compliance} + +Security and compliance considerations remain crucial, especially when handling +sensitive data within observability systems. Organizations must ensure that their +observability solutions adhere to relevant regulations and effectively protect +sensitive information. + +These challenges underscore the importance of strategic planning and informed +decision-making in implementing observability solutions that effectively meet +organizational needs. + +To address these challenges, organizations need a well-structured approach to +implementing observability. The standard observability pipeline has evolved to +provide a framework for effectively collecting, processing, and analyzing +telemetry data. One of the earliest and most influential examples of this +evolution comes from Twitter's experience in 2013. diff --git a/docs/cloud/onboard/01_discover/02_use_cases/03_data_lake_and_warehouse.md b/docs/cloud/onboard/01_discover/02_use_cases/03_data_lake_and_warehouse.md new file mode 100644 index 00000000000..6deabf4a0a3 --- /dev/null +++ b/docs/cloud/onboard/01_discover/02_use_cases/03_data_lake_and_warehouse.md @@ -0,0 +1,119 @@ +--- +slug: /cloud/get-started/cloud/use-cases/data_lake_and_warehouse +title: 'Data Lakehouse' +description: 'Build modern data lakehouse architectures with ClickHouse Cloud combining the flexibility of data lakes with database performance' +keywords: ['use cases', 'data lake and warehouse'] +sidebar_label: 'Data Lakehouse' +--- + +import Image from '@theme/IdealImage'; +import datalakehouse_01 from '@site/static/images/cloud/onboard/discover/use_cases/datalakehouse_01.png'; + + + +The data lakehouse is a convergent architecture that applies database principles +to data lake infrastructure while maintaining the flexibility and scale of cloud storage systems. + +The lakehouse is not just taking a database apart but building database-like +capabilities onto a fundamentally different foundation (cloud object storage) +that focuses on supporting traditional analytics and modern AI/ML workloads in +a unified platform. + +## What are the components of the data lakehouse? {#components-of-the-data-lakehouse} + +The modern data lakehouse architecture represents a convergence of data warehouse +and data lake technologies, combining the best aspects of both approaches. This +architecture comprises several distinct but interconnected layers providing a +flexible, robust data storage, management, and analysis platform. + +Understanding these components is essential for organizations looking to +implement or optimize their data lakehouse strategy. The layered approach allows +for component substitution and independent evolution of each layer, providing +architectural flexibility and future-proofing. + +Let's explore the core building blocks of a typical data lakehouse architecture +and how they interact to create a cohesive data management platform. + +Components of the data lakehouse + +| Component | Description | +|-------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| **Data sources** | Lakehouse data sources include operational databases, streaming platforms, IoT devices, application logs, and external providers. | +| **Query engine** | Processes analytical queries against the data stored in the object storage, leveraging the metadata and optimizations provided by the table format layer. Supports SQL and potentially other query languages to analyze large volumes of data efficiently. | +| **Metadata catalog** | The [data catalog](https://clickhouse.com/engineering-resources/data-catalog) acts as a central repository for metadata, storing and managing table definitions and schemas, partitioning information, and access control policies. Enables data discovery, lineage tracking, and governance across the lakehouse. | +| **Table format layer** | The [table format layer](https://clickhouse.com/engineering-resources/open-table-formats) manages the logical organization of data files into tables, providing database-like features such as ACID transactions, schema enforcement and evolution, time travel capabilities, and performance optimizations like data skipping and clustering. | +| **Object storage** | This layer provides scalable, durable, cost-effective storage for all data files and metadata. It handles the physical persistence of data in an open format, enabling direct access from multiple tools and systems. | +| **Client applications** | Various tools and applications that connect to the lakehouse to query data, visualize insights, or build data products. These can include BI tools, data science notebooks, custom applications, and ETL/ELT tools. | + +## What are the benefits of the data lakehouse? {#benefits-of-the-data-lakehouse} + +The data lakehouse architecture offers several significant advantages when compared +directly to both traditional data warehouses and data lakes: + +### Compared to traditional data warehouses {#compared-to-traditional-data-warehouses} + +| # | Benefit | Description | +|---|--------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| 1 | **Cost efficiency** | Lakehouses leverage inexpensive object storage rather than proprietary storage formats, significantly reducing storage costs compared to data warehouses that charge premium prices for their integrated storage. | +| 2 | **Component flexibility and interchangeability** | The lakehouse architecture allows organizations to substitute different components. Traditional systems require wholesale replacement when requirements change or technology advances, while lakehouses enable incremental evolution by swapping out individual components like query engines or table formats. This flexibility reduces vendor lock-in and allows organizations to adapt to changing needs without disruptive migrations. | +| 3 | **Open format support** | Lakehouses store data in open file formats like Parquet, allowing direct access from various tools without vendor lock-in, unlike proprietary data warehouse formats that restrict access to their ecosystem. | +| 4 | **AI/ML integration** | Lakehouses provide direct access to data for machine learning frameworks and Python/R libraries, whereas data warehouses typically require extracting data before using it for advanced analytics. | +| 5 | **Independent scaling** | Lakehouses separate storage from compute, allowing each to scale independently based on actual needs, unlike many data warehouses, where they scale together. | + +### Compared to data lakes {#compared-to-data-lakes} + +| # | Benefit | Description | +|---|-----------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| 1 | **Query performance** | Lakehouses implement indexing, statistics, and data layout optimizations that enable SQL queries to run at speeds comparable to data warehouses, overcoming the poor performance of raw data lakes. | +| 2 | **Data consistency** | Through ACID transaction support, lakehouses ensure consistency during concurrent operations, solving a major limitation of traditional data lakes, where file conflicts can corrupt data. | +| 3 | **Schema management** | Lakehouses enforce schema validation and track schema evolution, preventing the "data swamp" problem common in data lakes where data becomes unusable due to schema inconsistencies. | +| 4 | **Governance capabilities** | Lakehouses provide fine-grained access control and auditing features at row/column levels, addressing the limited security controls in basic data lakes. | +| 5 | **BI Tool support** | Lakehouses offer SQL interfaces and optimizations that make them compatible with standard BI tools, unlike raw data lakes that require additional processing layers before visualization. | + +## Where does ClickHouse fit in the data lakehouse architecture? {#where-does-clickhouse-fit-in-the-data-lakehouse-architecture} + +ClickHouse is a powerful analytical query engine within the modern data lakehouse +ecosystem. It offers organizations a high-performance option for analyzing data +at scale. ClickHouse is a compelling choice due to its exceptional query speed and +efficiency. + +Within the lakehouse architecture, ClickHouse functions as a specialized +processing layer that can flexibly interact with the underlying data. It can +directly query Parquet files stored in cloud object storage systems like S3, +Azure Blob Storage, or Google Cloud Storage, leveraging its optimized columnar +processing capabilities to deliver rapid results even on massive datasets. +This direct query capability allows organizations to analyze their lake data +without complex data movement or transformation processes. + +ClickHouse integrates with open table formats such as Apache Iceberg, Delta Lake, +or Apache Hudi for more sophisticated data management needs. This integration +enables ClickHouse to take advantage of these formats' advanced features, while +still delivering the exceptional query performance it's known for. Organizations +can integrate these table formats directly or connect through metadata catalogs +like AWS Glue, Unity, or other catalog services. + +By incorporating ClickHouse as a query engine in their lakehouse architecture, +organizations can run lightning-fast analytical queries against their data lake +while maintaining the flexibility and openness that define the lakehouse approach. +This combination delivers the performance characteristics of a specialized +analytical database without sacrificing the core benefits of the lakehouse model, +including component interchangeability, open formats, and unified data management. + +## Hybrid architecture: The best of both worlds {#hybrid-architecture-the-best-of-both-worlds} + +While ClickHouse excels at querying lakehouse components, its highly optimized +storage engine offers an additional advantage. For use cases demanding ultra-low +latency queries - such as real-time dashboards, operational analytics, or +interactive user experiences - organizations can selectively store +performance-critical data directly in ClickHouse's native format. This hybrid +approach delivers the best of both worlds: the unmatched query speed of +ClickHouse's specialized storage for time-sensitive analytics and the flexibility +to query the broader data lakehouse when needed. + +This dual capability allows organizations to implement tiered data strategies +where hot, frequently accessed data resides in ClickHouse's optimized storage +for sub-second query responses, while maintaining seamless access to the complete +data history in the lakehouse. Teams can make architectural decisions based on +performance requirements rather than technical limitations, using ClickHouse as +a lightning-fast analytical database for critical workloads and a flexible query +engine for the broader data ecosystem. diff --git a/docs/cloud/onboard/01_discover/02_use_cases/04_machine_learning_and_genAI/01_overview.md b/docs/cloud/onboard/01_discover/02_use_cases/04_machine_learning_and_genAI/01_overview.md new file mode 100644 index 00000000000..8a10c3c65c5 --- /dev/null +++ b/docs/cloud/onboard/01_discover/02_use_cases/04_machine_learning_and_genAI/01_overview.md @@ -0,0 +1,101 @@ +--- +slug: /cloud/get-started/cloud/use-cases/AI_ML +title: 'Machine learning and generative AI' +description: 'Learn how ClickHouse Cloud powers machine learning and generative AI applications with high-performance data processing and analytics' +keywords: ['use cases', 'Machine Learning', 'Generative AI'] +sidebar_label: 'Overview' +--- + + + +## The rapidly evolving data landscape for Machine Learning and Generative AI {#the-rapidly-evolving-data-landscape-for-machine-learning-and-generative-ai} + +Rapid advancements in Machine Learning and Generative AI are completely reshaping +how business and society operate, driving an ever-increasing demand for data on +an unparalleled scale. +At the time of writing, language training dataset size is growing on average 3.7x +per year, while it is projected that the largest training run will use all +public human-generated text by 2028. At the same time, users of these applications +increasingly expect real-time performance and the success of AI and ML-driven +insights, like personalized recommendations, accurate forecasting, or chatbots, +hinge on the ability to handle massive datasets in real-time. Against the backdrop +of these changes, traditional data architectures often face significant challenges +when it comes to meeting the scale and real-time requirements that modern AI/ML +workloads demand. + +## Challenges of traditional data stacks for AI/ML workloads {#challenges-of-traditional-data-stacks} + +Traditional database systems are often not designed for the massive analytical +workloads and complex queries inherent in modern ML and GenAI applications. +They frequently become bottlenecks as data volume grows and query complexity +increases, hindering the rapid processing required for AI. In addition to this, +machine learning architectures can become fragmented and challenging to handle +due to a proliferation of specialized tools and components which often leads to +higher learning curves, increased points of failure, and escalating expenses. +Real-time processing for ML faces significant challenges, including dealing with +the sheer volume and velocity of incoming data, minimizing latency and response +times, and continuously addressing issues like model drift and ensuring data +quality. These systems, designed for structured data at much smaller scales, often +take days or weeks when faced with terabytes or petabytes of data. Not only do +they become a performance bottleneck, but also a cost bottleneck, often relying +on expensive, close-coupled storage that does not scale cost effectively. + +## ClickHouse as a foundation for real-time AI/ML {#clickhouse-for-real-time-ai-ml} + +ClickHouse was designed and built from the ground up to tackle data at scale in +real-time. As such, it is ideally positioned for handling the requirements of +today’s AI and ML applications. Several core features enable it to ingest, +process and query datasets on the petabyte scale with real-time performance: + +| Feature | Description | +|----------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| **Columnar Storage** | ClickHouse utilizes a columnar storage model. This means that data from each column of an inserted row is stored together on disk, which enables significantly more efficient compression and boosts query speed by allowing the system to read only the relevant columns required for a query, which drastically reduces disk I/O. This is particularly advantageous for analytical queries common in ML/GenAI that often involve aggregations or filtering on a subset of columns. | +| **High Performance** | ClickHouse offers for its lightning-fast query processing, capable of querying billions of rows in milliseconds. It achieves this through a fully parallelized query pipeline and vectorized query execution engine, which processes multiple rows simultaneously at the CPU level, maximizing efficiency. | +| **Scalability** | Designed for horizontal scalability, ClickHouse allows users to add more servers (nodes) to a cluster to handle increasing data volumes and query loads, distributing data and queries across them. Performance scales linearly with the addition of each new server, enabling it to easily handle petabytes of data. | +| **Real-time data ingestion** | It is built for continuous data ingestion, supporting high rates of inserts and merges (billions of rows per second, gigabytes per second) without disrupting ongoing queries or analytics. This capability is crucial for environments where data arrives in a constant stream, such as from IoT devices or application logs, ensuring that ML models are fueled with the most up-to-date information. | +| **Specialized data types & functions** | In addition to standard SQL data types, syntax and functions, ClickHouse offers a host of additional specialised data types and functions suited for ML use cases. Some of these include Array functions which natively support vector operations, distance calculations, array manipulations; Native JSON support for efficient processing of semi-structured data common to ML feature stores; Approximate algorithms like HyperLogLog, quantiles, and sampling functions for large-scale statistical analysis or numeric indexed vectors for vector aggregation and pointwise operations. | +| **Extensive integration ecosystem** | ClickHouse's extensive integration ecosystem makes it exceptionally valuable for AI/ML applications by seamlessly connecting with every critical component of the ML toolchain—from Python/pandas and Jupyter for data science workflows, to Spark and Kafka for large-scale data processing, to Airflow for pipeline orchestration, and Grafana for model monitoring—eliminating the typical friction and data movement bottlenecks that plague multi-tool ML environments. | + +## How ClickHouse helps simplify the AI/ML Data Stack {#simplify-the-ai-ml-data-stack} + +ClickHouse streamlines the traditionally fragmented AI/ML data infrastructure +by serving as a unified platform that handles multiple data management +functions within a single high-performance system. Rather than maintaining +separate specialized data stores for different ML tasks, ClickHouse provides +a consolidated foundation for analytics, machine learning workloads, and +data preparation and exploration. + +ClickHouse natively integrates with object storage like S3, GCP and Azure. It +integrates with data lakes, enabling direct querying of data in popular formats +like Iceberg, Delta Lake, and Hudi, positioning it as a comprehensive access and +computation layer for ML operations. This unified approach tackles challenges +faced in MLOps by reducing the complexity that typically stems from managing +multiple systems. + +Data fragmentation across separate stores creates many operational pain +points such as escalating costs, increased failure risks, and the need for +duplicate transformation logic between training and inference pipelines. +ClickHouse addresses these issues by consolidating all of this functionality +into a single system, particularly for feature engineering where consistency +between offline training and online serving is critical. + +Through its integration with data catalogs including Unity, AWS Glue, Polaris, +and Hive Metastore, ClickHouse minimizes data movement and duplication. This +architectural approach ensures that feature definitions remain consistent +across models and experiments, reducing the risk of discrepancies that can +undermine model performance. For MLOps teams, this +translates to less time managing infrastructure complexity and more focus on +core activities like model development and deployment, ultimately accelerating +the ML lifecycle while improving the economic viability of AI initiatives at +scale. + +## ClickHouse across the AI/ML Lifecycle {#clickhouse-across-the-ai-ml-lifecycle} + +ClickHouse's capabilities span the entire AI/ML lifecycle, providing a robust and +efficient platform from the very first stages of data preparation all the way to +model deployment and monitoring. + +| Area | Description | +|----------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------| +| [Data preparation and feature engineering](/cloud/get-started/cloud/use-cases/AI_ML/feature_engineering) | Learn how ClickHouse is used in the data preparation and feature engineering stages of the AI/ML pipeline | +| [Agent-facing analytics](/cloud/get-started/cloud/use-cases/AI_ML/agent_facing_analytics) | Learn how ClickHouse enables agentic facing analytics | diff --git a/docs/cloud/onboard/01_discover/02_use_cases/04_machine_learning_and_genAI/02_data_prep_feature_engineering.md b/docs/cloud/onboard/01_discover/02_use_cases/04_machine_learning_and_genAI/02_data_prep_feature_engineering.md new file mode 100644 index 00000000000..5e9caf86ea3 --- /dev/null +++ b/docs/cloud/onboard/01_discover/02_use_cases/04_machine_learning_and_genAI/02_data_prep_feature_engineering.md @@ -0,0 +1,241 @@ +--- +slug: /cloud/get-started/cloud/use-cases/AI_ML/feature_engineering +title: 'Data preparation and feature engineering' +description: 'Use ClickHouse Cloud for efficient data preparation and feature engineering in machine learning and AI workflows' +keywords: ['use cases', 'Machine Learning', 'Generative AI'] +sidebar_label: 'Data preparation and feature engineering' +--- + +import Image from '@theme/IdealImage'; +import ml_ai_01 from '@site/static/images/cloud/onboard/discover/use_cases/ml_ai_01.png'; +import ml_ai_02 from '@site/static/images/cloud/onboard/discover/use_cases/ml_ai_02.png'; +import ml_ai_03 from '@site/static/images/cloud/onboard/discover/use_cases/ml_ai_03.png'; +import ml_ai_04 from '@site/static/images/cloud/onboard/discover/use_cases/ml_ai_04.png'; + +## Data preparation and feature engineering {#data-preparation-and-feature-engineering} + +Data preparation bridges raw data and effective machine learning or AI +models, typically consuming the majority of time in AI/ML projects and +directly determining model success. It sits between initial data collection +and model development in the lifecycle, transforming messy, inconsistent +real-world data into clean, structured formats that algorithms can +effectively learn from. `clickhouse-local`, `chDB` (an in-process version +of ClickHouse for Python), open-source ClickHouse server or ClickHouse Cloud +allow developers and data scientists to work with ever-growing amounts of +data interactively and efficiently for ad-hoc querying, data cleaning, and +feature engineering. + +### What is a feature store? {#what-is-a-feature-store} + +In its simplest form, a feature store is a centralized repository for storing +and managing feature data and acting as the source of truth. By providing +APIs that allow the storage, versioning, and retrieval of features, feature +stores aim to provide a consistent view of features for training and +inference from development to production environments. Whether a custom-built +in-house solution or off-the-shelf product, actual product-level features +provided by a feature store will vary, with some providing a complete data +platform capable of aggregating data into features and even providing a +compute engine for the training of models. + +Irrespective of how many capabilities are inherent to the feature store, all +provide abstractions to the underlying data with which data scientists and +engineers will be familiar. As well as delivering data as versioned +entities, features, and classes, most expose concepts of feature groups, +training sets, batching, streaming, and point-in-time queries (such as the +ability to identify the values for a feature at either a specific point, +e.g. the latest value). + +Feature store + +### Why might you use one? {#why-use-one} + +In theory, a feature store ties disparate systems and capabilities together to +form a complete ML data layer, capable of both acting as the source of truth for +training data and also being used to provide context when predictions are being +made. + +While the exact capabilities they provide vary, the objectives remain the same: + +- **improve collaboration and reusability** between data scientists and data +engineers by centralizing features and their transformation logic +- **reduce model iteration time** during both experimentation and deployment by +allowing feature re-use at both training and inference time +- **governance and compliance** through rules and versioning which can restrict +model access to sensitive data (and features) +- **improve model performance and reliability** by abstracting the complexity of +data engineering from data scientists and ensuring they work with only quality +consistent features delivered through an API. + +While these represent a very high-level overview of some of the problems a +feature store solves, the predominant benefit here is the ability to share +features across teams and utilize the same data for training and inference. + +Feature stores also address a number of other challenges present in MLOps, +such as how to backfill feature data, handle incremental updates to the +source data (to update features), or monitor new data for drift. More +recently, they have also integrated vector databases to act as the +orchestration layer for RAG pipelines or to help find similar features +using embeddings - a useful capability during some model training. + +### Components of a feature store {#components-of-a-feature-store} + +Before we explore how ClickHouse might fit into a feature store, understanding +the common components is helpful for context. Typically, a feature store will +consist of up to 4 main components: + +Components of a feature store + +- **Data source** - While this can be as simple as a CSV file, it is often a +database or data lake with files in a format like Iceberg and accessible +through a query engine. + +- **Transformation engine (optional)** - Raw data needs to be transformed into +features. In a simple case, a feature can be correlated with a column's +values. More likely, it is the result of a transformation process involving +joins, aggregations, and expressions changing the structure and/or type of +column values. Some feature stores (see Types of Feature Store) might +provide built-in capabilities to achieve this; others may offload the work +to local Python functions or, for larger datasets, the database (maybe even +using dbt under the hood) via materializations, or a processing engine such +as Spark. With ClickHouse, this is achievable through Materialized Views. +Features that are continuously subject to update often require some form of +streaming pipeline, typically implemented with tooling such as Flink or +Spark Streaming. Normally, some form of directed acyclic graph (DAG) is +required, if these transformations are chained, and dependencies need to be +tracked. + +- **Offline (Training) Store** - The offline store holds the features +resulting from the previous transformation pipeline. These features are +typically grouped as entities and associated with a label (the target +prediction). Usually, models need to consume these features selectively, +either iteratively or through aggregations, potentially multiple times and +in random order. Models often require more than one feature, requiring +features to be grouped together in a "feature group" - usually by an entity +ID and time dimension. This requires the offline store to be able to deliver +the correct version of a feature and label for a specific point in time. +This "point-in-time correctness" is often fundamental to models, which need +to be trained incrementally. + +- **Online (Interference) Store** - Once a model has been trained, it can be +deployed and used for making predictions. This inference process requires +information that is only available at the moment of prediction, e.g. the +user's ID for a transaction. However, it can also require features for the +prediction, which may be precomputed, e.g. features representing historical +purchases. These are often too expensive to compute at inference time, even +for ClickHouse. These features need to be served in latency-sensitive +situations, based on the most recent version of the data, especially in +scenarios, where predictions need to be made in real-time, such as fraud +detection. Features may be materialized from the offline store to the online +store for serving. + +### Feature stores and ClickHouse {#feature-stores-and-clickhouse} + +As a real-time data warehouse, ClickHouse can fulfill the role of a number +of the components - potentially significantly simplifying the feature store +architecture. + +Feature stores and ClickHouse + +Specifically, ClickHouse can act as a: + +- **Data source** - With the ability to query or ingest data in over 70 +different file formats, including data lake formats such as Iceberg and +Delta Lake, ClickHouse makes an ideal long-term store holding or querying +data. By separating storage and compute using object storage, ClickHouse +Cloud additionally allows data to be held indefinitely - with compute scaled +down or made completely idle to minimize costs. Flexible codecs, coupled +with column-oriented storage and ordering of data on disk, maximize +compression rates, thus minimizing the required storage. Users can easily +combine ClickHouse with data lakes, with built-in functions to query data in +place on object storage. + +- **Transformation engine** - SQL provides a natural means of declaring data + transformations. When extended with ClickHouse's analytical and statistical + functions, these transformations become succinct and optimized. As well as + applying to either ClickHouse tables, in cases where ClickHouse is used as a + data store, table functions allow SQL queries to be written against data + stored in formats such as Parquet, on-disk or object storage, or even other + data stores such as Postgres and MySQL. A completely parallelization query + execution engine, combined with a column-oriented storage format, allows + ClickHouse to perform aggregations over PBs of data in seconds - unlike + transformations on in memory data frames, users are not memory-bound. + Furthermore, materialized views allow data to be transformed at insert time, + thus overloading compute to data load time from query time. These views can + exploit the same range of analytical and statistical functions ideal for + data analysis and summarization. Should any of ClickHouse's existing + analytical functions be insufficient or custom libraries need to be + integrated, users can also utilize User Defined Functions (UDFs). + + While users can transform data directly in ClickHouse or prior to insertion + using SQL queries, ClickHouse can also be used in programming environments + such as Python via chDB. This allows embedded ClickHouse to be exposed as a + Python module and used to transform and manipulate large data frames within + notebooks. This allows transformation work to be performed client-side by + data engineers, with results potentially materialized as feature tables in + a centralized ClickHouse instance. + +- **Offline store** - With the above capabilities to read data from multiple + sources and apply transformations via SQL, the results of these queries can + also be persisted in ClickHouse via `INSERT INTO SELECT` statements. With + transformations often grouped by an entity ID and returning a number of + columns as results, ClickHouse's schema inference can automatically detect + the required types from these results and produce an appropriate table + schema to store them. Functions for generating random numbers and + statistical sampling allow data to be efficiently iterated and scaled at + millions or rows per second for feeding to model training pipelines. + + Often, features are represented in tables with a timestamp indicating the + value for an entity and feature at a specific point in time. As described + earlier, training pipelines often need the state of features at specific + points in time and in groups. ClickHouse's sparse indices allow fast + filtering of data to satisfy point-in-time queries and feature selection + filters. While other technologies such as Spark, Redshift, and BigQuery + rely on slow stateful windowed approaches to identify the state of features + at a specific point in time, ClickHouse supports the `ASOF` (as-of-this-time) + `LEFT JOIN` query and `argMax` function. As well as simplifying syntax, this + approach is highly performant on large datasets through the use of a sort + and merge algorithm. This allows feature groups to be built quickly, + reducing data preparation time prior to training. + + ClickHouse as an offline store + +- **Online store** - As a real-time analytics database, ClickHouse can serve highly + concurrent query workloads at low latency. While this requires data to be typically + denormalized, this aligns with the storage of feature groups used at both training + and inference time. Importantly, ClickHouse is able to deliver this query + performance while being subject to high write workloads thanks to its log-structured + merge tree. These properties are required in an online store to keep features + up-to-date. Since the features are already available within the offline store, + they can easily be materialized to new tables within either the same ClickHouse + cluster or a different instance via existing capabilities, e.g., [`remoteSecure`](/sql-reference/table-functions/remote#parameters). + + :::note + For use cases requiring very high request concurrency i.e., thousands per second, + and very low latency, we recommend users still consider a dedicated data store, + e.g., Redis, designed for these workloads. + ::: + +- **Vector database** - ClickHouse has built-in support for vector embeddings + through floating point arrays. These can be searched and compared through + [distance functions](https://clickhouse.com/docs/en/sql-reference/functions/distance-functions#cosinedistance), + allowing ClickHouse to be used as a vector database. This linear comparison can + be easily scaled and parallelized for larger datasets. Additionally, ClickHouse + has maturing support for [Approximate Nearest Neighbour (ANN)](https://clickhouse.com/docs/en/engines/table-engines/mergetree-family/annindexes) + indices, as well as [hyperplane indexes using pure-SQL](https://clickhouse.com/blog/approximate-nearest-neighbour-ann-with-sql-powered-local-sensitive-hashing-lsh-random-projections), + as required for larger vector datasets. + +By satisfying each of the above roles, ClickHouse can dramatically simplify +the feature store architecture. Aside from the simplification of operations, +this architecture allows features to be built and deployed faster. A single +instance of ClickHouse can be scaled vertically to handle PBs of data, with +additional instances simply added for high availability. This minimizes the +movement of data between data stores, minimizing the typical network +bottlenecks. ClickHouse Cloud expands on this further by storing only a +single copy of the data in object storage and allowing nodes to be scaled +vertically or horizontally dynamically in response to load as required. + +The above architecture still requires several key components not satisfied +by ClickHouse: a streaming engine such as Kafka + Flink and a framework to +provide compute for model training. A means of hosting models is also +required. For simplicity, we assume the use of a cloud-hosted solution to +these, such as Confluent and Amazon SageMaker. diff --git a/docs/cloud/onboard/01_discover/02_use_cases/04_machine_learning_and_genAI/03_agent_facing_analytics.md b/docs/cloud/onboard/01_discover/02_use_cases/04_machine_learning_and_genAI/03_agent_facing_analytics.md new file mode 100644 index 00000000000..518c78c700e --- /dev/null +++ b/docs/cloud/onboard/01_discover/02_use_cases/04_machine_learning_and_genAI/03_agent_facing_analytics.md @@ -0,0 +1,169 @@ +--- +slug: /cloud/get-started/cloud/use-cases/AI_ML/agent_facing_analytics +title: 'Agent facing analytics' +description: 'Build agent-facing analytics systems with ClickHouse Cloud for AI agents and autonomous systems requiring real-time data access' +keywords: ['use cases', 'Machine Learning', 'Generative AI', 'agent facing analytics', 'agents'] +sidebar_label: 'Agent facing analytics' +--- + +import Image from '@theme/IdealImage'; +import ml_ai_05 from '@site/static/images/cloud/onboard/discover/use_cases/ml_ai_05.png'; +import ml_ai_06 from '@site/static/images/cloud/onboard/discover/use_cases/ml_ai_06.png'; +import ml_ai_07 from '@site/static/images/cloud/onboard/discover/use_cases/ml_ai_07.png'; +import ml_ai_08 from '@site/static/images/cloud/onboard/discover/use_cases/ml_ai_08.png'; +import ml_ai_09 from '@site/static/images/cloud/onboard/discover/use_cases/ml_ai_09.png'; + +## Agent-facing analytics concepts {#agent-facing-analytics} + +### What are "agents"? {#agents} + +One can think of AI agents as digital assistants that have evolved beyond +simple task execution (or function calling): they can understand context, +make decisions, and take meaningful actions toward specific goals. They +operate in a "sense-think-act" loop (see ReAct agents), processing various +inputs (text, media, data), analyzing situations, and then doing something +useful with that information. Most importantly, depending on the application +domain, they can theoretically operate at various levels of autonomy, +requiring or not human supervision. + +The game changer here has been the advent of Large Language Models (LLMs). +While we had the notion of AI agents for quite a while, LLMs like the GPT +series have given them a massive upgrade in their ability to "understand" +and communicate. It's as if they've suddenly become more fluent in "human" +aka. able to grasp requests and respond with relevant contextual information +drawn from the model's training. + +### AI agents superpowers: “Tools” {#tools} + +These agents really shine through their access to “tools”. Tools enhance AI agents +by giving them abilities to perform tasks. Rather than just being conversational +interfaces, they can now get things done whether it’s crunching numbers, searching +for information, or managing customer communications. Think of it as the difference +between having someone who can describe how to solve a problem and someone who +can actually solve it. + +For example, ChatGPT is now shipped by default with a search tool. This +integration with search providers allows the model to pull current information +from the web during conversations. This means it can fact-check responses, access +recent events and data, and provide up-to-date information rather than relying +solely on its training data. + +Agents equipped with tools + +Tools can also be used to simplify the implementation of Retrieval-Augmented +Generation (RAG) pipelines. Instead of relying only on what an AI model +learned during training, RAG lets the model pull in relevant information +before formulating a response. Here's an example: Using an AI assistant to +help with customer support (e.g. Salesforce AgentForce, ServiceNow AI +Agents). Without RAG, it would only use its general training to answer +questions. But with RAG, when a customer asks about the latest product +feature, the system retrieves the most recent documentation, release notes, +and historical support tickets before crafting its response. This means that +answers are now grounded in the latest information available to the AI +model. + +### Reasoning models {#reasoning-models} + +Another development in the AI space, and perhaps one of the most +interesting, is the emergence of reasoning models. Systems like OpenAI o1, +Anthropic Claude, or DeepSeek-R1 take a more methodical approach by +introducing a "thinking" step before responding to a prompt. Instead of +generating the answer straightaway, reasoning models use prompting +techniques like Chain-of-Thought (CoT) to analyze problems from multiple +angles, break them down into steps, and use the tools available to them to +gather contextual information when needed. + +This represents a shift toward more capable systems that can handle more +complex tasks through a combination of reasoning and practical tools. One of +the latest examples in this area is the introduction of OpenAI's deep +research, an agent that can autonomously conduct complex multi-step research +tasks online. It processes and synthesizes information from various sources, +including text, images, and PDFs, to generate comprehensive reports within five +to thirty minutes, a task that would traditionally take a human several hours. + +Reasoning models + +## Real-time analytics for AI agents {#real-time-analytics-for-ai-agents} + +Let's take the case of an agentic AI assistant with access to a +real-time analytics database containing the company's CRM data. When a user asks +about the latest (up-to-the-minute) sales trends, the AI assistant queries the +connected data source. It iteratively analyzes the data to identify meaningful +patterns and trends, such as month-over-month growth, seasonal variations, or +emerging product categories. Finally, it generates a natural language response +explaining key findings, often with supporting visualizations. When the main +interface is chat-based like in this case, performance matters since these +iterative explorations trigger a series of queries that can scan large amounts of +data to extract relevant insights. + +Some properties make real-time databases especially suitable for such +workloads. For example, real-time analytics databases are designed to work +with near real-time data, allowing them to process and deliver insights +almost immediately as new data arrives. This is crucial for AI agents, as +they can require up-to-date information to make (or help make) timely and +relevant decisions. + +The core analytical capabilities are also important. Real-time analytics +databases shine in performing complex aggregations and pattern detection +across large datasets. Unlike operational databases focusing primarily on +raw data storage or retrieval, these systems are optimized for analyzing +vast amounts of information. This makes them particularly well-suited for AI +agents that need to uncover trends, detect anomalies, and derive actionable +insights. + +Real-time analytics databases are also expected to deliver fast +performance for interactive querying, essential for chat-based interaction +and high-frequency explorative workloads. They ensure consistent performance +even with large data volumes and high query concurrency, enabling responsive +dialogues and a smoother user experience. + +Finally, real-time analytics databases often serve as the ultimate "data +sinks" effectively consolidating valuable domain-specific data in a single +location. By co-locating essential data across different sources and formats +under the same tent, these databases ensure that AI agents have access to a +unified view of the domain information, decoupled from operational systems. + +Classic real-time analytics + +Agent real-time analytics + +These properties already empower real-time databases to play a vital role +in serving AI data retrieval use cases at scale (e.g. OpenAI's acquisition +of Rockset). They can also enable AI agents to provide fast data-driven +responses while offloading the heavy computational work. + +It positions the real-time analytics database as a preferred "context +provider" for AI agents when it comes to insights. + +## AI agents as an emerging user persona {#ai-agents-as-an-emerging-user-persona} + +A useful way to think about AI agents leveraging real-time analytics databases +is to perceive them as a new category of users, or in product manager speak: +a user persona. + +Agents as an emerging user persona + +From the database perspective, we can expect a potentially unlimited number of +AI agents, concurrently running a large number of queries on behalf of users, +or in autonomy, to perform investigations, refine iterative research and insights, +and execute tasks. + +Over the years, real-time databases have had the time to adapt to human +interactive users, directly connected to the system or via a middleware +application layer. Classic personas examples include database administrators, +business analysts, data scientists, or software developers building applications +on top of the database. The industry has progressively learned their usage +patterns and requirements and organically, provided the interfaces, the operators, +the UIs, the formats, the clients, and the performance to satisfy their various +use cases. + +The question now becomes, are we ready to accommodate the AI agent's workloads? +What specific features do we need to re-think or create from scratch for these +usage patterns? + +ClickHouse is rapidly providing answers to some of these questions through a host +of features aimed at providing a feature-complete AI experience. + +## ClickHouse.ai {#clickhouse-ai} + +For more information about features coming soon to ClickHouse Cloud, see [ClickHouse.ai](https://clickhouse.com/clickhouse-ai/). diff --git a/docs/cloud/onboard/01_discover/02_use_cases/04_machine_learning_and_genAI/_category_.json b/docs/cloud/onboard/01_discover/02_use_cases/04_machine_learning_and_genAI/_category_.json new file mode 100644 index 00000000000..7b4415fff32 --- /dev/null +++ b/docs/cloud/onboard/01_discover/02_use_cases/04_machine_learning_and_genAI/_category_.json @@ -0,0 +1,6 @@ +{ + "position": 2.5, + "label": "Machine Learning and GenAI", + "collapsible": true, + "collapsed": true, +} \ No newline at end of file diff --git a/docs/cloud/onboard/01_discover/02_use_cases/_category_.json b/docs/cloud/onboard/01_discover/02_use_cases/_category_.json new file mode 100644 index 00000000000..70c6591bd01 --- /dev/null +++ b/docs/cloud/onboard/01_discover/02_use_cases/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Use cases", + "collapsible": true, + "collapsed": true, +} \ No newline at end of file diff --git a/docs/integrations/migration/overview.md b/docs/cloud/onboard/02_migrate/01_migration_guides/01_overview.md similarity index 60% rename from docs/integrations/migration/overview.md rename to docs/cloud/onboard/02_migrate/01_migration_guides/01_overview.md index 46457d3c294..3a881aa4454 100644 --- a/docs/integrations/migration/overview.md +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/01_overview.md @@ -25,9 +25,9 @@ description: 'Page describing the options available for migrating data into Clic There are several options for migrating data into ClickHouse Cloud, depending on where your data resides now: -- [Self-managed to Cloud](./clickhouse-to-cloud.md): use the `remoteSecure` function to transfer data -- [Another DBMS](./clickhouse-local-etl.md): use the [clickhouse-local] ETL tool along with the appropriate ClickHouse table function for your current DBMS -- [Anywhere!](./etl-tool-to-clickhouse.md): use one of the many popular ETL/ELT tools that connect to all kinds of different data sources -- [Object Storage](./object-storage-to-clickhouse.md): easily insert data from S3 into ClickHouse +- [Self-managed to Cloud](/cloud/migration/clickhouse-to-cloud): use the `remoteSecure` function to transfer data +- [Another DBMS](/cloud/migration/clickhouse-local): use the [clickhouse-local] ETL tool along with the appropriate ClickHouse table function for your current DBMS +- [Anywhere!](/cloud/migration/etl-tool-to-clickhouse): use one of the many popular ETL/ELT tools that connect to all kinds of different data sources +- [Object Storage](/integrations/migration/object-storage-to-clickhouse): easily insert data from S3 into ClickHouse In the example [Migrate from Redshift](/integrations/data-ingestion/redshift/index.md), we present three different ways to migrate data to ClickHouse. diff --git a/docs/migrations/postgres/overview.md b/docs/cloud/onboard/02_migrate/01_migration_guides/02_postgres/01_overview.md similarity index 97% rename from docs/migrations/postgres/overview.md rename to docs/cloud/onboard/02_migrate/01_migration_guides/02_postgres/01_overview.md index ca1d195b914..b8be25dcc58 100644 --- a/docs/migrations/postgres/overview.md +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/02_postgres/01_overview.md @@ -1,10 +1,13 @@ --- slug: /migrations/postgresql/overview -title: 'Migrating from PostgreSQL to ClickHouse' +title: 'Comparing PostgreSQL and ClickHouse' description: 'A guide to migrating from PostgreSQL to ClickHouse' keywords: ['postgres', 'postgresql', 'migrate', 'migration'] +sidebar_label: 'Overview' --- +# Comparing ClickHouse and PostgreSQL + ## Why use ClickHouse over Postgres? {#why-use-clickhouse-over-postgres} TLDR: Because ClickHouse is designed for fast analytics, specifically `GROUP BY` queries, as an OLAP database whereas Postgres is an OLTP database designed for transactional workloads. diff --git a/docs/migrations/postgres/appendix.md b/docs/cloud/onboard/02_migrate/01_migration_guides/02_postgres/appendix.md similarity index 100% rename from docs/migrations/postgres/appendix.md rename to docs/cloud/onboard/02_migrate/01_migration_guides/02_postgres/appendix.md diff --git a/docs/migrations/postgres/index.md b/docs/cloud/onboard/02_migrate/01_migration_guides/02_postgres/index.md similarity index 90% rename from docs/migrations/postgres/index.md rename to docs/cloud/onboard/02_migrate/01_migration_guides/02_postgres/index.md index e4052fd1ab9..35837f4d34c 100644 --- a/docs/migrations/postgres/index.md +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/02_postgres/index.md @@ -8,7 +8,7 @@ description: 'Landing page for the PostgreSQL migrations section' | Page | Description | |----------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| [Overview](./overview.md) | Introduction page for this section | +| [Overview](/migrations/postgresql/overview) | Introduction page for this section | | [Connecting to PostgreSQL](/integrations/postgresql/connecting-to-postgresql) | This page covers the following options for integrating PostgreSQL with ClickHouse: ClickPipes, PeerDB, PostgreSQL table engine, MaterializedPostgreSQL database engine. | | [Migrating data](/migrations/postgresql/dataset) | Part 1 of a guide on migrating from PostgreSQL to ClickHouse. Using a practical example, it demonstrates how to efficiently carry out the migration with a real-time replication (CDC) approach. Many of the concepts covered are also applicable to manual bulk data transfers from PostgreSQL to ClickHouse. | |[Rewriting PostgreSQL Queries](/migrations/postgresql/rewriting-queries)|Part 2 of a guide on migrating from PostgreSQL to ClickHouse. Using a practical example, it demonstrates how to efficiently carry out the migration with a real-time replication (CDC) approach. Many of the concepts covered are also applicable to manual bulk data transfers from PostgreSQL to ClickHouse.| diff --git a/docs/migrations/postgres/dataset.md b/docs/cloud/onboard/02_migrate/01_migration_guides/02_postgres/migration_guide/01_migration_guide_part1.md similarity index 99% rename from docs/migrations/postgres/dataset.md rename to docs/cloud/onboard/02_migrate/01_migration_guides/02_postgres/migration_guide/01_migration_guide_part1.md index 2574252e1da..a2260255e9b 100644 --- a/docs/migrations/postgres/dataset.md +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/02_postgres/migration_guide/01_migration_guide_part1.md @@ -4,6 +4,7 @@ title: 'Migrating data' description: 'Dataset example to migrate from PostgreSQL to ClickHouse' keywords: ['Postgres'] show_related_blogs: true +sidebar_label: 'Part 1' --- import postgres_stackoverflow_schema from '@site/static/images/migrations/postgres-stackoverflow-schema.png'; @@ -177,4 +178,4 @@ INSERT INTO stackoverflow.posts SELECT * FROM postgresql('', 'postgres', ' > A possible method to detect UPDATE operations when using query replication is using the [`XMIN` system column](https://www.postgresql.org/docs/9.1/ddl-system-columns.html) (transaction IDs) as a watermark - a change in this column is indicative of a change and therefore can be applied to the destination table. Users employing this approach should be aware that `XMIN` values can wrap around and comparisons require a full table scan, making tracking changes more complex. -[Click here for Part 2](./rewriting-queries.md) +[Click here for Part 2](/migrations/postgresql/rewriting-queries) diff --git a/docs/migrations/postgres/rewriting-queries.md b/docs/cloud/onboard/02_migrate/01_migration_guides/02_postgres/migration_guide/02_migration_guide_part2.md similarity index 99% rename from docs/migrations/postgres/rewriting-queries.md rename to docs/cloud/onboard/02_migrate/01_migration_guides/02_postgres/migration_guide/02_migration_guide_part2.md index 451d1b37d9a..8866fb91cd9 100644 --- a/docs/migrations/postgres/rewriting-queries.md +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/02_postgres/migration_guide/02_migration_guide_part2.md @@ -3,6 +3,7 @@ slug: /migrations/postgresql/rewriting-queries title: 'Rewriting PostgreSQL Queries' keywords: ['postgres', 'postgresql', 'rewriting queries'] description: 'Part 2 of a guide on migrating from PostgreSQL to ClickHouse' +sidebar_label: 'Part 2' --- > This is **Part 2** of a guide on migrating from PostgreSQL to ClickHouse. Using a practical example, it demonstrates how to efficiently carry out the migration with a real-time replication (CDC) approach. Many of the concepts covered are also applicable to manual bulk data transfers from PostgreSQL to ClickHouse. @@ -269,4 +270,4 @@ LIMIT 5; Time: 116750.131 ms (01:56.750) ``` -[Click here for Part 3](./data-modeling-techniques.md) +[Click here for Part 3](/migrations/postgresql/data-modeling-techniques) diff --git a/docs/migrations/postgres/data-modeling-techniques.md b/docs/cloud/onboard/02_migrate/01_migration_guides/02_postgres/migration_guide/03_migration_guide_part3.md similarity index 99% rename from docs/migrations/postgres/data-modeling-techniques.md rename to docs/cloud/onboard/02_migrate/01_migration_guides/02_postgres/migration_guide/03_migration_guide_part3.md index f864bd8fb3e..db4468289d8 100644 --- a/docs/migrations/postgres/data-modeling-techniques.md +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/02_postgres/migration_guide/03_migration_guide_part3.md @@ -1,9 +1,10 @@ --- slug: /migrations/postgresql/data-modeling-techniques title: 'Data modeling techniques' -description: 'Data modeling for migrating from PostgreSQL to ClickHouse' +description: 'Part 3 of a guide on migrating from PostgreSQL to ClickHouse' keywords: ['postgres', 'postgresql'] show_related_blogs: true +sidebar_label: 'Part 3' --- import postgres_b_tree from '@site/static/images/migrations/postgres-b-tree.png'; diff --git a/docs/cloud/onboard/02_migrate/01_migration_guides/02_postgres/migration_guide/_category_.json b/docs/cloud/onboard/02_migrate/01_migration_guides/02_postgres/migration_guide/_category_.json new file mode 100644 index 00000000000..ad514aeb890 --- /dev/null +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/02_postgres/migration_guide/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Migration guide", + "collapsible": true, + "collapsed": true, +} \ No newline at end of file diff --git a/docs/migrations/bigquery/equivalent-concepts.md b/docs/cloud/onboard/02_migrate/01_migration_guides/03_bigquery/01_overview.md similarity index 98% rename from docs/migrations/bigquery/equivalent-concepts.md rename to docs/cloud/onboard/02_migrate/01_migration_guides/03_bigquery/01_overview.md index ee330a0610c..729112ee81e 100644 --- a/docs/migrations/bigquery/equivalent-concepts.md +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/03_bigquery/01_overview.md @@ -4,12 +4,13 @@ slug: /migrations/bigquery/biquery-vs-clickhouse-cloud description: 'How BigQuery differs from ClickHouse Cloud' keywords: ['BigQuery'] show_related_blogs: true +sidebar_label: 'Overview' --- import bigquery_1 from '@site/static/images/migrations/bigquery-1.png'; import Image from '@theme/IdealImage'; -# BigQuery vs ClickHouse Cloud: equivalent and different concepts +# Comparing ClickHouse Cloud and BigQuery ## Resource organization {#resource-organization} @@ -21,7 +22,7 @@ The way resources are organized in ClickHouse Cloud is similar to [BigQuery's re Similar to BigQuery, organizations are the root nodes in the ClickHouse cloud resource hierarchy. The first user you set up in your ClickHouse Cloud account is automatically assigned to an organization owned by the user. The user may invite additional users to the organization. -### BigQuery projects vs ClickHouse Cloud services {#bigquery-projects-vs-clickhouse-cloud-services} +### BigQuery Projects vs ClickHouse Cloud Services {#bigquery-projects-vs-clickhouse-cloud-services} Within organizations, you can create services loosely equivalent to BigQuery projects because stored data in ClickHouse Cloud is associated with a service. There are [several service types available](/cloud/manage/cloud-tiers) in ClickHouse Cloud. Each ClickHouse Cloud service is deployed in a specific region and includes: @@ -29,15 +30,15 @@ Within organizations, you can create services loosely equivalent to BigQuery pro 2. An object storage folder where the service stores all the data. 3. An endpoint (or multiple endpoints created via ClickHouse Cloud UI console) - a service URL that you use to connect to the service (for example, `https://dv2fzne24g.us-east-1.aws.clickhouse.cloud:8443`) -### BigQuery datasets vs ClickHouse Cloud databases {#bigquery-datasets-vs-clickhouse-cloud-databases} +### BigQuery Datasets vs ClickHouse Cloud Databases {#bigquery-datasets-vs-clickhouse-cloud-databases} ClickHouse logically groups tables into databases. Like BigQuery datasets, ClickHouse databases are logical containers that organize and control access to table data. -### BigQuery folders {#bigquery-folders} +### BigQuery Folders {#bigquery-folders} ClickHouse Cloud currently has no concept equivalent to BigQuery folders. -### BigQuery slot reservations and quotas {#bigquery-slot-reservations-and-quotas} +### BigQuery Slot reservations and Quotas {#bigquery-slot-reservations-and-quotas} Like BigQuery slot reservations, you can [configure vertical and horizontal autoscaling](/manage/scaling#configuring-vertical-auto-scaling) in ClickHouse Cloud. For vertical autoscaling, you can set the minimum and maximum size for the memory and CPU cores of the compute nodes for a service. The service will then scale as needed within those bounds. These settings are also available during the initial service creation flow. Each compute node in the service has the same size. You can change the number of compute nodes within a service with [horizontal scaling](/manage/scaling#manual-horizontal-scaling). @@ -78,7 +79,7 @@ When presented with multiple options for ClickHouse types, consider the actual r ## Query acceleration techniques {#query-acceleration-techniques} -### Primary and foreign keys and primary index {#primary-and-foreign-keys-and-primary-index} +### Primary and Foreign keys and Primary index {#primary-and-foreign-keys-and-primary-index} In BigQuery, a table can have [primary key and foreign key constraints](https://cloud.google.com/bigquery/docs/information-schema-table-constraints). Typically, primary and foreign keys are used in relational databases to ensure data integrity. A primary key value is normally unique for each row and is not `NULL`. Each foreign key value in a row must be present in the primary key column of the primary key table or be `NULL`. In BigQuery, these constraints are not enforced, but the query optimizer may use this information to optimize queries better. diff --git a/docs/migrations/bigquery/migrating-to-clickhouse-cloud.md b/docs/cloud/onboard/02_migrate/01_migration_guides/03_bigquery/02_migrating-to-clickhouse-cloud.md similarity index 99% rename from docs/migrations/bigquery/migrating-to-clickhouse-cloud.md rename to docs/cloud/onboard/02_migrate/01_migration_guides/03_bigquery/02_migrating-to-clickhouse-cloud.md index 44f8c8c7d20..0118a912fec 100644 --- a/docs/migrations/bigquery/migrating-to-clickhouse-cloud.md +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/03_bigquery/02_migrating-to-clickhouse-cloud.md @@ -4,6 +4,7 @@ slug: /migrations/bigquery/migrating-to-clickhouse-cloud description: 'How to migrate your data from BigQuery to ClickHouse Cloud' keywords: ['BigQuery'] show_related_blogs: true +sidebar_label: 'Migration guide' --- import bigquery_2 from '@site/static/images/migrations/bigquery-2.png'; diff --git a/docs/migrations/bigquery/loading-data.md b/docs/cloud/onboard/02_migrate/01_migration_guides/03_bigquery/03_loading-data.md similarity index 96% rename from docs/migrations/bigquery/loading-data.md rename to docs/cloud/onboard/02_migrate/01_migration_guides/03_bigquery/03_loading-data.md index 8e2558fe073..0bfdff8b2eb 100644 --- a/docs/migrations/bigquery/loading-data.md +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/03_bigquery/03_loading-data.md @@ -24,7 +24,9 @@ Exporting data from BigQuery to ClickHouse is dependent on the size of your data | [contracts](https://github.com/ClickHouse/examples/blob/main/ethereum/schemas/contracts.md) | 57,225,837 | 350 | 45.35GB | 16 sec | 1 hr 51 min | 39.4 secs | | Total | 8.26 billion | 23,577 | 3.982TB | 8 min 3 sec | \> 6 days 5 hrs | 53 mins 45 secs | -## 1. Export table data to GCS {#1-export-table-data-to-gcs} + + +## Export table data to GCS {#1-export-table-data-to-gcs} In this step, we utilize the [BigQuery SQL workspace](https://cloud.google.com/bigquery/docs/bigquery-web-ui) to execute our SQL commands. Below, we export a BigQuery table named `mytable` to a GCS bucket using the [`EXPORT DATA`](https://cloud.google.com/bigquery/docs/reference/standard-sql/other-statements) statement. @@ -60,7 +62,7 @@ This approach has a number of advantages: - Exports produce multiple files automatically, limiting each to a maximum of 1GB of table data. This is beneficial to ClickHouse since it allows imports to be parallelized. - Parquet, as a column-oriented format, represents a better interchange format since it is inherently compressed and faster for BigQuery to export and ClickHouse to query -## 2. Importing data into ClickHouse from GCS {#2-importing-data-into-clickhouse-from-gcs} +## Importing data into ClickHouse from GCS {#2-importing-data-into-clickhouse-from-gcs} Once the export is complete, we can import this data into a ClickHouse table. You can use the [ClickHouse SQL console](/integrations/sql-clients/sql-console) or [`clickhouse-client`](/interfaces/cli) to execute the commands below. @@ -111,7 +113,7 @@ In the above query, we use the [`ifNull` function](/sql-reference/functions/func Alternatively, you can `SET input_format_null_as_default=1` and any missing or NULL values will be replaced by default values for their respective columns, if those defaults are specified. ::: -## 3. Testing successful data export {#3-testing-successful-data-export} +## Testing successful data export {#3-testing-successful-data-export} To test whether your data was properly inserted, simply run a `SELECT` query on your new table: @@ -121,6 +123,8 @@ SELECT * FROM mytable LIMIT 10; To export more BigQuery tables, simply redo the steps above for each additional table. + + ## Further reading and support {#further-reading-and-support} In addition to this guide, we also recommend reading our blog post that shows [how to use ClickHouse to speed up BigQuery and how to handle incremental imports](https://clickhouse.com/blog/clickhouse-bigquery-migrating-data-for-realtime-queries). diff --git a/docs/cloud/onboard/02_migrate/01_migration_guides/03_bigquery/_04_sql_translation_reference.md b/docs/cloud/onboard/02_migrate/01_migration_guides/03_bigquery/_04_sql_translation_reference.md new file mode 100644 index 00000000000..e69de29bb2d diff --git a/docs/migrations/bigquery/index.md b/docs/cloud/onboard/02_migrate/01_migration_guides/03_bigquery/index.md similarity index 52% rename from docs/migrations/bigquery/index.md rename to docs/cloud/onboard/02_migrate/01_migration_guides/03_bigquery/index.md index fdb90ce1ab8..9b793545e38 100644 --- a/docs/migrations/bigquery/index.md +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/03_bigquery/index.md @@ -11,6 +11,6 @@ In this section of the docs, learn more about the similarities and differences b | Page | Description | |-----------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------| -| [BigQuery vs ClickHouse Cloud](./equivalent-concepts.md) | The way resources are organized in ClickHouse Cloud is similar to BigQuery's resource hierarchy. We describe the specific differences in this article. | -| [Migrating from BigQuery to ClickHouse Cloud](./migrating-to-clickhouse-cloud.md) | Learn about why you might want to migrate from BigQuery to ClickHouse Cloud. | -| [Loading Data](./loading-data.md) | A guide showing you how to migrate data from BigQuery to ClickHouse. | +| [BigQuery vs ClickHouse Cloud](/migrations/bigquery/biquery-vs-clickhouse-cloud) | The way resources are organized in ClickHouse Cloud is similar to BigQuery's resource hierarchy. We describe the specific differences in this article. | +| [Migrating from BigQuery to ClickHouse Cloud](/migrations/bigquery/migrating-to-clickhouse-cloud) | Learn about why you might want to migrate from BigQuery to ClickHouse Cloud. | +| [Loading Data](/migrations/bigquery/loading-data) | A guide showing you how to migrate data from BigQuery to ClickHouse. | diff --git a/docs/cloud/onboard/02_migrate/01_migration_guides/04_snowflake/01_overview.md b/docs/cloud/onboard/02_migrate/01_migration_guides/04_snowflake/01_overview.md new file mode 100644 index 00000000000..6139ff66887 --- /dev/null +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/04_snowflake/01_overview.md @@ -0,0 +1,184 @@ +--- +sidebar_label: 'Overview' +slug: /migrations/snowflake-overview +description: 'Migrating from Snowflake to ClickHouse' +keywords: ['Snowflake'] +title: 'Migrate from Snowflake to ClickHouse' +show_related_blogs: true +--- + +import snowflake_architecture from '@site/static/images/cloud/onboard/discover/use_cases/snowflake_architecture.png'; +import cloud_architecture from '@site/static/images/cloud/onboard/discover/use_cases/cloud_architecture.png'; +import Image from '@theme/IdealImage'; + +# Snowflake to ClickHouse migration + +> This document provides an introduction to migrating data from Snowflake to ClickHouse. + +Snowflake is a cloud data warehouse primarily focused on migrating legacy on-premise +data warehousing workloads to the cloud. It is well-optimized for executing +long-running reports at scale. As datasets migrate to the cloud, data owners start +thinking about how else they can extract value from this data, including using +these datasets to power real-time applications for internal and external use cases. +When this happens, they often realize they need a database optimized for +powering real-time analytics, like ClickHouse. + +## Comparison {#comparison} + +In this section, we'll compare the key features of ClickHouse and Snowflake. + +### Similarities {#similarities} + +Snowflake is a cloud-based data warehousing platform that provides a scalable +and efficient solution for storing, processing, and analyzing large amounts of +data. +Like ClickHouse, Snowflake is not built on existing technologies but relies +on its own SQL query engine and custom architecture. + +Snowflake’s architecture is described as a hybrid between a shared-storage (shared-disk) +architecture and a shared-nothing architecture. A shared-storage architecture is +one where data is both accessible from all compute nodes using object +stores such as S3. A shared-nothing architecture is one where each compute node +stores a portion of the entire data set locally to respond to queries. This, in +theory, delivers the best of both models: the simplicity of a shared-disk +architecture and the scalability of a shared-nothing architecture. + +This design fundamentally relies on object storage as the primary storage medium, +which scales almost infinitely under concurrent access while providing high +resilience and scalable throughput guarantees. + +The image below from [docs.snowflake.com](https://docs.snowflake.com/en/user-guide/intro-key-concepts) +shows this architecture: + +Snowflake architecture + +Conversely, as an open-source and cloud-hosted product, ClickHouse can be deployed +in both shared-disk and shared-nothing architectures. The latter is typical for +self-managed deployments. While allowing for CPU and memory to be easily scaled, +shared-nothing configurations introduce classic data management challenges and +overhead of data replication, especially during membership changes. + +For this reason, ClickHouse Cloud utilizes a shared-storage architecture that is +conceptually similar to Snowflake. Data is stored once in an object store +(single copy), such as S3 or GCS, providing virtually infinite storage with +strong redundancy guarantees. Each node has access to this single copy of the +data as well as its own local SSDs for cache purposes. Nodes can, in turn, be +scaled to provide additional CPU and memory resources as required. Like Snowflake, +S3’s scalability properties address the classic limitation of shared-disk +architectures (disk I/O and network bottlenecks) by ensuring the I/O throughput +available to current nodes in a cluster is not impacted as additional nodes are +added. + +ClickHouse Cloud architecture + +### Differences {#differences} + +Aside from the underlying storage formats and query engines, these architectures +differ in a few subtle ways: + +* Compute resources in Snowflake are provided through a concept of [warehouses](https://docs.snowflake.com/en/user-guide/warehouses). + These consist of a number of nodes, each of a set size. While Snowflake + doesn't publish the specific architecture of their warehouses, it is + [generally understood](https://select.dev/posts/snowflake-warehouse-sizing) + that each node consists of 8 vCPUs, 16GiB, and 200GB of local storage (for cache). + The number of nodes depends on a t-shirt size, e.g. an x-small has one node, + a small 2, medium 4, large 8, etc. These warehouses are independent of the data + and can be used to query any database residing on object storage. When idle + and not subjected to query load, warehouses are paused - resuming when a query + is received. While storage costs are always reflected in billing, warehouses + are only charged when active. + +* ClickHouse Cloud utilizes a similar principle of nodes with local cache + storage. Rather than t-shirt sizes, users deploy a service with a total + amount of compute and available RAM. This, in turn, transparently + auto-scales (within defined limits) based on the query load - either + vertically by increasing (or decreasing) the resources for each node or + horizontally by raising/lowering the total number of nodes. ClickHouse + Cloud nodes currently have a 1 CPU-to-memory ratio, unlike Snowflake's 1. + While a looser coupling is possible, services are currently coupled to the + data, unlike Snowflake warehouses. Nodes will also pause if idle and + resume if subjected to queries. Users can also manually resize services if + needed. + +* ClickHouse Cloud's query cache is currently node specific, unlike + Snowflake's, which is delivered at a service layer independent of the + warehouse. Based on benchmarks, ClickHouse Cloud's node cache outperforms + Snowflake's. + +* Snowflake and ClickHouse Cloud take different approaches to scaling to + increase query concurrency. Snowflake addresses this through a feature + known as [multi-cluster warehouses](https://docs.snowflake.com/en/user-guide/warehouses-multicluster#benefits-of-multi-cluster-warehouses). + This feature allows users to add clusters to a warehouse. While this offers no + improvement to query latency, it does provide additional parallelization and + allows higher query concurrency. ClickHouse achieves this by adding more memory + and CPU to a service through vertical or horizontal scaling. We do not explore the + capabilities of these services to scale to higher concurrency in this blog, + focusing instead on latency, but acknowledge that this work should be done + for a complete comparison. However, we would expect ClickHouse to perform + well in any concurrency test, with Snowflake explicitly limiting the number + of concurrent queries allowed for a [warehouse to 8 by default](https://docs.snowflake.com/en/sql-reference/parameters#max-concurrency-level). + In comparison, ClickHouse Cloud allows up to 1000 queries to be executed per + node. + +* Snowflake's ability to switch compute size on a dataset, coupled with fast + resume times for warehouses, makes it an excellent experience for ad hoc + querying. For data warehouse and data lake use cases, this provides an + advantage over other systems. + +### Real-time analytics {#real-time-analytics} + +Based on public [benchmark](https://benchmark.clickhouse.com/#system=+%E2%98%81w|%EF%B8%8Fr|C%20c|nfe&type=-&machine=-ca2|gl|6ax|6ale|3al&cluster_size=-&opensource=-&tuned=+n&metric=hot&queries=-) data, +ClickHouse outperforms Snowflake for real-time analytics applications in the following areas: + +* **Query latency**: Snowflake queries have a higher query latency even + when clustering is applied to tables to optimize performance. In our + testing, Snowflake requires over twice the compute to achieve equivalent + ClickHouse performance on queries where a filter is applied that is part + of the Snowflake clustering key or ClickHouse primary key. While + Snowflake's [persistent query cache](https://docs.snowflake.com/en/user-guide/querying-persisted-results) + offsets some of these latency challenges, this is ineffective in cases + where the filter criteria are more diverse. This query cache effectiveness + can be further impacted by changes to the underlying data, with cache + entries invalidated when the table changes. While this is not the case in + the benchmark for our application, a real deployment would require the new, + more recent data to be inserted. Note that ClickHouse's query cache is + node specific and not [transactionally consistent](https://clickhouse.com/blog/introduction-to-the-clickhouse-query-cache-and-design), + making it [better suited ](https://clickhouse.com/blog/introduction-to-the-clickhouse-query-cache-and-design) + to real-time analytics. Users also have granular control over its use + with the ability to control its use on a [per-query basis](/operations/settings/settings#use_query_cache), + its [precise size](/operations/settings/settings#query_cache_max_size_in_bytes), + whether a [query is cached](/operations/settings/settings#enable_writes_to_query_cache) + (limits on duration or required number of executions), and whether it is + only [passively used](https://clickhouse.com/blog/introduction-to-the-clickhouse-query-cache-and-design#using-logs-and-settings). + +* **Lower cost**: Snowflake warehouses can be configured to suspend after + a period of query inactivity. Once suspended, charges are not incurred. + Practically, this inactivity check can [only be lowered to 60s](https://docs.snowflake.com/en/sql-reference/sql/alter-warehouse). + Warehouses will automatically resume, within several seconds, once a query + is received. With Snowflake only charging for resources when a warehouse + is under use, this behavior caters to workloads that often sit idle, like + ad-hoc querying. + + However, many real-time analytics workloads require ongoing real-time data + ingestion and frequent querying that doesn't benefit from idling (like + customer-facing dashboards). This means warehouses must often be fully + active and incurring charges. This negates the cost-benefit of idling as + well as any performance advantage that may be associated with Snowflake's + ability to resume a responsive state faster than alternatives. This active + state requirement, when combined with ClickHouse Cloud's lower per-second + cost for an active state, results in ClickHouse Cloud offering a + significantly lower total cost for these kinds of workloads. + +* **Predictable pricing of features:** Features such as materialized views + and clustering (equivalent to ClickHouse's ORDER BY) are required to reach + the highest levels of performance in real-time analytics use cases. These + features incur additional charges in Snowflake, requiring not only a + higher tier, which increases costs per credit by 1.5x, but also + unpredictable background costs. For instance, materialized views incur a + background maintenance cost, as does clustering, which is hard to predict + prior to use. In contrast, these features incur no additional cost in + ClickHouse Cloud, except additional CPU and memory usage at insert time, + typically negligible outside of high insert workload use cases. We have + observed in our benchmark that these differences, along with lower query + latencies and higher compression, result in significantly lower costs with + ClickHouse. diff --git a/docs/migrations/snowflake.md b/docs/cloud/onboard/02_migrate/01_migration_guides/04_snowflake/02_migration_guide.md similarity index 77% rename from docs/migrations/snowflake.md rename to docs/cloud/onboard/02_migrate/01_migration_guides/04_snowflake/02_migration_guide.md index 38d3b8dfac1..468a8b6193b 100644 --- a/docs/migrations/snowflake.md +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/04_snowflake/02_migration_guide.md @@ -1,23 +1,27 @@ --- -sidebar_label: 'Snowflake' -sidebar_position: 20 +sidebar_label: 'Migration guide' slug: /migrations/snowflake description: 'Migrating from Snowflake to ClickHouse' keywords: ['Snowflake'] title: 'Migrating from Snowflake to ClickHouse' -show_related_blogs: true +show_related_blogs: false --- import migrate_snowflake_clickhouse from '@site/static/images/migrations/migrate_snowflake_clickhouse.png'; import Image from '@theme/IdealImage'; -# Migrating from Snowflake to ClickHouse +# Migrate from Snowflake to ClickHouse -This guide shows how to migrate data from Snowflake to ClickHouse. +> This guide shows you how to migrate data from Snowflake to ClickHouse. -Migrating data between Snowflake and ClickHouse requires the use of an object store, such as S3, as an intermediate storage for transfer. The migration process also relies on using the commands `COPY INTO` from Snowflake and `INSERT INTO SELECT` of ClickHouse. +Migrating data between Snowflake and ClickHouse requires the use of an object store, +such as S3, as an intermediate storage for transfer. The migration process also +relies on using the commands `COPY INTO` from Snowflake and `INSERT INTO SELECT` +of ClickHouse. -## 1. Exporting data from Snowflake {#1-exporting-data-from-snowflake} + + +## Export data from Snowflake {#1-exporting-data-from-snowflake} Migrating from Snowflake to ClickHouse @@ -54,7 +58,7 @@ COPY INTO @external_stage/mydataset from mydataset max_file_size=157286400 heade For a dataset around 5TB of data with a maximum file size of 150MB, and using a 2X-Large Snowflake warehouse located in the same AWS `us-east-1` region, copying data to the S3 bucket will take around 30 minutes. -## 2. Importing to ClickHouse {#2-importing-to-clickhouse} +## Import to ClickHouse {#2-importing-to-clickhouse} Once the data is staged in intermediary object storage, ClickHouse functions such as the [s3 table function](/sql-reference/table-functions/s3) can be used to insert the data into a table, as shown below. @@ -65,10 +69,10 @@ Assuming the following table target schema: ```sql CREATE TABLE default.mydataset ( - `timestamp` DateTime64(6), - `some_text` String, - `some_file` Tuple(filename String, version String), - `complex_data` Tuple(name String, description String), + `timestamp` DateTime64(6), + `some_text` String, + `some_file` Tuple(filename String, version String), + `complex_data` Tuple(name String, description String), ) ENGINE = MergeTree ORDER BY (timestamp) @@ -79,16 +83,16 @@ We can then use the `INSERT INTO SELECT` command to insert the data from S3 into ```sql INSERT INTO mydataset SELECT - timestamp, - some_text, - JSONExtract( - ifNull(some_file, '{}'), - 'Tuple(filename String, version String)' - ) AS some_file, - JSONExtract( - ifNull(complex_data, '{}'), - 'Tuple(filename String, description String)' - ) AS complex_data, + timestamp, + some_text, + JSONExtract( + ifNull(some_file, '{}'), + 'Tuple(filename String, version String)' + ) AS some_file, + JSONExtract( + ifNull(complex_data, '{}'), + 'Tuple(filename String, description String)' + ) AS complex_data, FROM s3('https://mybucket.s3.amazonaws.com/mydataset/mydataset*.parquet') SETTINGS input_format_null_as_default = 1, -- Ensure columns are inserted as default if values are null input_format_parquet_case_insensitive_column_matching = 1 -- Column matching between source data and target table should be case insensitive @@ -100,10 +104,12 @@ The `VARIANT` and `OBJECT` columns in the original Snowflake table schema will b Nested structures such as `some_file` are converted to JSON strings on copy by Snowflake. Importing this data requires us to transform these structures to Tuples at insert time in ClickHouse, using the [JSONExtract function](/sql-reference/functions/json-functions#jsonextract) as shown above. ::: -## 3. Testing successful data export {#3-testing-successful-data-export} +## Test successful data export {#3-testing-successful-data-export} To test whether your data was properly inserted, simply run a `SELECT` query on your new table: ```sql SELECT * FROM mydataset LIMIT 10; ``` + + \ No newline at end of file diff --git a/docs/cloud/onboard/02_migrate/01_migration_guides/04_snowflake/03_sql_translation_reference.md b/docs/cloud/onboard/02_migrate/01_migration_guides/04_snowflake/03_sql_translation_reference.md new file mode 100644 index 00000000000..450a58fe32f --- /dev/null +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/04_snowflake/03_sql_translation_reference.md @@ -0,0 +1,114 @@ +--- +sidebar_label: 'SQL translation reference' +slug: /migrations/snowflake-translation-reference +description: 'SQL translation reference' +keywords: ['Snowflake'] +title: 'Migrating from Snowflake to ClickHouse' +show_related_blogs: true +--- + +# Snowflake SQL translation guide + +## Data types {#data-types} + +### Numerics {#numerics} + +Users moving data between ClickHouse and Snowflake will immediately notice that +ClickHouse offers more granular precision concerning declaring numerics. For example, +Snowflake offers the type Number for numerics. This requires the user to specify a +precision (total number of digits) and scale (digits to the right of the decimal place) +up to a total of 38. Integer declarations are synonymous with Number, and simply +define a fixed precision and scale where the range is the same. This convenience +is possible as modifying the precision (scale is 0 for integers) does not impact the +size of data on disk in Snowflake - the minimal required bytes are used for a +numeric range at write time at a micro partition level. The scale does, however, +impact storage space and is offset with compression. A `Float64` type offers a +wider range of values with a loss of precision. + +Contrast this with ClickHouse, which offers multiple signed and unsigned +precision for floats and integers. With these, ClickHouse users can be explicit about +the precision required for integers to optimize storage and memory overhead. A +Decimal type, equivalent to Snowflake’s Number type, also offers twice the +precision and scale at 76 digits. In addition to a similar `Float64` value, +ClickHouse also provides a `Float32` for when precision is less critical and +compression paramount. + +### Strings {#strings} + +ClickHouse and Snowflake take contrasting approaches to the storage of string +data. The `VARCHAR` in Snowflake holds Unicode characters in UTF-8, allowing the +user to specify a maximum length. This length has no impact on storage or +performance, with the minimum number of bytes always used to store a string, and +rather provides only constraints useful for downstream tooling. Other types, such +as `Text` and `NChar`, are simply aliases for this type. ClickHouse conversely +stores all [string data as raw bytes](/sql-reference/data-types/string) with a `String` +type (no length specification required), deferring encoding to the user, with +[query time functions](/sql-reference/functions/string-functions#lengthutf8) +available for different encodings. We refer the reader to ["Opaque data argument"](https://utf8everywhere.org/#cookie) +for the motivation as to why. The ClickHouse `String` is thus more comparable +to the Snowflake Binary type in its implementation. Both [Snowflake](https://docs.snowflake.com/en/sql-reference/collation) +and [ClickHouse](/sql-reference/statements/select/order-by#collation-support) +support “collation”, allowing users to override how strings are sorted and compared. + +### Semi-structured types {#semi-structured-data} + +Snowflake supports the `VARIANT`, `OBJECT` and `ARRAY` types for semi-structured +data. + +ClickHouse offers the equivalent [`Variant`](/sql-reference/data-types/variant), +[`Object`](/sql-reference/data-types/object-data-type) (deprecated) and [`Array`](/sql-reference/data-types/array) +types. Additionally, ClickHouse has the [`JSON`](/sql-reference/data-types/newjson) +type which replaces the now deprecated `Object('json')` type and is particularly +performant and storage efficient in [comparison to other native JSON types](https://jsonbench.com/). + +ClickHouse also supports named [`Tuple`s](/sql-reference/data-types/tuple) and arrays of Tuples +via the [`Nested`](/sql-reference/data-types/nested-data-structures/nested) type, +allowing users to explicitly map nested structures. This allows codecs and type +optimizations to be applied throughout the hierarchy, unlike Snowflake, which +requires the user to use the `OBJECT`, `VARIANT`, and `ARRAY` types for the outer +object and does not allow [explicit internal typing](https://docs.snowflake.com/en/sql-reference/data-types-semistructured#characteristics-of-an-object). +This internal typing also simplifies queries on nested numerics in ClickHouse, +which do not need to be cast and can be used in index definitions. + +In ClickHouse, codecs and optimized types can also be applied to substructures. +This provides an added benefit that compression with nested structures remains +excellent, and comparable, to flattened data. In contrast, as a result of the +inability to apply specific types to substructures, Snowflake recommends [flattening +data to achieve optimal compression](https://docs.snowflake.com/en/user-guide/semistructured-considerations#storing-semi-structured-data-in-a-variant-column-vs-flattening-the-nested-structure). +Snowflake also [imposes size restrictions](https://docs.snowflake.com/en/user-guide/semistructured-considerations#data-size-limitations) +for these data types. + +### Type reference {#type-reference} + +| Snowflake | ClickHouse | Note | +|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| [`NUMBER`](https://docs.snowflake.com/en/sql-reference/data-types-numeric) | [`Decimal`](/sql-reference/data-types/decimal) | ClickHouse supports twice the precision and scale than Snowflake - 76 digits vs. 38. | +| [`FLOAT`, `FLOAT4`, `FLOAT8`](https://docs.snowflake.com/en/sql-reference/data-types-numeric#data-types-for-floating-point-numbers) | [`Float32`, `Float64`](/sql-reference/data-types/float) | All floats in Snowflake are 64 bit. | +| [`VARCHAR`](https://docs.snowflake.com/en/sql-reference/data-types-text#varchar) | [`String`](/sql-reference/data-types/string) | | +| [`BINARY`](https://docs.snowflake.com/en/sql-reference/data-types-text#binary) | [`String`](/sql-reference/data-types/string) | | +| [`BOOLEAN`](https://docs.snowflake.com/en/sql-reference/data-types-logical) | [`Bool`](/sql-reference/data-types/boolean) | | +| [`DATE`](https://docs.snowflake.com/en/sql-reference/data-types-datetime#date) | [`Date`](/sql-reference/data-types/date), [`Date32`](/sql-reference/data-types/date32) | `DATE` in Snowflake offers a wider date range than ClickHouse e.g. min for `Date32` is `1900-01-01` and `Date` `1970-01-01`. `Date` in ClickHouse provides more cost efficient (two byte) storage. | +| [`TIME(N)`](https://docs.snowflake.com/en/sql-reference/data-types-datetime#time) | No direct equivalent but can be represented by [`DateTime`](/sql-reference/data-types/datetime) and [`DateTime64(N)`](/sql-reference/data-types/datetime64). | `DateTime64` uses the same concepts of precision. | +| [`TIMESTAMP`](https://docs.snowflake.com/en/sql-reference/data-types-datetime#timestamp) - [`TIMESTAMP_LTZ`](https://docs.snowflake.com/en/sql-reference/data-types-datetime#timestamp-ltz-timestamp-ntz-timestamp-tz), [`TIMESTAMP_NTZ`](https://docs.snowflake.com/en/sql-reference/data-types-datetime#timestamp-ltz-timestamp-ntz-timestamp-tz), [`TIMESTAMP_TZ`](https://docs.snowflake.com/en/sql-reference/data-types-datetime#timestamp-ltz-timestamp-ntz-timestamp-tz) | [`DateTime`](/sql-reference/data-types/datetime) and [`DateTime64`](/sql-reference/data-types/datetime64) | `DateTime` and `DateTime64` can optionally have a TZ parameter defined for the column. If not present, the server's timezone is used. Additionally a `--use_client_time_zone` parameter is available for the client. | +| [`VARIANT`](https://docs.snowflake.com/en/sql-reference/data-types-semistructured#variant) | [`JSON`, `Tuple`, `Nested`](/interfaces/formats) | `JSON` type is experimental in ClickHouse. This type infers the column types at insert time. `Tuple`, `Nested` and `Array` can also be used to build explicitly type structures as an alternative. | +| [`OBJECT`](https://docs.snowflake.com/en/sql-reference/data-types-semistructured#object) | [`Tuple`, `Map`, `JSON`](/interfaces/formats) | Both `OBJECT` and `Map` are analogous to `JSON` type in ClickHouse where the keys are a `String`. ClickHouse requires the value to be consistent and strongly typed whereas Snowflake uses `VARIANT`. This means the values of different keys can be a different type. If this is required in ClickHouse, explicitly define the hierarchy using `Tuple` or rely on `JSON` type. | +| [`ARRAY`](https://docs.snowflake.com/en/sql-reference/data-types-semistructured#array) | [`Array`](/sql-reference/data-types/array), [`Nested`](/sql-reference/data-types/nested-data-structures/nested) | `ARRAY` in Snowflake uses `VARIANT` for the elements - a super type. Conversely these are strongly typed in ClickHouse. | +| [`GEOGRAPHY`](https://docs.snowflake.com/en/sql-reference/data-types-geospatial#geography-data-type) | [`Point`, `Ring`, `Polygon`, `MultiPolygon`](/sql-reference/data-types/geo) | Snowflake imposes a coordinate system (WGS 84) while ClickHouse applies at query time. | +| [`GEOMETRY`](https://docs.snowflake.com/en/sql-reference/data-types-geospatial#geometry-data-type) | [`Point`, `Ring`, `Polygon`, `MultiPolygon`](/sql-reference/data-types/geo) | | | + +| ClickHouse Type | Description | +|-------------------|-----------------------------------------------------------------------------------------------------| +| `IPv4` and `IPv6` | IP-specific types, potentially allowing more efficient storage than Snowflake. | +| `FixedString` | Allows a fixed length of bytes to be used, which is useful for hashes. | +| `LowCardinality` | Allows any type to be dictionary encoded. Useful for when the cardinality is expected to be < 100k. | +| `Enum` | Allows efficient encoding of named values in either 8 or 16-bit ranges. | +| `UUID` | For efficient storage of UUIDs. | +| `Array(Float32)` | Vectors can be represented as an Array of Float32 with supported distance functions. | + +Finally, ClickHouse offers the unique ability to store the intermediate +[state of aggregate functions](/sql-reference/data-types/aggregatefunction). This +state is implementation-specific, but allows the result of an aggregation to be +stored and later queried (with corresponding merge functions). Typically, this +feature is used via a materialized view and, as demonstrated below, offers the +ability to improve performance of specific queries with minimal storage cost by +storing the incremental result of queries over inserted data (more details here). diff --git a/docs/cloud/onboard/02_migrate/01_migration_guides/04_snowflake/_category_.json b/docs/cloud/onboard/02_migrate/01_migration_guides/04_snowflake/_category_.json new file mode 100644 index 00000000000..50b05cb45a0 --- /dev/null +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/04_snowflake/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Snowflake", + "collapsible": true, + "collapsed": true, +} \ No newline at end of file diff --git a/docs/cloud/onboard/02_migrate/01_migration_guides/05_elastic/01_overview.md b/docs/cloud/onboard/02_migrate/01_migration_guides/05_elastic/01_overview.md new file mode 100644 index 00000000000..2104a173b02 --- /dev/null +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/05_elastic/01_overview.md @@ -0,0 +1,12 @@ +--- +sidebar_label: 'Overview' +slug: /migrations/elastic-overview +description: 'Migrating from Snowflake to ClickHouse' +keywords: ['Snowflake'] +title: 'Migrate from Snowflake to ClickHouse' +show_related_blogs: true +--- + +# Elasticsearch to ClickHouse migration + +For observability use cases, see the [Elasticsearch to ClickStack](/use-cases/observability/clickstack/migration/elastic) migration docs. diff --git a/docs/cloud/onboard/02_migrate/01_migration_guides/05_elastic/_category_.json b/docs/cloud/onboard/02_migrate/01_migration_guides/05_elastic/_category_.json new file mode 100644 index 00000000000..4f49621cf3d --- /dev/null +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/05_elastic/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Elasticsearch", + "collapsible": true, + "collapsed": true +} \ No newline at end of file diff --git a/docs/cloud/onboard/02_migrate/01_migration_guides/06_redshift/01_overview.md b/docs/cloud/onboard/02_migrate/01_migration_guides/06_redshift/01_overview.md new file mode 100644 index 00000000000..785eba5d98a --- /dev/null +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/06_redshift/01_overview.md @@ -0,0 +1,57 @@ +--- +sidebar_label: 'Overview' +slug: /migrations/redshift-overview +description: 'Migrating from Amazon Redshift to ClickHouse' +keywords: ['Redshift'] +title: 'Comparing ClickHouse Cloud and Amazon Redshift' +--- + +# Amazon Redshift to ClickHouse migration + +> This document provides an introduction to migrating data from Amazon +Redshift to ClickHouse. + +## Introduction {#introduction} + +Amazon Redshift is a cloud data warehouse that provides reporting and +analytics capabilities for structured and semi-structured data. It was +designed to handle analytical workloads on big data sets using +column-oriented database principles similar to ClickHouse. As part of the +AWS offering, it is often the default solution AWS users turn to for their +analytical data needs. + +While attractive to existing AWS users due to its tight integration with the +Amazon ecosystem, Redshift users that adopt it to power real-time analytics +applications find themselves in need of a more optimized solution for this +purpose. As a result, they increasingly turn to ClickHouse to benefit from +superior query performance and data compression, either as a replacement or +a "speed layer" deployed alongside existing Redshift workloads. + +## ClickHouse vs Redshift {#clickhouse-vs-redshift} + +For users heavily invested in the AWS ecosystem, Redshift represents a +natural choice when faced with data warehousing needs. Redshift differs from +ClickHouse in this important aspect – it optimizes its engine for data +warehousing workloads requiring complex reporting and analytical queries. +Across all deployment modes, the following two limitations make it difficult +to use Redshift for real-time analytical workloads: +* Redshift [compiles code for each query execution plan](https://docs.aws.amazon.com/redshift/latest/dg/c-query-performance.html), +which adds significant overhead to first-time query execution. This overhead can +be justified when query patterns are predictable and compiled execution plans +can be stored in a query cache. However, this introduces challenges for interactive +applications with variable queries. Even when Redshift is able to exploit this +code compilation cache, ClickHouse is faster on most queries. See ["ClickBench"](https://benchmark.clickhouse.com/#system=+%E2%98%81w|%EF%B8%8Fr|C%20c|Rf&type=-&machine=-ca2|gl|6ax|6ale|3al&cluster_size=-&opensource=-&tuned=+n&metric=hot&queries=-). +* Redshift [limits concurrency to 50 across all queues](https://docs.aws.amazon.com/redshift/latest/dg/c_workload_mngmt_classification.html), +which (while adequate for BI) makes it inappropriate for highly concurrent +analytical applications. + +Conversely, while ClickHouse can also be utilized for complex analytical queries +it is optimized for real-time analytical workloads, either powering applications +or acting as a warehouse acceleration later. As a result, Redshift users typically +replace or augment Redshift with ClickHouse for the following reasons: + +| Advantage | Description | +|------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| **Lower query latencies** | ClickHouse achieves lower query latencies, including for varied query patterns, under high concurrency and while subjected to streaming inserts. Even when your query misses a cache, which is inevitable in interactive user-facing analytics, ClickHouse can still process it fast. | +| **Higher concurrent query limits** | ClickHouse places much higher limits on concurrent queries, which is vital for real-time application experiences. In ClickHouse, self-managed as well as cloud, you can scale up your compute allocation to achieve the concurrency your application needs for each service. The level of permitted query concurrency is configurable in ClickHouse, with ClickHouse Cloud defaulting to a value of 1000. | +| **Superior data compression** | ClickHouse offers superior data compression, which allows users to reduce their total storage (and thus cost) or persist more data at the same cost and derive more real-time insights from their data. See "ClickHouse vs Redshift Storage Efficiency" below. | diff --git a/docs/cloud/onboard/02_migrate/01_migration_guides/06_redshift/02_migration_guide.md b/docs/cloud/onboard/02_migrate/01_migration_guides/06_redshift/02_migration_guide.md new file mode 100644 index 00000000000..506c9957e58 --- /dev/null +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/06_redshift/02_migration_guide.md @@ -0,0 +1,13 @@ +--- +sidebar_label: 'Migration guide' +slug: /migrations/redshift/migration-guide +description: 'Migrating from Amazon Redshift to ClickHouse' +keywords: ['Redshift'] +title: 'Amazon Redshift to ClickHouse migration guide' +--- + +import MigrationGuide from '@site/docs/integrations/data-ingestion/redshift/_snippets/_migration_guide.md' + +# Amazon Redshift to ClickHouse migration guide + + \ No newline at end of file diff --git a/docs/cloud/onboard/02_migrate/01_migration_guides/06_redshift/03_sql_translation_reference.md b/docs/cloud/onboard/02_migrate/01_migration_guides/06_redshift/03_sql_translation_reference.md new file mode 100644 index 00000000000..67585e4ea72 --- /dev/null +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/06_redshift/03_sql_translation_reference.md @@ -0,0 +1,95 @@ +--- +sidebar_label: 'SQL translation reference' +slug: /migrations/redshift/sql-translation-reference +description: 'SQL translation reference for Amazon Redshift to ClickHouse' +keywords: ['Redshift'] +title: 'Amazon Redshift SQL translation guide' +--- + +# Amazon Redshift SQL translation guide + +## Data types {#data-types} + +Users moving data between ClickHouse and Redshift will immediately notice +that ClickHouse offers a more extensive range of types, which are also less +restrictive. While Redshift requires users to specify possible string +lengths, even if variable, ClickHouse removes this restriction and burden +from the user by storing strings without encoding as bytes. The ClickHouse +String type thus has no limits or length specification requirements. + +Furthermore, users can exploit Arrays, Tuples, and Enums - absent from +Redshift as first-class citizens (although Arrays/Structs can be imitated +with `SUPER`) and a common frustration of users. ClickHouse additionally +allows the persistence, either at query time or even in a table, of +aggregation states. This will enable data to be pre-aggregated, typically +using a materialized view, and can dramatically improve query performance +for common queries. + +Below we map the equivalent ClickHouse type for each Redshift type: + +| Redshift | ClickHouse | +|------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| [`SMALLINT`](https://docs.aws.amazon.com/redshift/latest/dg/r_Numeric_types201.html#r_Numeric_types201-integer-types) | [`Int8`](/sql-reference/data-types/int-uint) * | +| [`INTEGER`](https://docs.aws.amazon.com/redshift/latest/dg/r_Numeric_types201.html#r_Numeric_types201-integer-types) | [`Int32`](/sql-reference/data-types/int-uint) * | +| [`BIGINT`](https://docs.aws.amazon.com/redshift/latest/dg/r_Numeric_types201.html#r_Numeric_types201-integer-types) | [`Int64`](/sql-reference/data-types/int-uint) * | +| [`DECIMAL`](https://docs.aws.amazon.com/redshift/latest/dg/r_Numeric_types201.html#r_Numeric_types201-decimal-or-numeric-type) | [`UInt128`, `UInt256`, `Int128`, `Int256`](/sql-reference/data-types/int-uint), [`Decimal(P, S)`, `Decimal32(S)`, `Decimal64(S)`, `Decimal128(S)`, `Decimal256(S)`](/sql-reference/data-types/decimal) - (high precision and ranges possible) | +| [`REAL`](https://docs.aws.amazon.com/redshift/latest/dg/r_Numeric_types201.html#r_Numeric_types201-floating-point-types) | [`Float32`](/sql-reference/data-types/float) | +| [`DOUBLE PRECISION`](https://docs.aws.amazon.com/redshift/latest/dg/r_Numeric_types201.html#r_Numeric_types201-floating-point-types) | [`Float64`](/sql-reference/data-types/float) | +| [`BOOLEAN`](https://docs.aws.amazon.com/redshift/latest/dg/r_Boolean_type.html) | [`Bool`](/sql-reference/data-types/boolean) | +| [`CHAR`](https://docs.aws.amazon.com/redshift/latest/dg/r_Character_types.html#r_Character_types-char-or-character) | [`String`](/sql-reference/data-types/string), [`FixedString`](/sql-reference/data-types/fixedstring) | +| [`VARCHAR`](https://docs.aws.amazon.com/redshift/latest/dg/r_Character_types.html#r_Character_types-varchar-or-character-varying) ** | [`String`](/sql-reference/data-types/string) | +| [`DATE`](https://docs.aws.amazon.com/redshift/latest/dg/r_Datetime_types.html#r_Datetime_types-date) | [`Date32`](/sql-reference/data-types/date32) | +| [`TIMESTAMP`](https://docs.aws.amazon.com/redshift/latest/dg/r_Datetime_types.html#r_Datetime_types-timestamp) | [`DateTime`](/sql-reference/data-types/datetime), [`DateTime64`](/sql-reference/data-types/datetime64) | +| [`TIMESTAMPTZ`](https://docs.aws.amazon.com/redshift/latest/dg/r_Datetime_types.html#r_Datetime_types-timestamptz) | [`DateTime`](/sql-reference/data-types/datetime), [`DateTime64`](/sql-reference/data-types/datetime64) | +| [`GEOMETRY`](https://docs.aws.amazon.com/redshift/latest/dg/geospatial-overview.html) | [Geo Data Types](/sql-reference/data-types/geo) | +| [`GEOGRAPHY`](https://docs.aws.amazon.com/redshift/latest/dg/geospatial-overview.html) | [Geo Data Types](/sql-reference/data-types/geo) (less developed e.g. no coordinate systems - can be emulated [with functions](/sql-reference/functions/geo/)) | +| [`HLLSKETCH`](https://docs.aws.amazon.com/redshift/latest/dg/r_HLLSKTECH_type.html) | [`AggregateFunction(uniqHLL12, X)`](/sql-reference/data-types/aggregatefunction) | +| [`SUPER`](https://docs.aws.amazon.com/redshift/latest/dg/r_SUPER_type.html) | [`Tuple`](/sql-reference/data-types/tuple), [`Nested`](/sql-reference/data-types/nested-data-structures/nested), [`Array`](/sql-reference/data-types/array), [`JSON`](/sql-reference/data-types/newjson), [`Map`](/sql-reference/data-types/map) | +| [`TIME`](https://docs.aws.amazon.com/redshift/latest/dg/r_Datetime_types.html#r_Datetime_types-time) | [`DateTime`](/sql-reference/data-types/datetime), [`DateTime64`](/sql-reference/data-types/datetime64) | +| [`TIMETZ`](https://docs.aws.amazon.com/redshift/latest/dg/r_Datetime_types.html#r_Datetime_types-timetz) | [`DateTime`](/sql-reference/data-types/datetime), [`DateTime64`](/sql-reference/data-types/datetime64) | +| [`VARBYTE`](https://docs.aws.amazon.com/redshift/latest/dg/r_VARBYTE_type.html) ** | [`String`](/sql-reference/data-types/string) combined with [`Bit`](/sql-reference/functions/bit-functions) and [Encoding](/sql-reference/functions/encoding-functions/#hex) functions | + +* ClickHouse additionally supports unsigned integers with extended ranges i.e. `UInt8`, `UInt32`, `UInt32` and `UInt64`.
+**ClickHouse’s String type is unlimited by default but can be constrained to specific lengths using Constraints. + +## DDL syntax {#compression} + +### Sorting keys {#sorting-keys} + +Both ClickHouse and Redshift have the concept of a “sorting key”, which define +how data is sorted when being stored. Redshift defines the sorting key using the +`SORTKEY` clause: + +```sql +CREATE TABLE some_table(...) SORTKEY (column1, column2) +``` + +Comparatively, ClickHouse uses an `ORDER BY` clause to specify the sort order: + +```sql +CREATE TABLE some_table(...) ENGINE = MergeTree ORDER BY (column1, column2) +``` + +In most cases, you can use the same sorting key columns and order in ClickHouse +as Redshift, assuming you are using the default `COMPOUND` type. When data is +added to Redshift, you should run the `VACUUM` and `ANALYZE` commands to re-sort +newly added data and update the statistics for the query planner - otherwise, the +unsorted space grows. No such process is required for ClickHouse. + +Redshift supports a couple of convenience features for sorting keys. The first is +automatic sorting keys (using `SORTKEY AUTO`). While this may be appropriate for +getting started, explicit sorting keys ensure the best performance and storage +efficiency when the sorting key is optimal. The second is the `INTERLEAVED` sort key, +which gives equal weight to a subset of columns in the sort key to improve +performance when a query uses one or more secondary sort columns. ClickHouse +supports explicit [projections](/data-modeling/projections), which achieve the +same end-result with a slightly different setup. + +Users should be aware that the “primary key” concept represents different things +in ClickHouse and Redshift. In Redshift, the primary key resembles the traditional +RDMS concept intended to enforce constraints. However, they are not strictly +enforced in Redshift and instead act as hints for the query planner and data +distribution among nodes. In ClickHouse, the primary key denotes columns used +to construct the sparse primary index, used to ensure the data is ordered on +disk, maximizing compression while avoiding pollution of the primary index and +wasting memory. diff --git a/docs/cloud/onboard/02_migrate/01_migration_guides/06_redshift/_category_.json b/docs/cloud/onboard/02_migrate/01_migration_guides/06_redshift/_category_.json new file mode 100644 index 00000000000..95419dcb41c --- /dev/null +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/06_redshift/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Redshift", + "collapsible": true, + "collapsed": true, +} \ No newline at end of file diff --git a/docs/integrations/migration/clickhouse-to-cloud.md b/docs/cloud/onboard/02_migrate/01_migration_guides/07_OSS_to_Cloud/01_clickhouse-to-cloud.md similarity index 95% rename from docs/integrations/migration/clickhouse-to-cloud.md rename to docs/cloud/onboard/02_migrate/01_migration_guides/07_OSS_to_Cloud/01_clickhouse-to-cloud.md index 551314651e2..fed90c525c3 100644 --- a/docs/integrations/migration/clickhouse-to-cloud.md +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/07_OSS_to_Cloud/01_clickhouse-to-cloud.md @@ -1,6 +1,5 @@ --- -sidebar_position: 10 -sidebar_label: 'ClickHouse to ClickHouse Cloud' +sidebar_label: 'ClickHouse OSS' slug: /cloud/migration/clickhouse-to-cloud title: 'Migrating between self-managed ClickHouse and ClickHouse Cloud' description: 'Page describing how to migrate between self-managed ClickHouse and ClickHouse Cloud' @@ -19,7 +18,7 @@ import self_managed_06 from '@site/static/images/integrations/migration/self-man Migrating Self-managed ClickHouse -This guide will show how to migrate from a self-managed ClickHouse server to ClickHouse Cloud, and also how to migrate between ClickHouse Cloud services. The [`remoteSecure`](../../sql-reference/table-functions/remote.md) function is used in `SELECT` and `INSERT` queries to allow access to remote ClickHouse servers, which makes migrating tables as simple as writing an `INSERT INTO` query with an embedded `SELECT`. +This guide will show how to migrate from a self-managed ClickHouse server to ClickHouse Cloud, and also how to migrate between ClickHouse Cloud services. The [`remoteSecure`](/sql-reference/table-functions/remote) function is used in `SELECT` and `INSERT` queries to allow access to remote ClickHouse servers, which makes migrating tables as simple as writing an `INSERT INTO` query with an embedded `SELECT`. ## Migrating from Self-managed ClickHouse to ClickHouse Cloud {#migrating-from-self-managed-clickhouse-to-clickhouse-cloud} diff --git a/docs/cloud/onboard/02_migrate/01_migration_guides/07_OSS_to_Cloud/_category_.json b/docs/cloud/onboard/02_migrate/01_migration_guides/07_OSS_to_Cloud/_category_.json new file mode 100644 index 00000000000..9720f826193 --- /dev/null +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/07_OSS_to_Cloud/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "OSS to Cloud", + "collapsible": true, + "collapsed": true, +} \ No newline at end of file diff --git a/docs/integrations/migration/clickhouse-local-etl.md b/docs/cloud/onboard/02_migrate/01_migration_guides/08_other_methods/01_clickhouse-local-etl.md similarity index 99% rename from docs/integrations/migration/clickhouse-local-etl.md rename to docs/cloud/onboard/02_migrate/01_migration_guides/08_other_methods/01_clickhouse-local-etl.md index 2faf0a935d7..5e3eabc70c9 100644 --- a/docs/integrations/migration/clickhouse-local-etl.md +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/08_other_methods/01_clickhouse-local-etl.md @@ -1,6 +1,5 @@ --- sidebar_label: 'Using clickhouse-local' -sidebar_position: 20 keywords: ['clickhouse', 'migrate', 'migration', 'migrating', 'data', 'etl', 'elt', 'clickhouse-local', 'clickhouse-client'] slug: /cloud/migration/clickhouse-local title: 'Migrating to ClickHouse using clickhouse-local' diff --git a/docs/integrations/migration/etl-tool-to-clickhouse.md b/docs/cloud/onboard/02_migrate/01_migration_guides/08_other_methods/02_etl-tool-to-clickhouse.md similarity index 98% rename from docs/integrations/migration/etl-tool-to-clickhouse.md rename to docs/cloud/onboard/02_migrate/01_migration_guides/08_other_methods/02_etl-tool-to-clickhouse.md index f66e6ff2c47..32a0c168c5a 100644 --- a/docs/integrations/migration/etl-tool-to-clickhouse.md +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/08_other_methods/02_etl-tool-to-clickhouse.md @@ -1,6 +1,5 @@ --- sidebar_label: 'Using a 3rd-party ETL Tool' -sidebar_position: 20 keywords: ['clickhouse', 'migrate', 'migration', 'migrating', 'data', 'etl', 'elt', 'clickhouse-local', 'clickhouse-client'] slug: /cloud/migration/etl-tool-to-clickhouse title: 'Using a 3rd-party ETL Tool' diff --git a/docs/integrations/migration/object-storage-to-clickhouse.md b/docs/cloud/onboard/02_migrate/01_migration_guides/08_other_methods/03_object-storage-to-clickhouse.md similarity index 92% rename from docs/integrations/migration/object-storage-to-clickhouse.md rename to docs/cloud/onboard/02_migrate/01_migration_guides/08_other_methods/03_object-storage-to-clickhouse.md index 2f323db04ef..a0788a80aa0 100644 --- a/docs/integrations/migration/object-storage-to-clickhouse.md +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/08_other_methods/03_object-storage-to-clickhouse.md @@ -1,5 +1,5 @@ --- -title: 'Object Storage to ClickHouse Cloud' +title: 'Using object storage' description: 'Moving data from object storage to ClickHouse Cloud' keywords: ['object storage', 's3', 'azure blob', 'gcs', 'migration'] slug: /integrations/migration/object-storage-to-clickhouse @@ -20,7 +20,7 @@ table functions for migrating data stored in Cloud Object Storage into a ClickHo - [gcs](/sql-reference/table-functions/gcs) - [azureBlobStorage](/sql-reference/table-functions/azureBlobStorage) -If your current database system is not able to directly offload data into a Cloud Object Storage, you could use a [third-party ETL/ELT tool](./etl-tool-to-clickhouse.md) or [clickhouse-local](./clickhouse-local-etl.md) for moving data +If your current database system is not able to directly offload data into a Cloud Object Storage, you could use a [third-party ETL/ELT tool](/cloud/migration/etl-tool-to-clickhouse) or [clickhouse-local](/cloud/migration/clickhouse-local) for moving data from you current database system to Cloud Object Storage, in order to migrate that data in a second step into a ClickHouse Cloud table. Although this is a two steps process (offload data into a Cloud Object Storage, then load into ClickHouse), the advantage is that this diff --git a/docs/cloud/onboard/02_migrate/01_migration_guides/08_other_methods/_category_.json b/docs/cloud/onboard/02_migrate/01_migration_guides/08_other_methods/_category_.json new file mode 100644 index 00000000000..61c592ce8a0 --- /dev/null +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/08_other_methods/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Other...", + "collapsible": true, + "collapsed": true, +} \ No newline at end of file diff --git a/docs/cloud/onboard/02_migrate/01_migration_guides/_category_.json b/docs/cloud/onboard/02_migrate/01_migration_guides/_category_.json new file mode 100644 index 00000000000..aca0c529bce --- /dev/null +++ b/docs/cloud/onboard/02_migrate/01_migration_guides/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Migration guides", + "collapsible": true, + "collapsed": true, +} \ No newline at end of file diff --git a/docs/cloud/onboard/03_tune/_snippets/_monitoring_table_of_contents.md b/docs/cloud/onboard/03_tune/_snippets/_monitoring_table_of_contents.md new file mode 100644 index 00000000000..e5d813d8226 --- /dev/null +++ b/docs/cloud/onboard/03_tune/_snippets/_monitoring_table_of_contents.md @@ -0,0 +1,3 @@ +| Page | Description | +|------|-------------| +| | | diff --git a/docs/cloud/onboard/03_tune/resource_tour.md b/docs/cloud/onboard/03_tune/resource_tour.md new file mode 100644 index 00000000000..7df3afb3b2f --- /dev/null +++ b/docs/cloud/onboard/03_tune/resource_tour.md @@ -0,0 +1,55 @@ +--- +slug: /cloud/get-started/cloud/resource-tour +title: 'Resource tour' +description: 'Overview of ClickHouse Cloud documentation resources for query optimization, scaling strategies, monitoring, and best practices' +keywords: ['clickhouse cloud'] +hide_title: true +--- + +import TableOfContentsBestPractices from '@site/docs/best-practices/_snippets/_table_of_contents.md'; +import TableOfContentsOptimizationAndPerformance from '@site/docs/guides/best-practices/_snippets/_performance_optimizations_table_of_contents.md'; +import TableOfContentsSecurity from '@site/docs/cloud/_snippets/_security_table_of_contents.md'; + +# Resource tour + +This article is intended to provide you with an overview of the resources available +to you in the docs to learn how to get the most out of your ClickHouse Cloud deployment. +Explore resource organised by the following topics: + +- [Query optimization techniques and performance tuning](#query-optimization) +- [Scaling strategies and resource management](#scaling) +- [Monitoring](#monitoring) +- [Security best practices and compliance features](#security) +- [Cost optimization and billing](#cost-optimization) +- Troubleshooting common issues (coming soon) +- Production readiness checklist (coming soon) + +Before diving into more specific topics, we recommend you start with our general +ClickHouse best practice guides which cover general best practices to follow when +using ClickHouse: + + + +## Query optimization techniques and performance tuning {#query-optimization} + + + +## Scaling strategies and resource management {#scaling} + +## Monitoring {#monitoring} + +| Page | Description | +|-----------------------------------------------------------------|-------------------------------------------------------------------------------| +| [Advanced dashboard](/cloud/manage/monitor/advanced-dashboard) | Use the built in advanced dashboard to monitor service health and performance | +| [Prometheus integration](/integrations/prometheus) | Use Prometheus to monitor Cloud services | + +## Security {#security} + + + +## Cost optimization and billing {#cost-optimization} + +| Page | Description | +|-----------------------------------------------------|-----------------------------------------------------------------------------------------------------------| +| [Data transfer](/cloud/manage/network-data-transfer)| Understand how ClickHouse Cloud meters data transferred ingress and egress | +| [Notifications](/cloud/notifications) | Set up notifications for your ClickHouse Cloud service. For example, when credit usage passes a threshold | diff --git a/docs/cloud/onboard/index.md b/docs/cloud/onboard/index.md new file mode 100644 index 00000000000..742a9c87254 --- /dev/null +++ b/docs/cloud/onboard/index.md @@ -0,0 +1,45 @@ +--- +slug: /cloud/get-started +title: 'Get started with ClickHouse Cloud' +description: 'Complete guide to getting started with ClickHouse Cloud - from discovering features to deployment and optimization' +hide_title: true +--- + +# Get started with ClickHouse Cloud + +New to ClickHouse Cloud and not sure where to begin? In this section of the docs, +we'll walk you through everything you need to get up and running quickly. We've +arranged this getting started section into three subsections to help guide +you through each step of the process as you explore ClickHouse Cloud. + + + +## Discover ClickHouse Cloud {#discover-clickhouse-cloud} + +- [Learn](/cloud/overview) about what ClickHouse Cloud is, and how it differs from the open-source version +- [Discover](/cloud/get-started/cloud/use-cases/overview) the main use-cases of ClickHouse Cloud + +## Get set up with ClickHouse Cloud {#get-set-up-with-clickhouse-cloud} + +Now that you know what ClickHouse Cloud is, we'll walk you through the process +of getting your data into ClickHouse Cloud, show you the main features available +and point you towards some general best practices you should know. + +Topics include: + +- [Migration guides](/integrations/migration/overview) from various platforms + +## Tune your ClickHouse Cloud deployment {#evaluate-clickhouse-cloud} + +Now that your data is in ClickHouse Cloud, we'll walk you through some more advanced +topics to help you get the most out of your ClickHouse Cloud experience and explore +what the platform has to offer. + +Topics include: + +- [Query performance and optimization](/cloud/get-started/cloud/resource-tour#query-optimization) +- [Monitoring](/cloud/get-started/cloud/resource-tour#monitoring) +- [Security considerations](/cloud/get-started/cloud/resource-tour#security) +- Troubleshooting tips + + \ No newline at end of file diff --git a/docs/cloud/reference/changelog.md b/docs/cloud/reference/01_changelog/01_changelog.md similarity index 99% rename from docs/cloud/reference/changelog.md rename to docs/cloud/reference/01_changelog/01_changelog.md index 7dff1c1c61a..9298f5d6bca 100644 --- a/docs/cloud/reference/changelog.md +++ b/docs/cloud/reference/01_changelog/01_changelog.md @@ -29,7 +29,7 @@ import share_queries from '@site/static/images/cloud/reference/may-30-share-quer import query_endpoints from '@site/static/images/cloud/reference/may-17-query-endpoints.png'; import dashboards from '@site/static/images/cloud/reference/may-30-dashboards.png'; -In addition to this ClickHouse Cloud changelog, please see the [Cloud Compatibility](/cloud/reference/cloud-compatibility.md) page. +In addition to this ClickHouse Cloud changelog, please see the [Cloud Compatibility](/whats-new/cloud-compatibility) page. ## August 13, 2025 {#august-13-2025} @@ -71,10 +71,10 @@ to get up and running. - New services now store database and table metadata in a central **SharedCatalog**, a new model for coordination and object lifecycles which enables: - - **Cloud-scale DDL**, even under high concurrency - - **Resilient deletion and new DDL operations** - - **Fast spin-up and wake-ups** as stateless nodes now launch with no disk dependencies - - **Stateless compute across both native and open formats**, including Iceberg and Delta Lake + - **Cloud-scale DDL**, even under high concurrency + - **Resilient deletion and new DDL operations** + - **Fast spin-up and wake-ups** as stateless nodes now launch with no disk dependencies + - **Stateless compute across both native and open formats**, including Iceberg and Delta Lake Read more about SharedCatalog in our [blog](https://clickhouse.com/blog/clickhouse-cloud-stateless-compute) @@ -169,7 +169,7 @@ to get up and running. ## April 4, 2025 {#april-4-2025} - Slack notifications for ClickHouse Cloud: ClickHouse Cloud now supports Slack notifications for billing, scaling, and ClickPipes events, in addition to in-console and email notifications. These notifications are sent via the ClickHouse Cloud Slack application. Organization admins can configure these notifications via the notification center by specifying slack channels to which notifications should be sent. -- Users running Production and Development services will now see ClickPipes and data transfer usage price on their bills. Please refer to the [announcement](/cloud/manage/jan-2025-faq/pricing-dimensions) from January 2025 for more details. +- Users running Production and Development services will now see ClickPipes and data transfer usage price on their bills. ## March 21, 2025 {#march-21-2025} @@ -252,7 +252,7 @@ We are adding a **new Enterprise tier** to serve the needs of the most demanding To support these changes, we are restructuring our current **Development** and **Production** tiers to more closely match how our evolving customer base is using our offerings. We are introducing the **Basic** tier, oriented toward users that are testing out new ideas and projects, and the **Scale** tier, matching users working with production workloads and data at scale. -You can read about these and other functional changes in this [blog](https://clickhouse.com/blog/evolution-of-clickhouse-cloud-new-features-superior-performance-tailored-offerings). Existing customers will need to take action to select a [new plan](https://clickhouse.com/pricing). Customer-facing communication was sent via email to organization administrators, and the following [FAQ](/cloud/manage/jan-2025-faq/summary) covers the key changes and timelines. +You can read about these and other functional changes in this [blog](https://clickhouse.com/blog/evolution-of-clickhouse-cloud-new-features-superior-performance-tailored-offerings). Existing customers will need to take action to select a [new plan](https://clickhouse.com/pricing). Customer-facing communication was sent via email to organization administrators. ### Warehouses: Compute-compute separation (GA) {#warehouses-compute-compute-separation-ga} @@ -280,7 +280,7 @@ Safe managed upgrades deliver significant value to our users by allowing them to ### HIPAA support {#hipaa-support} -We now support HIPAA in compliant regions, including AWS `us-east-1`, `us-west-2` and GCP `us-central1`, `us-east1`. Customers wishing to onboard must sign a Business Associate Agreement (BAA) and deploy to the compliant version of the region. For more information on HIPAA, please refer to the [documentation](/cloud/security/security-and-compliance). +We now support HIPAA in compliant regions, including AWS `us-east-1`, `us-west-2` and GCP `us-central1`, `us-east1`. Customers wishing to onboard must sign a Business Associate Agreement (BAA) and deploy to the compliant version of the region. For more information on HIPAA, please refer to the [documentation](/cloud/security/compliance-overview). ### Scheduled upgrades {#scheduled-upgrades} @@ -775,12 +775,12 @@ This release upgrades the core database version, adds ability to set up private ### Integrations changes {#integrations-changes-4} * Kafka Connect - * Support async_insert for exactly once (disabled by default) + * Support async_insert for exactly once (disabled by default) * Golang client - * Fixed DateTime binding - * Improved batch insert performance + * Fixed DateTime binding + * Improved batch insert performance * Java client - * Fixed request compression problem + * Fixed request compression problem ### Settings changes {#settings-changes} * `use_mysql_types_in_show_columns` is no longer required. It will be automatically enabled when you connect through the MySQL interface. @@ -1428,7 +1428,7 @@ This release enables dictionaries from local ClickHouse table and HTTP sources, ### General changes {#general-changes-5} - Added support for [dictionaries](/sql-reference/dictionaries/index.md) from local ClickHouse table and HTTP sources -- Introduced support for the Mumbai [region](/cloud/reference/supported-regions.md) +- Introduced support for the Mumbai [region](/cloud/reference/supported-regions) ### Console changes {#console-changes-30} @@ -1496,4 +1496,4 @@ This release significantly lowers compute consumption for small workloads, lower ClickHouse Cloud began its public Beta on October 4th, 2022. Learn more in this [blog](https://clickhouse.com/blog/clickhouse-cloud-public-beta). -The ClickHouse Cloud version is based on ClickHouse core v22.10. For a list of compatible features, refer to the [Cloud Compatibility](/cloud/reference/cloud-compatibility.md) guide. +The ClickHouse Cloud version is based on ClickHouse core v22.10. For a list of compatible features, refer to the [Cloud Compatibility](/whats-new/cloud-compatibility) guide. diff --git a/docs/cloud/changelogs/24_02.md b/docs/cloud/reference/01_changelog/02_release_notes/24_02.md similarity index 100% rename from docs/cloud/changelogs/24_02.md rename to docs/cloud/reference/01_changelog/02_release_notes/24_02.md diff --git a/docs/cloud/changelogs/24_05.md b/docs/cloud/reference/01_changelog/02_release_notes/24_05.md similarity index 100% rename from docs/cloud/changelogs/24_05.md rename to docs/cloud/reference/01_changelog/02_release_notes/24_05.md diff --git a/docs/cloud/changelogs/24_06.md b/docs/cloud/reference/01_changelog/02_release_notes/24_06.md similarity index 100% rename from docs/cloud/changelogs/24_06.md rename to docs/cloud/reference/01_changelog/02_release_notes/24_06.md diff --git a/docs/cloud/changelogs/24_08.md b/docs/cloud/reference/01_changelog/02_release_notes/24_08.md similarity index 100% rename from docs/cloud/changelogs/24_08.md rename to docs/cloud/reference/01_changelog/02_release_notes/24_08.md diff --git a/docs/cloud/changelogs/24_10.md b/docs/cloud/reference/01_changelog/02_release_notes/24_10.md similarity index 100% rename from docs/cloud/changelogs/24_10.md rename to docs/cloud/reference/01_changelog/02_release_notes/24_10.md diff --git a/docs/cloud/changelogs/24_12.md b/docs/cloud/reference/01_changelog/02_release_notes/24_12.md similarity index 100% rename from docs/cloud/changelogs/24_12.md rename to docs/cloud/reference/01_changelog/02_release_notes/24_12.md diff --git a/docs/cloud/changelogs/25_04.md b/docs/cloud/reference/01_changelog/02_release_notes/25_04.md similarity index 100% rename from docs/cloud/changelogs/25_04.md rename to docs/cloud/reference/01_changelog/02_release_notes/25_04.md diff --git a/docs/cloud/changelogs/25_06.md b/docs/cloud/reference/01_changelog/02_release_notes/25_06.md similarity index 100% rename from docs/cloud/changelogs/25_06.md rename to docs/cloud/reference/01_changelog/02_release_notes/25_06.md diff --git a/docs/cloud/reference/01_changelog/02_release_notes/_category_.json b/docs/cloud/reference/01_changelog/02_release_notes/_category_.json new file mode 100644 index 00000000000..4eeae460788 --- /dev/null +++ b/docs/cloud/reference/01_changelog/02_release_notes/_category_.json @@ -0,0 +1,6 @@ +{ + "label": "Release notes", + "collapsible": true, + "collapsed": true, + "link": { "type": "doc", "id": "cloud/reference/changelog/release_notes/index" } +} \ No newline at end of file diff --git a/docs/cloud/reference/01_changelog/02_release_notes/index.md b/docs/cloud/reference/01_changelog/02_release_notes/index.md new file mode 100644 index 00000000000..ab87960c306 --- /dev/null +++ b/docs/cloud/reference/01_changelog/02_release_notes/index.md @@ -0,0 +1,24 @@ +--- +slug: /cloud/reference/changelogs/release-notes +title: 'Cloud Release Notes' +description: 'Landing page for Cloud release notes' +--- + + + + +| Page | Description | +|-----|-----| +| [v25.6 Changelog for Cloud](/changelogs/25.6) | Fast release changelog for v25.6 | +| [v25.4 Changelog for Cloud](/changelogs/25.4) | Fast release changelog for v25.4 | +| [v24.12 Changelog for Cloud](/changelogs/24.12) | Fast release changelog for v24.12 | +| [v24.10 Changelog for Cloud](/changelogs/24.10) | Fast release changelog for v24.10 | +| [v24.8 Changelog for Cloud](/changelogs/24.8) | Fast release changelog for v24.8 | +| [v24.6 Changelog for Cloud](/changelogs/24.6) | Fast release changelog for v24.6 | +| [v24.5 Changelog for Cloud](/changelogs/24.5) | Fast release changelog for v24.5 | +| [v24.2 Changelog](/whats-new/changelog/24.2-fast-release) | Fast release changelog for v24.2 | +| [Cloud Changelog](/whats-new/cloud) | ClickHouse Cloud changelog providing descriptions of what is new in each ClickHouse Cloud release | + diff --git a/docs/cloud/reference/01_changelog/_category_.json b/docs/cloud/reference/01_changelog/_category_.json new file mode 100644 index 00000000000..60a9e95ee7e --- /dev/null +++ b/docs/cloud/reference/01_changelog/_category_.json @@ -0,0 +1,6 @@ +{ + "label": "Change logs", + "collapsible": true, + "collapsed": true, + "link": { "type": "doc", "id": "cloud/reference/changelog/index" } +} \ No newline at end of file diff --git a/docs/cloud/reference/changelogs-index.md b/docs/cloud/reference/01_changelog/index.md similarity index 91% rename from docs/cloud/reference/changelogs-index.md rename to docs/cloud/reference/01_changelog/index.md index c23e70f4ea2..cfdb11087f8 100644 --- a/docs/cloud/reference/changelogs-index.md +++ b/docs/cloud/reference/01_changelog/index.md @@ -7,4 +7,4 @@ description: 'Landing page for Cloud changelogs' | Page | Description | |---------------------------------------------------------------|-------------------------------------------------| | [Cloud Changelog](/whats-new/cloud) | Changelog for ClickHouse Cloud | -| [Release Notes](/cloud/reference/changelogs/release-notes) | Release notes for all ClickHouse Cloud releases | +| [Release Notes](/cloud/reference/changelogs/release-notes) | Release notes for all ClickHouse Cloud releases | \ No newline at end of file diff --git a/docs/cloud/reference/architecture.md b/docs/cloud/reference/02_architecture.md similarity index 98% rename from docs/cloud/reference/architecture.md rename to docs/cloud/reference/02_architecture.md index 9c3d7cf5f56..6e3294d3a97 100644 --- a/docs/cloud/reference/architecture.md +++ b/docs/cloud/reference/02_architecture.md @@ -1,7 +1,7 @@ --- sidebar_label: 'Architecture' slug: /cloud/reference/architecture -title: 'ClickHouse Cloud Architecture' +title: 'ClickHouse Cloud architecture' description: 'This page describes the architecture of ClickHouse Cloud' --- diff --git a/docs/cloud/manage/billing.md b/docs/cloud/reference/03_billing/01_billing_overview.md similarity index 98% rename from docs/cloud/manage/billing.md rename to docs/cloud/reference/03_billing/01_billing_overview.md index 3745df1d2aa..cdbe6c40355 100644 --- a/docs/cloud/manage/billing.md +++ b/docs/cloud/reference/03_billing/01_billing_overview.md @@ -5,7 +5,7 @@ title: 'Pricing' description: 'Overview page for ClickHouse Cloud pricing' --- -import ClickPipesFAQ from './jan2025_faq/_snippets/_clickpipes_faq.md' +import ClickPipesFAQ from '../../_snippets/_clickpipes_faq.md' For pricing information, see the [ClickHouse Cloud Pricing](https://clickhouse.com/pricing#pricing-calculator) page. ClickHouse Cloud bills based on the usage of compute, storage, [data transfer](/cloud/manage/network-data-transfer) (egress over the internet and cross-region), and [ClickPipes](/integrations/clickpipes). @@ -15,7 +15,7 @@ To understand what can affect your bill, and ways that you can manage your spend :::note - Prices reflect AWS us-east-1 pricing. -- Explore applicable data transfer and ClickPipes charges [here](jan2025_faq/dimensions.md). +- Explore applicable data transfer and ClickPipes charges [here](/cloud/manage/network-data-transfer). ::: ### Basic: from $66.52 per month {#basic-from-6652-per-month} @@ -191,7 +191,7 @@ Storage costs are the same across tiers and vary by region and cloud service pro Storage and backups are counted towards storage costs and billed separately. All services will default to one backup, retained for a day. -Users who need additional backups can do so by configuring additional [backups](backups/overview.md) under the settings tab of the Cloud console. +Users who need additional backups can do so by configuring additional [backups](/cloud/manage/backups/overview) under the settings tab of the Cloud console. ### How do I estimate compression? {#how-do-i-estimate-compression} @@ -287,7 +287,7 @@ which are metered in the same way and billed accordingly. When creating a service in addition to an existing service, you can choose if this new service should share the same data with the existing one. -If yes, these two services now form a [warehouse](../reference/warehouses.md). +If yes, these two services now form a [warehouse](/cloud/reference/warehouses). A warehouse has the data stored in it with multiple compute services accessing this data. As the data is stored only once, you only pay for one copy of data, though multiple services are accessing it. diff --git a/docs/cloud/manage/billing/marketplace/aws-marketplace-committed.md b/docs/cloud/reference/03_billing/02_marketplace/aws-marketplace-committed.md similarity index 100% rename from docs/cloud/manage/billing/marketplace/aws-marketplace-committed.md rename to docs/cloud/reference/03_billing/02_marketplace/aws-marketplace-committed.md diff --git a/docs/cloud/manage/billing/marketplace/aws-marketplace-payg.md b/docs/cloud/reference/03_billing/02_marketplace/aws-marketplace-payg.md similarity index 100% rename from docs/cloud/manage/billing/marketplace/aws-marketplace-payg.md rename to docs/cloud/reference/03_billing/02_marketplace/aws-marketplace-payg.md diff --git a/docs/cloud/manage/billing/marketplace/azure-marketplace-committed.md b/docs/cloud/reference/03_billing/02_marketplace/azure-marketplace-committed.md similarity index 100% rename from docs/cloud/manage/billing/marketplace/azure-marketplace-committed.md rename to docs/cloud/reference/03_billing/02_marketplace/azure-marketplace-committed.md diff --git a/docs/cloud/manage/billing/marketplace/azure-marketplace-payg.md b/docs/cloud/reference/03_billing/02_marketplace/azure-marketplace-payg.md similarity index 100% rename from docs/cloud/manage/billing/marketplace/azure-marketplace-payg.md rename to docs/cloud/reference/03_billing/02_marketplace/azure-marketplace-payg.md diff --git a/docs/cloud/manage/billing/marketplace/gcp-marketplace-committed.md b/docs/cloud/reference/03_billing/02_marketplace/gcp-marketplace-committed.md similarity index 100% rename from docs/cloud/manage/billing/marketplace/gcp-marketplace-committed.md rename to docs/cloud/reference/03_billing/02_marketplace/gcp-marketplace-committed.md diff --git a/docs/cloud/manage/billing/marketplace/gcp-marketplace-payg.md b/docs/cloud/reference/03_billing/02_marketplace/gcp-marketplace-payg.md similarity index 100% rename from docs/cloud/manage/billing/marketplace/gcp-marketplace-payg.md rename to docs/cloud/reference/03_billing/02_marketplace/gcp-marketplace-payg.md diff --git a/docs/cloud/manage/billing/marketplace/index.md b/docs/cloud/reference/03_billing/02_marketplace/index.md similarity index 100% rename from docs/cloud/manage/billing/marketplace/index.md rename to docs/cloud/reference/03_billing/02_marketplace/index.md diff --git a/docs/cloud/manage/billing/marketplace/overview.md b/docs/cloud/reference/03_billing/02_marketplace/overview.md similarity index 100% rename from docs/cloud/manage/billing/marketplace/overview.md rename to docs/cloud/reference/03_billing/02_marketplace/overview.md diff --git a/docs/cloud/manage/billing/payment-thresholds.md b/docs/cloud/reference/03_billing/03_payment-thresholds.md similarity index 97% rename from docs/cloud/manage/billing/payment-thresholds.md rename to docs/cloud/reference/03_billing/03_payment-thresholds.md index 0c2b6948d0e..2d9ce5f188a 100644 --- a/docs/cloud/manage/billing/payment-thresholds.md +++ b/docs/cloud/reference/03_billing/03_payment-thresholds.md @@ -1,7 +1,7 @@ --- sidebar_label: 'Payment Thresholds' slug: /cloud/billing/payment-thresholds -title: 'Payment Thresholds' +title: 'Payment thresholds' description: 'Payment thresholds and automatic invoicing for ClickHouse Cloud.' keywords: ['billing', 'payment thresholds', 'automatic invoicing', 'invoice'] --- diff --git a/docs/cloud/reference/03_billing/04_network-data-transfer.mdx b/docs/cloud/reference/03_billing/04_network-data-transfer.mdx new file mode 100644 index 00000000000..4013e1477b7 --- /dev/null +++ b/docs/cloud/reference/03_billing/04_network-data-transfer.mdx @@ -0,0 +1,56 @@ +--- +sidebar_label: 'Data Transfer' +slug: /cloud/manage/network-data-transfer +title: 'Data Transfer' +description: 'Understand how ClickHouse Cloud meters data transferred ingress and egress' +--- + +import NetworkPricing from '@site/docs/cloud/reference/_snippets/_network_transfer_rates.md'; + +ClickHouse Cloud meters data transferred ingress and egress. +This includes any data in and out of ClickHouse Cloud as well as any intra-region +and cross-region data transfer. This usage is tracked at the service level. Based +on this usage, customers incur data transfer charges that are then added to their +monthly bill. + +ClickHouse Cloud charges for: +- Data egress from ClickHouse Cloud to the public Internet, including to other +regions of other cloud providers. +- Data egress to another region in the same cloud provider. + +There are no charges for intra-region data transfer or Private Link/Private +Service Connect use and data transfer.However, we reserve the right to implement +additional data transfer pricing dimensions if we see usage patterns that impact +our ability to charge users appropriately. + +Data transfer charges vary by Cloud Service Provider (CSP) and region. +Public internet egress pricing is based only on the origin region. +Inter-region (or cross-region) pricing depends on both the origin and destination +regions. + +**Best Practices to minimize Data Transfer Costs** + +There are some patterns to keep in mind when ingressing and egressing data to +minimize data transfer costs. + +1. When ingressing or egressing data from Clickhouse Cloud, use compression where +possible, to minimize the amount of data transferred and the associated cost. + +2. Be aware that when doing an INSERT over the native protocol with non-inlined +values (e.g. `INSERT INTO [TABLE] FROM INFILE [FILE] FORMAT NATIVE`), ClickHouse +clients pull metadata from servers to pack the data. If the metadata is larger +than the `INSERT` payload, you might counterintuitively see more egress than +there is ingress from the server perspective. If this is unacceptable, consider +inlining data with `VALUES` syntax or using the HTTP protocol. + +The tables below shows how data transfer charges for egress vary across public +internet or cross-region by cloud provider and region. + +:::note +ClickHouse Cloud meters inter-region usage in terms of tiers, Tier 1 through +Tier 4, depending on the origin and destination regions. The table below shows +the tier for each combination of inter-region data transfer. In the Billing usage +screen on ClickHouse Cloud you will see data transfer usage broken out by tiers. +::: + + diff --git a/docs/cloud/manage/troubleshooting-billing-issues.md b/docs/cloud/reference/03_billing/05_billing_compliance.md similarity index 100% rename from docs/cloud/manage/troubleshooting-billing-issues.md rename to docs/cloud/reference/03_billing/05_billing_compliance.md diff --git a/docs/cloud/manage/billing/index.md b/docs/cloud/reference/03_billing/index.md similarity index 87% rename from docs/cloud/manage/billing/index.md rename to docs/cloud/reference/03_billing/index.md index f940c75a034..1a47fc98417 100644 --- a/docs/cloud/manage/billing/index.md +++ b/docs/cloud/reference/03_billing/index.md @@ -12,4 +12,4 @@ This section of the documentation covers topics related to billing, and contains | [Overview](/cloud/marketplace/marketplace-billing) | Overview and FAQ pages for marketplace billing. | | [Payment Thresholds](/cloud/billing/payment-thresholds) | Learn more about how payment thresholds work and how to adjust them. | | [Troubleshooting Billing Issues](/manage/clickhouse-cloud-billing-compliance) | Troubleshoot common billing issues. | -| [Marketplace](/cloud/manage/) | Landing page for further marketplace related topics. | +| [Marketplace](/cloud/manage/marketplace/) | Landing page for further marketplace related topics. | diff --git a/docs/cloud/reference/supported-regions.md b/docs/cloud/reference/05_supported-regions.md similarity index 98% rename from docs/cloud/reference/supported-regions.md rename to docs/cloud/reference/05_supported-regions.md index f434b8786e1..4086227f4ab 100644 --- a/docs/cloud/reference/supported-regions.md +++ b/docs/cloud/reference/05_supported-regions.md @@ -1,6 +1,6 @@ --- title: 'Supported Cloud Regions' -sidebar_label: 'Supported Cloud Regions' +sidebar_label: 'Supported Cloud regions' keywords: ['aws', 'gcp', 'google cloud', 'azure', 'cloud', 'regions'] description: 'Supported regions for ClickHouse Cloud' slug: /cloud/reference/supported-regions diff --git a/docs/cloud/manage/service-uptime.md b/docs/cloud/reference/06_service-uptime.md similarity index 95% rename from docs/cloud/manage/service-uptime.md rename to docs/cloud/reference/06_service-uptime.md index 3a31e459eaf..33397a626be 100644 --- a/docs/cloud/manage/service-uptime.md +++ b/docs/cloud/reference/06_service-uptime.md @@ -1,7 +1,7 @@ --- sidebar_label: 'Service Uptime and SLA' slug: /cloud/manage/service-uptime -title: 'Service Uptime' +title: 'Service uptime' description: 'Users can now see regional uptimes on the status page and subscribe to alerts on service disruptions.' --- diff --git a/docs/cloud/manage/settings.md b/docs/cloud/reference/08_settings.md similarity index 94% rename from docs/cloud/manage/settings.md rename to docs/cloud/reference/08_settings.md index a766ef59c13..9926c5833cb 100644 --- a/docs/cloud/manage/settings.md +++ b/docs/cloud/reference/08_settings.md @@ -1,7 +1,7 @@ --- -sidebar_label: 'Configuring Settings' +sidebar_label: 'Configuring settings' slug: /manage/settings -title: 'Configuring Settings' +title: 'Configuring settings' description: 'How to configure settings for your ClickHouse Cloud service for a specific user or role' --- diff --git a/docs/cloud/reference/09_security/_category_.json b/docs/cloud/reference/09_security/_category_.json new file mode 100644 index 00000000000..aed26fa7f7a --- /dev/null +++ b/docs/cloud/reference/09_security/_category_.json @@ -0,0 +1,5 @@ +{ + "label": "Security", + "collapsible": true, + "collapsed": true, +} \ No newline at end of file diff --git a/docs/cloud/security/audit-logging.md b/docs/cloud/reference/09_security/audit-logging.md similarity index 100% rename from docs/cloud/security/audit-logging.md rename to docs/cloud/reference/09_security/audit-logging.md diff --git a/docs/cloud/reference/09_security/privacy_and_compliance/_category_.json b/docs/cloud/reference/09_security/privacy_and_compliance/_category_.json new file mode 100644 index 00000000000..99beeb3e924 --- /dev/null +++ b/docs/cloud/reference/09_security/privacy_and_compliance/_category_.json @@ -0,0 +1,6 @@ +{ + "label": "Privacy and compliance", + "collapsible": true, + "collapsed": true, + "link": { "type": "doc", "id": "cloud/reference/security/privacy_and_compliance/index" } +} \ No newline at end of file diff --git a/docs/cloud/security/compliance-overview.md b/docs/cloud/reference/09_security/privacy_and_compliance/compliance-overview.md similarity index 96% rename from docs/cloud/security/compliance-overview.md rename to docs/cloud/reference/09_security/privacy_and_compliance/compliance-overview.md index 4653c0f09c1..de8b7a9b90e 100644 --- a/docs/cloud/security/compliance-overview.md +++ b/docs/cloud/reference/09_security/privacy_and_compliance/compliance-overview.md @@ -1,8 +1,7 @@ --- -sidebar_label: 'Security and Compliance' -slug: /cloud/security/security-and-compliance -title: 'Security and Compliance' -description: 'This page describes the security and compliance measures implemented by ClickHouse Cloud to protect customer data.' +title: 'Security and compliance reports' +slug: /cloud/security/compliance-overview +description: 'Overview of ClickHouse Cloud security and compliance certifications including SOC 2, ISO 27001, U.S. DPF, and HIPAA' --- import BetaBadge from '@theme/badges/BetaBadge'; diff --git a/docs/cloud/security/privacy-compliance-overview.md b/docs/cloud/reference/09_security/privacy_and_compliance/index.md similarity index 83% rename from docs/cloud/security/privacy-compliance-overview.md rename to docs/cloud/reference/09_security/privacy_and_compliance/index.md index e47d422c0a8..9b0b8594fc8 100644 --- a/docs/cloud/security/privacy-compliance-overview.md +++ b/docs/cloud/reference/09_security/privacy_and_compliance/index.md @@ -11,5 +11,5 @@ This section contains the following pages: | Page | Description | |----------------------------------------------------------------------------|--------------------------------------------------------------| -| [Security and Compliance](/cloud/security/security-and-compliance) | Security reports and privacy compliance of ClickHouse Cloud. | +| [Security and Compliance](/cloud/security/compliance-overview) | Security reports and privacy compliance of ClickHouse Cloud. | | [Personal Data Access](/cloud/security/personal-data-access) | Information on how to access your personal data. | diff --git a/docs/cloud/security/personal-data-access.md b/docs/cloud/reference/09_security/privacy_and_compliance/personal-data-access.md similarity index 98% rename from docs/cloud/security/personal-data-access.md rename to docs/cloud/reference/09_security/privacy_and_compliance/personal-data-access.md index bcf4514b301..3bdc8ca3302 100644 --- a/docs/cloud/security/personal-data-access.md +++ b/docs/cloud/reference/09_security/privacy_and_compliance/personal-data-access.md @@ -1,7 +1,7 @@ --- -sidebar_label: 'Personal Data Access' +sidebar_label: 'Personal data access' slug: /cloud/security/personal-data-access -title: 'Personal Data Access' +title: 'Personal data access' description: 'As a registered user, ClickHouse allows you to view and manage your personal account data, including contact information.' --- diff --git a/docs/cloud/manage/account-close.md b/docs/cloud/reference/10_account-close.md similarity index 98% rename from docs/cloud/manage/account-close.md rename to docs/cloud/reference/10_account-close.md index ac9a79eeeea..021345d4a94 100644 --- a/docs/cloud/manage/account-close.md +++ b/docs/cloud/reference/10_account-close.md @@ -1,11 +1,12 @@ --- -sidebar_label: 'Delete Account' +sidebar_label: 'Account closure' slug: /cloud/manage/close_account -title: 'Account Close & Deletion' +title: 'Account closure and deletion' description: 'We know there are circumstances that sometimes necessitate account closure. This guide will help you through the process.' --- ## Account closure and deletion {#account-close--deletion} + Our goal is to help you be successful in your project. If you have questions that are not answered on this site or need help evaluating a unique use case, please contact us at [support@clickhouse.com](mailto:support@clickhouse.com). diff --git a/docs/cloud/manage/_snippets/_network_transfer_rates.md b/docs/cloud/reference/_snippets/_network_transfer_rates.md similarity index 100% rename from docs/cloud/manage/_snippets/_network_transfer_rates.md rename to docs/cloud/reference/_snippets/_network_transfer_rates.md diff --git a/docs/cloud/reference/release-notes-index.md b/docs/cloud/reference/release-notes-index.md deleted file mode 100644 index c7e32f843b5..00000000000 --- a/docs/cloud/reference/release-notes-index.md +++ /dev/null @@ -1,13 +0,0 @@ ---- -slug: /cloud/reference/changelogs/release-notes -title: 'Cloud Release Notes' -description: 'Landing page for Cloud release notes' ---- - - - - - diff --git a/docs/cloud/security/_category_.yml b/docs/cloud/security/_category_.yml deleted file mode 100644 index b7253753fd5..00000000000 --- a/docs/cloud/security/_category_.yml +++ /dev/null @@ -1,6 +0,0 @@ -label: 'Cloud Security' -collapsible: true -collapsed: true -link: - type: generated-index - title: Cloud Security diff --git a/docs/cloud/security/index.md b/docs/cloud/security/index.md deleted file mode 100644 index b6a2d56ab1b..00000000000 --- a/docs/cloud/security/index.md +++ /dev/null @@ -1,20 +0,0 @@ ---- -slug: /cloud/security -keywords: ['Cloud', 'Security'] -title: 'Overview' -hide_title: true -description: 'Landing page for ClickHouse Cloud Security' ---- - -# ClickHouse Cloud security - -This section delves into security in ClickHouse Cloud and contains the following pages: - -| Page | Description | -|---------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| [Shared Responsibility Model](shared-responsibility-model.md) | Information on the security features offered for each service type. | -| [Cloud Access Management](cloud-access-management/index.md) | Information on access control, authentication, SSO setup, common access management queries and how to invite new users. | -| [Connectivity](connectivity-overview.md) | Information on setting IP filters, private networking, secure access of S3 data and Cloud IP addresses. | -| [Enhanced Encryption](cmek.md) | Data at rest is encrypted by default using cloud provider-managed AES 256 keys. Customers may enable Transparent Data Encryption (TDE) to provide an additional layer of protection for service data. | -| [Audit Logging](audit-logging.md) | A guide to audit logging in ClickHouse Cloud. | -| [Privacy and Compliance](privacy-compliance-overview.md) | Information on security and compliance of ClickHouse Cloud, a guide on how to view and correct your personal information. | diff --git a/docs/getting-started/index.md b/docs/getting-started/index.md index ca8b2ee1c18..90d6ae54ca0 100644 --- a/docs/getting-started/index.md +++ b/docs/getting-started/index.md @@ -23,4 +23,31 @@ functions in ClickHouse. The sample datasets include: by https://github.com/ClickHouse/clickhouse-docs/blob/main/scripts/autogenerate-table-of-contents.sh --> +| Page | Description | +|-----|-----| +| [NOAA Global Historical Climatology Network](/getting-started/example-datasets/noaa) | 2.5 billion rows of climate data for the last 120 yrs | +| [Writing Queries in ClickHouse using GitHub Data](/getting-started/example-datasets/github) | Dataset containing all of the commits and changes for the ClickHouse repository | +| [Analyzing Stack Overflow data with ClickHouse](/getting-started/example-datasets/stackoverflow) | Analyzing Stack Overflow data with ClickHouse | +| [The UK property prices dataset](/getting-started/example-datasets/uk-price-paid) | Learn how to use projections to improve the performance of queries that you run frequently using the UK property dataset, which contains data about prices paid for real-estate property in England and Wales | +| [Taiwan Historical Weather Datasets](/getting-started/example-datasets/tw-weather) | 131 million rows of weather observation data for the last 128 yrs | +| [New York Taxi Data](/getting-started/example-datasets/nyc-taxi) | Data for billions of taxi and for-hire vehicle (Uber, Lyft, etc.) trips originating in New York City since 2009 | +| [Geo Data using the Cell Tower Dataset](/getting-started/example-datasets/cell-towers) | Learn how to load OpenCelliD data into ClickHouse, connect Apache Superset to ClickHouse and build a dashboard based on data | +| [Amazon Customer Review](/getting-started/example-datasets/amazon-reviews) | Over 150M customer reviews of Amazon products | +| [AMPLab Big Data Benchmark](/getting-started/example-datasets/amplab-benchmark) | A benchmark dataset used for comparing the performance of data warehousing solutions. | +| [Anonymized Web Analytics](/getting-started/example-datasets/metrica) | Dataset consisting of two tables containing anonymized web analytics data with hits and visits | +| [Brown University Benchmark](/getting-started/example-datasets/brown-benchmark) | A new analytical benchmark for machine-generated log data | +| [COVID-19 Open-Data](/getting-started/example-datasets/covid19) | COVID-19 Open-Data is a large, open-source database of COVID-19 epidemiological data and related factors like demographics, economics, and government responses | +| [Environmental Sensors Data](/getting-started/example-datasets/environmental-sensors) | Over 20 billion records of data from Sensor.Community, a contributors-driven global sensor network that creates Open Environmental Data. | +| [Foursquare places](/getting-started/example-datasets/foursquare-places) | Dataset with over 100 million records containing information about places on a map, such as shops, restaurants, parks, playgrounds, and monuments. | +| [GitHub Events Dataset](/getting-started/example-datasets/github-events) | Dataset containing all events on GitHub from 2011 to Dec 6 2020, with a size of 3.1 billion records. | +| [Laion-400M dataset](/getting-started/example-datasets/laion-400m-dataset) | Dataset containing 400 million images with English image captions | +| [New York Public Library "What's on the Menu?" Dataset](/getting-started/example-datasets/menus) | Dataset containing 1.3 million records of historical data on the menus of hotels, restaurants and cafes with the dishes along with their prices. | +| [NYPD Complaint Data](/getting-started/example-datasets/nypd_complaint_data) | Ingest and query Tab Separated Value data in 5 steps | +| [OnTime](/getting-started/example-datasets/ontime) | Dataset containing the on-time performance of airline flights | +| [Star Schema Benchmark (SSB, 2009)](/getting-started/example-datasets/star-schema) | The Star Schema Benchmark (SSB) data set and queries | +| [Terabyte Click Logs from Criteo](/getting-started/example-datasets/criteo) | A terabyte of Click Logs from Criteo | +| [TPC-DS (2012)](/getting-started/example-datasets/tpcds) | The TPC-DS benchmark data set and queries. | +| [TPC-H (1999)](/getting-started/example-datasets/tpch) | The TPC-H benchmark data set and queries. | +| [WikiStat](/getting-started/example-datasets/wikistat) | Explore the WikiStat dataset containing 0.5 trillion records. | +| [YouTube dataset of dislikes](/getting-started/example-datasets/youtube-dislikes) | A collection is dislikes of YouTube videos. | diff --git a/docs/guides/best-practices/_snippets/_performance_optimizations_table_of_contents.md b/docs/guides/best-practices/_snippets/_performance_optimizations_table_of_contents.md new file mode 100644 index 00000000000..3cbd396bb55 --- /dev/null +++ b/docs/guides/best-practices/_snippets/_performance_optimizations_table_of_contents.md @@ -0,0 +1,17 @@ +| Topic | Description | +|---------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| [Query optimization guide](/optimize/query-optimization) | Start here for query optimization fundamentals, covering common scenarios and performance techniques to improve query execution speed. | +| [Primary indexes advanced guide](/guides/best-practices/sparse-primary-indexes) | Deep dive into ClickHouse's unique sparse primary indexing system, how it differs from traditional databases, and best practices for optimal indexing strategies. | +| [Query parallelism](/optimize/query-parallelism) | Learn how ClickHouse parallelizes query execution using processing lanes and `max_threads` settings, including how to inspect and optimize parallel execution. | +| [Partitioning key](/optimize/partitioning-key) | Master partition key selection to dramatically improve query performance by enabling efficient data segment pruning and avoiding common partitioning pitfalls. | +| [Data skipping indexes](/optimize/skipping-indexes) | Apply secondary indexes strategically to skip irrelevant data blocks and accelerate filtered queries on non-primary key columns. | +| [`PREWHERE` optimization](/optimize/prewhere) | Understand how `PREWHERE` automatically reduces I/O by filtering data before reading unnecessary columns, plus how to monitor its effectiveness. | +| [Bulk inserts](/optimize/bulk-inserts) | Maximize ingestion throughput and reduce resource overhead by batching data insertions effectively. | +| [Asynchronous inserts](/optimize/asynchronous-inserts) | Improve insert performance by leveraging server-side batching to reduce client-side complexity and increase throughput for high-frequency insertions. | +| [Avoid mutations](/optimize/avoid-mutations) | Design append-only workflows that eliminate costly `UPDATE` and `DELETE` operations while maintaining data accuracy and performance. | +| [Avoid nullable columns](/optimize/avoid-nullable-columns) | Reduce storage overhead and improve query performance by using default values instead of nullable columns where possible. | +| [Avoid `OPTIMIZE FINAL`](/optimize/avoidoptimizefinal) | Understand when you should and should not use `OPTIMIZE TABLE FINAL` | +| [Analyzer](/operations/analyzer) | Leverage ClickHouse's new query analyzer to identify performance bottlenecks and optimize query execution plans for better efficiency. | +| [Query profiling](/operations/optimizing-performance/sampling-query-profiler) | Use the sampling query profiler to analyze query execution patterns, identify performance hot spots, and optimize resource usage. | +| [Query cache](/operations/query-cache) | Accelerate frequently executed `SELECT` queries by enabling and configuring ClickHouse's built-in query result caching. | +| [Testing hardware](/operations/performance-test) | Run ClickHouse performance benchmarks on any server without installation to evaluate hardware capabilities. (Not applicable to ClickHouse Cloud) | \ No newline at end of file diff --git a/docs/guides/best-practices/index.md b/docs/guides/best-practices/index.md index 0c52281492f..ef320eaf03c 100644 --- a/docs/guides/best-practices/index.md +++ b/docs/guides/best-practices/index.md @@ -5,26 +5,12 @@ description: 'Overview page of Performance and Optimizations' title: 'Performance and Optimizations' --- -# Performance and optimizations +import TableOfContents from '@site/docs/guides/best-practices/_snippets/_performance_optimizations_table_of_contents.md'; + +# Performance and Optimizations This section contains tips and best practices for improving performance with ClickHouse. We recommend users read [Core Concepts](/parts) as a precursor to this section, which covers the main concepts required to improve performance. -| Topic | Description | -|---------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| [Query Optimization Guide](/optimize/query-optimization) | A good place to start for query optimization, this simple guide describes common scenarios of how to use different performance and optimization techniques to improve query performance. | -| [Primary Indexes Advanced Guide](/guides/best-practices/sparse-primary-indexes) | A deep dive into ClickHouse indexing including how it differs from other DB systems, how ClickHouse builds and uses a table's spare primary index and what some of the best practices are for indexing in ClickHouse. | -| [Query Parallelism](/optimize/query-parallelism) | Explains how ClickHouse parallelizes query execution using processing lanes and the max_threads setting. Covers how data is distributed across lanes, how max_threads is applied, when it isn't fully used, and how to inspect execution with tools like EXPLAIN and trace logs. | -| [Partitioning Key](/optimize/partitioning-key) | Delves into ClickHouse partition key optimization. Explains how choosing the right partition key can significantly improve query performance by allowing ClickHouse to quickly locate relevant data segments. Covers best practices for selecting efficient partition keys and potential pitfalls to avoid. | -| [Data Skipping Indexes](/optimize/skipping-indexes) | Explains data skipping indexes as a way to optimize performance. | -| [PREWHERE Optimization](/optimize/prewhere) | Explains how PREWHERE reduces I/O by avoiding reading unnecessary column data. Shows how it's applied automatically, how the filtering order is chosen, and how to monitor it using EXPLAIN and logs. | -| [Bulk Inserts](/optimize/bulk-inserts) | Explains the benefits of using bulk inserts in ClickHouse. | -| [Asynchronous Inserts](/optimize/asynchronous-inserts) | Focuses on ClickHouse's asynchronous inserts feature. It likely explains how asynchronous inserts work (batching data on the server for efficient insertion) and their benefits (improved performance by offloading insert processing). It might also cover enabling asynchronous inserts and considerations for using them effectively in your ClickHouse environment. | -| [Avoid Mutations](/optimize/avoid-mutations) | Discusses the importance of avoiding mutations (updates and deletes) in ClickHouse. It recommends using append-only inserts for optimal performance and suggests alternative approaches for handling data changes. | -| [Avoid nullable columns](/optimize/avoid-nullable-columns) | Discusses why you may want to avoid nullable columns to save space and increase performance. Demonstrates how to set a default value for a column. | -| [Avoid `OPTIMIZE FINAL`](/optimize/avoidoptimizefinal) | Explains how the `OPTIMIZE TABLE ... FINAL` query is resource-intensive and suggests alternative approaches to optimize ClickHouse performance. | -| [Analyzer](/operations/analyzer) | Looks at the ClickHouse Analyzer, a tool for analyzing and optimizing queries. Discusses how the Analyzer works, its benefits (e.g., identifying performance bottlenecks), and how to use it to improve your ClickHouse queries' efficiency. | -| [Query Profiling](/operations/optimizing-performance/sampling-query-profiler) | Explains ClickHouse's Sampling Query Profiler, a tool that helps analyze query execution. | -| [Query Cache](/operations/query-cache) | Details ClickHouse's Query Cache, a feature that aims to improve performance by caching the results of frequently executed `SELECT` queries. | -| [Testing Hardware](/operations/performance-test) | How to run a basic ClickHouse performance test on any server without installation of ClickHouse packages. (Not applicable to ClickHouse Cloud) | + \ No newline at end of file diff --git a/docs/integrations/data-ingestion/clickpipes/index.md b/docs/integrations/data-ingestion/clickpipes/index.md index 45526f155cb..852d0363e5f 100644 --- a/docs/integrations/data-ingestion/clickpipes/index.md +++ b/docs/integrations/data-ingestion/clickpipes/index.md @@ -101,7 +101,7 @@ If ClickPipes cannot connect to a data source after 15 min or to a destination a - **Does using ClickPipes incur an additional cost?** - ClickPipes is billed on two dimensions: Ingested Data and Compute. The full details of the pricing are available on [this page](/cloud/manage/jan-2025-faq/pricing-dimensions#clickpipes-pricing-faq). Running ClickPipes might also generate an indirect compute and storage cost on the destination ClickHouse Cloud service similar to any ingest workload. + ClickPipes is billed on two dimensions: Ingested Data and Compute. Running ClickPipes might also generate an indirect compute and storage cost on the destination ClickHouse Cloud service similar to any ingest workload. - **Is there a way to handle errors or failures when using ClickPipes for Kafka?** diff --git a/docs/integrations/data-ingestion/clickpipes/kafka/index.md b/docs/integrations/data-ingestion/clickpipes/kafka/index.md index 830168afc93..fd4a2833514 100644 --- a/docs/integrations/data-ingestion/clickpipes/kafka/index.md +++ b/docs/integrations/data-ingestion/clickpipes/kafka/index.md @@ -6,4 +6,11 @@ title: 'Kafka ClickPipes' --- +| Page | Description | +|-----|-----| +| [Reference](/integrations/clickpipes/kafka/reference) | Details supported formats, sources, delivery semantics, authentication and experimental features supported by Kafka ClickPipes | +| [Schema registries for Kafka ClickPipe](/integrations/clickpipes/kafka/schema-registries) | Information on schema registries for Kafka ClickPipe | +| [Creating your first Kafka ClickPipe](/integrations/clickpipes/kafka/create-your-first-kafka-clickpipe) | Step-by-step guide to creating your first Kafka ClickPipe. | +| [Kafka ClickPipes FAQ](/integrations/clickpipes/kafka/faq) | Frequently asked questions about Kafka ClickPipes | +| [Best practices](/integrations/clickpipes/kafka/best-practices) | Details best practices to follow when working with Kafka ClickPipes | \ No newline at end of file diff --git a/docs/integrations/data-ingestion/dbms/jdbc-with-clickhouse.md b/docs/integrations/data-ingestion/dbms/jdbc-with-clickhouse.md index 981a11bad97..8ac322ecd5d 100644 --- a/docs/integrations/data-ingestion/dbms/jdbc-with-clickhouse.md +++ b/docs/integrations/data-ingestion/dbms/jdbc-with-clickhouse.md @@ -17,7 +17,7 @@ import Jdbc03 from '@site/static/images/integrations/data-ingestion/dbms/jdbc-03 # Connecting ClickHouse to external data sources with JDBC :::note -Using JDBC requires the ClickHouse JDBC bridge, so you will need to use `clickhouse-local` on a local machine to stream the data from your database to ClickHouse Cloud. Visit the [**Using clickhouse-local**](/integrations/migration/clickhouse-local-etl.md#example-2-migrating-from-mysql-to-clickhouse-cloud-with-the-jdbc-bridge) page in the **Migrate** section of the docs for details. +Using JDBC requires the ClickHouse JDBC bridge, so you will need to use `clickhouse-local` on a local machine to stream the data from your database to ClickHouse Cloud. Visit the [**Using clickhouse-local**](/cloud/migration/clickhouse-local#example-2-migrating-from-mysql-to-clickhouse-cloud-with-the-jdbc-bridge) page in the **Migrate** section of the docs for details. ::: **Overview:** The ClickHouse JDBC Bridge in combination with the [jdbc table function](/sql-reference/table-functions/jdbc.md) or the [JDBC table engine](/engines/table-engines/integrations/jdbc.md) allows ClickHouse to access data from any external data source for which a JDBC driver is available: diff --git a/docs/integrations/data-ingestion/kafka/index.md b/docs/integrations/data-ingestion/kafka/index.md index bde0ca73608..ef85ea20227 100644 --- a/docs/integrations/data-ingestion/kafka/index.md +++ b/docs/integrations/data-ingestion/kafka/index.md @@ -37,7 +37,7 @@ This is the recommended option if you're a ClickHouse Cloud user. ClickPipes is * Optimized for ClickHouse Cloud, delivering blazing-fast performance * Horizontal and vertical scalability for high-throughput workloads * Built-in fault tolerance with configurable replicas and automatic retries -* Deployment and management via ClickHouse Cloud UI, [Open API](../../../cloud/manage/api/api-overview.md), or [Terraform](https://registry.terraform.io/providers/ClickHouse/clickhouse/3.3.3-alpha2/docs/resources/clickpipe) +* Deployment and management via ClickHouse Cloud UI, [Open API](/cloud/manage/api/api-overview), or [Terraform](https://registry.terraform.io/providers/ClickHouse/clickhouse/3.3.3-alpha2/docs/resources/clickpipe) * Enterprise-grade security with support for cloud-native authorization (IAM) and private connectivity (PrivateLink) * Supports a wide range of [data sources](/integrations/clickpipes/kafka/reference/), including Confluent Cloud, Amazon MSK, Redpanda Cloud, and Azure Event Hubs * Supports most common serialization formats (JSON, Avro, Protobuf coming soon!) @@ -100,6 +100,6 @@ To get started using the Kafka table engine, see the [reference documentation](. * **Custom code** - Custom code using Kafka and ClickHouse [client libraries](../../language-clients/index.md) may be appropriate in cases where custom processing of events is required. -[BYOC]: ../../../cloud/reference/byoc.md -[Cloud]: ../../../cloud-index.md +[BYOC]: /cloud/reference/byoc +[Cloud]: /cloud/get-started [Self-hosted]: ../../../intro.md diff --git a/docs/integrations/data-ingestion/redshift/_snippets/_migration_guide.md b/docs/integrations/data-ingestion/redshift/_snippets/_migration_guide.md new file mode 100644 index 00000000000..960120aa751 --- /dev/null +++ b/docs/integrations/data-ingestion/redshift/_snippets/_migration_guide.md @@ -0,0 +1,254 @@ +import redshiftToClickhouse from '@site/static/images/integrations/data-ingestion/redshift/redshift-to-clickhouse.png'; +import push from '@site/static/images/integrations/data-ingestion/redshift/push.png'; +import pull from '@site/static/images/integrations/data-ingestion/redshift/pull.png'; +import pivot from '@site/static/images/integrations/data-ingestion/redshift/pivot.png'; +import s3_1 from '@site/static/images/integrations/data-ingestion/redshift/s3-1.png'; +import s3_2 from '@site/static/images/integrations/data-ingestion/redshift/s3-2.png'; +import Image from '@theme/IdealImage'; + +## Introduction {#introduction} + +[Amazon Redshift](https://aws.amazon.com/redshift/) is a popular cloud data warehousing solution that is part of the Amazon Web Services offerings. This guide presents different approaches to migrating data from a Redshift instance to ClickHouse. We will cover three options: + +Redshift to ClickHouse Migration Options + +From the ClickHouse instance standpoint, you can either: + +1. **[PUSH](#push-data-from-redshift-to-clickhouse)** data to ClickHouse using a third party ETL/ELT tool or service + +2. **[PULL](#pull-data-from-redshift-to-clickhouse)** data from Redshift leveraging the ClickHouse JDBC Bridge + +3. **[PIVOT](#pivot-data-from-redshift-to-clickhouse-using-s3)** using S3 object storage using an "Unload then load" logic + +:::note +We used Redshift as a data source in this tutorial. However, the migration approaches presented here are not exclusive to Redshift, and similar steps can be derived for any compatible data source. +::: + +## Push Data from Redshift to ClickHouse {#push-data-from-redshift-to-clickhouse} + +In the push scenario, the idea is to leverage a third-party tool or service (either custom code or an [ETL/ELT](https://en.wikipedia.org/wiki/Extract,_transform,_load#ETL_vs._ELT)) to send your data to your ClickHouse instance. For example, you can use a software like [Airbyte](https://www.airbyte.com/) to move data between your Redshift instance (as a source) and ClickHouse as a destination ([see our integration guide for Airbyte](/integrations/data-ingestion/etl-tools/airbyte-and-clickhouse.md)) + +PUSH Redshift to ClickHouse + +### Pros {#pros} + +* It can leverage the existing catalog of connectors from the ETL/ELT software. +* Built-in capabilities to keep data in sync (append/overwrite/increment logic). +* Enable data transformation scenarios (for example, see our [integration guide for dbt](/integrations/data-ingestion/etl-tools/dbt/index.md)). + +### Cons {#cons} + +* Users need to set up and maintain an ETL/ELT infrastructure. +* Introduces a third-party element in the architecture which can turn into a potential scalability bottleneck. + +## Pull Data from Redshift to ClickHouse {#pull-data-from-redshift-to-clickhouse} + +In the pull scenario, the idea is to leverage the ClickHouse JDBC Bridge to connect to a Redshift cluster directly from a ClickHouse instance and perform `INSERT INTO ... SELECT` queries: + +PULL from Redshift to ClickHouse + +### Pros {#pros-1} + +* Generic to all JDBC compatible tools +* Elegant solution to allow querying multiple external data sources from within ClickHouse + +### Cons {#cons-1} + +* Requires a ClickHouse JDBC Bridge instance which can turn into a potential scalability bottleneck + +:::note +Even though Redshift is based on PostgreSQL, using the ClickHouse PostgreSQL table function or table engine is not possible since ClickHouse requires PostgreSQL version 9 or above and the Redshift API is based on an earlier version (8.x). +::: + +### Tutorial {#tutorial} + +To use this option, you need to set up a ClickHouse JDBC Bridge. ClickHouse JDBC Bridge is a standalone Java application that handles JDBC connectivity and acts as a proxy between the ClickHouse instance and the data sources. For this tutorial, we used a pre-populated Redshift instance with a [sample database](https://docs.aws.amazon.com/redshift/latest/dg/c_sampledb.html). + + + +#### Deploy ClickHouse JDBC Bridge {#deploy-clickhouse-jdbc-bridge} + +Deploy the ClickHouse JDBC Bridge. For more details, see our user guide on [JDBC for External Data sources](/integrations/data-ingestion/dbms/jdbc-with-clickhouse.md) + +:::note +If you are using ClickHouse Cloud, you will need to run your ClickHouse JDBC Bridge on a separate environment and connect to ClickHouse Cloud using the [remoteSecure](/sql-reference/table-functions/remote/) function +::: + +#### Configure your Redshift datasource {#configure-your-redshift-datasource} + +Configure your Redshift datasource for ClickHouse JDBC Bridge. For example, `/etc/clickhouse-jdbc-bridge/config/datasources/redshift.json ` + +```json +{ + "redshift-server": { + "aliases": [ + "redshift" + ], + "driverUrls": [ + "https://s3.amazonaws.com/redshift-downloads/drivers/jdbc/2.1.0.4/redshift-jdbc42-2.1.0.4.jar" + ], + "driverClassName": "com.amazon.redshift.jdbc.Driver", + "jdbcUrl": "jdbc:redshift://redshift-cluster-1.ckubnplpz1uv.us-east-1.redshift.amazonaws.com:5439/dev", + "username": "awsuser", + "password": "", + "maximumPoolSize": 5 + } +} +``` + +#### Query your Redshift instance from ClickHouse {#query-your-redshift-instance-from-clickhouse} + +Once ClickHouse JDBC Bridge deployed and running, you can start querying your Redshift instance from ClickHouse + +```sql +SELECT * +FROM jdbc('redshift', 'select username, firstname, lastname from users limit 5') +``` + +```response +Query id: 1b7de211-c0f6-4117-86a2-276484f9f4c0 + +┌─username─┬─firstname─┬─lastname─┐ +│ PGL08LJI │ Vladimir │ Humphrey │ +│ XDZ38RDD │ Barry │ Roy │ +│ AEB55QTM │ Reagan │ Hodge │ +│ OWY35QYB │ Tamekah │ Juarez │ +│ MSD36KVR │ Mufutau │ Watkins │ +└──────────┴───────────┴──────────┘ + +5 rows in set. Elapsed: 0.438 sec. +``` + +```sql +SELECT * +FROM jdbc('redshift', 'select count(*) from sales') +``` + +```response +Query id: 2d0f957c-8f4e-43b2-a66a-cc48cc96237b + +┌──count─┐ +│ 172456 │ +└────────┘ + +1 rows in set. Elapsed: 0.304 sec. +``` + +#### Import Data from Redshift to ClickHouse {#import-data-from-redshift-to-clickhouse} + +In the following, we display importing data using an `INSERT INTO ... SELECT` statement + +```sql +# TABLE CREATION with 3 columns +CREATE TABLE users_imported +( + `username` String, + `firstname` String, + `lastname` String +) +ENGINE = MergeTree +ORDER BY firstname +``` + +```response +Query id: c7c4c44b-cdb2-49cf-b319-4e569976ab05 + +Ok. + +0 rows in set. Elapsed: 0.233 sec. +``` + +```sql +INSERT INTO users_imported (*) SELECT * +FROM jdbc('redshift', 'select username, firstname, lastname from users') +``` + +```response +Query id: 9d3a688d-b45a-40f4-a7c7-97d93d7149f1 + +Ok. + +0 rows in set. Elapsed: 4.498 sec. Processed 49.99 thousand rows, 2.49 MB (11.11 thousand rows/s., 554.27 KB/s.) +``` + + + +## Pivot Data from Redshift to ClickHouse using S3 {#pivot-data-from-redshift-to-clickhouse-using-s3} + +In this scenario, we export data to S3 in an intermediary pivot format and, in a second step, load the data from S3 into ClickHouse. + +PIVOT from Redshift using S3 + +### Pros {#pros-2} + +* Both Redshift and ClickHouse have powerful S3 integration features. +* Leverages the existing features such as the Redshift `UNLOAD` command and ClickHouse S3 table function / table engine. +* Scales seamlessly thanks to parallel reads and high throughput capabilities from/to S3 in ClickHouse. +* Can leverage sophisticated and compressed formats like Apache Parquet. + +### Cons {#cons-2} + +* Two steps in the process (unload from Redshift then load into ClickHouse). + +### Tutorial {#tutorial-1} + + + +#### Export data into an S3 bucket using UNLOAD {#export-data-into-an-s3-bucket-using-unload} + +Using Redshift's [UNLOAD](https://docs.aws.amazon.com/redshift/latest/dg/r_UNLOAD.html) feature, export the data into an existing private S3 bucket: + +UNLOAD from Redshift to S3 + +It will generate part files containing the raw data in S3 + +Data in S3 + +#### Create the table in ClickHouse {#create-the-table-in-clickhouse} + +Create the table in ClickHouse: + +```sql +CREATE TABLE users +( + username String, + firstname String, + lastname String +) +ENGINE = MergeTree +ORDER BY username +``` + +Alternatively, ClickHouse can try to infer the table structure using `CREATE TABLE ... EMPTY AS SELECT`: + +```sql +CREATE TABLE users +ENGINE = MergeTree ORDER BY username +EMPTY AS +SELECT * FROM s3('https://your-bucket.s3.amazonaws.com/unload/users/*', '', '', 'CSV') +``` + +This works especially well when the data is in a format that contains information about data types, like Parquet. + +#### Load S3 files into ClickHouse {#load-s3-files-into-clickhouse} + +Load the S3 files into ClickHouse using an `INSERT INTO ... SELECT` statement: + +```sql +INSERT INTO users SELECT * +FROM s3('https://your-bucket.s3.amazonaws.com/unload/users/*', '', '', 'CSV') +``` + +```response +Query id: 2e7e219a-6124-461c-8d75-e4f5002c8557 + +Ok. + +0 rows in set. Elapsed: 0.545 sec. Processed 49.99 thousand rows, 2.34 MB (91.72 thousand rows/s., 4.30 MB/s.) +``` + +:::note +This example used CSV as the pivot format. However, for production workloads we recommend Apache Parquet as the best option for large migrations since it comes with compression and can save some storage costs while reducing transfer times. (By default, each row group is compressed using SNAPPY). ClickHouse also leverages Parquet's column orientation to speed up data ingestion. +::: + + \ No newline at end of file diff --git a/docs/integrations/data-ingestion/redshift/index.md b/docs/integrations/data-ingestion/redshift/index.md index 3e936cec37b..217609acecd 100644 --- a/docs/integrations/data-ingestion/redshift/index.md +++ b/docs/integrations/data-ingestion/redshift/index.md @@ -7,17 +7,11 @@ keywords: ['Redshift'] show_related_blogs: true --- -import redshiftToClickhouse from '@site/static/images/integrations/data-ingestion/redshift/redshift-to-clickhouse.png'; -import push from '@site/static/images/integrations/data-ingestion/redshift/push.png'; -import pull from '@site/static/images/integrations/data-ingestion/redshift/pull.png'; -import pivot from '@site/static/images/integrations/data-ingestion/redshift/pivot.png'; -import s3_1 from '@site/static/images/integrations/data-ingestion/redshift/s3-1.png'; -import s3_2 from '@site/static/images/integrations/data-ingestion/redshift/s3-2.png'; -import Image from '@theme/IdealImage'; +import MigrationGuide from '@site/docs/integrations/data-ingestion/redshift/_snippets/_migration_guide.md'; -# Migrating data from Redshift to ClickHouse +# Migrating Data from Redshift to ClickHouse -## Related content {#related-content} +## Related Content {#related-content}