From 4b9dda3e379ba65cb5ded46311f48e6562081f6e Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Sat, 9 Aug 2025 22:47:34 +0200 Subject: [PATCH 1/3] ETL/CDC: Guidance, Layout --- docs/_include/links.md | 4 + docs/_include/styles.html | 7 + docs/connect/index.md | 5 +- docs/ingest/cdc/index.md | 96 ++++++++++-- docs/ingest/etl/index.md | 185 ++++++++++++++++++++++- docs/ingest/index.md | 1 + docs/ingest/telemetry/index.md | 2 +- docs/integrate/apache-airflow/index.md | 6 +- docs/integrate/apache-iceberg/index.md | 4 + docs/integrate/azure-functions/index.md | 4 +- docs/integrate/azure-functions/learn.rst | 10 +- docs/integrate/index.md | 2 +- docs/integrate/marquez/index.md | 3 +- 13 files changed, 303 insertions(+), 26 deletions(-) diff --git a/docs/_include/links.md b/docs/_include/links.md index eff688c3..45ba0a27 100644 --- a/docs/_include/links.md +++ b/docs/_include/links.md @@ -1,7 +1,10 @@ [Amazon DynamoDB Streams]: https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Streams.html [Amazon Kinesis Data Streams]: https://docs.aws.amazon.com/streams/latest/dev/introduction.html +[Apache Airflow]: https://airflow.apache.org/ +[Astronomer]: https://www.astronomer.io/ [AWS Database Migration Service (AWS DMS)]: https://aws.amazon.com/dms/ [AWS DMS Integration with CrateDB]: https://cratedb-toolkit.readthedocs.io/io/dms/ +[AWS Lambda]: https://aws.amazon.com/lambda/ [BM25]: https://en.wikipedia.org/wiki/Okapi_BM25 [cloud-datashader-colab]: https://colab.research.google.com/github/crate/cratedb-examples/blob/amo/cloud-datashader/topic/timeseries/explore/cloud-datashader.ipynb [cloud-datashader-github]: https://github.com/crate/cratedb-examples/blob/amo/cloud-datashader/topic/timeseries/explore/cloud-datashader.ipynb @@ -17,6 +20,7 @@ [dask-weather-data-github]: https://github.com/crate/cratedb-examples/blob/main/topic/timeseries/dask-weather-data-import.ipynb [Datashader]: https://datashader.org/ [Dynamic Database Schemas]: https://cratedb.com/product/features/dynamic-schemas +[DynamoDB]: https://aws.amazon.com/dynamodb/ [DynamoDB CDC Relay]: https://cratedb-toolkit.readthedocs.io/io/dynamodb/cdc.html [DynamoDB CDC Relay with AWS Lambda]: https://cratedb-toolkit.readthedocs.io/io/dynamodb/cdc-lambda.html [DynamoDB Table Loader]: https://cratedb-toolkit.readthedocs.io/io/dynamodb/loader.html diff --git a/docs/_include/styles.html b/docs/_include/styles.html index a3b794b9..e5ff3327 100644 --- a/docs/_include/styles.html +++ b/docs/_include/styles.html @@ -46,4 +46,11 @@ height: 0; } +/* On tiled link overview index pages, give ul/li elements more space */ +.ul-li-wide { + ul li { + margin-bottom: 1rem; + } +} + diff --git a/docs/connect/index.md b/docs/connect/index.md index a6caddac..eabefec2 100644 --- a/docs/connect/index.md +++ b/docs/connect/index.md @@ -3,6 +3,8 @@ :::{include} /_include/links.md ::: +:::{include} /_include/styles.html +::: :::::{grid} :padding: 0 @@ -71,6 +73,7 @@ protocol. :gutter: 2 ::::{grid-item-card} {material-outlined}`link;2em` How to connect +:class-body: ul-li-wide - {ref}`connect-configure` To connect to CrateDB, your application or driver needs to be configured @@ -87,7 +90,7 @@ protocol. Database connectivity options and tools. :::: -::::{grid-item-card} {material-outlined}`not_started;2em` How to use database drivers +::::{grid-item-card} {material-outlined}`link;2em` How to connect - {ref}`connect-java` - {ref}`connect-javascript` - {ref}`connect-php` diff --git a/docs/ingest/cdc/index.md b/docs/ingest/cdc/index.md index 019e41ad..754c59e6 100644 --- a/docs/ingest/cdc/index.md +++ b/docs/ingest/cdc/index.md @@ -5,20 +5,96 @@ ::: :::{div} -CrateDB provides many options to connect and integrate with third-party +Options to connect and integrate CrateDB with third-party CDC applications, mostly using [CrateDB's PostgreSQL interface]. CrateDB also provides native adapter components to leverage advanced features. -This documentation section lists corresponding CDC applications and -frameworks which can be used together with CrateDB, and outlines how -to use them optimally. +This documentation section lists CDC applications, +frameworks, and solutions, which can be used together with CrateDB, +and outlines how to use them optimally. Please also take a look at support for {ref}`generic ETL ` solutions. ::: -- {ref}`aws-dms` -- {ref}`aws-dynamodb` -- {ref}`aws-kinesis` -- {ref}`debezium` -- {ref}`mongodb` -- {ref}`streamsets` + +## Connectors + +Native and specialized connectors for CrateDB, both managed and unmanaged. + +:::::{grid} 1 +:gutter: 2 + +::::{grid-item-card} Amazon DynamoDB +:link: aws-dynamodb +:link-type: ref +Load data from DynamoDB, a fully managed NoSQL database service provided by +Amazon Web Services (AWS), which is designed for high-performance, scalable +applications and offers key-value and document data structures. +:::: + +::::{grid-item-card} Amazon Kinesis +:link: aws-kinesis +:link-type: ref +Load data from Amazon Kinesis Data Streams, a serverless streaming data service +that simplifies the capture, processing, and storage of data streams at any scale. +:::: + +::::{grid-item-card} MongoDB +:link: mongodb +:link-type: ref +Load data from MongoDB or MongoDB Atlas, a document database, self-hosted +or multi-cloud. +:::: + +::::: + + +## Platforms + +Support for data integration frameworks and platforms, both managed and unmanaged. + +:::::{grid} 1 +:gutter: 2 + +::::{grid-item-card} AWS DMS +:link: aws-dms +:link-type: ref +Use AWS Database Migration Service (AWS DMS), a managed migration and replication +service that helps move your database and analytics workloads between different +kinds of databases. +:::: + +::::{grid-item-card} Debezium +:link: debezium +:link-type: ref +Use, Debezium an open source distributed platform for change data capture for +loading data into CrateDB. +It is used as a building block by a number of downstream third-party projects and products. +:::: + +::::{grid-item-card} Estuary +:link: estuary +:link-type: ref +Use Estuary Flow, a managed, real-time, reliable change data capture (CDC) solution, +to load data into CrateDB. +It combines agentless CDC, zero-code pipelines, and enterprise-grade governance to +simplify data integration. +:::: + +::::{grid-item-card} RisingWave +:link: risingwave +:link-type: ref +Use RisingWave, a stream processing and management platform, to load data into CrateDB. +It provides a Postgres-compatible SQL interface, like CrateDB, and a DataFrame-style +Python interface. It is available for on-premises and as a managed service. +:::: + +::::{grid-item-card} StreamSets +:link: streamsets +:link-type: ref +Use the StreamSets Data Collector Engine to ingest and transform data from a variety +of sources into CrateDB. It runs on-premises or in any cloud. +:::: + +::::: + diff --git a/docs/ingest/etl/index.md b/docs/ingest/etl/index.md index 070fc533..5c016417 100644 --- a/docs/ingest/etl/index.md +++ b/docs/ingest/etl/index.md @@ -5,29 +5,207 @@ :::{include} /_include/links.md ::: +:::{include} /_include/styles.html +::: :::{div} -CrateDB provides many options to connect and integrate with third-party +Options to connect and integrate CrateDB with third-party ETL applications, mostly using [CrateDB's PostgreSQL interface]. CrateDB also provides native adapter components to leverage advanced features. -This documentation section lists corresponding ETL applications and +This documentation section lists ETL applications and frameworks which can be used together with CrateDB, and outlines how to use them optimally. Please also take a look at support for {ref}`cdc` solutions. ::: +:::{rubric} Grouped by category +::: + +:::::{grid} 1 2 2 2 +:margin: 4 4 0 0 +:padding: 0 +:gutter: 2 +:class-container: ul-li-wide + + +::::{grid-item-card} {material-outlined}`air;2em` Dataflow / Pipeline / Code-first +- {ref}`apache-airflow` + + Apache Airflow is an open source software platform to programmatically author, + schedule, and monitor workflows. Pipelines are defined in Python, allowing for + dynamic pipeline generation and on-demand, code-driven pipeline invocation. + +- {ref}`apache-flink` + + Apache Flink is a programming framework and distributed processing engine for + stateful computations over unbounded and bounded data streams, written in Java. + +- {ref}`apache-nifi` + + Apache NiFi is a dataflow system based on the concepts of flow-based programming. + It supports powerful and scalable directed graphs of data routing, transformation, + and system mediation logic. + +- {ref}`dbt` + + dbt is an SQL-first platform for transforming data in data warehouses using + Python and SQL. The data abstraction layer provided by dbt-core allows the + decoupling of the models on which reports and dashboards rely from the source data. + +- {ref}`kestra` + + Kestra is an open source workflow automation and orchestration toolkit with a rich + plugin ecosystem. It enables users to automate and manage complex workflows in a + streamlined and efficient manner, defining them both declaratively, or imperatively + using any scripting language like Python, Bash, or JavaScript. + +- {ref}`meltano` + + Meltano is a declarative code-first polyglot data integration engine adhering to + the Singer specification. Singer is a composable open source ETL framework and + specification, including powerful data extraction and consolidation elements. + ++++ +Data pipeline programming frameworks and platforms. +:::: + + +::::{grid-item-card} {material-outlined}`all_inclusive;2em` Low-code / No-code / Visual +- {ref}`apache-hop` + + Apache Hop aims to be the future of data integration. Visual development enables + developers to be more productive than they can be through code. + +- {ref}`estuary` + + Estuary provides real-time data integration and modern ETL and ELT data pipelines + as a fully managed solution. Estuary Flow is a real-time, reliable change data + capture (CDC) solution. + +- {ref}`node-red` + + Node-RED is an open-source programming tool for wiring together hardware devices, + APIs and online services within a low-code programming environment for event-driven + applications. + ++++ +Visual data flow and integration frameworks and platforms. +:::: + + +::::{grid-item-card} {material-outlined}`storage;2em` Databases +- {ref}`aws-dms` + + AWS DMS is a managed migration and replication service that helps move your + database and analytics workloads between different kinds of databases quickly, + securely, and with minimal downtime and zero data loss. + +- {ref}`aws-dynamodb` + + DynamoDB is a fully managed NoSQL database service provided by Amazon Web Services (AWS). + +- {ref}`influxdb` + + InfluxDB is a scalable datastore for metrics, events, and real-time analytics to + collect, process, transform, and store event and time series data. + +- {ref}`mongodb` + + MongoDB is a document database designed for ease of application development and scaling. + +- {ref}`mysql` + + MySQL and MariaDB are well-known free and open-source relational database management + systems (RDBMS), available as standalone and managed variants. + +- {ref}`sql-server` + + Microsoft SQL Server Integration Services (SSIS) is a component of the Microsoft SQL + Server database software that can be used to perform a broad range of data migration tasks. + ++++ +Load data from database systems. +:::: + + +::::{grid-item-card} {material-outlined}`fast_forward;2em` Streams +- {ref}`apache-kafka` + + Apache Kafka is an open-source distributed event streaming platform + for high-performance data pipelines, streaming analytics, data integration, + and mission-critical applications. + +- {ref}`aws-kinesis` + + Amazon Kinesis Data Streams is a serverless streaming data service that simplifies + the capture, processing, and storage of data streams at any scale, such as + application logs, website clickstreams, and IoT telemetry data, for machine + learning (ML), analytics, and other applications. + +- {ref}`risingwave` + + RisingWave is a stream processing and management platform that allows configuring + data sources, views on that data, and destinations where results are materialized. + It provides both a Postgres-compatible SQL interface, like CrateDB, and a + DataFrame-style Python interface. + It delivers low-latency insights from real-time streams, database CDC, and + time-series data, bringing streaming and batch together. + +- {ref}`streamsets` + + The StreamSets Data Collector is a lightweight and powerful engine that allows you + to build streaming, batch and change-data-capture (CDC) pipelines that can ingest + and transform data from a variety of sources. + ++++ +Load data from streaming platforms. +:::: + + +::::{grid-item-card} {material-outlined}`add_to_queue;2em` Serverless Compute + +- {ref}`azure-functions` + + An Azure Function is a short-lived, serverless computation that is triggered by + external events. The trigger produces an input payload, which is delivered to + the Azure Function. The Azure Function then does computation with this payload + and subsequently outputs its result to other Azure Functions, computation + services, or storage services. ++++ +Use serverless compute units for custom import tasks. +:::: + + +::::{grid-item-card} {material-outlined}`dataset;2em` Datasets + +- {ref}`apache-iceberg` + + Apache Iceberg is an open table format for analytic datasets. + ++++ +Load data from datasets and open table formats. +:::: + + +::::: + + +:::{rubric} Alphabetically sorted +::: + +:::{div} - {ref}`apache-airflow` - {ref}`apache-flink` - {ref}`apache-hop` - {ref}`apache-iceberg` - {ref}`apache-kafka` - {ref}`apache-nifi` -- {ref}`aws-dms` - {ref}`aws-dynamodb` - {ref}`aws-kinesis` +- {ref}`aws-dms` - {ref}`azure-functions` - {ref}`dbt` - {ref}`estuary` @@ -40,3 +218,4 @@ Please also take a look at support for {ref}`cdc` solutions. - {ref}`risingwave` - {ref}`sql-server` - {ref}`streamsets` +::: diff --git a/docs/ingest/index.md b/docs/ingest/index.md index 9ca48032..56230b19 100644 --- a/docs/ingest/index.md +++ b/docs/ingest/index.md @@ -10,6 +10,7 @@ All data ingestion methods for CrateDB at a glance. :margin: 4 4 0 0 :padding: 0 :gutter: 2 +:class-container: ul-li-wide ::::{grid-item-card} {material-outlined}`file_upload;2em` Load data using CrateDB - {ref}`Import files ` diff --git a/docs/ingest/telemetry/index.md b/docs/ingest/telemetry/index.md index 45b17b08..1ab34b70 100644 --- a/docs/ingest/telemetry/index.md +++ b/docs/ingest/telemetry/index.md @@ -4,7 +4,7 @@ # Telemetry data :::{div} -CrateDB integrations with metrics collection agents, brokers, and stores. +CrateDB integrates with metrics collection agents, brokers, and stores. This documentation section lists applications and daemons which can be used together with CrateDB, and educates about how to use them optimally. diff --git a/docs/integrate/apache-airflow/index.md b/docs/integrate/apache-airflow/index.md index e8db6077..8f4eaa5c 100644 --- a/docs/integrate/apache-airflow/index.md +++ b/docs/integrate/apache-airflow/index.md @@ -3,6 +3,9 @@ (astronomer)= # Apache Airflow / Astronomer +:::{include} /_include/links.md +::: + :::{rubric} About ::: @@ -12,6 +15,7 @@ [![](https://logowik.com/content/uploads/images/astronomer2824.jpg){w=180px}](https://www.astronomer.io/) ``` +:::{div} [Apache Airflow] is an open source software platform to programmatically author, schedule, and monitor workflows, written in Python. [Astronomer] offers managed Airflow services on the cloud of your choice, in @@ -24,6 +28,7 @@ dynamic pipeline generation and on-demand, code-driven pipeline invocation. Pipeline parametrization is using the powerful Jinja templating engine. To extend the system, you can define your own operators and extend libraries to fit the level of abstraction that suits your environment. +::: ```{div} :style: "clear: both" ``` @@ -84,7 +89,6 @@ Tutorials and resources about configuring the managed variants, Astro and CrateD -[Apache Airflow]: https://airflow.apache.org/ [Automating export of CrateDB data to S3 using Apache Airflow]: https://community.cratedb.com/t/cratedb-and-apache-airflow-automating-data-export-to-s3/901 [Automating stock data collection and storage with CrateDB and Apache Airflow]: https://community.cratedb.com/t/automating-stock-data-collection-and-storage-with-cratedb-and-apache-airflow/990 [Automating the import of Parquet files with Apache Airflow]: https://community.cratedb.com/t/automating-the-import-of-parquet-files-with-apache-airflow/1247 diff --git a/docs/integrate/apache-iceberg/index.md b/docs/integrate/apache-iceberg/index.md index 606d34f0..12368135 100644 --- a/docs/integrate/apache-iceberg/index.md +++ b/docs/integrate/apache-iceberg/index.md @@ -3,6 +3,9 @@ :::{rubric} About ::: + +[Apache Iceberg] is an open table format for analytic datasets. + The [Iceberg table format] is designed to manage a large, slow-changing collection of files in a distributed file system or key-value store as a database table. @@ -16,4 +19,5 @@ see {ref}`risingwave-iceberg`. ::: +[Apache Iceberg]: https://iceberg.apache.org/ [Iceberg table format]: https://iceberg.apache.org/spec/ diff --git a/docs/integrate/azure-functions/index.md b/docs/integrate/azure-functions/index.md index 1b3fd982..7c57fa7d 100644 --- a/docs/integrate/azure-functions/index.md +++ b/docs/integrate/azure-functions/index.md @@ -4,11 +4,11 @@ :::{include} /_include/links.md ::: +_Execute event-driven serverless code with an end-to-end development experience._ + :::{rubric} About ::: -_Execute event-driven serverless code with an end-to-end development experience._ - [Azure Functions] is a serverless solution that allows you to build robust apps while using less code, and with less infrastructure and lower costs. Instead of worrying about deploying and maintaining servers, you can use the cloud diff --git a/docs/integrate/azure-functions/learn.rst b/docs/integrate/azure-functions/learn.rst index 8c83d89b..6edd04df 100644 --- a/docs/integrate/azure-functions/learn.rst +++ b/docs/integrate/azure-functions/learn.rst @@ -1,8 +1,8 @@ .. _azure-functions-learn: -=========================================================== -Data Enrichment using IoT Hubs, Azure Functions and CrateDB -=========================================================== +========================================================== +Data Enrichment using IoT Hub, Azure Functions and CrateDB +========================================================== This integration document details how to create an enrichment pipeline between data ingested into an Azure IoT Hub and CrateDB, using @@ -231,7 +231,7 @@ account, selecting "Access keys", and copying one of the shown connection strings. The ``EventHubConnectionString`` can be copied from the -"Event Hub-compatible endpoint" field under the IoT hub's "Built-in endpoints" +"Event Hub-compatible endpoint" field under the IoT Hub's "Built-in endpoints" section. Ensure that the event hub connection string includes the ``EntityPath=EVENTHUBNAME`` at the end of it. @@ -363,7 +363,7 @@ The following JSON document can be used as a test message: {"input": "{\"id\": \"Zero Gravitas\", \"type\": \"unmanned\", \"location\": {\"longitude\": -164.5984,\"latitude\": -24.9734},\"timestamp\": 1588240576000}"} -To test the deployed Azure Function against an actual IoT hub, you can install +To test the deployed Azure Function against an actual IoT Hub, you can install VSCode's `Azure IoT Hub extension`_. Its documentation describes how to create a new device and send a device-to-cloud (D2C) message for testing purposes. diff --git a/docs/integrate/index.md b/docs/integrate/index.md index ea665ee6..63531aea 100644 --- a/docs/integrate/index.md +++ b/docs/integrate/index.md @@ -23,10 +23,10 @@ apache-iceberg/index apache-kafka/index apache-nifi/index apache-superset/index -azure-functions/index aws-dms/index aws-dynamodb/index aws-kinesis/index +azure-functions/index cluvio/index datagrip/index dbeaver/index diff --git a/docs/integrate/marquez/index.md b/docs/integrate/marquez/index.md index 6cd66e5b..0f0d35be 100644 --- a/docs/integrate/marquez/index.md +++ b/docs/integrate/marquez/index.md @@ -257,5 +257,4 @@ You can now see all lineage graphs and events for this setup. [Marquez]: https://github.com/MarquezProject/marquez [OpenLineage]: https://openlineage.io/ -[Apache Airflow]: https://airflow.apache.org/ -[the Docker documentation on this topic]: https://docs.docker.com/compose/install/linux/ \ No newline at end of file +[the Docker documentation on this topic]: https://docs.docker.com/compose/install/linux/ From 06718c2a92444509dd0dc378bfc07bd7acc0b4e9 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Sat, 9 Aug 2025 22:51:42 +0200 Subject: [PATCH 2/3] ETL: Add `aws-lambda` and `n8n` --- docs/ingest/etl/index.md | 12 ++++++++++++ docs/integrate/aws-lambda/index.md | 23 +++++++++++++++++++++++ docs/integrate/index.md | 2 ++ docs/integrate/n8n/index.md | 18 ++++++++++++++++++ 4 files changed, 55 insertions(+) create mode 100644 docs/integrate/aws-lambda/index.md create mode 100644 docs/integrate/n8n/index.md diff --git a/docs/ingest/etl/index.md b/docs/ingest/etl/index.md index 5c016417..481e9d63 100644 --- a/docs/ingest/etl/index.md +++ b/docs/ingest/etl/index.md @@ -85,6 +85,11 @@ Data pipeline programming frameworks and platforms. as a fully managed solution. Estuary Flow is a real-time, reliable change data capture (CDC) solution. +- {ref}`n8n` + + n8n is a workflow automation tool that helps you to connect any app with an API with + any other, and manipulate its data with little or no code. + - {ref}`node-red` Node-RED is an open-source programming tool for wiring together hardware devices, @@ -166,6 +171,11 @@ Load data from streaming platforms. ::::{grid-item-card} {material-outlined}`add_to_queue;2em` Serverless Compute +- {ref}`aws-lambda` + + AWS Lambda is a serverless compute service that runs your code in response to + events and automatically manages the underlying compute resources for you. These + events may include changes in state or an update. - {ref}`azure-functions` @@ -206,6 +216,7 @@ Load data from datasets and open table formats. - {ref}`aws-dynamodb` - {ref}`aws-kinesis` - {ref}`aws-dms` +- {ref}`aws-lambda` - {ref}`azure-functions` - {ref}`dbt` - {ref}`estuary` @@ -214,6 +225,7 @@ Load data from datasets and open table formats. - {ref}`meltano` - {ref}`mongodb` - {ref}`mysql` +- {ref}`n8n` - {ref}`node-red` - {ref}`risingwave` - {ref}`sql-server` diff --git a/docs/integrate/aws-lambda/index.md b/docs/integrate/aws-lambda/index.md new file mode 100644 index 00000000..de2dcaf1 --- /dev/null +++ b/docs/integrate/aws-lambda/index.md @@ -0,0 +1,23 @@ +(aws-lambda)= +# AWS Lambda + +:::{include} /_include/links.md +::: + +:::{rubric} About +::: + +:::{div} +[AWS Lambda] is a serverless compute service that runs your code in response to +events and automatically manages the underlying compute resources for you. These +events may include changes in state or an update. +::: + +:::{rubric} Learn +::: + +:::{div} +Serverless replication from DynamoDB to CrateDB using AWS Lambda: +- [DynamoDB CDC Relay with AWS Lambda] +- Blog: [Replicating CDC events from DynamoDB to CrateDB] +::: diff --git a/docs/integrate/index.md b/docs/integrate/index.md index 63531aea..5198fa3d 100644 --- a/docs/integrate/index.md +++ b/docs/integrate/index.md @@ -26,6 +26,7 @@ apache-superset/index aws-dms/index aws-dynamodb/index aws-kinesis/index +aws-lambda/index azure-functions/index cluvio/index datagrip/index @@ -44,6 +45,7 @@ meltano/index metabase/index mongodb/index mysql/index +n8n/index node-red/index plotly/index powerbi/index diff --git a/docs/integrate/n8n/index.md b/docs/integrate/n8n/index.md new file mode 100644 index 00000000..93e2fab0 --- /dev/null +++ b/docs/integrate/n8n/index.md @@ -0,0 +1,18 @@ +(n8n)= +# n8n + +:::{rubric} About +::: + +[n8n] is a fair-code licensed workflow automation tool that combines AI capabilities +with business process automation. It helps you to connect any app with an API with +any other, and manipulate its data with little or no code. + +:::{rubric} Learn +::: + +- https://cratedb.com/integrations/cratedb-and-n8n +- https://n8n.io/integrations/cratedb/ + + +[n8n]: https://docs.n8n.io/ From 176cc1eda8131988c9fc5251a29216d69e78d37a Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Sat, 9 Aug 2025 23:19:33 +0200 Subject: [PATCH 3/3] ETL/CDC: Implement suggestions by CodeRabbit --- docs/_include/links.md | 8 +++++--- docs/_include/styles.html | 6 ++---- docs/connect/drivers.md | 2 +- docs/ingest/cdc/index.md | 9 ++++----- docs/ingest/etl/index.md | 12 ++++++------ docs/integrate/apache-airflow/index.md | 4 ++-- docs/integrate/aws-lambda/index.md | 4 ++-- docs/integrate/n8n/index.md | 8 +++++--- 8 files changed, 27 insertions(+), 26 deletions(-) diff --git a/docs/_include/links.md b/docs/_include/links.md index 45ba0a27..543604b5 100644 --- a/docs/_include/links.md +++ b/docs/_include/links.md @@ -1,3 +1,5 @@ + + [Amazon DynamoDB Streams]: https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Streams.html [Amazon Kinesis Data Streams]: https://docs.aws.amazon.com/streams/latest/dev/introduction.html [Apache Airflow]: https://airflow.apache.org/ @@ -21,9 +23,9 @@ [Datashader]: https://datashader.org/ [Dynamic Database Schemas]: https://cratedb.com/product/features/dynamic-schemas [DynamoDB]: https://aws.amazon.com/dynamodb/ -[DynamoDB CDC Relay]: https://cratedb-toolkit.readthedocs.io/io/dynamodb/cdc.html -[DynamoDB CDC Relay with AWS Lambda]: https://cratedb-toolkit.readthedocs.io/io/dynamodb/cdc-lambda.html -[DynamoDB Table Loader]: https://cratedb-toolkit.readthedocs.io/io/dynamodb/loader.html +[DynamoDB CDC Relay]: inv:ctk:*:label#dynamodb-cdc +[DynamoDB CDC Relay with AWS Lambda]: inv:ctk:*:doc#io/dynamodb/cdc-lambda +[DynamoDB Table Loader]: inv:ctk:*:label#dynamodb-loader [Executable stack with Apache Kafka, Apache Flink, and CrateDB]: https://github.com/crate/cratedb-examples/tree/main/framework/flink/kafka-jdbcsink-java [Geospatial Data Model]: https://cratedb.com/data-model/geospatial [Geospatial Database]: https://cratedb.com/geospatial-spatial-database diff --git a/docs/_include/styles.html b/docs/_include/styles.html index e5ff3327..f3962cc4 100644 --- a/docs/_include/styles.html +++ b/docs/_include/styles.html @@ -47,10 +47,8 @@ } /* On tiled link overview index pages, give ul/li elements more space */ -.ul-li-wide { - ul li { - margin-bottom: 1rem; - } +.ul-li-wide ul li { + margin-bottom: 1rem; } diff --git a/docs/connect/drivers.md b/docs/connect/drivers.md index 1b871f5f..531276fc 100644 --- a/docs/connect/drivers.md +++ b/docs/connect/drivers.md @@ -359,6 +359,6 @@ Ruby on Rails ActiveRecord adapter for CrateDB. ```{tip} -Please visit the :ref:`build-status` page for an overview about the integration +Please visit the {ref}`build-status` page for an overview about the integration status of the client drivers listed above, and more. ``` diff --git a/docs/ingest/cdc/index.md b/docs/ingest/cdc/index.md index 754c59e6..f11a18c8 100644 --- a/docs/ingest/cdc/index.md +++ b/docs/ingest/cdc/index.md @@ -13,7 +13,7 @@ features. This documentation section lists CDC applications, frameworks, and solutions, which can be used together with CrateDB, and outlines how to use them optimally. -Please also take a look at support for {ref}`generic ETL ` solutions. +Additionally, see support for {ref}`generic ETL ` solutions. ::: @@ -67,7 +67,7 @@ kinds of databases. ::::{grid-item-card} Debezium :link: debezium :link-type: ref -Use, Debezium an open source distributed platform for change data capture for +Use Debezium, an open source distributed platform for change data capture for loading data into CrateDB. It is used as a building block by a number of downstream third-party projects and products. :::: @@ -92,9 +92,8 @@ Python interface. It is available for on-premises and as a managed service. ::::{grid-item-card} StreamSets :link: streamsets :link-type: ref -Use the StreamSets Data Collector Engine to ingest and transform data from a variety -of sources into CrateDB. It runs on-premises or in any cloud. +Use the StreamSets Data Collector Engine to ingest and transform data from many +sources into CrateDB. It runs on-premises or in any cloud. :::: ::::: - diff --git a/docs/ingest/etl/index.md b/docs/ingest/etl/index.md index 481e9d63..bf864714 100644 --- a/docs/ingest/etl/index.md +++ b/docs/ingest/etl/index.md @@ -17,7 +17,7 @@ features. This documentation section lists ETL applications and frameworks which can be used together with CrateDB, and outlines how to use them optimally. -Please also take a look at support for {ref}`cdc` solutions. +Additionally, see support for {ref}`cdc` solutions. ::: @@ -34,7 +34,7 @@ Please also take a look at support for {ref}`cdc` solutions. ::::{grid-item-card} {material-outlined}`air;2em` Dataflow / Pipeline / Code-first - {ref}`apache-airflow` - Apache Airflow is an open source software platform to programmatically author, + Apache Airflow is an open-source software platform to programmatically author, schedule, and monitor workflows. Pipelines are defined in Python, allowing for dynamic pipeline generation and on-demand, code-driven pipeline invocation. @@ -57,7 +57,7 @@ Please also take a look at support for {ref}`cdc` solutions. - {ref}`kestra` - Kestra is an open source workflow automation and orchestration toolkit with a rich + Kestra is an open-source workflow automation and orchestration toolkit with a rich plugin ecosystem. It enables users to automate and manage complex workflows in a streamlined and efficient manner, defining them both declaratively, or imperatively using any scripting language like Python, Bash, or JavaScript. @@ -65,7 +65,7 @@ Please also take a look at support for {ref}`cdc` solutions. - {ref}`meltano` Meltano is a declarative code-first polyglot data integration engine adhering to - the Singer specification. Singer is a composable open source ETL framework and + the Singer specification. Singer is a composable open-source ETL framework and specification, including powerful data extraction and consolidation elements. +++ @@ -162,8 +162,8 @@ Load data from database systems. - {ref}`streamsets` The StreamSets Data Collector is a lightweight and powerful engine that allows you - to build streaming, batch and change-data-capture (CDC) pipelines that can ingest - and transform data from a variety of sources. + to build streaming, batch, and change-data-capture (CDC) pipelines that can ingest + and transform data from many sources. +++ Load data from streaming platforms. diff --git a/docs/integrate/apache-airflow/index.md b/docs/integrate/apache-airflow/index.md index 8f4eaa5c..e8baaecc 100644 --- a/docs/integrate/apache-airflow/index.md +++ b/docs/integrate/apache-airflow/index.md @@ -11,9 +11,9 @@ ```{div} :style: "float: right" -[![](https://19927462.fs1.hubspotusercontent-na1.net/hub/19927462/hubfs/Partner%20Logos/392x140/Apache-Airflow-Logo-392x140.png?width=784&height=280&name=Apache-Airflow-Logo-392x140.png){w=180px}](https://airflow.apache.org/) +[![Apache Airflow logo](https://19927462.fs1.hubspotusercontent-na1.net/hub/19927462/hubfs/Partner%20Logos/392x140/Apache-Airflow-Logo-392x140.png?width=784&height=280&name=Apache-Airflow-Logo-392x140.png){w=180px}](https://airflow.apache.org/) -[![](https://logowik.com/content/uploads/images/astronomer2824.jpg){w=180px}](https://www.astronomer.io/) +[![Astronomer logo](https://logowik.com/content/uploads/images/astronomer2824.jpg){w=180px}](https://www.astronomer.io/) ``` :::{div} [Apache Airflow] is an open source software platform to programmatically author, diff --git a/docs/integrate/aws-lambda/index.md b/docs/integrate/aws-lambda/index.md index de2dcaf1..e1746543 100644 --- a/docs/integrate/aws-lambda/index.md +++ b/docs/integrate/aws-lambda/index.md @@ -9,8 +9,8 @@ :::{div} [AWS Lambda] is a serverless compute service that runs your code in response to -events and automatically manages the underlying compute resources for you. These -events may include changes in state or an update. +events and automatically manages the underlying compute resources for you. +Events can include state changes and updates. ::: :::{rubric} Learn diff --git a/docs/integrate/n8n/index.md b/docs/integrate/n8n/index.md index 93e2fab0..73aca93d 100644 --- a/docs/integrate/n8n/index.md +++ b/docs/integrate/n8n/index.md @@ -5,14 +5,16 @@ ::: [n8n] is a fair-code licensed workflow automation tool that combines AI capabilities -with business process automation. It helps you to connect any app with an API with +with business process automation. It helps you connect any app with an API to any other, and manipulate its data with little or no code. :::{rubric} Learn ::: -- https://cratedb.com/integrations/cratedb-and-n8n -- https://n8n.io/integrations/cratedb/ +- [CrateDB and n8n integration] +- [n8n CrateDB integration] +[CrateDB and n8n integration]: https://cratedb.com/integrations/cratedb-and-n8n [n8n]: https://docs.n8n.io/ +[n8n CrateDB integration]: https://n8n.io/integrations/cratedb/