From 3f43cd60b3c857c9a7557c8ea3c11c8bc683fbd0 Mon Sep 17 00:00:00 2001 From: Niklas Schmidtmer Date: Sun, 14 Sep 2025 02:24:44 +0200 Subject: [PATCH 1/3] NiFi: Connecting to CrateDB from Apache NiFi --- docs/conf.py | 2 + docs/integrate/nifi/index.md | 10 +++- docs/integrate/nifi/tutorial.md | 97 +++++++++++++++++++++++++++++++++ 3 files changed, 107 insertions(+), 2 deletions(-) create mode 100644 docs/integrate/nifi/tutorial.md diff --git a/docs/conf.py b/docs/conf.py index d6897de0..b2ff30b7 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -77,6 +77,8 @@ r"https://openai.com/index/gpt-4/.*", # 403 Client Error: Forbidden for url r"https://www.npmjs.com/", + # Out of service. + r"https://s3.amazonaws.com/nyc-tlc/.*", # 2025-09-29: Phased out CrateDB 3.3 docs r"https://cratedb.com/docs/crate/reference/en/3.3/", # 403 Client Error: Forbidden for url diff --git a/docs/integrate/nifi/index.md b/docs/integrate/nifi/index.md index d6f3e978..26934621 100644 --- a/docs/integrate/nifi/index.md +++ b/docs/integrate/nifi/index.md @@ -39,8 +39,8 @@ worldwide across every industry. ::::{grid} 2 :::{grid-item-card} Connect Apache NiFi and CrateDB -:link: https://community.cratedb.com/t/connecting-to-cratedb-from-apache-nifi/647 -:link-type: url +:link: nifi-tutorial +:link-type: ref Learn how to ingest data into CrateDB using Apache NiFi. ::: @@ -50,6 +50,12 @@ Learn how to ingest data into CrateDB using Apache NiFi. [CrateDB and Apache NiFi] ``` +:::{toctree} +:maxdepth: 1 +:hidden: +Tutorial +::: + [Apache NiFi]: https://nifi.apache.org/ [CrateDB and Apache NiFi]: https://cratedb.com/integrations/cratedb-and-apache-nifi diff --git a/docs/integrate/nifi/tutorial.md b/docs/integrate/nifi/tutorial.md new file mode 100644 index 00000000..fced06a4 --- /dev/null +++ b/docs/integrate/nifi/tutorial.md @@ -0,0 +1,97 @@ +(nifi-tutorial)= +# Connecting to CrateDB from Apache NiFi + +This article describes how to connect from [Apache NiFi](http://nifi.apache.org) to CrateDB and ingest data from NiFi into CrateDB. + +## Prerequisites +To follow this article, you will need: +* A CrateDB cluster +* An Apache NiFi installation that can connect to the CrateDB cluster + +## Configure +First, we will set up a connection pool to CrateDB: + 1. On the main NiFi web interface, click the gear icon of your process group ("NiFi Flow" by default). + 2. Switch to "Controller Services" and click the plus icon to add a new controller. + 3. Choose "DBCPConnectionPool" as type and click "Add". + 4. Open the settings of the newly created connection pool and switch to "Properties". The table below describes in more detail which parameters need to be changed. + +| Parameter | Description | Sample value | +| -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------- | +| Database Connection URL | The JDBC connection string pointing to CrateDB | `jdbc:postgresql://:5432/doc?ssl=true&sslfactory=org.postgresql.ssl.NonValidatingFactory` | +| Database Driver Class Name | The PostgreSQL JDBC driver class name | `org.postgresql.Driver` | +| Database Driver Location(s)| [Download](https://jdbc.postgresql.org/download/) the latest PostgreSQL JDBC driver and place it on the file system of the NiFi host | `/opt/nifi/nifi-1.13.2/postgresql-42.2.23.jar` | +| Database User | The CrateDB user name | | +| Password | The password of your CrateDB user | | + + 5. After applying the changed properties, click the flash icon to enable the service. + +Now the connection pool is ready to be used in one of NiFi's processors. + +## Example: Read from CSV files +One common use case is to design a process in NiFi that results in data being ingested into CrateDB. As an example, we will take a CSV file from the [NYC Taxi Data](https://github.com/toddwschneider/nyc-taxi-data) repository, process it in NiFi, and then ingest it into Crate DB. + +To achieve high throughput, NiFi uses by default prepared statements with configurable batch size. The optimal batch size depends on your concrete use case, 500 is typically a good starting point. Please also see the documentation on [insert performance](https://crate.io/docs/crate/howtos/en/latest/performance/inserts/index.html) for additional information. + +![Screenshot 2021-04-20 at 13.58.18|576x500](https://us1.discourse-cdn.com/flex020/uploads/crate/original/1X/474e6e5a44eb5df4928599e23b3ca2a00392b56f.png){height=480} + +In CrateDB, we first create the corresponding target table: + +```sql +CREATE TABLE "doc"."yellow_taxi_trips" ( + "vendor_id" TEXT, + "pickup_datetime" TIMESTAMP WITH TIME ZONE, + "dropoff_datetime" TIMESTAMP WITH TIME ZONE, + "passenger_count" INTEGER, + "trip_distance" REAL, + "pickup_longitude" REAL, + "pickup_latitude" REAL, + "rate_code" INTEGER, + "store_and_fwd_flag" TEXT, + "dropoff_longitude" REAL, + "dropoff_latitude" REAL, + "payment_type" TEXT, + "fare_amount" REAL, + "surcharge" REAL, + "mta_tax" REAL, + "tip_amount" REAL, + "tolls_amount" REAL, + "total_amount" REAL +); +``` + +After configuring the processors as described below, click the start icon on the process group window. You should see rows appearing in CrateDB after a short amount of time. If you encounter any issues, please also check NiFi's log files (`log/nifi-bootstrap.log` and `log/nifi-app.log`). + +### GetFile +The `GetFile` processor points to a local directory that contains the file [yellow_tripdata_2013-08.csv](https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2013-08.csv). + +### PutDatabaseRecord +The PutDatabaseRecord has a couple of properties that need to be configured: +* Record Reader: CSVReader. The CSVReader is configured to use "Use String Fields From Header" as a "Schema Access Strategy". +* Database Type: PostgreSQL +* Statement Type: INSERT +* Database Connection Pooling Service: The connection pool created previously +* Schema Name: `doc` +* Table Name: `yellow_taxi_trips` +* Maximum Batch Size: 200 + +## Example: Read from another SQL-based database +Data can be also be read from a SQL database and then be inserted into CrateDB: +![Screenshot 2021-07-15 at 09.59.36|690x229](https://us1.discourse-cdn.com/flex020/uploads/crate/original/1X/ee51baa35eddf540838d7d784cb433a1e16e1b02.png) +### ExecuteSQLRecord +Reads rows from the source database. +* Database Connection Pooling Service: A connection pool pointing to the source database +* SQL select query: The SQL query to retrieve rows as needed +* RecordWriter: JsonRecordSetWriter. JSON files are required by the following processors for conversion into SQL statements. + +### ConvertJSONToSQL +Converts the generated JSON files into SQL statements. +* JDBC Connection Pool: A connection pool pointing to CrateDB +* Statement Type: INSERT +* Table Name: Name of the target table in CrateDB (without schema name) +* Schema Name: The table's schema name in CrateDB + +### PutSQL +Executes the previously generated SQL statements as prepared statements. +* JDBC Connection Pool: A connection pool pointing to CrateDB +* SQL Statement: No value set +* Batch Size: 500 (the optimal value for your use case might vary) From 280b0a6d4c193ace1723bb89fa4ef5c395e33d81 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Tue, 16 Sep 2025 23:09:07 +0200 Subject: [PATCH 2/3] NiFi: Implement suggestions by CodeRabbit --- docs/integrate/nifi/index.md | 2 +- docs/integrate/nifi/tutorial.md | 67 +++++++++++++++++++++------------ 2 files changed, 43 insertions(+), 26 deletions(-) diff --git a/docs/integrate/nifi/index.md b/docs/integrate/nifi/index.md index 26934621..fbde18e8 100644 --- a/docs/integrate/nifi/index.md +++ b/docs/integrate/nifi/index.md @@ -41,7 +41,7 @@ worldwide across every industry. :::{grid-item-card} Connect Apache NiFi and CrateDB :link: nifi-tutorial :link-type: ref -Learn how to ingest data into CrateDB using Apache NiFi. +Connect Apache NiFi to CrateDB and ingest data. ::: :::: diff --git a/docs/integrate/nifi/tutorial.md b/docs/integrate/nifi/tutorial.md index fced06a4..754801e0 100644 --- a/docs/integrate/nifi/tutorial.md +++ b/docs/integrate/nifi/tutorial.md @@ -1,40 +1,45 @@ (nifi-tutorial)= # Connecting to CrateDB from Apache NiFi -This article describes how to connect from [Apache NiFi](http://nifi.apache.org) to CrateDB and ingest data from NiFi into CrateDB. +Learn how to connect from [Apache NiFi](https://nifi.apache.org) to CrateDB +and ingest data from NiFi into CrateDB. ## Prerequisites -To follow this article, you will need: +You need: * A CrateDB cluster * An Apache NiFi installation that can connect to the CrateDB cluster ## Configure -First, we will set up a connection pool to CrateDB: - 1. On the main NiFi web interface, click the gear icon of your process group ("NiFi Flow" by default). - 2. Switch to "Controller Services" and click the plus icon to add a new controller. - 3. Choose "DBCPConnectionPool" as type and click "Add". - 4. Open the settings of the newly created connection pool and switch to "Properties". The table below describes in more detail which parameters need to be changed. +Set up a connection pool to CrateDB: +1. On the main NiFi web interface, click the gear icon of your process group ("NiFi Flow" by default). +2. Switch to "Controller Services" and click the plus icon to add a new controller. +3. Choose "DBCPConnectionPool" as type and click "Add". +4. Open the new connection pool, switch to "Properties", and set the following parameters: -| Parameter | Description | Sample value | -| -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------- | -| Database Connection URL | The JDBC connection string pointing to CrateDB | `jdbc:postgresql://:5432/doc?ssl=true&sslfactory=org.postgresql.ssl.NonValidatingFactory` | -| Database Driver Class Name | The PostgreSQL JDBC driver class name | `org.postgresql.Driver` | -| Database Driver Location(s)| [Download](https://jdbc.postgresql.org/download/) the latest PostgreSQL JDBC driver and place it on the file system of the NiFi host | `/opt/nifi/nifi-1.13.2/postgresql-42.2.23.jar` | -| Database User | The CrateDB user name | | -| Password | The password of your CrateDB user | | +| Parameter | Description | Sample value | +| -------------------------- |----------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------| +| Database Connection URL | The JDBC connection string pointing to CrateDB | `jdbc:postgresql://:5432/doc?sslmode=verify-full&sslrootcert=/path/to/ca.pem` | +| Database Driver Class Name | The PostgreSQL JDBC driver class name | `org.postgresql.Driver` | +| Database Driver Location(s)| [Download](https://jdbc.postgresql.org/download/) the latest PostgreSQL JDBC driver and place it on the file system of the NiFi host | `${nifi.home}/lib/postgresql-42.7.x.jar` | +| Database User | The CrateDB user name | | +| Password | The password of your CrateDB user | | - 5. After applying the changed properties, click the flash icon to enable the service. +5. Apply the properties, then click the lightning bolt to enable the service. -Now the connection pool is ready to be used in one of NiFi's processors. +You can now use the connection pool in NiFi processors. ## Example: Read from CSV files -One common use case is to design a process in NiFi that results in data being ingested into CrateDB. As an example, we will take a CSV file from the [NYC Taxi Data](https://github.com/toddwschneider/nyc-taxi-data) repository, process it in NiFi, and then ingest it into Crate DB. +One common use case is to design a process in NiFi that results in data being +ingested into CrateDB. This example takes a CSV file from the +[NYC Taxi Data](https://github.com/toddwschneider/nyc-taxi-data) repository, +processes it in NiFi, and then ingests it into CrateDB. -To achieve high throughput, NiFi uses by default prepared statements with configurable batch size. The optimal batch size depends on your concrete use case, 500 is typically a good starting point. Please also see the documentation on [insert performance](https://crate.io/docs/crate/howtos/en/latest/performance/inserts/index.html) for additional information. +NiFi uses prepared statements and batching by default. Start with a batch size +of 500 and adjust to your workload. See [insert performance] for details. ![Screenshot 2021-04-20 at 13.58.18|576x500](https://us1.discourse-cdn.com/flex020/uploads/crate/original/1X/474e6e5a44eb5df4928599e23b3ca2a00392b56f.png){height=480} -In CrateDB, we first create the corresponding target table: +Create the corresponding target table in CrateDB: ```sql CREATE TABLE "doc"."yellow_taxi_trips" ( @@ -59,29 +64,38 @@ CREATE TABLE "doc"."yellow_taxi_trips" ( ); ``` -After configuring the processors as described below, click the start icon on the process group window. You should see rows appearing in CrateDB after a short amount of time. If you encounter any issues, please also check NiFi's log files (`log/nifi-bootstrap.log` and `log/nifi-app.log`). +Start the process group. Rows should appear in CrateDB shortly. To verify: + +```sql +SELECT count(*) FROM doc.yellow_taxi_trips; +``` +If you run into issues, check NiFi logs: `log/nifi-bootstrap.log` and +`log/nifi-app.log`. ### GetFile The `GetFile` processor points to a local directory that contains the file [yellow_tripdata_2013-08.csv](https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2013-08.csv). ### PutDatabaseRecord The PutDatabaseRecord has a couple of properties that need to be configured: -* Record Reader: CSVReader. The CSVReader is configured to use "Use String Fields From Header" as a "Schema Access Strategy". +* Record Reader: CSVReader + * Schema Access Strategy: "Use String Fields From Header" + * Treat First Line as Header: true * Database Type: PostgreSQL * Statement Type: INSERT * Database Connection Pooling Service: The connection pool created previously * Schema Name: `doc` * Table Name: `yellow_taxi_trips` -* Maximum Batch Size: 200 +* Maximum Batch Size: 500 ## Example: Read from another SQL-based database -Data can be also be read from a SQL database and then be inserted into CrateDB: +Read data from a SQL database and insert it into CrateDB: ![Screenshot 2021-07-15 at 09.59.36|690x229](https://us1.discourse-cdn.com/flex020/uploads/crate/original/1X/ee51baa35eddf540838d7d784cb433a1e16e1b02.png) + ### ExecuteSQLRecord Reads rows from the source database. * Database Connection Pooling Service: A connection pool pointing to the source database * SQL select query: The SQL query to retrieve rows as needed -* RecordWriter: JsonRecordSetWriter. JSON files are required by the following processors for conversion into SQL statements. +* RecordWriter: JsonRecordSetWriter. The following processors require JSON files for conversion into SQL statements. ### ConvertJSONToSQL Converts the generated JSON files into SQL statements. @@ -94,4 +108,7 @@ Converts the generated JSON files into SQL statements. Executes the previously generated SQL statements as prepared statements. * JDBC Connection Pool: A connection pool pointing to CrateDB * SQL Statement: No value set -* Batch Size: 500 (the optimal value for your use case might vary) +* Batch Size: 500 (the optimal value varies by use case) + + +[insert performance]: https://crate.io/docs/crate/howtos/en/latest/performance/inserts/index.html From 5abfc82e23809c06129368abd5296130b5b2cce1 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Tue, 23 Sep 2025 21:15:05 +0200 Subject: [PATCH 3/3] NiFi: s/tutorial/usage/ --- docs/integrate/nifi/index.md | 4 ++-- docs/integrate/nifi/{tutorial.md => usage.md} | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) rename docs/integrate/nifi/{tutorial.md => usage.md} (99%) diff --git a/docs/integrate/nifi/index.md b/docs/integrate/nifi/index.md index fbde18e8..fbbdbce7 100644 --- a/docs/integrate/nifi/index.md +++ b/docs/integrate/nifi/index.md @@ -39,7 +39,7 @@ worldwide across every industry. ::::{grid} 2 :::{grid-item-card} Connect Apache NiFi and CrateDB -:link: nifi-tutorial +:link: nifi-usage :link-type: ref Connect Apache NiFi to CrateDB and ingest data. ::: @@ -53,7 +53,7 @@ Connect Apache NiFi to CrateDB and ingest data. :::{toctree} :maxdepth: 1 :hidden: -Tutorial +Usage ::: diff --git a/docs/integrate/nifi/tutorial.md b/docs/integrate/nifi/usage.md similarity index 99% rename from docs/integrate/nifi/tutorial.md rename to docs/integrate/nifi/usage.md index 754801e0..910318e9 100644 --- a/docs/integrate/nifi/tutorial.md +++ b/docs/integrate/nifi/usage.md @@ -1,4 +1,4 @@ -(nifi-tutorial)= +(nifi-usage)= # Connecting to CrateDB from Apache NiFi Learn how to connect from [Apache NiFi](https://nifi.apache.org) to CrateDB