diff --git a/mddocs/docs/_static/images/icon.svg b/mddocs/docs/_static/images/icon.svg
new file mode 100644
index 000000000..a4d737f81
--- /dev/null
+++ b/mddocs/docs/_static/images/icon.svg
@@ -0,0 +1,11 @@
+
diff --git a/mddocs/docs/_static/images/logo.svg b/mddocs/docs/_static/images/logo.svg
new file mode 100644
index 000000000..76527ebf1
--- /dev/null
+++ b/mddocs/docs/_static/images/logo.svg
@@ -0,0 +1,214 @@
+
diff --git a/mddocs/docs/_static/images/logo_wide.svg b/mddocs/docs/_static/images/logo_wide.svg
new file mode 100644
index 000000000..981bf0148
--- /dev/null
+++ b/mddocs/docs/_static/images/logo_wide.svg
@@ -0,0 +1,329 @@
+
diff --git a/mddocs/docs/_static/stylesheets/autodoc_pydantic.css b/mddocs/docs/_static/stylesheets/autodoc_pydantic.css
new file mode 100644
index 000000000..db37fda45
--- /dev/null
+++ b/mddocs/docs/_static/stylesheets/autodoc_pydantic.css
@@ -0,0 +1,11 @@
+.autodoc_pydantic_validator_arrow {
+ padding-left: 8px;
+ }
+
+.autodoc_pydantic_collapsable_json {
+ cursor: pointer;
+ }
+
+.autodoc_pydantic_collapsable_erd {
+ cursor: pointer;
+ }
diff --git a/mddocs/docs/changelog/0.10.0.md b/mddocs/docs/changelog/0.10.0.md
new file mode 100644
index 000000000..69afb2e60
--- /dev/null
+++ b/mddocs/docs/changelog/0.10.0.md
@@ -0,0 +1,533 @@
+# 0.10.0 (2023-12-18) { #DBR-onetl-changelog-0-10-0 }
+
+## Breaking Changes { #DBR-onetl-changelog-0-10-0-breaking-changes }
+
+- Upgrade `etl-entities` from v1 to v2 ([#172](https://github.com/MTSWebServices/onetl/pull/172)).
+
+ This implies that `HWM` classes are now have different internal structure than they used to.
+
+ Before:
+
+ ```python
+ from etl_entities.old_hwm import IntHWM as OldIntHWM
+ from etl_entities.source import Column, Table
+ from etl_entities.process import Process
+
+ hwm = OldIntHWM(
+ process=Process(name="myprocess", task="abc", dag="cde", host="myhost"),
+ source=Table(name="schema.table", instance="postgres://host:5432/db"),
+ column=Column(name="col1"),
+ value=123,
+ )
+ ```
+
+ After:
+
+ ```python
+ from etl_entities.hwm import ColumnIntHWM
+
+ hwm = ColumnIntHWM(
+ name="some_unique_name",
+ description="any value you want",
+ source="schema.table",
+ expression="col1",
+ value=123,
+ )
+ ```
+
+ **Breaking change:** If you used HWM classes from `etl_entities` module, you should rewrite your code to make it compatible with new version.
+
+??? "More details"
+
+ - `HWM` classes used by previous onETL versions were moved from `etl_entities` to `etl_entities.old_hwm` submodule. They are here for compatibility reasons, but are planned to be removed in `etl-entities` v3 release.
+ - New `HWM` classes have flat structure instead of nested.
+ - New `HWM` classes have mandatory `name` attribute (it was known as `qualified_name` before).
+ - Type aliases used while serializing and deserializing `HWM` objects to `dict` representation were changed too: `int` → `column_int`.
+
+ To make migration simpler, you can use new method:
+
+ ```python
+ old_hwm = OldIntHWM(...)
+ new_hwm = old_hwm.as_new_hwm()
+ ```
+
+ Which automatically converts all fields from old structure to new one, including `qualified_name` → `name`.
+
+- **Breaking changes:**
+
+ - Methods `BaseHWMStore.get()` and `BaseHWMStore.save()` were renamed to `get_hwm()` and `set_hwm()`.
+ - They now can be used only with new HWM classes from `etl_entities.hwm`, **old HWM classes are not supported**.
+
+ If you used them in your code, please update it accordingly.
+
+- YAMLHWMStore **CANNOT read files created by older onETL versions** (0.9.x or older).
+
+??? "Update procedure"
+
+ ```python
+ # pip install onetl==0.9.5
+
+ # Get qualified_name for HWM
+
+
+ # Option 1. HWM is built manually
+ from etl_entities import IntHWM, FileListHWM
+ from etl_entities.source import Column, Table, RemoteFolder
+ from etl_entities.process import Process
+
+ # for column HWM
+ old_column_hwm = IntHWM(
+ process=Process(name="myprocess", task="abc", dag="cde", host="myhost"),
+ source=Table(name="schema.table", instance="postgres://host:5432/db"),
+ column=Column(name="col1"),
+ )
+ qualified_name = old_column_hwm.qualified_name
+ # "col1#schema.table@postgres://host:5432/db#cde.abc.myprocess@myhost"
+
+ # for file HWM
+ old_file_hwm = FileListHWM(
+ process=Process(name="myprocess", task="abc", dag="cde", host="myhost"),
+ source=RemoteFolder(name="/absolute/path", instance="ftp://ftp.server:21"),
+ )
+ qualified_name = old_file_hwm.qualified_name
+ # "file_list#/absolute/path@ftp://ftp.server:21#cde.abc.myprocess@myhost"
+
+
+ # Option 2. HWM is generated automatically (by DBReader/FileDownloader)
+ # See onETL logs and search for string like qualified_name = '...'
+
+ qualified_name = "col1#schema.table@postgres://host:5432/db#cde.abc.myprocess@myhost"
+
+
+ # Get .yml file path by qualified_name
+
+ import os
+ from pathlib import PurePosixPath
+ from onetl.hwm.store import YAMLHWMStore
+
+ # here you should pass the same arguments as used on production, if any
+ yaml_hwm_store = YAMLHWMStore()
+ hwm_path = yaml_hwm_store.get_file_path(qualified_name)
+ print(hwm_path)
+
+ # for column HWM
+ # LocalPosixPath('/home/maxim/.local/share/onETL/yml_hwm_store/col1__schema.table__postgres_host_5432_db__cde.abc.myprocess__myhost.yml')
+
+ # for file HWM
+ # LocalPosixPath('/home/maxim/.local/share/onETL/yml_hwm_store/file_list__absolute_path__ftp_ftp.server_21__cde.abc.myprocess__myhost.yml')
+
+
+ # Read raw .yml file content
+
+ from yaml import safe_load, dump
+
+ raw_old_hwm_items = safe_load(hwm_path.read_text())
+ print(raw_old_hwm_items)
+
+ # for column HWM
+ # [
+ # {
+ # "column": { "name": "col1", "partition": {} },
+ # "modified_time": "2023-12-18T10: 39: 47.377378",
+ # "process": { "dag": "cde", "host": "myhost", "name": "myprocess", "task": "abc" },
+ # "source": { "instance": "postgres: //host:5432/db", "name": "schema.table" },
+ # "type": "int",
+ # "value": "123",
+ # },
+ # ]
+
+ # for file HWM
+ # [
+ # {
+ # "modified_time": "2023-12-18T11:15:36.478462",
+ # "process": { "dag": "cde", "host": "myhost", "name": "myprocess", "task": "abc" },
+ # "source": { "instance": "ftp://ftp.server:21", "name": "/absolute/path" },
+ # "type": "file_list",
+ # "value": ["file1.txt", "file2.txt"],
+ # },
+ # ]
+
+
+ # Convert file content to new structure, compatible with onETL 0.10.x
+ raw_new_hwm_items = []
+ for old_hwm in raw_old_hwm_items:
+ new_hwm = {"name": qualified_name, "modified_time": old_hwm["modified_time"]}
+
+ if "column" in old_hwm:
+ new_hwm["expression"] = old_hwm["column"]["name"]
+ new_hwm["entity"] = old_hwm["source"]["name"]
+ old_hwm.pop("process", None)
+
+ if old_hwm["type"] == "int":
+ new_hwm["type"] = "column_int"
+ new_hwm["value"] = old_hwm["value"]
+
+ elif old_hwm["type"] == "date":
+ new_hwm["type"] = "column_date"
+ new_hwm["value"] = old_hwm["value"]
+
+ elif old_hwm["type"] == "datetime":
+ new_hwm["type"] = "column_datetime"
+ new_hwm["value"] = old_hwm["value"]
+
+ elif old_hwm["type"] == "file_list":
+ new_hwm["type"] = "file_list"
+ new_hwm["value"] = [
+ os.fspath(PurePosixPath(old_hwm["source"]["name"]).joinpath(path))
+ for path in old_hwm["value"]
+ ]
+
+ else:
+ raise ValueError("WAT?")
+
+ raw_new_hwm_items.append(new_hwm)
+
+
+ print(raw_new_hwm_items)
+ # for column HWM
+ # [
+ # {
+ # "name": "col1#schema.table@postgres://host:5432/db#cde.abc.myprocess@myhost",
+ # "modified_time": "2023-12-18T10:39:47.377378",
+ # "expression": "col1",
+ # "source": "schema.table",
+ # "type": "column_int",
+ # "value": 123,
+ # },
+ # ]
+
+ # for file HWM
+ # [
+ # {
+ # "name": "file_list#/absolute/path@ftp://ftp.server:21#cde.abc.myprocess@myhost",
+ # "modified_time": "2023-12-18T11:15:36.478462",
+ # "entity": "/absolute/path",
+ # "type": "file_list",
+ # "value": ["/absolute/path/file1.txt", "/absolute/path/file2.txt"],
+ # },
+ # ]
+
+
+ # Save file with new content
+ with open(hwm_path, "w") as file:
+ dump(raw_new_hwm_items, file)
+
+
+ # Stop Python interpreter and update onETL
+ # pip install onetl==0.10.0
+ # Check that new .yml file can be read
+
+ from onetl.hwm.store import YAMLHWMStore
+
+ qualified_name = ...
+
+ # here you should pass the same arguments as used on production, if any
+ yaml_hwm_store = YAMLHWMStore()
+ yaml_hwm_store.get_hwm(qualified_name)
+
+ # for column HWM
+ # ColumnIntHWM(
+ # name='col1#schema.table@postgres://host:5432/db#cde.abc.myprocess@myhost',
+ # description='',
+ # entity='schema.table',
+ # value=123,
+ # expression='col1',
+ # modified_time=datetime.datetime(2023, 12, 18, 10, 39, 47, 377378),
+ # )
+
+ # for file HWM
+ # FileListHWM(
+ # name='file_list#/absolute/path@ftp://ftp.server:21#cde.abc.myprocess@myhost',
+ # description='',
+ # entity=AbsolutePath('/absolute/path'),
+ # value=frozenset({AbsolutePath('/absolute/path/file1.txt'), AbsolutePath('/absolute/path/file2.txt')}),
+ # expression=None,
+ # modified_time=datetime.datetime(2023, 12, 18, 11, 15, 36, 478462)
+ # )
+
+
+ # That's all!
+ ```
+
+But most of users use other HWM store implementations which do not have such issues.
+
+- Several classes and functions were moved from `onetl` to `etl_entities`:
+
+=== "onETL `0.9.x` and older"
+
+ ```python
+ from onetl.hwm.store import (
+ detect_hwm_store,
+ BaseHWMStore,
+ HWMStoreClassRegistry,
+ register_hwm_store_class,
+ HWMStoreManager,
+ MemoryHWMStore,
+ )
+ ```
+
+=== "onETL `0.10.x` and newer"
+
+ ```python
+ from etl_entities.hwm_store import (
+ detect_hwm_store,
+ BaseHWMStore,
+ HWMStoreClassRegistry,
+ register_hwm_store_class,
+ HWMStoreManager,
+ MemoryHWMStore,
+ )
+ ```
+
+ They still can be imported from old module, but this is deprecated and will be removed in v1.0.0 release.
+
+- Change the way of passing `HWM` to `DBReader` and `FileDownloader` classes:
+
+=== "onETL `0.9.x` and older"
+
+ ```python linenums="1" hl_lines="12-21"
+ # Simple
+ reader = DBReader(
+ connection=...,
+ source=...,
+ hwm_column="col1",
+ )
+
+
+ # Complex
+ reader = DBReader(
+ connection=...,
+ source=...,
+ hwm_column=(
+ "col1",
+ "cast(col1 as date)",
+ ),
+ )
+
+
+ # Files
+ downloader = FileDownloader(
+ connection=...,
+ source_path=...,
+ target_path=...,
+ hwm_type="file_list",
+ )
+ ```
+
+=== "onETL `0.10.x` and newer"
+
+ ```python linenums="1" hl_lines="12-21"
+ # Simple
+ reader = DBReader(
+ connection=...,
+ source=...,
+ hwm=DBReader.AutoDetectHWM(
+ # name is mandatory now!
+ name="my_unique_hwm_name",
+ expression="col1",
+ ),
+ )
+
+ # Complex
+ reader = DBReader(
+ connection=...,
+ source=...,
+ hwm=DBReader.AutoDetectHWM(
+ # name is mandatory now!
+ name="my_unique_hwm_name",
+ expression="cast(col1 as date)",
+ ),
+ )
+
+ # Files
+ downloader = FileDownloader(
+ connection=...,
+ source_path=...,
+ target_path=...,
+ hwm=FileListHWM(
+ # name is mandatory now!
+ name="another_unique_hwm_name",
+ ),
+ )
+ ```
+
+ New HWM classes have **mandatory** `name` attribute which should be passed explicitly,
+ instead of generating if automatically under the hood.
+
+ Automatic `name` generation using the old `DBReader.hwm_column` / `FileDownloader.hwm_type`
+ syntax is still supported, but will be removed in v1.0.0 release. ([#179](https://github.com/MTSWebServices/onetl/pull/179))
+
+- Performance of read Incremental and Batch strategies has been drastically improved. ([#182](https://github.com/MTSWebServices/onetl/pull/182)).
+
+??? "Before and after in details"
+
+ `DBReader.run()` + incremental/batch strategy behavior in versions 0.9.x and older:
+
+ - Get table schema by making query `SELECT * FROM table WHERE 1=0` (if `DBReader.columns` has `*``)
+ - Expand `*`` to real column names from table, add here `hwm_column`, remove duplicates (as some RDBMS does not allow that).
+ - Create dataframe from query like `SELECT hwm_expression AS hwm_column, ...other table columns... FROM table WHERE hwm_expression > prev_hwm.value`.
+ - Determine HWM class using dataframe schema: `df.schema[hwm_column].dataType`.
+ - Determine x HWM column value using Spark: `df.select(max(hwm_column)).collect()`.
+ - Use `max(hwm_column)` as next HWM value, and save it to HWM Store.
+ - Return dataframe to user.
+
+ This was far from ideal:
+
+ - Dataframe content (all rows or just changed ones) was loaded from the source to Spark only to get min/max values of specific column.
+
+ - Step of fetching table schema and then substituting column names in the next query caused some unexpected errors.
+
+ For example, source contains columns with mixed name case, like `"CamelColumn"` or `"spaced column"`.
+
+ Column names were *not* escaped during query generation, leading to queries that cannot be executed by database.
+
+ So users have to *explicitly* pass column names `DBReader`, wrapping columns with mixed naming with `"``:
+
+ ```python
+ reader = DBReader(
+ connection=...,
+ source=...,
+ columns=[ # passing '*' here leads to wrong SQL query generation
+ "normal_column",
+ '"CamelColumn"',
+ '"spaced column"',
+ ...,
+ ],
+ )
+ ```
+ - Using `DBReader` with `IncrementalStrategy` could lead to reading rows already read before.
+
+ Dataframe was created from query with WHERE clause like `hwm.expression > prev_hwm.value`,
+ not `hwm.expression > prev_hwm.value AND hwm.expression <= current_hwm.value`.
+
+ So if new rows appeared in the source **after** HWM value is determined,
+ they can be read by accessing dataframe content (because Spark dataframes are lazy),
+ leading to inconsistencies between HWM value and dataframe content.
+
+ This may lead to issues then `DBReader.run()` read some data, updated HWM value, and next call of `DBReader.run()``
+ will read rows that were already read in previous run.
+
+ `DBReader.run()` + incremental/batch strategy behavior in versions 0.10.x and newer:
+
+ - Detect type of HWM expression: `SELECT hwm.expression FROM table WHERE 1=0`.
+ - Determine corresponding Spark type `df.schema[0]` and when determine matching HWM class (if `DBReader.AutoDetectHWM` is used).
+ - Get min/max values by querying the source: `SELECT MAX(hwm.expression) FROM table WHERE hwm.expression >= prev_hwm.value`.
+ - Use `max(hwm.expression)` as next HWM value, and save it to HWM Store.
+ - Create dataframe from query `SELECT ... table columns ... FROM table WHERE hwm.expression > prev_hwm.value AND hwm.expression <= current_hwm.value`, baking new HWM value into the query.
+ - Return dataframe to user.
+
+ Improvements:
+
+ - Allow source to calculate min/max instead of loading everything to Spark. This should be **faster** on large amounts of data (**up to x2**), because we do not transfer all the data from the source to Spark. This can be even faster if source have indexes for HWM column.
+ - Columns list is passed to source as-is, without any resolving on `DBReader` side. So you can pass `DBReader(columns=["*"])` to read tables with mixed columns naming.
+ - Restrict dataframe content to always match HWM values, which leads to never reading the same row twice.
+
+ **Breaking change**: HWM column is not being implicitly added to dataframe. It was a part of `SELECT` clause, but now it is mentioned only in `WHERE` clause.
+
+ So if you had code like this, you have to rewrite it:
+
+=== "onETL `0.9.x` and older"
+
+ ```python linenums="1" hl_lines="1-16"
+ reader = DBReader(
+ connection=...,
+ source=...,
+ columns=[
+ "col1",
+ "col2",
+ ],
+ hwm_column="hwm_col",
+ )
+
+ df = reader.run()
+ # hwm_column value is in the dataframe
+ assert df.columns == ["col1", "col2", "hwm_col"]
+
+
+ reader = DBReader(
+ connection=...,
+ source=...,
+ columns=[
+ "col1",
+ "col2",
+ ],
+ hwm_column=(
+ "hwm_col",
+ "cast(hwm_col as int)",
+ ),
+ )
+
+ df = reader.run()
+ # hwm_expression value is in the dataframe
+ assert df.columns == ["col1", "col2", "hwm_col"]
+ ```
+
+=== "onETL `0.10.x` and newer"
+
+ ```python linenums="1" hl_lines="1-16"
+ reader = DBReader(
+ connection=...,
+ source=...,
+ columns=[
+ "col1",
+ "col2",
+ # add hwm_column explicitly
+ "hwm_col",
+ ],
+ hwm_column="hwm_col",
+ )
+
+ df = reader.run()
+ # if columns list is not updated,
+ # this fill fail
+ assert df.columns == ["col1", "col2", "hwm_col"]
+
+ reader = DBReader(
+ connection=...,
+ source=...,
+ columns=[
+ "col1",
+ "col2",
+ # add hwm_expression explicitly
+ "cast(hwm_col as int) as hwm_col",
+ ],
+ hwm_column=(
+ "hwm_col",
+ "cast(hwm_col as int)",
+ ),
+ )
+ df = reader.run()
+ # if columns list is not updated,
+ # this fill fail
+ assert df.columns == ["col1", "col2", "hwm_col"]
+ ```
+
+ But most users just use `columns=["*"]` anyway, they won't see any changes.
+
+- `FileDownloader.run()` now updates HWM in HWM Store not after each file is being successfully downloaded,
+ but after all files were handled.
+
+ This is because:
+
+ - FileDownloader can be used with `DownloadOptions(workers=N)`, which could lead to race condition - one thread can save to HWM store one HWM value when another thread will save different value.
+ - FileDownloader can download hundreds and thousands of files, and issuing a request to HWM Store for each file could potentially DDoS HWM Store. ([#189](https://github.com/MTSWebServices/onetl/pull/189))
+
+ There is a exception handler which tries to save HWM to HWM store if download process was interrupted. But if it was interrupted by force, like sending `SIGKILL` event,
+ HWM will not be saved to HWM store, so some already downloaded files may be downloaded again next time.
+
+ But unexpected process kill may produce other negative impact, like some file will be downloaded partially, so this is an expected behavior.
+
+## Features { #DBR-onetl-changelog-0-10-0-features }
+
+- Add Python 3.12 compatibility. ([#167](https://github.com/MTSWebServices/onetl/pull/167))
+- `Excel` file format now can be used with Spark 3.5.0. ([#187](https://github.com/MTSWebServices/onetl/pull/187))
+- `SnapshotBatchStagy` and `IncrementalBatchStrategy` does no raise exceptions if source does not contain any data.
+ Instead they stop at first iteration and return empty dataframe. ([#188](https://github.com/MTSWebServices/onetl/pull/188))
+- Cache result of `connection.check()` in high-level classes like `DBReader`, `FileDownloader` and so on. This makes logs less verbose. ([#190](https://github.com/MTSWebServices/onetl/pull/190))
+
+## Bug Fixes { #DBR-onetl-changelog-0-10-0-bug-fixes }
+
+- Fix `@slot` and `@hook` decorators returning methods with missing arguments in signature (Pylance, VS Code). ([#183](https://github.com/MTSWebServices/onetl/pull/183))
+- Kafka connector documentation said that it does support reading topic data incrementally by passing `group.id` or `groupIdPrefix`.
+ Actually, this is not true, because Spark does not send information to Kafka which messages were consumed.
+ So currently users can only read the whole topic, no incremental reads are supported.
diff --git a/mddocs/docs/changelog/0.10.1.md b/mddocs/docs/changelog/0.10.1.md
new file mode 100644
index 000000000..58d2512f1
--- /dev/null
+++ b/mddocs/docs/changelog/0.10.1.md
@@ -0,0 +1,29 @@
+# 0.10.1 (2024-02-05) { #DBR-onetl-changelog-0-10-1 }
+
+## Features { #DBR-onetl-changelog-0-10-1-features }
+
+- Add support of `Incremental Strategies` for `Kafka` connection:
+
+ ```python
+ reader = DBReader(
+ connection=Kafka(...),
+ source="topic_name",
+ hwm=DBReader.AutoDetectHWM(name="some_hwm_name", expression="offset"),
+ )
+
+ with IncrementalStrategy():
+ df = reader.run()
+ ```
+
+ This lets you resume reading data from a Kafka topic starting at the last committed offset from your previous run. ([#202](https://github.com/MTSWebServices/onetl/pull/202))
+
+- Add `has_data`, `raise_if_no_data` methods to `DBReader` class. ([#203](https://github.com/MTSWebServices/onetl/pull/203))
+
+- Updare VMware Greenplum connector from `2.1.4` to `2.3.0`. This implies:
+ - Greenplum 7.x support
+ - [Kubernetes support](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/configure.html#k8scfg)
+ - New read option [gpdb.matchDistributionPolicy](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/options.html#distpolmotion)
+ which allows to match each Spark executor with specific Greenplum segment, avoiding redundant data transfer between Greenplum segments
+ - Allows overriding [Greenplum optimizer parameters](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/options.html#greenplum-gucs) in read/write operations ([#208](https://github.com/MTSWebServices/onetl/pull/208))
+
+- `Greenplum.get_packages()` method now accepts optional arg `package_version` which allows to override version of Greenplum connector package. ([#208](https://github.com/MTSWebServices/onetl/pull/208))
diff --git a/mddocs/docs/changelog/0.10.2.md b/mddocs/docs/changelog/0.10.2.md
new file mode 100644
index 000000000..930316a03
--- /dev/null
+++ b/mddocs/docs/changelog/0.10.2.md
@@ -0,0 +1,39 @@
+# 0.10.2 (2024-03-21) { #DBR-onetl-changelog-0-10-2 }
+
+## Features { #DBR-onetl-changelog-0-10-2-features }
+
+- Add support of Pydantic v2. ([#230](https://github.com/MTSWebServices/onetl/pull/230))
+
+## Improvements { #DBR-onetl-changelog-0-10-2-improvements }
+
+- Improve database connections documentation:
+ - Add "Types" section describing mapping between Clickhouse and Spark types
+ - Add "Prerequisites" section describing different aspects of connecting to Clickhouse
+ - Separate documentation of `DBReader` and `.sql()` / `.pipeline(...)`
+ - Add examples for `.fetch()` and `.execute()` ([#211](https://github.com/MTSWebServices/onetl/pull/211), [#228](https://github.com/MTSWebServices/onetl/pull/228), [#229](https://github.com/MTSWebServices/onetl/pull/229), [#233](https://github.com/MTSWebServices/onetl/pull/233), [#234](https://github.com/MTSWebServices/onetl/pull/234), [#235](https://github.com/MTSWebServices/onetl/pull/235), [#236](https://github.com/MTSWebServices/onetl/pull/236), [#240](https://github.com/MTSWebServices/onetl/pull/240))
+
+- Add notes to Greenplum documentation about issues with IP resolution and building `gpfdist` URL ([#228](https://github.com/MTSWebServices/onetl/pull/228))
+
+- Allow calling `MongoDB.pipeline(...)` with passing just collection name, without explicit aggregation pipeline. ([#237](https://github.com/MTSWebServices/onetl/pull/237))
+
+- Update default `Postgres(extra={...})` to include `{"stringtype": "unspecified"}` option.
+ This allows to write text data to non-text column (or vice versa), relying to Postgres cast capabilities.
+
+ For example, now it is possible to read column of type `money` as Spark's `StringType()`, and write it back to the same column,
+ without using intermediate columns or tables. ([#229](https://github.com/MTSWebServices/onetl/pull/229))
+
+## Bug Fixes { #DBR-onetl-changelog-0-10-2-bug-fixes }
+
+- Return back handling of `DBReader(columns="string")`. This was a valid syntax up to v0.10 release, but it was removed because
+ most of users neved used it. It looks that we were wrong, returning this behavior back, but with deprecation warning. ([#238](https://github.com/MTSWebServices/onetl/pull/238))
+
+- Downgrade Greenplum package version from `2.3.0` to `2.2.0`. ([#239](https://github.com/MTSWebServices/onetl/pull/239))
+
+ This is because version 2.3.0 introduced issues with writing data to Greenplum 6.x.
+ Connector can open transaction with `SELECT * FROM table LIMIT 0` query, but does not close it, which leads to deadlocks.
+
+ For using this connector with Greenplum 7.x, please pass package version explicitly:
+
+ ```python
+ maven_packages = Greenplum.get_packages(package_version="2.3.0", ...)
+ ```
diff --git a/mddocs/docs/changelog/0.11.0.md b/mddocs/docs/changelog/0.11.0.md
new file mode 100644
index 000000000..b093991ad
--- /dev/null
+++ b/mddocs/docs/changelog/0.11.0.md
@@ -0,0 +1,212 @@
+# 0.11.0 (2024-05-27) { #DBR-onetl-changelog-0-11-0 }
+
+## Breaking Changes { #DBR-onetl-changelog-0-11-0-breaking-changes }
+
+There can be some changes in connection behavior, related to version upgrades. So we mark these changes as **breaking** although
+most of users will not see any differences.
+
+- Update Clickhouse JDBC driver to latest version ([#249](https://github.com/MTSWebServices/onetl/pull/249)):
+ - Package was renamed `ru.yandex.clickhouse:clickhouse-jdbc` → `com.clickhouse:clickhouse-jdbc`.
+ - Package version changed `0.3.2` → `0.6.0-patch5`.
+ - Driver name changed `ru.yandex.clickhouse.ClickHouseDriver` → `com.clickhouse.jdbc.ClickHouseDriver`.
+
+ This brings up several fixes for Spark <-> Clickhouse type compatibility, and also Clickhouse clusters support.
+
+- Update other JDBC drivers to latest versions:
+ - MSSQL `12.2.0` → `12.6.2` ([#254](https://github.com/MTSWebServices/onetl/pull/254)).
+ - MySQL `8.0.33` → `8.4.0` ([#253](https://github.com/MTSWebServices/onetl/pull/253), [#285](https://github.com/MTSWebServices/onetl/pull/285)).
+ - Oracle `23.2.0.0` → `23.4.0.24.05` ([#252](https://github.com/MTSWebServices/onetl/pull/252), [#284](https://github.com/MTSWebServices/onetl/pull/284)).
+ - Postgres `42.6.0` → `42.7.3` ([#251](https://github.com/MTSWebServices/onetl/pull/251)).
+
+- Update MongoDB connector to latest version: `10.1.1` → `10.3.0` ([#255](https://github.com/MTSWebServices/onetl/pull/255), [#283](https://github.com/MTSWebServices/onetl/pull/283)).
+
+ This brings up Spark 3.5 support.
+
+- Update `XML` package to latest version: `0.17.0` → `0.18.0` ([#259](https://github.com/MTSWebServices/onetl/pull/259)).
+
+ This brings few bugfixes with datetime format handling.
+
+- For JDBC connections add new `SQLOptions` class for `DB.sql(query, options=...)` method ([#272](https://github.com/MTSWebServices/onetl/pull/272)).
+
+ Firsly, to keep naming more consistent.
+
+ Secondly, some of options are not supported by `DB.sql(...)` method, but supported by `DBReader`.
+ For example, `SQLOptions` do not support `partitioning_mode` and require explicit definition of `lower_bound` and `upper_bound` when `num_partitions` is greater than 1.
+ `ReadOptions` does support `partitioning_mode` and allows skipping `lower_bound` and `upper_bound` values.
+
+ This require some code changes. Before:
+
+ ```python
+ from onetl.connection import Postgres
+
+ postgres = Postgres(...)
+ df = postgres.sql(
+ """
+ SELECT *
+ FROM some.mytable
+ WHERE key = 'something'
+ """,
+ options=Postgres.ReadOptions(
+ partitioning_mode="range",
+ partition_column="id",
+ num_partitions=10,
+ ),
+ )
+ ```
+
+ After:
+
+ ```python
+ from onetl.connection import Postgres
+
+ postgres = Postgres(...)
+ df = postgres.sql(
+ """
+ SELECT *
+ FROM some.mytable
+ WHERE key = 'something'
+ """,
+ options=Postgres.SQLOptions(
+ # partitioning_mode is not supported!
+ partition_column="id",
+ num_partitions=10,
+ lower_bound=0, # <-- set explicitly
+ upper_bound=1000, # <-- set explicitly
+ ),
+ )
+ ```
+
+ For now, `DB.sql(query, options=...)` can accept `ReadOptions` to keep backward compatibility, but emits deprecation warning.
+ The support will be removed in `v1.0.0`.
+
+- Split up `JDBCOptions` class into `FetchOptions` and `ExecuteOptions` ([#274](https://github.com/MTSWebServices/onetl/pull/274)).
+
+ New classes are used by `DB.fetch(query, options=...)` and `DB.execute(query, options=...)` methods respectively.
+ This is mostly to keep naming more consistent.
+
+ This require some code changes. Before:
+
+ ```python
+ from onetl.connection import Postgres
+
+ postgres = Postgres(...)
+ df = postgres.fetch(
+ "SELECT * FROM some.mytable WHERE key = 'something'",
+ options=Postgres.JDBCOptions(
+ fetchsize=1000,
+ query_timeout=30,
+ ),
+ )
+
+ postgres.execute(
+ "UPDATE some.mytable SET value = 'new' WHERE key = 'something'",
+ options=Postgres.JDBCOptions(query_timeout=30),
+ )
+ ```
+
+ After:
+
+ ```python
+ from onetl.connection import Postgres
+
+ # Using FetchOptions for fetching data
+ postgres = Postgres(...)
+ df = postgres.fetch(
+ "SELECT * FROM some.mytable WHERE key = 'something'",
+ options=Postgres.FetchOptions( # <-- change class name
+ fetchsize=1000,
+ query_timeout=30,
+ ),
+ )
+
+ # Using ExecuteOptions for executing statements
+ postgres.execute(
+ "UPDATE some.mytable SET value = 'new' WHERE key = 'something'",
+ options=Postgres.ExecuteOptions(query_timeout=30), # <-- change class name
+ )
+ ```
+
+ For now, `DB.fetch(query, options=...)` and `DB.execute(query, options=...)` can accept `JDBCOptions`, to keep backward compatibility,
+ but emit a deprecation warning. The old class will be removed in `v1.0.0`.
+
+- Serialize `ColumnDatetimeHWM` to Clickhouse's `DateTime64(6)` (precision up to microseconds) instead of `DateTime` (precision up to seconds) ([#267](https://github.com/MTSWebServices/onetl/pull/267)).
+
+ In previous onETL versions, `ColumnDatetimeHWM` value was rounded to the second, and thus reading some rows that were read in previous runs,
+ producing duplicates.
+
+ For Clickhouse versions below 21.1 comparing column of type `DateTime` with a value of type `DateTime64` is not supported, returning an empty dataframe.
+ To avoid this, replace:
+
+ ```python
+ DBReader(
+ ...,
+ hwm=DBReader.AutoDetectHWM(
+ name="my_hwm",
+ expression="hwm_column", # <--
+ ),
+ )
+ ```
+
+ with:
+
+ ```python
+ DBReader(
+ ...,
+ hwm=DBReader.AutoDetectHWM(
+ name="my_hwm",
+ expression="CAST(hwm_column AS DateTime64)", # <-- add explicit CAST
+ ),
+ )
+ ```
+
+- Pass JDBC connection extra params as `properties` dict instead of URL with query part ([#268](https://github.com/MTSWebServices/onetl/pull/268)).
+
+ This allows passing custom connection parameters like `Clickhouse(extra={"custom_http_options": "option1=value1,option2=value2"})`
+ without need to apply urlencode to parameter value, like `option1%3Dvalue1%2Coption2%3Dvalue2`.
+
+## Features { #DBR-onetl-changelog-0-11-0-features }
+
+Improve user experience with Kafka messages and Database tables with serialized columns, like JSON/XML.
+
+- Allow passing custom package version as argument for `DB.get_packages(...)` method of several DB connectors:
+ - `Clickhouse.get_packages(package_version=..., apache_http_client_version=...)` ([#249](https://github.com/MTSWebServices/onetl/pull/249)).
+ - `MongoDB.get_packages(scala_version=..., spark_version=..., package_version=...)` ([#255](https://github.com/MTSWebServices/onetl/pull/255)).
+ - `MySQL.get_packages(package_version=...)` ([#253](https://github.com/MTSWebServices/onetl/pull/253)).
+ - `MSSQL.get_packages(java_version=..., package_version=...)` ([#254](https://github.com/MTSWebServices/onetl/pull/254)).
+ - `Oracle.get_packages(java_version=..., package_version=...)` ([#252](https://github.com/MTSWebServices/onetl/pull/252)).
+ - `Postgres.get_packages(package_version=...)` ([#251](https://github.com/MTSWebServices/onetl/pull/251)).
+ - `Teradata.get_packages(package_version=...)` ([#256](https://github.com/MTSWebServices/onetl/pull/256)).
+ Now users can downgrade or upgrade connection without waiting for next onETL release. Previously only `Kafka` and `Greenplum` supported this feature.
+- Add `FileFormat.parse_column(...)` method to several classes:
+ - `Avro.parse_column(col)` ([#265](https://github.com/MTSWebServices/onetl/pull/265)).
+ - `JSON.parse_column(col, schema=...)` ([#257](https://github.com/MTSWebServices/onetl/pull/257)).
+ - `CSV.parse_column(col, schema=...)` ([#258](https://github.com/MTSWebServices/onetl/pull/258)).
+ - `XML.parse_column(col, schema=...)` ([#269](https://github.com/MTSWebServices/onetl/pull/269)).
+ This allows parsing data in `value` field of Kafka message or string/binary column of some table as a nested Spark structure.
+- Add `FileFormat.serialize_column(...)` method to several classes:
+ - `Avro.serialize_column(col)` ([#265](https://github.com/MTSWebServices/onetl/pull/265)).
+ - `JSON.serialize_column(col)` ([#257](https://github.com/MTSWebServices/onetl/pull/257)).
+ - `CSV.serialize_column(col)` ([#258](https://github.com/MTSWebServices/onetl/pull/258)).
+ This allows saving Spark nested structures or arrays to `value` field of Kafka message or string/binary column of some table.
+
+## Improvements { #DBR-onetl-changelog-0-11-0-improvements }
+
+Few documentation improvements.
+
+- Replace all `assert` in documentation with doctest syntax. This should make documentation more readable ([#273](https://github.com/MTSWebServices/onetl/pull/273)).
+- Add generic `Troubleshooting` guide ([#275](https://github.com/MTSWebServices/onetl/pull/275)).
+- Improve Kafka documentation:
+ - Add "Prerequisites" page describing different aspects of connecting to Kafka.
+ - Improve "Reading from" and "Writing to" page of Kafka documentation, add more examples and usage notes.
+ - Add "Troubleshooting" page ([#276](https://github.com/MTSWebServices/onetl/pull/276)).
+- Improve Hive documentation:
+ - Add "Prerequisites" page describing different aspects of connecting to Hive.
+ - Improve "Reading from" and "Writing to" page of Hive documentation, add more examples and recommendations.
+ - Improve "Executing statements in Hive" page of Hive documentation. ([#278](https://github.com/MTSWebServices/onetl/pull/278)).
+- Add "Prerequisites" page describing different aspects of using SparkHDFS and SparkS3 connectors. ([#279](https://github.com/MTSWebServices/onetl/pull/279)).
+- Add note about connecting to Clickhouse cluster. ([#280](https://github.com/MTSWebServices/onetl/pull/280)).
+- Add notes about versions when specific class/method/attribute/argument was added, renamed or changed behavior ([#282](https://github.com/MTSWebServices/onetl/pull/282)).
+
+## Bug Fixes { #DBR-onetl-changelog-0-11-0-bug-fixes }
+
+- Fix missing `pysmb` package after installing `pip install onetl[files]` .
diff --git a/mddocs/docs/changelog/0.11.1.md b/mddocs/docs/changelog/0.11.1.md
new file mode 100644
index 000000000..59d72a7b0
--- /dev/null
+++ b/mddocs/docs/changelog/0.11.1.md
@@ -0,0 +1,9 @@
+# 0.11.1 (2024-05-29) { #DBR-onetl-changelog-0-11-1 }
+
+## Features { #DBR-onetl-changelog-0-11-1-features }
+
+- Change `MSSQL.port` default from `1433` to `None`, allowing use of `instanceName` to detect port number. ([#287](https://github.com/MTSWebServices/onetl/pull/287))
+
+## Bug Fixes { #DBR-onetl-changelog-0-11-1-bug-fixes }
+
+- Remove `fetchsize` from `JDBC.WriteOptions`. ([#288](https://github.com/MTSWebServices/onetl/pull/288))
diff --git a/mddocs/docs/changelog/0.11.2.md b/mddocs/docs/changelog/0.11.2.md
new file mode 100644
index 000000000..dcacef9f7
--- /dev/null
+++ b/mddocs/docs/changelog/0.11.2.md
@@ -0,0 +1,5 @@
+# 0.11.2 (2024-09-02) { #DBR-onetl-changelog-0-11-2 }
+
+## Bug Fixes { #DBR-onetl-changelog-0-11-2-bug-fixes }
+
+- Fix passing `Greenplum(extra={"options": ...})` during read/write operations. ([#308](https://github.com/MTSWebServices/onetl/pull/308))
diff --git a/mddocs/docs/changelog/0.12.0.md b/mddocs/docs/changelog/0.12.0.md
new file mode 100644
index 000000000..f26bc4505
--- /dev/null
+++ b/mddocs/docs/changelog/0.12.0.md
@@ -0,0 +1,54 @@
+# 0.12.0 (2024-09-03) { #DBR-onetl-changelog-0-12-0 }
+
+## Breaking Changes { #DBR-onetl-changelog-0-12-0-breaking-changes }
+
+- Change connection URL used for generating HWM names of S3 and Samba sources:
+ - `smb://host:port` -> `smb://host:port/share`
+ - `s3://host:port` -> `s3://host:port/bucket` ([#304](https://github.com/MTSWebServices/onetl/pull/304))
+- Update DB connectors/drivers to latest versions:
+ - Clickhouse `0.6.0-patch5` → `0.6.5`
+ - MongoDB `10.3.0` → `10.4.0`
+ - MSSQL `12.6.2` → `12.8.1`
+ - MySQL `8.4.0` → `9.0.0`
+ - Oracle `23.4.0.24.05` → `23.5.0.24.07`
+ - Postgres `42.7.3` → `42.7.4`
+- Update `Excel` package from `0.20.3` to `0.20.4`, to include Spark 3.5.1 support. ([#306](https://github.com/MTSWebServices/onetl/pull/306))
+
+## Features { #DBR-onetl-changelog-0-12-0-features }
+
+- Add support for specifying file formats (`ORC`, `Parquet`, `CSV`, etc.) in `HiveWriteOptions.format` ([#292](https://github.com/MTSWebServices/onetl/pull/292)):
+
+ ```python
+ Hive.WriteOptions(format=ORC(compression="snappy"))
+ ```
+
+- Collect Spark execution metrics in following methods, and log then in DEBUG mode:
+ - `DBWriter.run()`
+ - `FileDFWriter.run()`
+ - `Hive.sql()`
+ - `Hive.execute()`
+
+ This is implemented using custom `SparkListener` which wraps the entire method call, and
+ then report collected metrics. But these metrics sometimes may be missing due to Spark architecture,
+ so they are not reliable source of information. That's why logs are printed only in DEBUG mode, and
+ are not returned as method call result. ([#303](https://github.com/MTSWebServices/onetl/pull/303))
+
+- Generate default `jobDescription` based on currently executed method. Examples:
+ - `DBWriter.run(schema.table) -> Postgres[host:5432/database]`
+ - `MongoDB[localhost:27017/admin] -> DBReader.has_data(mycollection)`
+ - `Hive[cluster].execute()`
+
+ If user already set custom `jobDescription`, it will left intact. ([#304](https://github.com/MTSWebServices/onetl/pull/304))
+
+- Add log.info about JDBC dialect usage ([#305](https://github.com/MTSWebServices/onetl/pull/305)):
+
+ ```text
+ |MySQL| Detected dialect: 'org.apache.spark.sql.jdbc.MySQLDialect'
+ ```
+
+- Log estimated size of in-memory dataframe created by `JDBC.fetch` and `JDBC.execute` methods. ([#303](https://github.com/MTSWebServices/onetl/pull/303))
+
+## Bug Fixes { #DBR-onetl-changelog-0-12-0-bug-fixes }
+
+- Fix passing `Greenplum(extra={"options": ...})` during read/write operations. ([#308](https://github.com/MTSWebServices/onetl/pull/308))
+- Do not raise exception if yield-based hook whas something past (and only one) `yield`.
diff --git a/mddocs/docs/changelog/0.12.1.md b/mddocs/docs/changelog/0.12.1.md
new file mode 100644
index 000000000..f3126477c
--- /dev/null
+++ b/mddocs/docs/changelog/0.12.1.md
@@ -0,0 +1,23 @@
+# 0.12.1 (2024-10-28) { #DBR-onetl-changelog-0-12-1 }
+
+## Features { #DBR-onetl-changelog-0-12-1-features }
+
+- Log detected JDBC dialect while using `DBWriter`.
+
+## Bug Fixes { #DBR-onetl-changelog-0-12-1-bug-fixes }
+
+- Fix `SparkMetricsRecorder` failing when receiving
+ `SparkListenerTaskEnd` without `taskMetrics` (e.g. executor was
+ killed by OOM). ([#313](https://github.com/MTSWebServices/onetl/pull/313))
+- Call `kinit` before checking for HDFS active namenode.
+- Wrap `kinit` with `threading.Lock` to avoid multithreading issues.
+- Immediately show `kinit` errors to user, instead of hiding them.
+- Use `AttributeError` instead of `ImportError` in module's
+ `__getattr__` method, to make code compliant with Python spec.
+
+## Doc only Changes { #DBR-onetl-changelog-0-12-1-doc-only-changes }
+
+- Add note about
+ [spark-dialect-extension](https://github.com/MTSWebServices/spark-dialect-extension)
+ package to Clickhouse connector documentation.
+ ([#310](https://github.com/MTSWebServices/onetl/pull/310))
diff --git a/mddocs/docs/changelog/0.12.2.md b/mddocs/docs/changelog/0.12.2.md
new file mode 100644
index 000000000..2391c32ad
--- /dev/null
+++ b/mddocs/docs/changelog/0.12.2.md
@@ -0,0 +1,22 @@
+# 0.12.2 (2024-11-12) { #DBR-onetl-changelog-0-12-2 }
+
+## Improvements { #DBR-onetl-changelog-0-12-2-improvements }
+
+- Change Spark `jobDescription` for DBReader & FileDFReader from
+ `DBReader.run() -> Connection` to `Connection -> DBReader.run()`.
+
+## Bug Fixes { #DBR-onetl-changelog-0-12-2-bug-fixes }
+
+- Fix `log_hwm` result for `KeyValueIntHWM` (used by Kafka).
+ ([#316](https://github.com/MTSWebServices/onetl/pull/316))
+- Fix `log_collection` hiding values of `Kafka.addresses` in logs with
+ `INFO` level. ([#316](https://github.com/MTSWebServices/onetl/pull/316))
+
+## Dependencies { #DBR-onetl-changelog-0-12-2-dependencies }
+
+- Allow using
+ [etl-entities==2.4.0](https://github.com/MTSWebServices/etl-entities/releases/tag/2.4.0).
+
+## Doc only Changes { #DBR-onetl-changelog-0-12-2-doc-only-changes }
+
+- Fix links to MSSQL date & time type documentation.
diff --git a/mddocs/docs/changelog/0.12.3.md b/mddocs/docs/changelog/0.12.3.md
new file mode 100644
index 000000000..02c5c07ab
--- /dev/null
+++ b/mddocs/docs/changelog/0.12.3.md
@@ -0,0 +1,5 @@
+# 0.12.3 (2024-11-22) { #DBR-onetl-changelog-0-12-3 }
+
+## Bug Fixes { #DBR-onetl-changelog-0-12-3-bug-fixes }
+
+- Allow passing table names in format `schema."table.with.dots"` to `DBReader(source=...)` and `DBWriter(target=...)`.
diff --git a/mddocs/docs/changelog/0.12.4.md b/mddocs/docs/changelog/0.12.4.md
new file mode 100644
index 000000000..7489d87b9
--- /dev/null
+++ b/mddocs/docs/changelog/0.12.4.md
@@ -0,0 +1,5 @@
+# 0.12.4 (2024-11-27) { #DBR-onetl-changelog-0-12-4 }
+
+## Bug Fixes { #DBR-onetl-changelog-0-12-4-bug-fixes }
+
+- Fix `DBReader(conn=oracle, options={"partitioning_mode": "hash"})` lead to data skew in last partition due to wrong `ora_hash` usage. ([#319](https://github.com/MTSWebServices/onetl/pull/319))
diff --git a/mddocs/docs/changelog/0.12.5.md b/mddocs/docs/changelog/0.12.5.md
new file mode 100644
index 000000000..62d50c5a1
--- /dev/null
+++ b/mddocs/docs/changelog/0.12.5.md
@@ -0,0 +1,13 @@
+# 0.12.5 (2024-12-03) { #DBR-onetl-changelog-0-12-5 }
+
+## Improvements { #DBR-onetl-changelog-0-12-5-improvements }
+
+- Use `sipHash64` instead of `md5` in Clickhouse for reading data with `{"partitioning_mode": "hash"}`, as it is 5 times faster.
+- Use `hashtext` instead of `md5` in Postgres for reading data with `{"partitioning_mode": "hash"}`, as it is 3-5 times faster.
+- Use `BINARY_CHECKSUM` instead of `HASHBYTES` in MSSQL for reading data with `{"partitioning_mode": "hash"}`, as it is 5 times faster.
+
+## Big fixes { #DBR-onetl-changelog-0-12-5-big-fixes }
+
+- In JDBC sources wrap `MOD(partitionColumn, numPartitions)` with `ABS(...)` to make al returned values positive. This prevents data skew.
+- Fix reading table data from MSSQL using `{"partitioning_mode": "hash"}` with `partitionColumn` of integer type.
+- Fix reading table data from Postgres using `{"partitioning_mode": "hash"}` lead to data skew (all the data was read into one Spark partition).
diff --git a/mddocs/docs/changelog/0.13.0.md b/mddocs/docs/changelog/0.13.0.md
new file mode 100644
index 000000000..9d217c8a7
--- /dev/null
+++ b/mddocs/docs/changelog/0.13.0.md
@@ -0,0 +1,273 @@
+# 0.13.0 (2025-02-24) { #DBR-onetl-changelog-0-13-0 }
+
+🎉 3 years since first release 0.1.0 🎉
+
+## Breaking Changes { #DBR-onetl-changelog-0-13-0-breaking-changes }
+
+- Add Python 3.13. support. ([#298](https://github.com/MTSWebServices/onetl/pull/298))
+
+- Change the logic of `FileConnection.walk` and
+ `FileConnection.list_dir`. ([#327](https://github.com/MTSWebServices/onetl/pull/327))
+
+ Previously `limits.stops_at(path) == True` considered as \"return
+ current file and stop\", and could lead to exceeding some limit. Not
+ it means \"stop immediately\".
+
+- Change default value for `FileDFWriter.Options(if_exists=...)` from
+ `error` to `append`, to make it consistent with other `.Options()`
+ classes within onETL. ([#343](https://github.com/MTSWebServices/onetl/pull/343))
+
+## Features { #DBR-onetl-changelog-0-13-0-features }
+
+- Add support for `FileModifiedTimeHWM` HWM class (see [etl-entities
+ 2.5.0](https://github.com/MTSWebServices/etl-entities/releases/tag/2.5.0)):
+
+ ```python
+ from etl_entitites.hwm import FileModifiedTimeHWM
+ from onetl.file import FileDownloader
+ from onetl.strategy import IncrementalStrategy
+
+ downloader = FileDownloader(
+ ...,
+ hwm=FileModifiedTimeHWM(name="somename"),
+ )
+
+ with IncrementalStrategy():
+ downloader.run()
+ ```
+
+- Introduce `FileSizeRange(min=..., max=...)` filter class.
+ ([#325](https://github.com/MTSWebServices/onetl/pull/325))
+
+ Now users can set `FileDownloader` / `FileMover` to download/move
+ only files with specific file size range:
+
+ ```python
+ from onetl.file import FileDownloader
+ from onetl.file.filter import FileSizeRange
+
+ downloader = FileDownloader(
+ ...,
+ filters=[FileSizeRange(min="10KiB", max="1GiB")],
+ )
+ ```
+
+- Introduce `TotalFilesSize(...)` limit class.
+ ([#326](https://github.com/MTSWebServices/onetl/pull/326))
+
+ Now users can set `FileDownloader` / `FileMover` to stop
+ downloading/moving files after reaching a certain amount of data:
+
+ ```python
+ from datetime import datetime, timedelta
+ from onetl.file import FileDownloader
+ from onetl.file.limit import TotalFilesSize
+
+ downloader = FileDownloader(
+ ...,
+ limits=[TotalFilesSize("1GiB")],
+ )
+ ```
+
+- Implement `FileModifiedTime(since=..., until=...)` file filter.
+ ([#330](https://github.com/MTSWebServices/onetl/pull/330))
+
+ Now users can set `FileDownloader` / `FileMover` to download/move
+ only files with specific file modification time:
+
+ ```python
+ from datetime import datetime, timedelta
+ from onetl.file import FileDownloader
+ from onetl.file.filter import FileModifiedTime
+
+ downloader = FileDownloader(
+ ...,
+ filters=[FileModifiedTime(before=datetime.now() - timedelta(hours=1))],
+ )
+ ```
+
+- Add `SparkS3.get_exclude_packages()` and
+ `Kafka.get_exclude_packages()` methods. ([#341](https://github.com/MTSWebServices/onetl/pull/341))
+
+ Using them allows to skip downloading dependencies not required by
+ this specific connector, or which are already a part of
+ Spark/PySpark:
+
+ ```python
+ from onetl.connection import SparkS3, Kafka
+
+ maven_packages = [
+ *SparkS3.get_packages(spark_version="3.5.4"),
+ *Kafka.get_packages(spark_version="3.5.4"),
+ ]
+ exclude_packages = SparkS3.get_exclude_packages() + Kafka.get_exclude_packages()
+ spark = (
+ SparkSession.builder.appName("spark_app_onetl_demo")
+ .config("spark.jars.packages", ",".join(maven_packages))
+ .config("spark.jars.excludes", ",".join(exclude_packages))
+ .getOrCreate()
+ )
+ ```
+
+## Improvements { #DBR-onetl-changelog-0-13-0-improvements }
+
+- All DB connections opened by `JDBC.fetch(...)`, `JDBC.execute(...)`
+ or `JDBC.check()` are immediately closed after the statements is
+ executed. ([#334](https://github.com/MTSWebServices/onetl/pull/334))
+
+ Previously Spark session with `master=local[3]` actually opened up
+ to 5 connections to target DB - one for `JDBC.check()`, another for
+ Spark driver interaction with DB to create tables, and one for each
+ Spark executor. Now only max 4 connections are opened, as
+ `JDBC.check()` does not hold opened connection.
+
+ This is important for RDBMS like Postgres or Greenplum where number
+ of connections is strictly limited and limit is usually quite low.
+
+- Set up `ApplicationName` (client info) for Clickhouse, MongoDB,
+ MSSQL, MySQL and Oracle. ([#339](https://github.com/MTSWebServices/onetl/pull/339),
+ [#248](https://github.com/MTSWebServices/onetl/pull/248))
+
+ Also update `ApplicationName` format for Greenplum, Postgres, Kafka
+ and SparkS3. Now all connectors have the same `ApplicationName`
+ format:
+ `${spark.applicationId} ${spark.appName} onETL/${onetl.version} Spark/${spark.version}`
+
+ The only connections not sending `ApplicationName` are Teradata and
+ FileConnection implementations.
+
+- Now `DB.check()` will test connection availability not only on Spark
+ driver, but also from some Spark executor. ([#346](https://github.com/MTSWebServices/onetl/pull/346))
+
+ This allows to fail immediately if Spark driver host has network
+ access to target DB, but Spark executors have not.
+
+ !!! note
+
+ Now `Greenplum.check()` requires the same user grants as
+ `DBReader(connection=greenplum)`:
+
+ ``` sql
+ -- yes, "writable" for reading data from GP, it's not a mistake
+ ALTER USER username CREATEEXTTABLE(type = 'writable', protocol = 'gpfdist');
+
+ -- for both reading and writing to GP
+ -- ALTER USER username CREATEEXTTABLE(type = 'readable', protocol = 'gpfdist') CREATEEXTTABLE(type = 'writable', protocol = 'gpfdist');
+ ```
+
+ Please ask your Greenplum administrators to provide these grants.
+
+## Bug Fixes { #DBR-onetl-changelog-0-13-0-bug-fixes }
+
+- Avoid suppressing Hive Metastore errors while using `DBWriter`.
+ ([#329](https://github.com/MTSWebServices/onetl/pull/329))
+
+ Previously this was implemented as:
+
+ ```python
+ try:
+ spark.sql(f"SELECT * FROM {table}")
+ table_exists = True
+ except Exception:
+ table_exists = False
+ ```
+
+ If Hive Metastore was overloaded and responded with an exception, it
+ was considered as non-existing table, resulting to full table
+ override instead of append or override only partitions subset.
+
+- Fix using onETL to write data to PostgreSQL or Greenplum instances
+ behind *pgbouncer* with `pool_mode=transaction`.
+ ([#336](https://github.com/MTSWebServices/onetl/pull/336))
+
+ Previously `Postgres.check()` opened a read-only transaction,
+ pgbouncer changed the entire connection type from read-write to
+ read-only, and when `DBWriter.run(df)` executed in read-only
+ connection, producing errors like:
+
+ ```
+ org.postgresql.util.PSQLException: ERROR: cannot execute INSERT in a read-only transaction
+ org.postgresql.util.PSQLException: ERROR: cannot execute TRUNCATE TABLE in a read-only transaction
+ ```
+
+ Added a workaround by passing `readOnly=True` to JDBC params for
+ read-only connections, so pgbouncer may differ read-only and
+ read-write connections properly.
+
+ After upgrading onETL 0.13.x or higher the same error still may
+ appear of pgbouncer still holds read-only connections and returns
+ them for DBWriter. To this this, user can manually convert read-only
+ connection to read-write:
+
+ ```python
+ postgres.execute("BEGIN READ WRITE;") # <-- add this line
+ DBWriter(...).run()
+ ```
+
+ After all connections in pgbouncer pool were converted from
+ read-only to read-write, and error fixed, this additional line could
+ be removed.
+
+ See [Postgres JDBC driver
+ documentation](https://jdbc.postgresql.org/documentation/use/).
+
+- Fix `MSSQL.fetch(...)` and `MySQL.fetch(...)` opened a read-write
+ connection instead of read-only. ([#337](https://github.com/MTSWebServices/onetl/pull/337))
+
+ Now this is fixed:
+
+ - `MSSQL.fetch(...)` establishes connection with `ApplicationIntent=ReadOnly`.
+ - `MySQL.fetch(...)` calls `SET SESSION TRANSACTION READ ONLY` statement.
+
+- Fixed passing multiple filters to `FileDownloader` and `FileMover`.
+ ([#338](https://github.com/MTSWebServices/onetl/pull/338)) If was caused by
+ sorting filters list in internal logging method, but `FileFilter`
+ subclasses are not sortable.
+
+- Fix a false warning about a lof of parallel connections to Grenplum.
+ ([#342](https://github.com/MTSWebServices/onetl/pull/342))
+
+ Creating Spark session with `.master("local[5]")` may open up to 6
+ connections to Greenplum (=number of Spark executors + 1 for
+ driver), but onETL instead used number of *CPU cores* on the host as
+ a number of parallel connections.
+
+ This lead to showing a false warning that number of Greenplum
+ connections is too high, which actually should be the case only if
+ number of executors is higher than 30.
+
+- Fix MongoDB trying to use current database name as `authSource`.
+ ([#347](https://github.com/MTSWebServices/onetl/pull/347))
+
+ Use default connector value which is `admin` database. Previous
+ onETL versions could be fixed by:
+
+ ```python
+ from onetl.connection import MongoDB
+
+ mongodb = MongoDB(
+ ...,
+ database="mydb",
+ extra={
+ "authSource": "admin",
+ },
+ )
+ ```
+
+## Dependencies { #DBR-onetl-changelog-0-13-0-dependencies }
+
+- Minimal `etl-entities` version is now
+ [2.5.0](https://github.com/MTSWebServices/etl-entities/releases/tag/2.5.0).
+ ([#331](https://github.com/MTSWebServices/onetl/pull/331))
+
+- Update DB connectors/drivers to latest versions: ([#345](https://github.com/MTSWebServices/onetl/pull/345))
+
+ - Clickhouse `0.6.5` → `0.7.2`
+ - MongoDB `10.4.0` → `10.4.1`
+ - MySQL `9.0.0` → `9.2.0`
+ - Oracle `23.5.0.24.07` → `23.7.0.25.01`
+ - Postgres `42.7.4` → `42.7.5`
+
+## Doc only Changes { #DBR-onetl-changelog-0-13-0-doc-only-changes }
+
+- Split large code examples to tabs. ([#344](https://github.com/MTSWebServices/onetl/pull/344))
diff --git a/mddocs/docs/changelog/0.13.1.md b/mddocs/docs/changelog/0.13.1.md
new file mode 100644
index 000000000..b025397f5
--- /dev/null
+++ b/mddocs/docs/changelog/0.13.1.md
@@ -0,0 +1,9 @@
+# 0.13.1 (2025-03-06) { #DBR-onetl-changelog-0-13-1 }
+
+## Bug Fixes { #DBR-onetl-changelog-0-13-1-bug-fixes }
+
+In 0.13.0, using `DBWriter(connection=hive, target="SOMEDB.SOMETABLE")` lead to executing `df.write.saveAsTable()`
+instead of `df.write.insertInto()` if target table `somedb.sometable` already exist.
+
+This is caused by table name normalization (Hive uses lower-case names), which wasn't properly handled by method used for checking table existence.
+([#350](https://github.com/MTSWebServices/onetl/pull/350))
diff --git a/mddocs/docs/changelog/0.13.3.md b/mddocs/docs/changelog/0.13.3.md
new file mode 100644
index 000000000..7a17b10d8
--- /dev/null
+++ b/mddocs/docs/changelog/0.13.3.md
@@ -0,0 +1,6 @@
+# 0.13.3 (2025-03-11) { #DBR-onetl-changelog-0-13-3 }
+
+## Dependencies { #DBR-onetl-changelog-0-13-3-dependencies }
+
+Allow using [etl-entities
+2.6.0](https://github.com/MTSWebServices/etl-entities/releases/tag/2.6.0).
diff --git a/mddocs/docs/changelog/0.13.4.md b/mddocs/docs/changelog/0.13.4.md
new file mode 100644
index 000000000..a1f0f4478
--- /dev/null
+++ b/mddocs/docs/changelog/0.13.4.md
@@ -0,0 +1,10 @@
+# 0.13.4 (2025-03-20) { #DBR-onetl-changelog-0-13-4 }
+
+## Doc only Changes { #DBR-onetl-changelog-0-13-4-doc-only-changes }
+
+- Prefer `ReadOptions(partitionColumn=..., numPartitions=..., queryTimeout=...)`
+ instead of `ReadOptions(partition_column=..., num_partitions=..., query_timeout=...)`,
+ to match Spark documentation. ([#352](https://github.com/MTSWebServices/onetl/pull/352))
+- Prefer `WriteOptions(if_exists=...)` instead of `WriteOptions(mode=...)` for IDE suggestions. ([#354](https://github.com/MTSWebServices/onetl/pull/354))
+- Document all options of supported file formats.
+ ([#355](https://github.com/MTSWebServices/onetl/pull/355), [#356](https://github.com/MTSWebServices/onetl/pull/356), [#357](https://github.com/MTSWebServices/onetl/pull/357), [#358](https://github.com/MTSWebServices/onetl/pull/358), [#359](https://github.com/MTSWebServices/onetl/pull/359), [#360](https://github.com/MTSWebServices/onetl/pull/360), [#361](https://github.com/MTSWebServices/onetl/pull/361), [#362](https://github.com/MTSWebServices/onetl/pull/362))
diff --git a/mddocs/docs/changelog/0.13.5.md b/mddocs/docs/changelog/0.13.5.md
new file mode 100644
index 000000000..a464db4e1
--- /dev/null
+++ b/mddocs/docs/changelog/0.13.5.md
@@ -0,0 +1,11 @@
+# 0.13.5 (2025-04-14) { #DBR-onetl-changelog-0-13-5 }
+
+## Bug Fixes { #DBR-onetl-changelog-0-13-5-bug-fixes }
+
+0.13.0 changed the way `Greenplum.check()` is implemented - it begin
+checking DB availability from both Spark driver and executor. But due to
+misspell, `SELECT` queries were emitted from all available executors.
+This lead to opening too much connections to Greenplum, which was
+unexpected.
+
+Now only one Spark executor is used to run `Greenplum.check()`.
diff --git a/mddocs/docs/changelog/0.14.0.md b/mddocs/docs/changelog/0.14.0.md
new file mode 100644
index 000000000..31f0bbe84
--- /dev/null
+++ b/mddocs/docs/changelog/0.14.0.md
@@ -0,0 +1,43 @@
+# 0.14.0 (2025-09-08) { #DBR-onetl-changelog-0-14-0 }
+
+## Breaking Changes { #DBR-onetl-changelog-0-14-0-breaking-changes }
+
+- Drop Spark 2 support. Minimal supported Spark version is 3.2.
+ ([#383](https://github.com/MTSWebServices/onetl/pull/383))
+
+ Also dropped:
+
+ - `Greenplum.package_spark_2_3`
+ - `Greenplum.package_spark_2_4`
+
+- Update DB connectors/drivers to latest versions:
+
+ - MongoDB `10.4.1` → `10.5.0`
+ - MySQL `9.2.0` → `9.4.0`
+ - MSSQL `12.8.10` → `13.2.0`
+ - Oracle `23.7.0.25.01` → `23.9.0.25.07`
+ - Postgres `42.7.5` → `42.7.7`
+
+- Update Excel package name from `com.crealytics:spark-excel` to
+ `dev.mauch:spark-excel`. ([#382](https://github.com/MTSWebServices/onetl/pull/382))
+
+- Now `Excel.get_packages(package_version=...)` parameter is
+ mandatory. ([#382](https://github.com/MTSWebServices/onetl/pull/382))
+
+- Return full file/directory path from `FileConnection.list_dir` and
+ `FileConnection.walk`. ([#381](https://github.com/MTSWebServices/onetl/pull/381))
+ Previously these methods returned only file names.
+
+## Features { #DBR-onetl-changelog-0-14-0-features }
+
+- Add Spark 4.0 support. ([#297](https://github.com/MTSWebServices/onetl/pull/297))
+- Add `Iceberg` connection support. For now this is alpha version, and
+ behavior may change in future. ([#378](https://github.com/MTSWebServices/onetl/pull/378),
+ [#386](https://github.com/MTSWebServices/onetl/pull/386))
+- Treat S3 objects with names ending with a `/` slash as directory
+ marker. ([#379](https://github.com/MTSWebServices/onetl/pull/379))
+
+## Improvements { #DBR-onetl-changelog-0-14-0-improvements }
+
+- Speed up removing S3 and Samba directories with `recursive=True`.
+ ([#380](https://github.com/MTSWebServices/onetl/pull/380))
diff --git a/mddocs/docs/changelog/0.14.1.md b/mddocs/docs/changelog/0.14.1.md
new file mode 100644
index 000000000..c3012a94f
--- /dev/null
+++ b/mddocs/docs/changelog/0.14.1.md
@@ -0,0 +1,17 @@
+# 0.14.1 (2025-11-25) { #DBR-onetl-changelog-0-14-1 }
+
+## Dependencies { #DBR-onetl-changelog-0-14-1-dependencies }
+
+Release [minio==7.2.19](https://github.com/minio/minio-py/issues/1536)
+lead to broken `S3` connector with errors like these:
+
+```
+TypeError: Minio.fget_object() takes 1 positional argument but 3 were given
+TypeError: Minio.fput_object() takes 1 positional argument but 3 were given
+```
+
+Fixed.
+
+Added limit `minio<8.0` to avoid [breaking
+things](https://github.com/minio/minio-py/pull/1530) in next major
+release.
diff --git a/mddocs/docs/changelog/0.15.0.md b/mddocs/docs/changelog/0.15.0.md
new file mode 100644
index 000000000..f70bd5686
--- /dev/null
+++ b/mddocs/docs/changelog/0.15.0.md
@@ -0,0 +1,173 @@
+# 0.15.0 (2025-12-08) { #DBR-onetl-changelog-0-15-0 }
+
+## Removals { #DBR-onetl-changelog-0-15-0-removals }
+
+Drop `Teradata` connector. It is not used in our company anymore, and
+never had proper integration tests.
+
+## Breaking Changes { #DBR-onetl-changelog-0-15-0-breaking-changes }
+
+Add `Iceberg(catalog=..., warehouse=...)` mandatory options
+([#391](https://github.com/MTSWebServices/onetl/pull/391),
+[#393](https://github.com/MTSWebServices/onetl/pull/393),
+[#394](https://github.com/MTSWebServices/onetl/pull/394),
+[#397](https://github.com/MTSWebServices/onetl/pull/397),
+[#399](https://github.com/MTSWebServices/onetl/pull/399),
+[#413](https://github.com/MTSWebServices/onetl/pull/413)).
+
+In 0.14.0 we've implemented very basic `Iceberg` connector configured
+via dictionary:
+
+``` python
+iceberg = Iceberg(
+ catalog_name="mycatalog",
+ extra={
+ "type": "rest",
+ "uri": "https://catalog.company.com/rest",
+ "rest.auth.type": "oauth2",
+ "token": "jwt_token",
+ "warehouse": "s3a://mybucket/",
+ "io-impl": "org.apache.iceberg.aws.s3.S3FileIO",
+ "s3.endpoint": "http://localhost:9010",
+ "s3.access-key-id": "access_key",
+ "s3.secret-access-key": "secret_key",
+ "s3.path-style-access": "true",
+ "client.region": "us-east-1",
+ },
+ spark=spark,
+)
+```
+
+Now we've implemented wrapper classes allowing to configure various
+Iceberg catalogs:
+
+```python title="REST Catalog with Bearer token auth"
+iceberg = Iceberg(
+ catalog_name="mycatalog",
+ catalog=Iceberg.RESTCatalog(
+ url="https://catalog.company.com/rest",
+ auth=Iceberg.RESTCatalog.BearerAuth(
+ access_token="jwt_token",
+ ),
+ ),
+ warehouse=...,
+)
+```
+
+```python title="REST Catalog with OAuth2 ClientCredentials auth"
+iceberg = Iceberg(
+ catalog_name="mycatalog",
+ catalog=Iceberg.RESTCatalog(
+ url="https://catalog.company.com/rest",
+ auth=Iceberg.RESTCatalog.OAuth2ClientCredentials(
+ client_id="my_client",
+ client_secret="my_secret",
+ oauth2_token_endpoint="http://keycloak.company.com/realms/my-realm/protocol/openid-connect/token",
+ scopes=["catalog"],
+ ),
+ ),
+ warehouse=...,
+ spark=spark,
+)
+```
+
+And also set of classes to configure for warehouses:
+
+```python title="S3 warehouse"
+iceberg = Iceberg(
+ catalog_name="mycatalog",
+ catalog=...,
+ # using Iceberg AWS integration
+ warehouse=Iceberg.S3Warehouse(
+ path="/",
+ bucket="mybucket",
+ host="localhost",
+ port=9010,
+ protocol="http",
+ path_style_access=True,
+ access_key="access_key",
+ secret_key="secret_key",
+ region="us-east-1",
+ ),
+ spark=spark,
+)
+```
+
+```python title="For Lakekeeper, Polaris, Gravitino"
+iceberg = Iceberg(
+ catalog_name="mycatalog",
+ catalog=...,
+ # Delegate warehouse config to REST Catalog
+ warehouse=Iceberg.DelegatedWarehouse(
+ warehouse="some-warehouse",
+ access_delegation="vended-credentials",
+ ),
+ spark=spark,
+)
+```
+
+```python title="HDFS warehouse"
+iceberg = Iceberg(
+ catalog_name="mycatalog",
+ # store both data and metadata on HadoopFilesystem
+ catalog=Iceberg.FilesystemCatalog(),
+ warehouse=Iceberg.FilesystemWarehouse(
+ path="/some/warehouse",
+ connection=SparkHDFS(cluster="dwh"),
+ ),
+ spark=spark,
+)
+```
+
+Having classes instead of dicts brings IDE autocompletion, and allows to
+reuse the same catalog connection options for multiple warehouses.
+
+## Features { #DBR-onetl-changelog-0-15-0-features }
+
+- Added support for `Iceberg.WriteOptions(table_properties={})`
+ ([#401](https://github.com/MTSWebServices/onetl/pull/401)).
+
+ In particular, table's `"location": "/some/warehouse/mytable"` can
+ be set now.
+
+- Added support for `Hive.WriteOptions(table_properties={})`
+ ([#412](https://github.com/MTSWebServices/onetl/pull/412)).
+
+ In particular, table's `"auto.purge": "true"` can be set now.
+
+## Improvements { #DBR-onetl-changelog-0-15-0-improvements }
+
+- Allow to set `SparkS3(path_style_access=True)` instead of
+ `SparkS3(extra={"path.style.access": True)` ([#392](https://github.com/MTSWebServices/onetl/pull/392)).
+
+ This change improves IDE autocompletion and made it more explicit
+ that the parameter is important for the connector's functionality.
+
+- Add a runtime warning about missing `S3(region=...)` and
+ `SparkS3(region=...)` params ([#418](https://github.com/MTSWebServices/onetl/pull/418)).
+
+ It is recommended to explicitly pass this parameter to avoid
+ potential access errors.
+
+Thanks to [@yabel](https://github.com/yabel)
+
+## Dependencies { #DBR-onetl-changelog-0-15-0-dependencies }
+
+- Update JDBC connectors:
+
+ - MySQL `9.4.0` → `9.5.0`
+ - MSSQL `13.2.0` → `13.2.1`
+ - Oracle `23.9.0.25.07` → `23.26.0.0.0`
+ - Postgres `42.7.7` → `42.7.8`
+
+- Added support for `Clickhouse.get_packages(package_version="0.9.3")`
+ ([#407](https://github.com/MTSWebServices/onetl/pull/407)).
+
+ Versions in range 0.8.0-0.9.2 are not supported due to [issue #2625](https://github.com/ClickHouse/clickhouse-java/issues/2625).
+
+ Versions 0.9.3+ is still not default one because of various
+ compatibility and performance issues. Use it at your own risk.
+
+## Documentation { #DBR-onetl-changelog-0-15-0-documentation }
+
+- Document using Greenplum connector with Spark on `master=k8s`
diff --git a/mddocs/docs/changelog/0.7.0.md b/mddocs/docs/changelog/0.7.0.md
new file mode 100644
index 000000000..75ef943c9
--- /dev/null
+++ b/mddocs/docs/changelog/0.7.0.md
@@ -0,0 +1,239 @@
+# 0.7.0 (2023-05-15) { #DBR-onetl-changelog-0-7-0 }
+
+## 🎉 onETL is now open source 🎉 { #DBR-onetl-changelog-0-7-0-onetl-is-now-open-source }
+
+That was long road, but we finally did it!
+
+## Breaking Changes { #DBR-onetl-changelog-0-7-0-breaking-changes }
+
+- Changed installation method.
+
+ **TL;DR What should I change to restore previous behavior**
+
+ Simple way:
+
+ | onETL < 0.7.0 | onETL >= 0.7.0 |
+ | ----------------- | --------------------------------- |
+ | pip install onetl | pip install onetl[files,kerberos] |
+
+ Right way - enumerate connectors should be installed:
+
+ ```bash
+ pip install onetl[hdfs,ftp,kerberos] # except DB connections
+ ```
+
+ **Details**
+
+ In onetl<0.7 the package installation looks like:
+
+ ```bash title="before"
+
+ pip install onetl
+ ```
+
+ But this includes all dependencies for all connectors, even if user does not use them.
+ This caused some issues, for example user had to install Kerberos libraries to be able to install onETL, even if user uses only S3 (without Kerberos support).
+
+ Since 0.7.0 installation process was changed:
+
+ ``` bash title="after"
+
+ pip install onetl # minimal installation, only onETL core
+ # there is no extras for DB connections because they are using Java packages which are installed in runtime
+
+ pip install onetl[ftp,ftps,hdfs,sftp,s3,webdav] # install dependencies for specified file connections
+ pip install onetl[files] # install dependencies for all file connections
+
+ pip install onetl[kerberos] # Kerberos auth support
+ pip install onetl[spark] # install PySpark to use DB connections
+
+ pip install onetl[spark,kerberos,files] # all file connections + Kerberos + PySpark
+ pip install onetl[all] # alias for previous case
+ ```
+
+ There are corresponding documentation items for each extras.
+
+ Also onETL checks that some requirements are missing, and raises exception with recommendation how to install them:
+
+ ``` text title="exception while import Clickhouse connection"
+
+ Cannot import module "pyspark".
+
+ Since onETL v0.7.0 you should install package as follows:
+ pip install onetl[spark]
+
+ or inject PySpark to sys.path in some other way BEFORE creating MongoDB instance.
+ ```
+
+ ``` text title="exception while import FTP connection"
+
+ Cannot import module "ftputil".
+
+ Since onETL v0.7.0 you should install package as follows:
+ pip install onetl[ftp]
+
+ or
+ pip install onetl[files]
+ ```
+
+- Added new `cluster` argument to `Hive` and `HDFS` connections.
+
+ `Hive` qualified name (used in HWM) contains cluster name. But in onETL<0.7.0 cluster name had hard coded value `rnd-dwh` which was not OK for some users.
+
+ `HDFS` connection qualified name contains host (active namenode of Hadoop cluster), but its value can change over time, leading to creating of new HWM.
+
+ Since onETL 0.7.0 both `Hive` and `HDFS` connections have `cluster` attribute which can be set to a specific cluster name.
+ For `Hive` it is mandatory, for `HDFS` it can be omitted (using host as a fallback).
+
+ But passing cluster name every time could lead to errors.
+
+ Now `Hive` and `HDFS` have nested class named `slots` with methods:
+
+ - `normalize_cluster_name`
+ - `get_known_clusters`
+ - `get_current_cluster`
+ - `normalize_namenode_host` (only `HDFS`)
+ - `get_cluster_namenodes` (only `HDFS`)
+ - `get_webhdfs_port` (only `HDFS`)
+ - `is_namenode_active` (only `HDFS`)
+
+ And new method `HDFS.get_current` / `Hive.get_current`.
+
+ Developers can implement hooks validating user input or substituting values for automatic cluster detection.
+ This should improve user experience while using these connectors.
+
+ See slots documentation.
+
+- Update JDBC connection drivers.
+
+ - Greenplum `2.1.3` → `2.1.4`.
+ - MSSQL `10.2.1.jre8` → `12.2.0.jre8`. Minimal supported version of MSSQL is now 2014 instead 2021.
+ - MySQL `8.0.30` → `8.0.33`:
+ - Package was renamed `mysql:mysql-connector-java` → `com.mysql:mysql-connector-j`.
+ - Driver class was renamed `com.mysql.jdbc.Driver` → `com.mysql.cj.jdbc.Driver`.
+ - Oracle `21.6.0.0.1` → `23.2.0.0`.
+ - Postgres `42.4.0` → `42.6.0`.
+ - Teradata `17.20.00.08` → `17.20.00.15`:
+ - Package was renamed `com.teradata.jdbc:terajdbc4` → `com.teradata.jdbc:terajdbc`.
+ - Teradata driver is now published to Maven.
+
+ See [#31](https://github.com/MTSWebServices/onetl/pull/31).
+
+## Features { #DBR-onetl-changelog-0-7-0-features }
+
+- Added MongoDB connection.
+
+ Using official [MongoDB connector for Spark v10](https://www.mongodb.com/docs/spark-connector/current/). Only Spark 3.2+ is supported.
+
+ There are some differences between MongoDB and other database sources:
+
+ - Instead of `mongodb.sql` method there is `mongodb.pipeline`.
+ - No methods `mongodb.fetch` and `mongodb.execute`.
+ - `DBReader.hint` and `DBReader.where` have different types than in SQL databases:
+
+ ```python
+ where = {
+ "col1": {
+ "$eq": 10,
+ },
+ }
+
+ hint = {
+ "col1": 1,
+ }
+ ```
+
+ - Because MongoDB does not have schemas of collections, but Spark cannot create dataframe with dynamic schema, new option `DBReader.df_schema` was introduced.
+ It is mandatory for MongoDB, but optional for other sources.
+ - Currently DBReader cannot be used with MongoDB and hwm expression, e.g. `hwm_column=("mycolumn", {"$cast": {"col1": "date"}})`
+
+ Because there are no tables in MongoDB, some options were renamed in core classes:
+
+ - `DBReader(table=...)` → `DBReader(source=...)`
+ - `DBWriter(table=...)` → `DBWriter(target=...)`
+
+ Old names can be used too, they are not deprecated ([#30](https://github.com/MTSWebServices/onetl/pull/30)).
+
+- Added option for disabling some plugins during import.
+
+ Previously if some plugin were failing during the import, the only way to import onETL would be to disable all plugins
+ using environment variable.
+
+ Now there are several variables with different behavior:
+
+ - `ONETL_PLUGINS_ENABLED=false` - disable all plugins autoimport. Previously it was named `ONETL_ENABLE_PLUGINS`.
+ - `ONETL_PLUGINS_BLACKLIST=plugin-name,another-plugin` - set list of plugins which should NOT be imported automatically.
+ - `ONETL_PLUGINS_WHITELIST=plugin-name,another-plugin` - set list of plugins which should ONLY be imported automatically.
+
+ Also we improved exception message with recommendation how to disable a failing plugin:
+
+ ``` text title="exception message example"
+
+ Error while importing plugin 'mtspark' from package 'mtspark' v4.0.0.
+
+ Statement:
+ import mtspark.onetl
+
+ Check if plugin is compatible with current onETL version 0.7.0.
+
+ You can disable loading this plugin by setting environment variable:
+ ONETL_PLUGINS_BLACKLIST='mtspark,failing-plugin'
+
+ You can also define a whitelist of packages which can be loaded by onETL:
+ ONETL_PLUGINS_WHITELIST='not-failing-plugin1,not-failing-plugin2'
+
+ Please take into account that plugin name may differ from package or module name.
+ See package metadata for more details
+ ```
+
+## Improvements { #DBR-onetl-changelog-0-7-0-improvements }
+
+- Added compatibility with Python 3.11 and PySpark 3.4.0.
+
+ File connections were OK, but `jdbc.fetch` and `jdbc.execute` were failing. Fixed in [#28](https://github.com/MTSWebServices/onetl/pull/28).
+
+- Added check for missing Java packages.
+
+ Previously if DB connection tried to use some Java class which were not loaded into Spark version, it raised an exception
+ with long Java stacktrace. Most users failed to interpret this trace.
+
+ Now onETL shows the following error message:
+
+ ``` text title="exception message example"
+
+ |Spark| Cannot import Java class 'com.mongodb.spark.sql.connector.MongoTableProvider'.
+
+ It looks like you've created Spark session without this option:
+ SparkSession.builder.config("spark.jars.packages", MongoDB.package_spark_3_2)
+
+ Please call `spark.stop()`, restart the interpreter,
+ and then create new SparkSession with proper options.
+ ```
+
+- Documentation improvements.
+
+ - Changed documentation site theme - using [furo](https://github.com/pradyunsg/furo)
+ instead of default [ReadTheDocs](https://github.com/readthedocs/sphinx_rtd_theme).
+
+ New theme supports wide screens and dark mode.
+ See [#10](https://github.com/MTSWebServices/onetl/pull/10).
+
+ - Now each connection class have compatibility table for Spark + Java + Python.
+
+ - Added global compatibility table for Spark + Java + Python + Scala.
+
+## Bug Fixes { #DBR-onetl-changelog-0-7-0-bug-fixes }
+
+- Fixed several SFTP issues.
+
+ - If SSH config file `~/.ssh/config` contains some options not recognized by Paramiko (unknown syntax, unknown option name),
+ previous versions were raising exception until fixing or removing this file. Since 0.7.0 exception is replaced with warning.
+
+ - If user passed `host_key_check=False` but server changed SSH keys, previous versions raised exception until new key is accepted.
+ Since 0.7.0 exception is replaced with warning if option value is `False`.
+
+ Fixed in [#19](https://github.com/MTSWebServices/onetl/pull/19).
+
+- Fixed several S3 issues.
+
+ There was a bug in S3 connection which prevented handling files in the root of a bucket - they were invisible for the connector. Fixed in [#29](https://github.com/MTSWebServices/onetl/pull/29).
diff --git a/mddocs/docs/changelog/0.7.1.md b/mddocs/docs/changelog/0.7.1.md
new file mode 100644
index 000000000..c69428444
--- /dev/null
+++ b/mddocs/docs/changelog/0.7.1.md
@@ -0,0 +1,40 @@
+# 0.7.1 (2023-05-23) { #DBR-onetl-changelog-0-7-1 }
+
+## Bug Fixes { #DBR-onetl-changelog-0-7-1-bug-fixes }
+
+- Fixed `setup_logging` function.
+
+ In onETL==0.7.0 calling `onetl.log.setup_logging()` broke the logging:
+
+ ``` text title="exception message"
+
+ Traceback (most recent call last):
+ File "/opt/anaconda/envs/py39/lib/python3.9/logging/__init__.py", line 434, in format
+ return self._format(record)
+ File "/opt/anaconda/envs/py39/lib/python3.9/logging/__init__.py", line 430, in _format
+ return self._fmt % record.dict
+ KeyError: 'levelname:8s'
+ ```
+
+- Fixed installation examples.
+
+ In onETL==0.7.0 there are examples of installing onETL with extras:
+
+ ``` bash title="before"
+
+ pip install onetl[files, kerberos, spark]
+ ```
+
+ But pip fails to install such package:
+
+ ``` text title="exception"
+
+ ERROR: Invalid requirement: 'onet[files,'
+ ```
+
+ This is because of spaces in extras clause. Fixed:
+
+ ``` bash title="after"
+
+ pip install onetl[files,kerberos,spark]
+ ```
diff --git a/mddocs/docs/changelog/0.7.2.md b/mddocs/docs/changelog/0.7.2.md
new file mode 100644
index 000000000..0796b5cce
--- /dev/null
+++ b/mddocs/docs/changelog/0.7.2.md
@@ -0,0 +1,37 @@
+# 0.7.2 (2023-05-24) { #DBR-onetl-changelog-0-7-2 }
+
+## Dependencies { #DBR-onetl-changelog-0-7-2-dependencies }
+
+- Limited `typing-extensions` version.
+
+ `typing-extensions==4.6.0` release contains some breaking changes causing errors like:
+
+ ``` text title="typing-extensions 4.6.0"
+
+ Traceback (most recent call last):
+ File "/Users/project/lib/python3.9/typing.py", line 852, in __subclasscheck__
+ return issubclass(cls, self.__origin__)
+ TypeError: issubclass() arg 1 must be a class
+ ```
+
+ `typing-extensions==4.6.1` was causing another error:
+
+ ``` text title="typing-extensions 4.6.1"
+
+ Traceback (most recent call last):
+ File "/home/maxim/Repo/typing_extensions/1.py", line 33, in
+ isinstance(file, ContainsException)
+ File "/home/maxim/Repo/typing_extensions/src/typing_extensions.py", line 599, in __instancecheck__
+ if super().__instancecheck__(instance):
+ File "/home/maxim/.pyenv/versions/3.7.8/lib/python3.7/abc.py", line 139, in __instancecheck__
+ return _abc_instancecheck(cls, instance)
+ File "/home/maxim/Repo/typing_extensions/src/typing_extensions.py", line 583, in __subclasscheck__
+ return super().__subclasscheck__(other)
+ File "/home/maxim/.pyenv/versions/3.7.8/lib/python3.7/abc.py", line 143, in __subclasscheck__
+ return _abc_subclasscheck(cls, subclass)
+ File "/home/maxim/Repo/typing_extensions/src/typing_extensions.py", line 661, in _proto_hook
+ and other._is_protocol
+ AttributeError: type object 'PathWithFailure' has no attribute '_is_protocol'
+ ```
+
+ We updated requirements with `typing-extensions<4.6` until fixing compatibility issues.
diff --git a/mddocs/docs/changelog/0.8.0.md b/mddocs/docs/changelog/0.8.0.md
new file mode 100644
index 000000000..00f011aed
--- /dev/null
+++ b/mddocs/docs/changelog/0.8.0.md
@@ -0,0 +1,162 @@
+# 0.8.0 (2023-05-31) { #DBR-onetl-changelog-0-8-0 }
+
+## Breaking Changes { #DBR-onetl-changelog-0-8-0-breaking-changes }
+
+- Rename methods of `FileConnection` classes:
+
+ - `get_directory` → `resolve_dir`
+ - `get_file` → `resolve_file`
+ - `listdir` → `list_dir`
+ - `mkdir` → `create_dir`
+ - `rmdir` → `remove_dir`
+
+ New naming should be more consistent.
+
+ They were undocumented in previous versions, but someone could use these methods, so this is a breaking change. ([#36](https://github.com/MTSWebServices/onetl/pull/36))
+
+- Deprecate `onetl.core.FileFilter` class, replace it with new classes:
+
+ - `onetl.file.filter.Glob`
+ - `onetl.file.filter.Regexp`
+ - `onetl.file.filter.ExcludeDir`
+
+ Old class will be removed in v1.0.0. ([#43](https://github.com/MTSWebServices/onetl/pull/43))
+
+- Deprecate `onetl.core.FileLimit` class, replace it with new class `onetl.file.limit.MaxFilesCount`.
+
+ Old class will be removed in v1.0.0. ([#44](https://github.com/MTSWebServices/onetl/pull/44))
+
+- Change behavior of `BaseFileLimit.reset` method.
+
+ This method should now return `self` instead of `None`.
+ Return value could be the same limit object or a copy, this is an implementation detail. ([#44](https://github.com/MTSWebServices/onetl/pull/44))
+
+- Replaced `FileDownloader.filter` and `.limit` with new options `.filters` and `.limits`:
+
+ ``` python title="onETL < 0.8.0"
+ FileDownloader(
+ ...,
+ filter=FileFilter(glob="*.txt", exclude_dir="/path"),
+ limit=FileLimit(count_limit=10),
+ )
+ ```
+
+ ``` python title="onETL >= 0.8.0"
+ FileDownloader(
+ ...,
+ filters=[Glob("*.txt"), ExcludeDir("/path")],
+ limits=[MaxFilesCount(10)],
+ )
+ ```
+
+ This allows to developers to implement their own filter and limit classes, and combine them with existing ones.
+
+ Old behavior still supported, but it will be removed in v1.0.0. ([#45](https://github.com/MTSWebServices/onetl/pull/45))
+
+- Removed default value for `FileDownloader.limits`, user should pass limits list explicitly. ([#45](https://github.com/MTSWebServices/onetl/pull/45))
+
+- Move classes from module `onetl.core`:
+
+ ``` python title="before"
+ from onetl.core import DBReader
+ from onetl.core import DBWriter
+ from onetl.core import FileDownloader
+ from onetl.core import FileUploader
+ ```
+
+ with new modules `onetl.db` and `onetl.file`:
+
+ ``` python title="after"
+ from onetl.db import DBReader
+ from onetl.db import DBWriter
+
+ from onetl.file import FileDownloader
+ from onetl.file import FileUploader
+ ```
+
+ Imports from old module `onetl.core` still can be used, but marked as deprecated. Module will be removed in v1.0.0. ([#46](https://github.com/MTSWebServices/onetl/pull/46))
+
+## Features { #DBR-onetl-changelog-0-8-0-features }
+
+- Add `rename_dir` method.
+
+ Method was added to following connections:
+
+ - `FTP`
+ - `FTPS`
+ - `HDFS`
+ - `SFTP`
+ - `WebDAV`
+
+ It allows to rename/move directory to new path with all its content.
+
+ `S3` does not have directories, so there is no such method in that class. ([#40](https://github.com/MTSWebServices/onetl/pull/40))
+
+- Add `onetl.file.FileMover` class.
+
+ It allows to move files between directories of remote file system.
+ Signature is almost the same as in `FileDownloader`, but without HWM support. ([#42](https://github.com/MTSWebServices/onetl/pull/42))
+
+## Improvements { #DBR-onetl-changelog-0-8-0-improvements }
+
+- Document all public methods in `FileConnection` classes:
+
+ - `download_file`
+ - `resolve_dir`
+ - `resolve_file`
+ - `get_stat`
+ - `is_dir`
+ - `is_file`
+ - `list_dir`
+ - `create_dir`
+ - `path_exists`
+ - `remove_file`
+ - `rename_file`
+ - `remove_dir`
+ - `upload_file`
+ - `walk` ([#39](https://github.com/MTSWebServices/onetl/pull/39))
+
+- Update documentation of `check` method of all connections - add usage example and document result type. ([#39](https://github.com/MTSWebServices/onetl/pull/39))
+
+- Add new exception type `FileSizeMismatchError`.
+
+ Methods `connection.download_file` and `connection.upload_file` now raise new exception type instead of `RuntimeError`,
+ if target file after download/upload has different size than source. ([#39](https://github.com/MTSWebServices/onetl/pull/39))
+
+- Add new exception type `DirectoryExistsError` - it is raised if target directory already exists. ([#40](https://github.com/MTSWebServices/onetl/pull/40))
+
+- Improved `FileDownloader` / `FileUploader` exception logging.
+
+ If `DEBUG` logging is enabled, print exception with stacktrace instead of
+ printing only exception message. ([#42](https://github.com/MTSWebServices/onetl/pull/42))
+
+- Updated documentation of `FileUploader`.
+
+ - Class does not support read strategies, added note to documentation.
+ - Added examples of using `run` method with explicit files list passing, both absolute and relative paths.
+ - Fix outdated imports and class names in examples. ([#42](https://github.com/MTSWebServices/onetl/pull/42))
+
+- Updated documentation of `DownloadResult` class - fix outdated imports and class names. ([#42](https://github.com/MTSWebServices/onetl/pull/42))
+
+- Improved file filters documentation section.
+
+ Document interface class `onetl.base.BaseFileFilter` and function `match_all_filters`. ([#43](https://github.com/MTSWebServices/onetl/pull/43))
+
+- Improved file limits documentation section.
+
+ Document interface class `onetl.base.BaseFileLimit` and functions `limits_stop_at` / `limits_reached` / `reset_limits`. ([#44](https://github.com/MTSWebServices/onetl/pull/44))
+
+- Added changelog.
+
+ Changelog is generated from separated news files using [towncrier](https://pypi.org/project/towncrier/). ([#47](https://github.com/MTSWebServices/onetl/pull/47))
+
+## Misc { #DBR-onetl-changelog-0-8-0-misc }
+
+- Improved CI workflow for tests.
+
+ - If developer haven't changed source core of a specific connector or its dependencies,
+ run tests only against maximum supported versions of Spark, Python, Java and db/file server.
+ - If developed made some changes in a specific connector, or in core classes, or in dependencies,
+ run tests for both minimal and maximum versions.
+ - Once a week run all aganst for minimal and latest versions to detect breaking changes in dependencies
+ - Minimal tested Spark version is 2.3.1 instead on 2.4.8. ([#32](https://github.com/MTSWebServices/onetl/pull/32))
diff --git a/mddocs/docs/changelog/0.8.1.md b/mddocs/docs/changelog/0.8.1.md
new file mode 100644
index 000000000..8d4ef4a9e
--- /dev/null
+++ b/mddocs/docs/changelog/0.8.1.md
@@ -0,0 +1,42 @@
+# 0.8.1 (2023-07-10) { #DBR-onetl-changelog-0-8-1 }
+
+## Features { #DBR-onetl-changelog-0-8-1-features }
+
+- Add `@slot` decorator to public methods of:
+
+ - `DBConnection`
+ - `FileConnection`
+ - `DBReader`
+ - `DBWriter`
+ - `FileDownloader`
+ - `FileUploader`
+ - `FileMover` ([#49](https://github.com/MTSWebServices/onetl/pull/49))
+
+- Add `workers` field to `FileDownloader` / `FileUploader` / `FileMover`. `Options` classes.
+
+ This allows to speed up all file operations using parallel threads. ([#57](https://github.com/MTSWebServices/onetl/pull/57))
+
+## Improvements { #DBR-onetl-changelog-0-8-1-improvements }
+
+- Add documentation for HWM store `.get` and `.save` methods. ([#49](https://github.com/MTSWebServices/onetl/pull/49))
+
+- Improve Readme:
+
+ - Move `Quick start` section from documentation
+ - Add `Non-goals` section
+ - Fix code blocks indentation ([#50](https://github.com/MTSWebServices/onetl/pull/50))
+
+- Improve Contributing guide:
+
+ - Move `Develop` section from Readme
+ - Move `docs/changelog/README.rst` content
+ - Add `Limitations` section
+ - Add instruction of creating a fork and building documentation ([#50](https://github.com/MTSWebServices/onetl/pull/50))
+
+- Remove duplicated checks for source file existence in `FileDownloader` / `FileMover`. ([#57](https://github.com/MTSWebServices/onetl/pull/57))
+
+- Update default logging format to include thread name. ([#57](https://github.com/MTSWebServices/onetl/pull/57))
+
+## Bug Fixes { #DBR-onetl-changelog-0-8-1-bug-fixes }
+
+- Fix `S3.list_dir('/')` returns empty list on latest Minio version. ([#58](https://github.com/MTSWebServices/onetl/pull/58))
diff --git a/mddocs/docs/changelog/0.9.0.md b/mddocs/docs/changelog/0.9.0.md
new file mode 100644
index 000000000..b87049ec4
--- /dev/null
+++ b/mddocs/docs/changelog/0.9.0.md
@@ -0,0 +1,122 @@
+# 0.9.0 (2023-08-17) { #DBR-onetl-changelog-0-9-0 }
+
+## Breaking Changes { #DBR-onetl-changelog-0-9-0-breaking-changes }
+
+- Rename methods:
+
+ - `DBConnection.read_df` → `DBConnection.read_source_as_df`
+ - `DBConnection.write_df` → `DBConnection.write_df_to_target` ([#66](https://github.com/MTSWebServices/onetl/pull/66))
+
+- Rename classes:
+
+ - `HDFS.slots` → `HDFS.Slots`
+ - `Hive.slots` → `Hive.Slots`
+
+ Old names are left intact, but will be removed in v1.0.0 ([#103](https://github.com/MTSWebServices/onetl/pull/103))
+
+- Rename options to make them self-explanatory:
+
+ - `Hive.WriteOptions(mode="append")` → `Hive.WriteOptions(if_exists="append")`
+ - `Hive.WriteOptions(mode="overwrite_table")` → `Hive.WriteOptions(if_exists="replace_entire_table")`
+ - `Hive.WriteOptions(mode="overwrite_partitions")` → `Hive.WriteOptions(if_exists="replace_overlapping_partitions")`
+ - `JDBC.WriteOptions(mode="append")` → `JDBC.WriteOptions(if_exists="append")`
+ - `JDBC.WriteOptions(mode="overwrite")` → `JDBC.WriteOptions(if_exists="replace_entire_table")`
+ - `Greenplum.WriteOptions(mode="append")` → `Greenplum.WriteOptions(if_exists="append")`
+ - `Greenplum.WriteOptions(mode="overwrite")` → `Greenplum.WriteOptions(if_exists="replace_entire_table")`
+ - `MongoDB.WriteOptions(mode="append")` → `Greenplum.WriteOptions(if_exists="append")`
+ - `MongoDB.WriteOptions(mode="overwrite")` → `Greenplum.WriteOptions(if_exists="replace_entire_collection")`
+ - `FileDownloader.Options(mode="error")` → `FileDownloader.Options(if_exists="error")`
+ - `FileDownloader.Options(mode="ignore")` → `FileDownloader.Options(if_exists="ignore")`
+ - `FileDownloader.Options(mode="overwrite")` → `FileDownloader.Options(if_exists="replace_file")`
+ - `FileDownloader.Options(mode="delete_all")` → `FileDownloader.Options(if_exists="replace_entire_directory")`
+ - `FileUploader.Options(mode="error")` → `FileUploader.Options(if_exists="error")`
+ - `FileUploader.Options(mode="ignore")` → `FileUploader.Options(if_exists="ignore")`
+ - `FileUploader.Options(mode="overwrite")` → `FileUploader.Options(if_exists="replace_file")`
+ - `FileUploader.Options(mode="delete_all")` → `FileUploader.Options(if_exists="replace_entire_directory")`
+ - `FileMover.Options(mode="error")` → `FileMover.Options(if_exists="error")`
+ - `FileMover.Options(mode="ignore")` → `FileMover.Options(if_exists="ignore")`
+ - `FileMover.Options(mode="overwrite")` → `FileMover.Options(if_exists="replace_file")`
+ - `FileMover.Options(mode="delete_all")` → `FileMover.Options(if_exists="replace_entire_directory")`
+
+ Old names are left intact, but will be removed in v1.0.0 ([#108](https://github.com/MTSWebServices/onetl/pull/108))
+
+- Rename `onetl.log.disable_clients_logging()` to `onetl.log.setup_clients_logging()`. ([#120](https://github.com/MTSWebServices/onetl/pull/120))
+
+## Features { #DBR-onetl-changelog-0-9-0-features }
+
+- Add new methods returning Maven packages for specific connection class:
+
+ - `Clickhouse.get_packages()`
+ - `MySQL.get_packages()`
+ - `Postgres.get_packages()`
+ - `Teradata.get_packages()`
+ - `MSSQL.get_packages(java_version="8")`
+ - `Oracle.get_packages(java_version="8")`
+ - `Greenplum.get_packages(scala_version="2.12")`
+ - `MongoDB.get_packages(scala_version="2.12")`
+ - `Kafka.get_packages(spark_version="3.4.1", scala_version="2.12")`
+
+ Deprecate old syntax:
+
+ - `Clickhouse.package`
+ - `MySQL.package`
+ - `Postgres.package`
+ - `Teradata.package`
+ - `MSSQL.package`
+ - `Oracle.package`
+ - `Greenplum.package_spark_2_3`
+ - `Greenplum.package_spark_2_4`
+ - `Greenplum.package_spark_3_2`
+ - `MongoDB.package_spark_3_2`
+ - `MongoDB.package_spark_3_3`
+ - `MongoDB.package_spark_3_4` ([#87](https://github.com/MTSWebServices/onetl/pull/87))
+
+- Allow to set client modules log level in `onetl.log.setup_clients_logging()`.
+
+ Allow to enable underlying client modules logging in `onetl.log.setup_logging()` by providing additional argument `enable_clients=True`.
+ This is useful for debug. ([#120](https://github.com/MTSWebServices/onetl/pull/120))
+
+- Added support for reading and writing data to Kafka topics.
+
+ For these operations, new classes were added.
+
+ - `Kafka` ([#54](https://github.com/MTSWebServices/onetl/pull/54), [#60](https://github.com/MTSWebServices/onetl/pull/60), [#72](https://github.com/MTSWebServices/onetl/pull/72), [#84](https://github.com/MTSWebServices/onetl/pull/84), [#87](https://github.com/MTSWebServices/onetl/pull/87), [#89](https://github.com/MTSWebServices/onetl/pull/89), [#93](https://github.com/MTSWebServices/onetl/pull/93), [#96](https://github.com/MTSWebServices/onetl/pull/96), [#102](https://github.com/MTSWebServices/onetl/pull/102), [#104](https://github.com/MTSWebServices/onetl/pull/104))
+ - `Kafka.PlaintextProtocol` ([#79](https://github.com/MTSWebServices/onetl/pull/79))
+ - `Kafka.SSLProtocol` ([#118](https://github.com/MTSWebServices/onetl/pull/118))
+ - `Kafka.BasicAuth` ([#63](https://github.com/MTSWebServices/onetl/pull/63), [#77](https://github.com/MTSWebServices/onetl/pull/77))
+ - `Kafka.KerberosAuth` ([#63](https://github.com/MTSWebServices/onetl/pull/63), [#77](https://github.com/MTSWebServices/onetl/pull/77), [#110](https://github.com/MTSWebServices/onetl/pull/110))
+ - `Kafka.ScramAuth` ([#115](https://github.com/MTSWebServices/onetl/pull/115))
+ - `Kafka.Slots` ([#109](https://github.com/MTSWebServices/onetl/pull/109))
+ - `Kafka.ReadOptions` ([#68](https://github.com/MTSWebServices/onetl/pull/68))
+ - `Kafka.WriteOptions` ([#68](https://github.com/MTSWebServices/onetl/pull/68))
+
+ Currently, Kafka does not support incremental read strategies, this will be implemented in future releases.
+
+- Added support for reading files as Spark DataFrame and saving DataFrame as Files.
+
+ For these operations, new classes were added.
+
+ FileDFConnections:
+
+ - `SparkHDFS` ([#98](https://github.com/MTSWebServices/onetl/pull/98))
+ - `SparkS3` ([#94](https://github.com/MTSWebServices/onetl/pull/94), [#100](https://github.com/MTSWebServices/onetl/pull/100), [#124](https://github.com/MTSWebServices/onetl/pull/124))
+ - `SparkLocalFS` ([#67](https://github.com/MTSWebServices/onetl/pull/67))
+
+ High-level classes:
+
+ - `FileDFReader` ([#73](https://github.com/MTSWebServices/onetl/pull/73))
+ - `FileDFWriter` ([#81](https://github.com/MTSWebServices/onetl/pull/81))
+
+ File formats:
+
+ - `Avro` ([#69](https://github.com/MTSWebServices/onetl/pull/69))
+ - `CSV` ([#92](https://github.com/MTSWebServices/onetl/pull/92))
+ - `JSON` ([#83](https://github.com/MTSWebServices/onetl/pull/83))
+ - `JSONLine` ([#83](https://github.com/MTSWebServices/onetl/pull/83))
+ - `ORC` ([#86](https://github.com/MTSWebServices/onetl/pull/86))
+ - `Parquet` ([#88](https://github.com/MTSWebServices/onetl/pull/88))
+
+## Improvements { #DBR-onetl-changelog-0-9-0-improvements }
+
+- Remove redundant checks for driver availability in Greenplum and MongoDB connections. ([#67](https://github.com/MTSWebServices/onetl/pull/67))
+- Check of Java class availability moved from `.check()` method to connection constructor. ([#97](https://github.com/MTSWebServices/onetl/pull/97))
diff --git a/mddocs/docs/changelog/0.9.1.md b/mddocs/docs/changelog/0.9.1.md
new file mode 100644
index 000000000..40cb722da
--- /dev/null
+++ b/mddocs/docs/changelog/0.9.1.md
@@ -0,0 +1,7 @@
+# 0.9.1 (2023-08-17) { #DBR-onetl-changelog-0-9-1 }
+
+## Bug Fixes { #DBR-onetl-changelog-0-9-1-bug-fixes }
+
+- Fixed bug then number of threads created by `FileDownloader` / `FileUploader` / `FileMover` was
+ not `min(workers, len(files))`, but `max(workers, len(files))`. leading to create too much workers
+ on large files list.
diff --git a/mddocs/docs/changelog/0.9.2.md b/mddocs/docs/changelog/0.9.2.md
new file mode 100644
index 000000000..349e907e1
--- /dev/null
+++ b/mddocs/docs/changelog/0.9.2.md
@@ -0,0 +1,23 @@
+# 0.9.2 (2023-09-06) { #DBR-onetl-changelog-0-9-2 }
+
+## Features { #DBR-onetl-changelog-0-9-2-features }
+
+- Add `if_exists="ignore"` and `error` to `Greenplum.WriteOptions` ([#142](https://github.com/MTSWebServices/onetl/pull/142))
+
+## Improvements { #DBR-onetl-changelog-0-9-2-improvements }
+
+- Improve validation messages while writing dataframe to Kafka. ([#131](https://github.com/MTSWebServices/onetl/pull/131))
+
+- Improve documentation:
+
+ - Add notes about reading and writing to database connections documentation
+ - Add notes about executing statements in JDBC and Greenplum connections
+
+## Bug Fixes { #DBR-onetl-changelog-0-9-2-bug-fixes }
+
+- Fixed validation of `headers` column is written to Kafka with default `Kafka.WriteOptions()` - default value was `False`,
+ but instead of raising an exception, column value was just ignored. ([#131](https://github.com/MTSWebServices/onetl/pull/131))
+- Fix reading data from Oracle with `partitioningMode="range"` without explicitly set `lowerBound` / `upperBound`. ([#133](https://github.com/MTSWebServices/onetl/pull/133))
+- Update Kafka documentation with SSLProtocol usage. ([#136](https://github.com/MTSWebServices/onetl/pull/136))
+- Raise exception if someone tries to read data from Kafka topic which does not exist. ([#138](https://github.com/MTSWebServices/onetl/pull/138))
+- Allow to pass Kafka topics with name like `some.topic.name` to DBReader. Same for MongoDB collections. ([#139](https://github.com/MTSWebServices/onetl/pull/139))
diff --git a/mddocs/docs/changelog/0.9.3.md b/mddocs/docs/changelog/0.9.3.md
new file mode 100644
index 000000000..c8f24f4ba
--- /dev/null
+++ b/mddocs/docs/changelog/0.9.3.md
@@ -0,0 +1,5 @@
+# 0.9.3 (2023-09-06) { #DBR-onetl-changelog-0-9-3 }
+
+## Bug Fixes { #DBR-onetl-changelog-0-9-3-bug-fixes }
+
+- Fix documentation build
diff --git a/mddocs/docs/changelog/0.9.4.md b/mddocs/docs/changelog/0.9.4.md
new file mode 100644
index 000000000..d74ea2564
--- /dev/null
+++ b/mddocs/docs/changelog/0.9.4.md
@@ -0,0 +1,24 @@
+# 0.9.4 (2023-09-26) { #DBR-onetl-changelog-0-9-4 }
+
+## Features { #DBR-onetl-changelog-0-9-4-features }
+
+- Add `Excel` file format support. ([#148](https://github.com/MTSWebServices/onetl/pull/148))
+- Add `Samba` file connection.
+ It is now possible to download and upload files to Samba shared folders using `FileDownloader`/`FileUploader`. ([#150](https://github.com/MTSWebServices/onetl/pull/150))
+- Add `if_exists="ignore"` and `error` to `Hive.WriteOptions` ([#143](https://github.com/MTSWebServices/onetl/pull/143))
+- Add `if_exists="ignore"` and `error` to `JDBC.WriteOptions` ([#144](https://github.com/MTSWebServices/onetl/pull/144))
+- Add `if_exists="ignore"` and `error` to `MongoDB.WriteOptions` ([#145](https://github.com/MTSWebServices/onetl/pull/145))
+
+## Improvements { #DBR-onetl-changelog-0-9-4-improvements }
+
+- Add documentation about different ways of passing packages to Spark session. ([#151](https://github.com/MTSWebServices/onetl/pull/151))
+- Drastically improve `Greenplum` documentation:
+ - Added information about network ports, grants, `pg_hba.conf` and so on.
+ - Added interaction schemas for reading, writing and executing statements in Greenplum.
+ - Added recommendations about reading data from views and `JOIN` results from Greenplum. ([#154](https://github.com/MTSWebServices/onetl/pull/154))
+- Make `.fetch` and `.execute` methods of DB connections thread-safe. Each thread works with its own connection. ([#156](https://github.com/MTSWebServices/onetl/pull/156))
+- Call `.close()` on `FileConnection` then it is removed by garbage collector. ([#156](https://github.com/MTSWebServices/onetl/pull/156))
+
+## Bug Fixes { #DBR-onetl-changelog-0-9-4-bug-fixes }
+
+- Fix issue when stopping Python interpreter calls `JDBCMixin.close()`, but it is finished with exceptions. ([#156](https://github.com/MTSWebServices/onetl/pull/156))
diff --git a/mddocs/docs/changelog/0.9.5.md b/mddocs/docs/changelog/0.9.5.md
new file mode 100644
index 000000000..b86961d3c
--- /dev/null
+++ b/mddocs/docs/changelog/0.9.5.md
@@ -0,0 +1,14 @@
+# 0.9.5 (2023-10-10) { #DBR-onetl-changelog-0-9-5 }
+
+## Features { #DBR-onetl-changelog-0-9-5-features }
+
+- Add `XML` file format support. ([#163](https://github.com/MTSWebServices/onetl/pull/163))
+- Tested compatibility with Spark 3.5.0. `MongoDB` and `Excel` are not supported yet, but other packages do. ([#159](https://github.com/MTSWebServices/onetl/pull/159))
+
+## Improvements { #DBR-onetl-changelog-0-9-5-improvements }
+
+- Add check to all DB and FileDF connections that Spark session is alive. ([#164](https://github.com/MTSWebServices/onetl/pull/164))
+
+## Bug Fixes { #DBR-onetl-changelog-0-9-5-bug-fixes }
+
+- Fix `Hive.check()` behavior when Hive Metastore is not available. ([#164](https://github.com/MTSWebServices/onetl/pull/164))
diff --git a/mddocs/docs/changelog/DRAFT.md b/mddocs/docs/changelog/DRAFT.md
new file mode 100644
index 000000000..912b7d7f7
--- /dev/null
+++ b/mddocs/docs/changelog/DRAFT.md
@@ -0,0 +1,3 @@
+```{eval-rst}
+.. towncrier-draft-entries:: |release| [UNRELEASED]
+```
diff --git a/mddocs/docs/changelog/NEXT_RELEASE.md b/mddocs/docs/changelog/NEXT_RELEASE.md
new file mode 100644
index 000000000..a9831f9d1
--- /dev/null
+++ b/mddocs/docs/changelog/NEXT_RELEASE.md
@@ -0,0 +1 @@
+% towncrier release notes start
diff --git a/mddocs/docs/changelog/index.md b/mddocs/docs/changelog/index.md
new file mode 100644
index 000000000..01026b43c
--- /dev/null
+++ b/mddocs/docs/changelog/index.md
@@ -0,0 +1,33 @@
+# Changelog { #DBR-onetl-changelog }
+
+- [0.15.0 (2025-12-08)][DBR-onetl-changelog-0-15-0]
+- [0.14.1 (2025-11-25)][DBR-onetl-changelog-0-14-1]
+- [0.14.0 (2025-09-08)][DBR-onetl-changelog-0-14-0]
+- [0.13.5 (2025-04-14)][DBR-onetl-changelog-0-13-5]
+- [0.13.4 (2025-03-20)][DBR-onetl-changelog-0-13-4]
+- [0.13.3 (2025-03-11)][DBR-onetl-changelog-0-13-3]
+- [0.13.1 (2025-03-06)][DBR-onetl-changelog-0-13-1]
+- [0.13.0 (2025-02-24)][DBR-onetl-changelog-0-13-0]
+- [0.12.5 (2024-12-03)][DBR-onetl-changelog-0-12-5]
+- [0.12.4 (2024-11-27)][DBR-onetl-changelog-0-12-4]
+- [0.12.3 (2024-11-22)][DBR-onetl-changelog-0-12-3]
+- [0.12.2 (2024-11-12)][DBR-onetl-changelog-0-12-2]
+- [0.12.1 (2024-10-28)][DBR-onetl-changelog-0-12-1]
+- [0.12.0 (2024-09-03)][DBR-onetl-changelog-0-12-0]
+- [0.11.2 (2024-09-02)][DBR-onetl-changelog-0-11-2]
+- [0.11.1 (2024-05-29)][DBR-onetl-changelog-0-11-1]
+- [0.11.0 (2024-05-27)][DBR-onetl-changelog-0-11-0]
+- [0.10.2 (2024-03-21)][DBR-onetl-changelog-0-10-2]
+- [0.10.1 (2024-02-05)][DBR-onetl-changelog-0-10-1]
+- [0.10.0 (2023-12-18)][DBR-onetl-changelog-0-10-0]
+- [0.9.5 (2023-10-10)][DBR-onetl-changelog-0-9-5]
+- [0.9.4 (2023-09-26)][DBR-onetl-changelog-0-9-4]
+- [0.9.3 (2023-09-06)][DBR-onetl-changelog-0-9-3]
+- [0.9.2 (2023-09-06)][DBR-onetl-changelog-0-9-2]
+- [0.9.1 (2023-08-17)][DBR-onetl-changelog-0-9-1]
+- [0.9.0 (2023-08-17)][DBR-onetl-changelog-0-9-0]
+- [0.8.1 (2023-07-10)][DBR-onetl-changelog-0-8-1]
+- [0.8.0 (2023-05-31)][DBR-onetl-changelog-0-8-0]
+- [0.7.2 (2023-05-24)][DBR-onetl-changelog-0-7-2]
+- [0.7.1 (2023-05-23)][DBR-onetl-changelog-0-7-1]
+- [0.7.0 (2023-05-15)][DBR-onetl-changelog-0-7-0]
diff --git a/mddocs/docs/changelog/next_release/.keep b/mddocs/docs/changelog/next_release/.keep
new file mode 100644
index 000000000..e69de29bb
diff --git a/mddocs/docs/concepts.md b/mddocs/docs/concepts.md
new file mode 100644
index 000000000..98f0d3191
--- /dev/null
+++ b/mddocs/docs/concepts.md
@@ -0,0 +1,369 @@
+# Concepts { #DBR-onetl-concepts }
+
+Here you can find detailed documentation about each one of the onETL concepts and how to use them.
+
+## Connection { #DBR-onetl-concepts-connection }
+
+### Connection basics { #DBR-onetl-concepts-connection-basics }
+
+onETL is used to pull and push data into other systems, and so it has a first-class `Connection` concept for storing credentials that are used to communicate with external systems.
+
+A `Connection` is essentially a set of parameters, such as username, password, hostname.
+
+To create a connection to a specific storage type, you must use a class that matches the storage type. The class name is the same as the storage type name (`Oracle`, `MSSQL`, `SFTP`, etc):
+
+```python
+from onetl.connection import SFTP
+
+sftp = SFTP(
+ host="sftp.test.com",
+ user="onetl",
+ password="onetl",
+)
+```
+
+All connection types are inherited from the parent class `BaseConnection`.
+
+### Connection class diagram { #DBR-onetl-concepts-connection-class-diagram }
+
+```mermaid
+classDiagram
+ BaseConnection <|-- DBConnection
+ DBConnection <|-- Hive
+ DBConnection <|-- Greenplum
+ DBConnection <|-- MongoDB
+ DBConnection <|-- Kafka
+ DBConnection <|-- JDBCConnection
+ JDBCConnection <|-- Clickhouse
+ JDBCConnection <|-- MSSQL
+ JDBCConnection <|-- MySQL
+ JDBCConnection <|-- Postgres
+ JDBCConnection <|-- Oracle
+
+ BaseConnection <|-- FileConnection
+ FileConnection <|-- FTP
+ FileConnection <|-- FTPS
+ FileConnection <|-- HDFS
+ FileConnection <|-- WebDAV
+ FileConnection <|-- Samba
+ FileConnection <|-- SFTP
+ FileConnection <|-- S3
+ BaseConnection <|-- FileDFConnection
+ FileDFConnection <|-- SparkHDFS
+ FileDFConnection <|-- SparkLocalFS
+ FileDFConnection <|-- SparkS3
+```
+
+### DBConnection { #DBR-onetl-concepts-dbconnection }
+
+Classes inherited from `DBConnection` could be used for accessing databases.
+
+A `DBConnection` could be instantiated as follows:
+
+```python
+from onetl.connection import MSSQL
+
+mssql = MSSQL(
+ host="mssqldb.demo.com",
+ user="onetl",
+ password="onetl",
+ database="Telecom",
+ spark=spark,
+)
+```
+
+where **spark** is the current SparkSession.
+`onETL` uses `Spark` and specific Java connectors under the hood to work with databases.
+
+For a description of other parameters, see the documentation for the [available DBConnections][DBR-onetl-connection-db-connection-db-connections].
+
+### FileConnection { #DBR-onetl-concepts-fileconnection }
+
+Classes inherited from `FileConnection` could be used to access files stored on the different file systems/file servers
+
+A `FileConnection` could be instantiated as follows:
+
+```python
+from onetl.connection import SFTP
+
+sftp = SFTP(
+ host="sftp.test.com",
+ user="onetl",
+ password="onetl",
+)
+```
+
+For a description of other parameters, see the documentation for the [available FileConnections][DBR-onetl-connection-file-connection-file-connections].
+
+### FileDFConnection { #DBR-onetl-concepts-filedfconnection }
+
+Classes inherited from `FileDFConnection` could be used for accessing files as Spark DataFrames.
+
+A `FileDFConnection` could be instantiated as follows:
+
+```python
+from onetl.connection import SparkHDFS
+
+spark_hdfs = SparkHDFS(
+ host="namenode1.domain.com",
+ cluster="mycluster",
+ spark=spark,
+)
+```
+
+where **spark** is the current SparkSession.
+`onETL` uses `Spark` and specific Java connectors under the hood to work with DataFrames.
+
+For a description of other parameters, see the documentation for the [available FileDFConnections][DBR-onetl-connection-file-df-connection-file-dataframe-connections].
+
+### Checking connection availability { #DBR-onetl-concepts-checking-connection-availability }
+
+Once you have created a connection, you can check the database/filesystem availability using the method `check()`:
+
+```python
+mssql.check()
+sftp.check()
+spark_hdfs.check()
+```
+
+It will raise an exception if database/filesystem cannot be accessed.
+
+This method returns connection itself, so you can create connection and immediately check its availability:
+
+```python
+mssql = MSSQL(
+ host="mssqldb.demo.com",
+ user="onetl",
+ password="onetl",
+ database="Telecom",
+ spark=spark,
+).check() # <--
+```
+
+## Extract/Load data { #DBR-onetl-concepts-extractload-data }
+
+### Basics { #DBR-onetl-concepts-basics }
+
+As we said above, onETL is used to extract data from and load data into remote systems.
+
+onETL provides several classes for this:
+
+* [DBReader][DBR-onetl-db-reader]
+* [DBWriter][DBR-onetl-db-writer]
+* [FileDFReader][DBR-onetl-file-df-reader-filedf-reader-0]
+* [FileDFWriter][DBR-onetl-file-df-writer-filedf-writer-0]
+* [FileDownloader][DBR-onetl-file-downloader-0]
+* [FileUploader][DBR-onetl-file-uploader-0]
+* [FileMover][DBR-onetl-file-mover-0]
+
+All of these classes have a method `run()` that starts extracting/loading the data:
+
+```python
+from onetl.db import DBReader, DBWriter
+
+reader = DBReader(
+ connection=mssql,
+ source="dbo.demo_table",
+ columns=["column_1", "column_2"],
+)
+
+# Read data as Spark DataFrame
+df = reader.run()
+
+db_writer = DBWriter(
+ connection=hive,
+ target="dl_sb.demo_table",
+)
+
+# Save Spark DataFrame to Hive table
+writer.run(df)
+```
+
+### Extract data { #DBR-onetl-concepts-extract-data }
+
+To extract data you can use classes:
+
+| | Use case | Connection | `run()` gets | `run()` returns |
+| -- | - | - | - | --- |
+| [`DBReader`][DBR-onetl-db-reader] | Reading data from a database | Any [`DBConnection`][DBR-onetl-connection-db-connection-db-connections] | - | [Spark DataFrame](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/dataframe.html#dataframe) |
+| [`FileDFReader`][DBR-onetl-file-df-reader-filedf-reader-0] | Read data from a file or set of files | Any [`FileDFConnection`][DBR-onetl-connection-file-df-connection-file-dataframe-connections] | No input, or List[File path on FileSystem] | [Spark DataFrame](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/dataframe.html#dataframe) |
+| [`FileDownloader`][DBR-onetl-file-downloader-0] | Download files from remote FS to local FS | Any [`FileConnection`][DBR-onetl-connection-file-connection-file-connections] | No input, or List[File path on remote FileSystem] | [`DownloadResult`][DBR-onetl-file-downloader-result] |
+
+### Load data { #DBR-onetl-concepts-load-data }
+
+To load data you can use classes:
+
+| | Use case | Connection | `run()` gets | `run()` returns |
+| - | -- | - | --- | -- |
+| [`DBWriter`][DBR-onetl-db-writer] | Writing data from a DataFrame to a database | Any [`DBConnection`][DBR-onetl-connection-db-connection-db-connections] | [Spark DataFrame](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/dataframe.html#dataframe) | None |
+| [`FileDFWriter`][DBR-onetl-file-df-writer-filedf-writer-0] | Writing data from a DataFrame to a folder | Any [`FileDFConnection`][DBR-onetl-connection-file-df-connection-file-dataframe-connections] | [Spark DataFrame](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/dataframe.html#dataframe) | None |
+| [`FileUploader`][DBR-onetl-file-uploader-0] | Uploading files from a local FS to remote FS | Any [`FileConnection`][DBR-onetl-connection-file-connection-file-connections] | List[File path on local FileSystem] | [`UploadResult`][DBR-onetl-file-uploader-result] |
+
+### Manipulate data { #DBR-onetl-concepts-manipulate-data }
+
+To manipulate data you can use classes:
+
+| | Use case | Connection | `run()` gets | `run()` returns |
+| - | - | -- | -- | - |
+| [`FileMover`][DBR-onetl-file-mover-0] | Move files between directories in remote FS | Any [`FileConnection`][DBR-onetl-connection-file-connection-file-connections] | List[File path on remote FileSystem] | [`MoveResult`][DBR-onetl-file-mover-result] |
+
+### Options { #DBR-onetl-concepts-options }
+
+Extract and load classes have a `options` parameter, which has a special meaning:
+
+* all other parameters - *WHAT* we extract / *WHERE* we load to
+* `options` parameter - *HOW* we extract/load data
+
+```python
+db_reader = DBReader(
+ # WHAT do we read:
+ connection=mssql,
+ source="dbo.demo_table", # some table from MSSQL
+ columns=["column_1", "column_2"], # but only specific set of columns
+ where="column_2 > 1000", # only rows matching the clause
+ # HOW do we read:
+ options=MSSQL.ReadOptions(
+ numPartitions=10, # read in 10 parallel jobs
+ partitionColumn="id", # balance data read by assigning each job a part of data using `hash(id) mod N` expression
+ partitioningMode="hash",
+ fetchsize=1000, # each job will fetch block of 1000 rows each on every read attempt
+ ),
+)
+
+db_writer = DBWriter(
+ # WHERE do we write to - to some table in Hive
+ connection=hive,
+ target="dl_sb.demo_table",
+ # HOW do we write - overwrite all the data in the existing table
+ options=Hive.WriteOptions(if_exists="replace_entire_table"),
+)
+
+file_downloader = FileDownloader(
+ # WHAT do we download - files from some dir in SFTP
+ connection=sftp,
+ source_path="/source",
+ filters=[Glob("*.csv")], # only CSV files
+ limits=[MaxFilesCount(1000)], # 1000 files max
+ # WHERE do we download to - a specific dir on local FS
+ local_path="/some",
+ # HOW do we download:
+ options=FileDownloader.Options(
+ delete_source=True, # after downloading each file remove it from source_path
+ if_exists="replace_file", # replace existing files in the local_path
+ ),
+)
+
+file_uploader = FileUploader(
+ # WHAT do we upload - files from some local dir
+ local_path="/source",
+ # WHERE do we upload to- specific remote dir in HDFS
+ connection=hdfs,
+ target_path="/some",
+ # HOW do we upload:
+ options=FileUploader.Options(
+ delete_local=True, # after uploading each file remove it from local_path
+ if_exists="replace_file", # replace existing files in the target_path
+ ),
+)
+
+file_mover = FileMover(
+ # WHAT do we move - files in some remote dir in HDFS
+ source_path="/source",
+ connection=hdfs,
+ # WHERE do we move files to
+ target_path="/some", # a specific remote dir within the same HDFS connection
+ # HOW do we load - replace existing files in the target_path
+ options=FileMover.Options(if_exists="replace_file"),
+)
+
+file_df_reader = FileDFReader(
+ # WHAT do we read - *.csv files from some dir in S3
+ connection=s3,
+ source_path="/source",
+ file_format=CSV(),
+ # HOW do we read - load files from /source/*.csv, not from /source/nested/*.csv
+ options=FileDFReader.Options(recursive=False),
+)
+
+file_df_writer = FileDFWriter(
+ # WHERE do we write to - as .csv files in some dir in S3
+ connection=s3,
+ target_path="/target",
+ file_format=CSV(),
+ # HOW do we write - replace all existing files in /target, if exists
+ options=FileDFWriter.Options(if_exists="replace_entire_directory"),
+)
+```
+
+More information about `options` could be found on [`DBConnection`][DBR-onetl-connection-db-connection-db-connections] and [`FileDownloader`][DBR-onetl-file-downloader-0] / [`FileUploader`][DBR-onetl-file-uploader-0] / [`FileMover`][DBR-onetl-file-mover-0] / [`FileDFReader`][DBR-onetl-file-df-reader-filedf-reader-0] / [`FileDFWriter`][DBR-onetl-file-df-writer-filedf-writer-0] documentation.
+
+### Read Strategies { #DBR-onetl-concepts-read-strategies }
+
+onETL have several builtin strategies for reading data:
+
+1. [Snapshot strategy][DBR-onetl-strategy-snapshot-strategy] (default strategy)
+2. [Incremental strategy][DBR-onetl-connection-db-connection-clickhouse-read-incremental-strategy]
+3. [Snapshot batch strategy][DBR-onetl-strategy-snapshot-batch-strategy]
+4. [Incremental batch strategy][DBR-onetl-strategy-incremental-batch-strategy]
+
+For example, an incremental strategy allows you to get only new data from the table:
+
+```python
+from onetl.strategy import IncrementalStrategy
+
+reader = DBReader(
+ connection=mssql,
+ source="dbo.demo_table",
+ hwm_column="id", # detect new data based on value of "id" column
+)
+
+# first run
+with IncrementalStrategy():
+ df = reader.run()
+
+sleep(3600)
+
+# second run
+with IncrementalStrategy():
+ # only rows, that appeared in the source since previous run
+ df = reader.run()
+```
+
+or get only files which were not downloaded before:
+
+```python
+from onetl.strategy import IncrementalStrategy
+
+file_downloader = FileDownloader(
+ connection=sftp,
+ source_path="/remote",
+ local_path="/local",
+ hwm_type="file_list", # save all downloaded files to a list, and exclude files already present in this list
+)
+
+# first run
+with IncrementalStrategy():
+ files = file_downloader.run()
+
+sleep(3600)
+
+# second run
+with IncrementalStrategy():
+ # only files, that appeared in the source since previous run
+ files = file_downloader.run()
+```
+
+Most of strategies are based on [`HWM`][DBR-onetl-hwm-store-hwm], Please check each strategy documentation for more details
+
+### Why just not use Connection class for extract/load? { #DBR-onetl-concepts-why-just-not-use-connection-class-for-extractload }
+
+Connections are very simple, they have only a set of some basic operations,
+like `mkdir`, `remove_file`, `get_table_schema`, and so on.
+
+High-level operations, like
+
+* [`strategy`][DBR-onetl-strategy-read-strategies] support
+* Handling metadata push/pull
+* Handling different options, like `if_exists="replace_file"` in case of file download/upload
+
+is moved to a separate class which calls the connection object methods to perform some complex logic.
diff --git a/mddocs/docs/connection/db_connection/clickhouse/connection.md b/mddocs/docs/connection/db_connection/clickhouse/connection.md
new file mode 100644
index 000000000..225b088a4
--- /dev/null
+++ b/mddocs/docs/connection/db_connection/clickhouse/connection.md
@@ -0,0 +1,8 @@
+# Clickhouse connection { #DBR-onetl-connection-db-connection-clickhouse-connection-0 }
+
+
+::: onetl.connection.db_connection.clickhouse.connection.Clickhouse
+ options:
+ members:
+ - get_packages
+ - check
diff --git a/mddocs/docs/connection/db_connection/clickhouse/execute.md b/mddocs/docs/connection/db_connection/clickhouse/execute.md
new file mode 100644
index 000000000..02c0e720a
--- /dev/null
+++ b/mddocs/docs/connection/db_connection/clickhouse/execute.md
@@ -0,0 +1,109 @@
+# Executing statements in Clickhouse { #DBR-onetl-connection-db-connection-clickhouse-execute-executing-statements-in-clickhouse }
+
+!!! warning
+
+ Methods below **read all the rows** returned from DB **to Spark driver memory**, and then convert them to DataFrame.
+
+ Do **NOT** use them to read large amounts of data. Use [DBReader][DBR-onetl-connection-db-connection-clickhouse-read-reading-from-clickhouse-using-dbreader] or [Clickhouse.sql][DBR-onetl-connection-db-connection-clickhouse-sql-reading-from-clickhouse-using-clickhouse-sql] instead.
+
+## How to { #DBR-onetl-connection-db-connection-clickhouse-execute-how-to }
+
+There are 2 ways to execute some statement in Clickhouse
+
+### Use `Clickhouse.fetch` { #DBR-onetl-connection-db-connection-clickhouse-execute-use-clickhouse-fetch }
+
+Use this method to perform some `SELECT` query which returns **small number or rows**, like reading
+Clickhouse config, or reading data from some reference table. Method returns Spark DataFrame.
+
+Method accepts [Clickhouse.FetchOptions][onetl.connection.db_connection.clickhouse.options.ClickhouseFetchOptions].
+
+
+!!! warning
+
+ Please take into account [Clickhouse types][DBR-onetl-connection-db-connection-clickhouse-types-clickhouse-spark-type-mapping].
+
+#### Syntax support in `Clickhouse.fetch` { #DBR-onetl-connection-db-connection-clickhouse-execute-syntax-support-in-clickhouse-fetch }
+
+This method supports **any** query syntax supported by Clickhouse, like:
+
+- ✅︎ `SELECT ... FROM ...`
+- ✅︎ `WITH alias AS (...) SELECT ...`
+- ✅︎ `SELECT func(arg1, arg2)` - call function
+- ✅︎ `SHOW ...`
+- ❌ `SET ...; SELECT ...;` - multiple statements not supported
+
+#### Examples for `Clickhouse.fetch` { #DBR-onetl-connection-db-connection-clickhouse-execute-examples-for-clickhouse-fetch }
+
+```python
+from onetl.connection import Clickhouse
+
+clickhouse = Clickhouse(...)
+
+df = clickhouse.fetch(
+ "SELECT value FROM some.reference_table WHERE key = 'some_constant'",
+ options=Clickhouse.FetchOptions(queryTimeout=10),
+)
+clickhouse.close()
+value = df.collect()[0][0] # get value from first row and first column
+```
+
+### Use `Clickhouse.execute` { #DBR-onetl-connection-db-connection-clickhouse-execute-use-clickhouse-execute }
+
+Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it.
+
+Method accepts [Clickhouse.ExecuteOptions][onetl.connection.db_connection.clickhouse.options.ClickhouseExecuteOptions].
+
+
+#### Syntax support in `Clickhouse.execute` { #DBR-onetl-connection-db-connection-clickhouse-execute-syntax-support-in-clickhouse-execute }
+
+This method supports **any** query syntax supported by Clickhouse, like:
+
+- ✅︎ `CREATE TABLE ...`, `CREATE VIEW ...`, and so on
+- ✅︎ `ALTER ...`
+- ✅︎ `INSERT INTO ... SELECT ...`, `UPDATE ...`, `DELETE ...`, and so on
+- ✅︎ `DROP TABLE ...`, `DROP VIEW ...`, `TRUNCATE TABLE`, and so on
+- ✅︎ other statements not mentioned here
+- ❌ `SET ...; SELECT ...;` - multiple statements not supported
+
+#### Examples for `Clickhouse.execute` { #DBR-onetl-connection-db-connection-clickhouse-execute-examples-for-clickhouse-execute }
+
+```python
+from onetl.connection import Clickhouse
+
+clickhouse = Clickhouse(...)
+
+clickhouse.execute("DROP TABLE schema.table")
+clickhouse.execute(
+ """
+ CREATE TABLE schema.table (
+ id UInt8,
+ key String,
+ value Float32
+ )
+ ENGINE = MergeTree()
+ ORDER BY id
+ """,
+ options=Clickhouse.ExecuteOptions(queryTimeout=10),
+)
+```
+
+## Notes { #DBR-onetl-connection-db-connection-clickhouse-execute-notes }
+
+These methods **read all the rows** returned from DB **to Spark driver memory**, and then convert them to DataFrame.
+
+So it should **NOT** be used to read large amounts of data. Use [DBReader][DBR-onetl-connection-db-connection-clickhouse-read-reading-from-clickhouse-using-dbreader] or [Clickhouse.sql][DBR-onetl-connection-db-connection-clickhouse-sql-reading-from-clickhouse-using-clickhouse-sql] instead.
+
+## Options { #DBR-onetl-connection-db-connection-clickhouse-execute-options }
+
+
+::: onetl.connection.db_connection.clickhouse.options.ClickhouseFetchOptions
+ options:
+ inherited_members: true
+ heading_level: 3
+ show_root_heading: true
+
+::: onetl.connection.db_connection.clickhouse.options.ClickhouseExecuteOptions
+ options:
+ inherited_members: true
+ heading_level: 3
+ show_root_heading: true
diff --git a/mddocs/docs/connection/db_connection/clickhouse/index.md b/mddocs/docs/connection/db_connection/clickhouse/index.md
new file mode 100644
index 000000000..25e55cf75
--- /dev/null
+++ b/mddocs/docs/connection/db_connection/clickhouse/index.md
@@ -0,0 +1,17 @@
+# Clickhouse { #DBR-onetl-connection-db-connection-clickhouse }
+
+## Connection { #DBR-onetl-connection-db-connection-clickhouse-connection-1 }
+
+* [Prerequisites][DBR-onetl-connection-db-connection-clickhouse-prerequisites]
+* [Clickhouse connection][DBR-onetl-connection-db-connection-clickhouse-connection-0]
+
+## Operations { #DBR-onetl-connection-db-connection-clickhouse-operations }
+
+* [Reading from Clickhouse using `DBReader`][DBR-onetl-connection-db-connection-clickhouse-read-reading-from-clickhouse-using-dbreader]
+* [Reading from Clickhouse using `Clickhouse.sql`][DBR-onetl-connection-db-connection-clickhouse-sql-reading-from-clickhouse-using-clickhouse-sql]
+* [Writing to Clickhouse using `DBWriter`][DBR-onetl-connection-db-connection-clickhouse-write-writing-to-clickhouse-using-dbwriter]
+* [Executing statements in Clickhouse][DBR-onetl-connection-db-connection-clickhouse-execute-executing-statements-in-clickhouse]
+
+## Troubleshooting { #DBR-onetl-connection-db-connection-clickhouse-troubleshooting }
+
+* [Clickhouse <-> Spark type mapping][DBR-onetl-connection-db-connection-clickhouse-types-clickhouse-spark-type-mapping]
diff --git a/mddocs/docs/connection/db_connection/clickhouse/prerequisites.md b/mddocs/docs/connection/db_connection/clickhouse/prerequisites.md
new file mode 100644
index 000000000..39770111b
--- /dev/null
+++ b/mddocs/docs/connection/db_connection/clickhouse/prerequisites.md
@@ -0,0 +1,71 @@
+# Prerequisites { #DBR-onetl-connection-db-connection-clickhouse-prerequisites }
+
+## Version Compatibility { #DBR-onetl-connection-db-connection-clickhouse-prerequisites-version-compatibility }
+
+- Clickhouse server versions:
+ - Officially declared: 22.8 or higher
+ - Actually tested: 21.1, 25.8
+- Spark versions: 3.2.x - 4.1.x
+- Java versions: 8 - 22
+
+See [official documentation](https://clickhouse.com/docs/en/integrations/java#jdbc-driver).
+
+## Installing PySpark { #DBR-onetl-connection-db-connection-clickhouse-prerequisites-installing-pyspark }
+
+To use Clickhouse connector you should have PySpark installed (or injected to `sys.path`)
+BEFORE creating the connector instance.
+
+See [installation instruction][DBR-onetl-install-spark] for more details.
+
+## Connecting to Clickhouse { #DBR-onetl-connection-db-connection-clickhouse-prerequisites-connecting-to-clickhouse }
+
+### Connection port { #DBR-onetl-connection-db-connection-clickhouse-prerequisites-connection-port }
+
+Connector can only use **HTTP** (usually `8123` port) or **HTTPS** (usually `8443` port) protocol.
+
+TCP and GRPC protocols are NOT supported.
+
+### Connecting to cluster { #DBR-onetl-connection-db-connection-clickhouse-prerequisites-connecting-to-cluster }
+
+It is possible to connect to Clickhouse cluster, and use it's load balancing capabilities to read or write data in parallel.
+Each Spark executor can connect to random Clickhouse nodes, instead of sending all the data to a node specified in connection params.
+
+This requires all Clickhouse servers to run on different hosts, and **listen the same HTTP port**.
+Set `auto_discovery=True` to enable this feature (disabled by default):
+
+```python
+Clickhouse(
+ host="node1.of.cluster",
+ port=8123,
+ extra={
+ "auto_discovery": True,
+ "load_balancing_policy": "roundRobin",
+ },
+)
+```
+
+See [official documentation](https://clickhouse.com/docs/en/integrations/java#configuring-node-discovery-load-balancing-and-failover).
+
+### Required grants { #DBR-onetl-connection-db-connection-clickhouse-prerequisites-required-grants }
+
+Ask your Clickhouse cluster administrator to set following grants for a user,
+used for creating a connection:
+
+=== "Read + Write"
+
+ ```sql
+ -- allow creating tables in the target schema
+ GRANT CREATE TABLE ON myschema.* TO username;
+
+ -- allow read & write access to specific table
+ GRANT SELECT, INSERT ON myschema.mytable TO username;
+ ```
+
+=== "Read only"
+
+ ```sql
+ -- allow read access to specific table
+ GRANT SELECT ON myschema.mytable TO username;
+ ```
+
+More details can be found in [official documentation](https://clickhouse.com/docs/en/sql-reference/statements/grant).
diff --git a/mddocs/docs/connection/db_connection/clickhouse/read.md b/mddocs/docs/connection/db_connection/clickhouse/read.md
new file mode 100644
index 000000000..f42f94b8a
--- /dev/null
+++ b/mddocs/docs/connection/db_connection/clickhouse/read.md
@@ -0,0 +1,85 @@
+# Reading from Clickhouse using `DBReader` { #DBR-onetl-connection-db-connection-clickhouse-read-reading-from-clickhouse-using-dbreader }
+
+[DBReader][DBR-onetl-db-reader] supports [strategy][DBR-onetl-strategy-read-strategies] for incremental data reading,
+but does not support custom queries, like `JOIN`.
+
+!!! warning
+
+ Please take into account [Сlickhouse types][DBR-onetl-connection-db-connection-clickhouse-types-clickhouse-spark-type-mapping]
+
+## Supported DBReader features { #DBR-onetl-connection-db-connection-clickhouse-read-supported-dbreader-features }
+
+- ✅︎ `columns`
+- ✅︎ `where`
+- ✅︎ `hwm`, supported strategies:
+ - ✅︎ [Snapshot strategy][DBR-onetl-strategy-snapshot-strategy]
+ - ✅︎ [Incremental strategy][DBR-onetl-connection-db-connection-clickhouse-read-incremental-strategy]
+ - ✅︎ [Snapshot batch strategy][DBR-onetl-strategy-snapshot-batch-strategy]
+ - ✅︎ [Incremental batch strategy][DBR-onetl-strategy-incremental-batch-strategy]
+- ❌ `hint` (is not supported by Clickhouse)
+- ❌ `df_schema`
+- ✅︎ `options` (see [Clickhouse.ReadOptions][onetl.connection.db_connection.clickhouse.options.ClickhouseReadOptions])
+
+## Examples { #DBR-onetl-connection-db-connection-clickhouse-read-examples }
+
+### Snapshot strategy { #DBR-onetl-connection-db-connection-clickhouse-read-snapshot-strategy }
+
+```python
+from onetl.connection import Clickhouse
+from onetl.db import DBReader
+
+clickhouse = Clickhouse(...)
+
+reader = DBReader(
+ connection=clickhouse,
+ source="schema.table",
+ columns=["id", "key", "CAST(value AS String) value", "updated_dt"],
+ where="key = 'something'",
+ options=Clickhouse.ReadOptions(partitionColumn="id", numPartitions=10),
+)
+df = reader.run()
+
+```
+
+### Incremental strategy { #DBR-onetl-connection-db-connection-clickhouse-read-incremental-strategy }
+
+```python
+from onetl.connection import Clickhouse
+from onetl.db import DBReader
+from onetl.strategy import IncrementalStrategy
+
+clickhouse = Clickhouse(...)
+
+reader = DBReader(
+ connection=clickhouse,
+ source="schema.table",
+ columns=["id", "key", "CAST(value AS String) value", "updated_dt"],
+ where="key = 'something'",
+ hwm=DBReader.AutoDetectHWM(name="clickhouse_hwm", expression="updated_dt"),
+ options=Clickhouse.ReadOptions(partitionColumn="id", numPartitions=10),
+)
+
+with IncrementalStrategy():
+ df = reader.run()
+```
+
+## Recommendations { #DBR-onetl-connection-db-connection-clickhouse-read-recommendations }
+
+### Select only required columns { #DBR-onetl-connection-db-connection-clickhouse-read-select-only-required-columns }
+
+Instead of passing `"*"` in `DBReader(columns=[...])` prefer passing exact column names. This reduces the amount of data passed from Clickhouse to Spark.
+
+### Pay attention to `where` value { #DBR-onetl-connection-db-connection-clickhouse-read-pay-attention-to-where-value }
+
+Instead of filtering data on Spark side using `df.filter(df.column == 'value')` pass proper `DBReader(where="column = 'value'")` clause.
+This both reduces the amount of data send from Clickhouse to Spark, and may also improve performance of the query.
+Especially if there are indexes or partitions for columns used in `where` clause.
+
+## Options { #DBR-onetl-connection-db-connection-clickhouse-read-options }
+
+
+::: onetl.connection.db_connection.clickhouse.options.ClickhouseReadOptions
+ options:
+ inherited_members: true
+ heading_level: 3
+ show_root_heading: true
diff --git a/mddocs/docs/connection/db_connection/clickhouse/sql.md b/mddocs/docs/connection/db_connection/clickhouse/sql.md
new file mode 100644
index 000000000..3145a42c0
--- /dev/null
+++ b/mddocs/docs/connection/db_connection/clickhouse/sql.md
@@ -0,0 +1,69 @@
+# Reading from Clickhouse using `Clickhouse.sql` { #DBR-onetl-connection-db-connection-clickhouse-sql-reading-from-clickhouse-using-clickhouse-sql }
+
+`Clickhouse.sql` allows passing custom SQL query, but does not support incremental strategies.
+
+!!! warning
+
+ Please take into account [Clickhouse types][DBR-onetl-connection-db-connection-clickhouse-types-clickhouse-spark-type-mapping]
+
+!!! warning
+
+ Statement is executed in **read-write** connection, so if you're calling some functions/procedures with DDL/DML statements inside,
+ they can change data in your database.
+
+## Syntax support { #DBR-onetl-connection-db-connection-clickhouse-sql-syntax-support }
+
+Only queries with the following syntax are supported:
+
+- ✅︎ `SELECT ... FROM ...`
+- ✅︎ `WITH alias AS (...) SELECT ...`
+- ❌ `SET ...; SELECT ...;` - multiple statements not supported
+
+## Examples { #DBR-onetl-connection-db-connection-clickhouse-sql-examples }
+
+```python
+from onetl.connection import Clickhouse
+
+clickhouse = Clickhouse(...)
+df = clickhouse.sql(
+ """
+ SELECT
+ id,
+ key,
+ CAST(value AS String) value,
+ updated_at
+ FROM
+ some.mytable
+ WHERE
+ key = 'something'
+ """,
+ options=Clickhouse.SQLOptions(
+ partitionColumn="id",
+ numPartitions=10,
+ lowerBound=0,
+ upperBound=1000,
+ ),
+)
+```
+
+## Recommendations { #DBR-onetl-connection-db-connection-clickhouse-sql-recommendations }
+
+### Select only required columns { #DBR-onetl-connection-db-connection-clickhouse-sql-select-only-required-columns }
+
+Instead of passing `SELECT * FROM ...` prefer passing exact column names `SELECT col1, col2, ...`.
+This reduces the amount of data passed from Clickhouse to Spark.
+
+### Pay attention to `where` value { #DBR-onetl-connection-db-connection-clickhouse-sql-pay-attention-to-where-value }
+
+Instead of filtering data on Spark side using `df.filter(df.column == 'value')` pass proper `WHERE column = 'value'` clause.
+This both reduces the amount of data send from Clickhouse to Spark, and may also improve performance of the query.
+Especially if there are indexes or partitions for columns used in `where` clause.
+
+## Options { #DBR-onetl-connection-db-connection-clickhouse-sql-options }
+
+
+::: onetl.connection.db_connection.clickhouse.options.ClickhouseSQLOptions
+ options:
+ inherited_members: true
+ heading_level: 3
+ show_root_heading: true
diff --git a/mddocs/docs/connection/db_connection/clickhouse/types.md b/mddocs/docs/connection/db_connection/clickhouse/types.md
new file mode 100644
index 000000000..f761e3f94
--- /dev/null
+++ b/mddocs/docs/connection/db_connection/clickhouse/types.md
@@ -0,0 +1,350 @@
+# Clickhouse <-> Spark type mapping { #DBR-onetl-connection-db-connection-clickhouse-types-clickhouse-spark-type-mapping }
+
+!!! note
+
+ The results below are valid for Spark 3.5.8, and may differ on other Spark versions.
+
+!!! note
+
+ It is recommended to use [spark-dialect-extension](https://github.com/MTSWebServices/spark-dialect-extension) package,
+ which implements writing Arrays from Spark to Clickhouse, fixes dropping fractions of seconds in `TimestampType`,
+ and fixes other type conversion issues.
+
+## Type detection & casting { #DBR-onetl-connection-db-connection-clickhouse-types-type-detection-casting }
+
+Spark's DataFrames always have a `schema` which is a list of columns with corresponding Spark types. All operations on a column are performed using column type.
+
+### Reading from Clickhouse { #DBR-onetl-connection-db-connection-clickhouse-types-reading-from-clickhouse }
+
+This is how Clickhouse connector performs this:
+
+- For each column in query result (`SELECT column1, column2, ... FROM table ...`) get column name and Clickhouse type.
+- Find corresponding `Clickhouse type (read)` → `Spark type` combination (see below) for each DataFrame column. If no combination is found, raise exception.
+- Create DataFrame from query with specific column names and Spark types.
+
+### Writing to some existing Clickhouse table { #DBR-onetl-connection-db-connection-clickhouse-types-writing-to-some-existing-clickhouse-table }
+
+This is how Clickhouse connector performs this:
+
+- Get names of columns in DataFrame. [^1]
+- Perform `SELECT * FROM table LIMIT 0` query.
+- Take only columns present in DataFrame (by name, case insensitive). For each found column get Clickhouse type.
+- **Find corresponding** `Clickhouse type (read)` → `Spark type` **combination** (see below) for each DataFrame column. If no combination is found, raise exception. [^2]
+- Find corresponding `Spark type` → `Clickhouse type (write)` combination (see below) for each DataFrame column. If no combination is found, raise exception.
+- If `Clickhouse type (write)` match `Clickhouse type (read)`, no additional casts will be performed, DataFrame column will be written to Clickhouse as is.
+- If `Clickhouse type (write)` does not match `Clickhouse type (read)`, DataFrame column will be casted to target column type **on Clickhouse side**. For example, you can write column with text data to `Int32` column, if column contains valid integer values within supported value range and precision.
+
+[^1]: This allows to write data to tables with `DEFAULT` columns - if DataFrame has no such column, it will be populated by Clickhouse.
+
+[^2]: Yes, this is weird.
+
+### Create new table using Spark { #DBR-onetl-connection-db-connection-clickhouse-types-create-new-table-using-spark }
+
+!!! warning
+
+ ABSOLUTELY NOT RECOMMENDED!
+
+This is how Clickhouse connector performs this:
+
+- Find corresponding `Spark type` → `Clickhouse type (create)` combination (see below) for each DataFrame column. If no combination is found, raise exception.
+- Generate DDL for creating table in Clickhouse, like `CREATE TABLE (col1 ...)`, and run it.
+- Write DataFrame to created table as is.
+
+But Spark does not have specific dialect for Clickhouse, so Generic JDBC dialect is used.
+Generic dialect is using SQL ANSI type names while creating tables in target database, not database-specific types.
+
+If some cases this may lead to using wrong column type. For example, Spark creates column of type `TIMESTAMP`
+which corresponds to Clickhouse type `DateTime32` (precision up to seconds)
+instead of more precise `DateTime64` (precision up to nanoseconds).
+This may lead to incidental precision loss, or sometimes data cannot be written to created table at all.
+
+So instead of relying on Spark to create tables:
+
+??? "See example"
+
+ ```python
+ writer = DBWriter(
+ connection=clickhouse,
+ target="default.target_tbl",
+ options=Clickhouse.WriteOptions(
+ if_exists="append",
+ # ENGINE is required by Clickhouse
+ createTableOptions="ENGINE = MergeTree() ORDER BY id",
+ ),
+ )
+ writer.run(df)
+ ```
+
+Always prefer creating tables with specific types **BEFORE WRITING DATA**:
+
+??? "See example"
+
+ ```python
+ clickhouse.execute(
+ """
+ CREATE TABLE default.target_tbl (
+ id UInt8,
+ value DateTime64(6) -- specific type and precision
+ )
+ ENGINE = MergeTree()
+ ORDER BY id
+ """,
+ )
+
+ writer = DBWriter(
+ connection=clickhouse,
+ target="default.target_tbl",
+ options=Clickhouse.WriteOptions(if_exists="append"),
+ )
+ writer.run(df)
+ ```
+
+### References { #DBR-onetl-connection-db-connection-clickhouse-types-references }
+
+Here you can find source code with type conversions:
+
+- [Clickhouse -> JDBC](https://github.com/ClickHouse/clickhouse-java/blob/0.3.2/clickhouse-jdbc/src/main/java/com/clickhouse/jdbc/JdbcTypeMapping.java#L39-L176)
+- [JDBC -> Spark](https://github.com/apache/spark/blob/v3.5.8/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala#L307)
+- [Spark -> JDBC](https://github.com/apache/spark/blob/v3.5.8/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala#L141-L164)
+- [JDBC -> Clickhouse](https://github.com/ClickHouse/clickhouse-java/blob/0.3.2/clickhouse-jdbc/src/main/java/com/clickhouse/jdbc/JdbcTypeMapping.java#L185-L311)
+
+## Supported types { #DBR-onetl-connection-db-connection-clickhouse-types-supported-types }
+
+See [official documentation](https://clickhouse.com/docs/en/sql-reference/data-types)
+
+### Generic types { #DBR-onetl-connection-db-connection-clickhouse-types-generic-types }
+
+- `LowCardinality(T)` is same as `T`
+- `Nullable(T)` is same as `T`, but Spark column is inferred as `nullable=True`
+
+### Numeric types { #DBR-onetl-connection-db-connection-clickhouse-types-numeric-types }
+
+| Clickhouse type (read) | Spark type | Clickhouse type (write) | Clickhouse type (create) |
+|--------------------------------|-----------------------------------|-------------------------------|------------------------------|
+| `Bool` | `BooleanType()` | `Bool` | `UInt64` |
+| `Decimal` | `DecimalType(P=10, S=0)` | `Decimal(P=10, S=0)` | `Decimal(P=10, S=0)` |
+| `Decimal(P=0..38)` | `DecimalType(P=0..38, S=0)` | `Decimal(P=0..38, S=0)` | `Decimal(P=0..38, S=0)` |
+| `Decimal(P=0..38, S=0..38)` | `DecimalType(P=0..38, S=0..38)` | `Decimal(P=0..38, S=0..38)` | `Decimal(P=0..38, S=0..38)` |
+| `Decimal(P=39..76, S=0..76)` | unsupported [^3] | | |
+| `Decimal32(P=0..9)` | `DecimalType(P=9, S=0..9)` | `Decimal(P=9, S=0..9)` | `Decimal(P=9, S=0..9)` |
+| `Decimal64(S=0..18)` | `DecimalType(P=18, S=0..18)` | `Decimal(P=18, S=0..18)` | `Decimal(P=18, S=0..18)` |
+| `Decimal128(S=0..38)` | `DecimalType(P=38, S=0..38)` | `Decimal(P=38, S=0..38)` | `Decimal(P=38, S=0..38)` |
+| `Decimal256(S=0..76)` | unsupported [^3] | | |
+| `Float32` | `FloatType()` | `Float32` | `Float32` |
+| `Float64` | `DoubleType()` | `Float64` | `Float64` |
+| `Int8` `Int16` `Int32` | `IntegerType()` | `Int32` | `Int32` |
+| `Int64` | `LongType()` | `Int64` | `Int64` |
+| `Int128` `Int256` | unsupported [^3] | | |
+| `-` | `ByteType()` | `Int8` | `Int8` |
+| `-` | `ShortType()` | `Int32` | `Int32` |
+| `UInt8` | `IntegerType()` | `Int32` | `Int32` |
+| `UInt16` | `LongType()` | `Int64` | `Int64` |
+| `UInt32` `UInt64` | `DecimalType(20,0)` | `Decimal(20,0)` | `Decimal(20,0)` |
+| `UInt128` `UInt256` | unsupported [^3] | | |
+
+[^3]: Clickhouse support numeric types up to 256 bit - `Int256`, `UInt256`, `Decimal256(S)`, `Decimal(P=39..76, S=0..76)`.
+
+ But Spark's `DecimalType(P, S)` supports maximum `P=38` (128 bit). It is impossible to read, write or operate with values of larger precision,
+ this leads to an exception.
+
+### Temporal types { #DBR-onetl-connection-db-connection-clickhouse-types-temporal-types }
+
+Notes:
+
+- Datetime with timezone has the same precision as without timezone
+- `DateTime` is alias for `DateTime32`
+- `TIMESTAMP` is alias for `DateTime32`, but `TIMESTAMP(N)` is alias for `DateTime64(N)`
+
+| Clickhouse type (read) | Spark type | Clickhouse type (write) | Clickhouse type (create) |
+|-----------------------------------|--------------------------------------|----------------------------------|-------------------------------|
+| `Date` | `DateType()` | `Date` | `Date` |
+| `Date32` | `DateType()` | `Date` | `Date`, **cannot insert data** [^4] |
+| `DateTime32`, seconds | `TimestampType()`, microseconds | `DateTime64(6)`, microseconds | `DateTime32`, seconds |
+| `DateTime64(3)`, milliseconds | `TimestampType()`, microseconds | `DateTime64(6)`, microseconds | `DateTime32`, seconds, **precision loss** [^5] |
+| `DateTime64(6)`, microseconds | `TimestampType()`, microseconds | | `DateTime32`, seconds, **precision loss** [^7] |
+| `DateTime64(7..9)`, nanoseconds | `TimestampType()`, microseconds, **precision loss** [^6] | | |
+| `-` | `TimestampNTZType()`, microseconds | | |
+| `DateTime32(TZ)` `DateTime64(P, TZ)` | unsupported [^7] | | |
+| `IntervalNanosecond` `IntervalMicrosecond` `IntervalMillisecond` `IntervalSecond` `IntervalMinute` `IntervalHour` `IntervalDay` `IntervalMonth` `IntervalQuarter` `IntervalWeek` `IntervalYear` |
`LongType()` |
`Int64` |
`Int64` |
+
+!!! warning
+
+ Note that types in Clickhouse and Spark have different value ranges:
+
+ | Clickhouse type | Min value | Max value | Spark type | Min value | Max value |
+ |------------------------|-----------------------------------|-----------------------------------|---------------------|--------------------------------|--------------------------------|
+ | `Date` | `1970-01-01` | `2149-06-06` |
`DateType()` {: rowspan=3} |
`0001-01-01 00:00:00.000000` {: rowspan=3} |
`9999-12-31 23:59:59.999999` {: rowspan=3} |
+ | `DateTime64(P=0..8)` | `1900-01-01 00:00:00.00000000` | `2299-12-31 23:59:59.99999999` | {: style="padding:0"} | {: style="padding:0"} | {: style="padding:0"} |
+ | `DateTime64(P=9)` | `1900-01-01 00:00:00.000000000` | `2262-04-11 23:47:16.999999999` | {: style="padding:0"} | {: style="padding:0"} | {: style="padding:0"} |
+
+ So not all of values in Spark DataFrame can be written to Clickhouse.
+
+ References:
+
+ * [Clickhouse Date documentation](https://clickhouse.com/docs/en/sql-reference/data-types/date)
+ * [Clickhouse Datetime32 documentation](https://clickhouse.com/docs/en/sql-reference/data-types/datetime)
+ * [Clickhouse Datetime64 documentation](https://clickhouse.com/docs/en/sql-reference/data-types/datetime64)
+ * [Spark DateType documentation](https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/DateType.html)
+ * [Spark TimestampType documentation](https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/TimestampType.html)
+
+[^4]: `Date32` has different bytes representation than `Date`, and inserting value of type `Date32` to `Date` column
+ leads to errors on Clickhouse side, e.g. `Date(106617) should be between 0 and 65535 inclusive of both values`.
+ Although Spark does properly read the `Date32` column as `DateType()`, and there should be no difference at all.
+ Probably this is some bug in Clickhouse driver.
+
+[^5]: Generic JDBC dialect generates DDL with Clickhouse type `TIMESTAMP` which is alias for `DateTime32` with precision up to seconds (`23:59:59`).
+ Inserting data with milliseconds precision (`23:59:59.999`) will lead to **throwing away milliseconds**.
+ Solution: create table manually, with proper column type.
+
+[^6]: Clickhouse support datetime up to nanoseconds precision (`23:59:59.999999999`),
+ but Spark `TimestampType()` supports datetime up to microseconds precision (`23:59:59.999999`).
+ Nanoseconds will be lost during read or write operations.
+ Solution: create table manually, with proper column type.
+
+[^7]: Clickhouse will raise an exception that data in format `2001-01-01 23:59:59.999999` has data `.999999` which does not match format `YYYY-MM-DD hh:mm:ss`
+ of `DateTime32` column type (see [^5]).
+ So Spark can create Clickhouse table, but cannot write data to column of this type.
+ Solution: create table manually, with proper column type.
+
+### String types { #DBR-onetl-connection-db-connection-clickhouse-types-string-types }
+
+| Clickhouse type (read) | Spark type | Clickhouse type (write) | Clickhouse type (create) |
+|--------------------------------------|------------------|------------------------|--------------------------|
+| `FixedString(N)` `String` `Enum8` `Enum16` `IPv4` `IPv6` `UUID` |
`StringType()` |
`String` |
`String` |
+| `-` | `BinaryType()` | | |
+
+## Unsupported types { #DBR-onetl-connection-db-connection-clickhouse-types-unsupported-types }
+
+Columns of these Clickhouse types cannot be read by Spark:
+
+- `AggregateFunction(func, T)`
+- `Array(T)`
+- `JSON`
+- `Map(K, V)`
+- `MultiPolygon`
+- `Nested(field1 T1, ...)`
+- `Nothing`
+- `Point`
+- `Polygon`
+- `Ring`
+- `SimpleAggregateFunction(func, T)`
+- `Tuple(T1, T2, ...)`
+
+Dataframe with these Spark types cannot be written to Clickhouse:
+
+- `ArrayType(T)`
+- `BinaryType()`
+- `CharType(N)`
+- `DayTimeIntervalType(P, S)`
+- `MapType(K, V)`
+- `NullType()`
+- `StructType([...])`
+- `TimestampNTZType()`
+- `VarcharType(N)`
+
+This is because Spark does not have dedicated Clickhouse dialect, and uses Generic JDBC dialect instead.
+This dialect does not have type conversion between some types, like Clickhouse `Array` -> Spark `ArrayType()`, and vice versa.
+
+The is a way to avoid this - just cast everything to `String`.
+
+## Explicit type cast { #DBR-onetl-connection-db-connection-clickhouse-types-explicit-type-cast }
+
+### `DBReader` { #DBR-onetl-connection-db-connection-clickhouse-types-dbreader }
+
+Use `CAST` or `toJSONString` to get column data as string in JSON format,
+
+For parsing JSON columns in ClickHouse, [JSON.parse_column][onetl.file.format.json.JSON.parse_column] method.
+
+```python
+from pyspark.sql.types import ArrayType, IntegerType
+
+from onetl.file.format import JSON
+from onetl.connection import ClickHouse
+from onetl.db import DBReader
+
+reader = DBReader(
+ connection=clickhouse,
+ target="default.source_tbl",
+ columns=[
+ "id",
+ "toJSONString(array_column) array_column",
+ ],
+)
+df = reader.run()
+
+# Spark requires all columns to have some specific type, describe it
+column_type = ArrayType(IntegerType())
+
+json = JSON()
+df = df.select(
+ df.id,
+ json.parse_column("array_column", column_type),
+)
+```
+
+### `DBWriter` { #DBR-onetl-connection-db-connection-clickhouse-types-dbwriter }
+
+For writing JSON data to ClickHouse, use the [JSON.serialize_column][onetl.file.format.json.JSON.serialize_column] method to convert a DataFrame column to JSON format efficiently and write it as a `String` column in Clickhouse.
+
+```python
+from onetl.file.format import JSON
+from onetl.connection import ClickHouse
+from onetl.db import DBWriter
+
+clickhouse = ClickHouse(...)
+
+clickhouse.execute(
+ """
+ CREATE TABLE default.target_tbl (
+ id Int32,
+ array_column_json String,
+ )
+ ENGINE = MergeTree()
+ ORDER BY id
+ """,
+)
+
+json = JSON()
+df = df.select(
+ df.id,
+ json.serialize_column(df.array_column).alias("array_column_json"),
+)
+
+writer.run(df)
+```
+
+Then you can parse this column on Clickhouse side - for example, by creating a view:
+
+```sql
+SELECT
+ id,
+ JSONExtract(json_column, 'Array(String)') AS array_column
+FROM target_tbl
+```
+
+You can also use [ALIAS](https://clickhouse.com/docs/en/sql-reference/statements/create/table#alias)
+or [MATERIALIZED](https://clickhouse.com/docs/en/sql-reference/statements/create/table#materialized) columns
+to avoid writing such expression in every `SELECT` clause all the time:
+
+```sql
+CREATE TABLE default.target_tbl (
+ id Int32,
+ array_column_json String,
+ -- computed column
+ array_column Array(String) ALIAS JSONExtract(json_column, 'Array(String)')
+ -- or materialized column
+ -- array_column Array(String) MATERIALIZED JSONExtract(json_column, 'Array(String)')
+)
+ENGINE = MergeTree()
+ORDER BY id
+```
+
+Downsides:
+
+- Using `SELECT JSONExtract(...)` or `ALIAS` column can be expensive, because value is calculated on every row access. This can be especially harmful if such column is used in `WHERE` clause.
+- `ALIAS` and `MATERIALIZED` columns are not included in `SELECT *` clause, they should be added explicitly: `SELECT *, calculated_column FROM table`.
+
+!!! warning
+
+ [EPHEMERAL](https://clickhouse.com/docs/en/sql-reference/statements/create/table#ephemeral) columns are not supported by Spark
+ because they cannot be selected to determine target column type.
diff --git a/mddocs/docs/connection/db_connection/clickhouse/write.md b/mddocs/docs/connection/db_connection/clickhouse/write.md
new file mode 100644
index 000000000..7a86f9b90
--- /dev/null
+++ b/mddocs/docs/connection/db_connection/clickhouse/write.md
@@ -0,0 +1,50 @@
+# Writing to Clickhouse using `DBWriter` { #DBR-onetl-connection-db-connection-clickhouse-write-writing-to-clickhouse-using-dbwriter }
+
+For writing data to Clickhouse, use [DBWriter][DBR-onetl-db-writer].
+
+!!! warning
+
+ Please take into account [Clickhouse types][DBR-onetl-connection-db-connection-clickhouse-types-clickhouse-spark-type-mapping]
+
+
+!!! warning
+
+ It is always recommended to create table explicitly using [Clickhouse.execute][DBR-onetl-connection-db-connection-clickhouse-execute-executing-statements-in-clickhouse]
+ instead of relying on Spark's table DDL generation.
+
+ This is because Spark's DDL generator can create columns with different precision and types than it is expected,
+ causing precision loss or other issues.
+
+## Examples { #DBR-onetl-connection-db-connection-clickhouse-write-examples }
+
+```python
+from onetl.connection import Clickhouse
+from onetl.db import DBWriter
+
+clickhouse = Clickhouse(...)
+
+df = ... # data is here
+
+writer = DBWriter(
+ connection=clickhouse,
+ target="schema.table",
+ options=Clickhouse.WriteOptions(
+ if_exists="append",
+ # ENGINE is required by Clickhouse
+ createTableOptions="ENGINE = MergeTree() ORDER BY id",
+ ),
+)
+
+writer.run(df)
+```
+
+## Options { #DBR-onetl-connection-db-connection-clickhouse-write-options }
+
+Method above accepts [Clickhouse.WriteOptions][onetl.connection.db_connection.clickhouse.options.ClickhouseWriteOptions]
+
+
+::: onetl.connection.db_connection.clickhouse.options.ClickhouseWriteOptions
+ options:
+ inherited_members: true
+ heading_level: 3
+ show_root_heading: true
diff --git a/mddocs/docs/connection/db_connection/greenplum/connection.md b/mddocs/docs/connection/db_connection/greenplum/connection.md
new file mode 100644
index 000000000..8f6af6b43
--- /dev/null
+++ b/mddocs/docs/connection/db_connection/greenplum/connection.md
@@ -0,0 +1,8 @@
+# Greenplum connection { #DBR-onetl-connection-db-connection-greenplum-connection-0 }
+
+
+::: onetl.connection.db_connection.greenplum.connection.Greenplum
+ options:
+ members:
+ - get_packages
+ - check
diff --git a/mddocs/docs/connection/db_connection/greenplum/execute.md b/mddocs/docs/connection/db_connection/greenplum/execute.md
new file mode 100644
index 000000000..8205daf46
--- /dev/null
+++ b/mddocs/docs/connection/db_connection/greenplum/execute.md
@@ -0,0 +1,140 @@
+# Executing statements in Greenplum { #DBR-onetl-connection-db-connection-greenplum-execute-executing-statements-in-greenplum }
+
+!!! warning
+
+ Methods below **read all the rows** returned from DB **to Spark driver memory**, and then convert them to DataFrame.
+
+ Do **NOT** use them to read large amounts of data. Use [DBReader][DBR-onetl-connection-db-connection-greenplum-read-reading-from-greenplum-using-dbreader] instead.
+
+## How to { #DBR-onetl-connection-db-connection-greenplum-execute-how-to }
+
+There are 2 ways to execute some statement in Greenplum
+
+### Use `Greenplum.fetch` { #DBR-onetl-connection-db-connection-greenplum-execute-use-greenplum-fetch }
+
+Use this method to perform some `SELECT` query which returns **small number or rows**, like reading
+Greenplum config, or reading data from some reference table. Method returns Spark DataFrame.
+
+Method accepts [Greenplum.FetchOptions][onetl.connection.db_connection.greenplum.options.GreenplumFetchOptions].
+
+
+!!! warning
+
+ `Greenplum.fetch` is implemented using Postgres JDBC connection, so types are handled a bit differently than in `DBReader`. See [Postgres types][DBR-onetl-connection-db-connection-postgres-types-postgres-spark-type-mapping].
+
+#### Syntax support in `Greenplum.fetch` { #DBR-onetl-connection-db-connection-greenplum-execute-syntax-support-in-greenplum-fetch }
+
+This method supports **any** query syntax supported by Greenplum, like:
+
+- ✅︎ `SELECT ... FROM ...`
+- ✅︎ `WITH alias AS (...) SELECT ...`
+- ✅︎ `SELECT func(arg1, arg2)` or `{call func(arg1, arg2)}` - special syntax for calling functions
+- ❌ `SET ...; SELECT ...;` - multiple statements not supported
+
+#### Examples for `Greenplum.fetch` { #DBR-onetl-connection-db-connection-greenplum-execute-examples-for-greenplum-fetch }
+
+```python
+from onetl.connection import Greenplum
+
+greenplum = Greenplum(...)
+
+df = greenplum.fetch(
+ "SELECT value FROM some.reference_table WHERE key = 'some_constant'",
+ options=Greenplum.FetchOptions(queryTimeout=10),
+)
+greenplum.close()
+value = df.collect()[0][0] # get value from first row and first column
+
+```
+
+### Use `Greenplum.execute` { #DBR-onetl-connection-db-connection-greenplum-execute-use-greenplum-execute }
+
+Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it.
+
+Method accepts [Greenplum.ExecuteOptions][onetl.connection.db_connection.greenplum.options.GreenplumExecuteOptions].
+
+
+#### Syntax support in `Greenplum.execute` { #DBR-onetl-connection-db-connection-greenplum-execute-syntax-support-in-greenplum-execute }
+
+This method supports **any** query syntax supported by Greenplum, like:
+
+- ✅︎ `CREATE TABLE ...`, `CREATE VIEW ...`, and so on
+- ✅︎ `ALTER ...`
+- ✅︎ `INSERT INTO ... SELECT ...`, `UPDATE ...`, `DELETE ...`, and so on
+- ✅︎ `DROP TABLE ...`, `DROP VIEW ...`, `TRUNCATE TABLE`, and so on
+- ✅︎ `CALL procedure(arg1, arg2) ...`
+- ✅︎ `SELECT func(arg1, arg2)` or `{call func(arg1, arg2)}` - special syntax for calling functions
+- ✅︎ other statements not mentioned here
+- ❌ `SET ...; SELECT ...;` - multiple statements not supported
+
+#### Examples for `Greenplum.execute` { #DBR-onetl-connection-db-connection-greenplum-execute-examples-for-greenplum-execute }
+
+```python
+from onetl.connection import Greenplum
+
+greenplum = Greenplum(...)
+
+greenplum.execute("DROP TABLE schema.table")
+greenplum.execute(
+ """
+ CREATE TABLE schema.table (
+ id int,
+ key text,
+ value real
+ )
+ DISTRIBUTED BY id
+ """,
+ options=Greenplum.ExecuteOptions(queryTimeout=10),
+)
+```
+
+## Interaction schema { #DBR-onetl-connection-db-connection-greenplum-execute-interaction-schema }
+
+Unlike reading & writing, executing statements in Greenplum is done **only** through Greenplum master node,
+without any interaction between Greenplum segments and Spark executors. More than that, Spark executors are not used in this case.
+
+The only port used while interacting with Greenplum in this case is `5432` (Greenplum master port).
+
+??? note "Spark <-> Greenplum interaction during Greenplum.execute()/Greenplum.fetch()"
+
+ ```mermaid
+ ---
+ title: Greenplum master <—> Spark driver
+ ---
+
+ sequenceDiagram
+ box Spark
+ participant A as Spark driver
+ end
+ box Greenplum
+ participant B as Greenplum master
+ end
+
+ Note over A,B: == Greenplum.check() ==
+
+ A->>B: CONNECT
+
+ Note over A,B: == Greenplum.execute(statement) ==
+
+ A-->>B: EXECUTE statement
+ B-->> A: RETURN result
+
+ Note over A,B: == Greenplum.close() ==
+
+ A ->> B: CLOSE CONNECTION
+ ```
+
+## Options { #DBR-onetl-connection-db-connection-greenplum-execute-options }
+
+
+::: onetl.connection.db_connection.greenplum.options.GreenplumFetchOptions
+ options:
+ inherited_members: true
+ heading_level: 3
+ show_root_heading: true
+
+::: onetl.connection.db_connection.greenplum.options.GreenplumExecuteOptions
+ options:
+ inherited_members: true
+ heading_level: 3
+ show_root_heading: true
diff --git a/mddocs/docs/connection/db_connection/greenplum/index.md b/mddocs/docs/connection/db_connection/greenplum/index.md
new file mode 100644
index 000000000..741751517
--- /dev/null
+++ b/mddocs/docs/connection/db_connection/greenplum/index.md
@@ -0,0 +1,16 @@
+# Greenplum { #DBR-onetl-connection-db-connection-greenplum }
+
+## Connection { #DBR-onetl-connection-db-connection-greenplum-connection-1 }
+
+* [Prerequisites][DBR-onetl-connection-db-connection-greenplum-prerequisites]
+* [Greenplum connection][DBR-onetl-connection-db-connection-greenplum-connection-0]
+
+## Operations { #DBR-onetl-connection-db-connection-greenplum-operations }
+
+* [Reading from Greenplum using `DBReader`][DBR-onetl-connection-db-connection-greenplum-read-reading-from-greenplum-using-dbreader]
+* [Writing to Greenplum using `DBWriter`][DBR-onetl-connection-db-connection-greenplum-write-writing-to-greenplum-using-dbwriter]
+* [Executing statements in Greenplum][DBR-onetl-connection-db-connection-greenplum-execute-executing-statements-in-greenplum]
+
+## Troubleshooting { #DBR-onetl-connection-db-connection-greenplum-troubleshooting }
+
+* [Greenplum <-> Spark type mapping][DBR-onetl-connection-db-connection-greenplum-types-greenplum-spark-type-mapping]
diff --git a/mddocs/docs/connection/db_connection/greenplum/prerequisites.md b/mddocs/docs/connection/db_connection/greenplum/prerequisites.md
new file mode 100644
index 000000000..511b43c77
--- /dev/null
+++ b/mddocs/docs/connection/db_connection/greenplum/prerequisites.md
@@ -0,0 +1,434 @@
+# Prerequisites { #DBR-onetl-connection-db-connection-greenplum-prerequisites }
+
+## Version Compatibility { #DBR-onetl-connection-db-connection-greenplum-prerequisites-version-compatibility }
+
+- Greenplum server versions:
+ - Officially declared: 5.x, 6.x, and 7.x (which requires `Greenplum.get_packages(package_version="2.3.0")` or higher)
+ - Actually tested: 6.23, 7.0
+- Spark versions: 3.2.x (Spark 3.3+ is not supported yet)
+- Java versions: 8 - 11
+
+See [official documentation](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.2/greenplum-connector-spark/release_notes.html).
+
+## Installing PySpark { #DBR-onetl-connection-db-connection-greenplum-prerequisites-installing-pyspark }
+
+To use Greenplum connector you should have PySpark installed (or injected to `sys.path`)
+BEFORE creating the connector instance.
+
+See [installation instruction][DBR-onetl-install-spark] for more details.
+
+## Download VMware package { #DBR-onetl-connection-db-connection-greenplum-prerequisites-download-vmware-package }
+
+To use Greenplum connector you should download connector `.jar` file from
+[VMware website](https://network.tanzu.vmware.com/products/vmware-greenplum#/releases/1413479/file_groups/16966)
+and then pass it to Spark session.
+
+!!! warning
+
+ Please pay attention to [Spark & Scala version compatibility][DBR-onetl-install-spark-compatibility-matrix].
+
+!!! warning
+
+ There are issues with using package of version 2.3.0/2.3.1 with Greenplum 6.x - connector can
+ open transaction with `SELECT * FROM table LIMIT 0` query, but does not close it, which leads to deadlocks
+ during write.
+
+There are several ways to do that. See [install Java packages][DBR-onetl-install-spark-injecting-java-packages] for details.
+
+!!! note
+
+ If you're uploading package to private package repo, use `groupId=io.pivotal` and `artifactoryId=greenplum-spark_2.12`
+ (`2.12` is Scala version) to give uploaded package a proper name.
+
+## Interaction Spark ↔ Greenplum { #DBR-onetl-connection-db-connection-greenplum-prerequisites-interaction-spark-greenplum }
+
+This connector is **very** different from regular Postgres connector.
+
+Postgres connector connects directly to Postgres host via JDBC driver:
+
+- Spark driver → Postgres host (get query column names and types, create target table)
+- Spark executors → Postgres host (send/fetch actual data)
+
+Data should **NEVER** be send via Greenplum master (coordinator) using regular Postgres connector, as it's very easy to overload coordinator
+by sending hundreds and thousands of gigabytes of data.
+
+Instead, Greenplum connector uses [gpfdist protocol](https://docs.vmware.com/en/VMware-Greenplum/7/greenplum-database/admin_guide-external-g-using-the-greenplum-parallel-file-server--gpfdist-.html#about-gpfdist-setup-and-performance-1) with a bit complicated schema:
+
+- Spark driver → Greenplum master (get query column names and types, create target table)
+- Spark executors → Greenplum master (create [EXTERNAL TABLEs](https://docs.vmware.com/en/VMware-Greenplum/7/greenplum-database/ref_guide-sql_commands-CREATE_EXTERNAL_TABLE.html))
+- Greenplum segments → Spark executors (send/fetch actual data via `EXTERNAL TABLE`)
+
+More details can be found in [official documentation](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/overview.html).
+
+## Configuring the connector { #DBR-onetl-connection-db-connection-greenplum-prerequisites-configuring-the-connector }
+
+Each Spark executor starts a `gpfdist` server, and each Greeplum **segment** connect to this server.
+Greenplum segment should know server's IP address/hostname and a port number.
+
+This target IP and port range should be added to firewall `ALLOW` rule on Spark host/cluster with sourceIP = Greenplum network.
+Otherwise connection cannot be established.
+
+More details can be found in official documentation:
+
+- [port requirements](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/sys_reqs.html#network-port-requirements)
+- [format of server.port value](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/options.html#server.port)
+- [port troubleshooting](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/troubleshooting.html#port-errors)
+
+### spark.master=local { #DBR-onetl-connection-db-connection-greenplum-prerequisites-spark-masterlocal }
+
+#### Set `gpfdist` server host { #DBR-onetl-connection-db-connection-greenplum-prerequisites-set-gpfdist-server-host-0 }
+
+By default, Greenplum connector tries to resolve current host IP, and then pass it to Greenplum segment.
+On some hosts it works as-is, without any additional configuration. In others it's not.
+
+The most common error is that Greenplum segment receives `127.0.0.1` IP address (loopback interface).
+This is usually caused by `/etc/hosts` content like this:
+
+```text
+127.0.0.1 localhost real-host-name
+```
+
+```bash
+$ hostname -f
+localhost
+
+$ hostname -i
+127.0.0.1
+```
+
+Reading/writing data to Greenplum will fail with following exception:
+
+```text
+org.postgresql.util.PSQLException: ERROR: connection with gpfdist failed for
+"gpfdist://127.0.0.1:49152/local-1709739764667/exec/driver",
+effective url: "http://127.0.0.1:49152/local-1709739764667/exec/driver":
+error code = 111 (Connection refused); (seg3 slice1 12.34.56.78:10003 pid=123456)
+```
+
+There are 2 ways to fix that:
+
+- Explicitly pass your host IP address to connector, like this:
+
+ ```python
+ import os
+
+ # host IP, accessible from GP segments
+ os.environ["SPARK_LOCAL_IP"] = "192.168.1.1"
+
+ # !!!SET IP BEFORE CREATING SPARK SESSION!!!
+ spark = ...
+
+ greenplum = Greenplum(
+ ...,
+ extra={
+ # connector will read IP from this environment variable
+ "server.hostEnv": "env.SPARK_LOCAL_IP",
+ },
+ spark=spark,
+ )
+ ```
+
+ More details can be found in [official documentation](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/options.html#server.hostenv).
+
+- Update `/etc/hosts` file to include real host IP:
+
+ ```text
+ 127.0.0.1 localhost
+ # this IP should be accessible from GP segments
+ 192.168.1.1 real-host-name
+ ```
+
+ This requires root privileges on host, not everyone can do this.
+ Also this doesn't work with dynamic IP addresses.
+
+#### Set `gpfdist` server port { #DBR-onetl-connection-db-connection-greenplum-prerequisites-set-gpfdist-server-port-0 }
+
+By default, Spark executors can start `gpfdist` server on *any* random port number.
+You can limit port range using `extra` option:
+
+```python
+greenplum = Greenplum(
+ ...,
+ extra={
+ "server.port": "41000-42000", # !!! JUST AN EXAMPLE !!!
+ },
+)
+```
+
+Number of ports in this range should be at least `number of parallel running Spark sessions on host` * `number of executors per session`.
+
+### spark.master=yarn { #DBR-onetl-connection-db-connection-greenplum-prerequisites-spark-masteryarn }
+
+#### Set `gpfdist` server host { #DBR-onetl-connection-db-connection-greenplum-prerequisites-set-gpfdist-server-host-1 }
+
+By default, Greenplum connector tries to resolve current host IP, and then pass it to Greenplum segment.
+Usually there are no issues with that, connector just works as-is, without any adjustments.
+
+The most common error is that Greenplum segment receives `127.0.0.1` IP address (loopback interface)
+instead of external IP of Hadoop data/compute node. There are 3 ways to fix it:
+
+- Pass node hostname instead of IP address to Greenplum segment:
+
+ ```python
+ greenplum = Greenplum(
+ ...,
+ extra={
+ "server.useHostname": "true",
+ },
+ )
+ ```
+
+ This may require configuring DNS on each Greenplum segment to properly resolve Hadoop node hostname → some IP.
+
+ More details can be found in [official documentation](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/options.html#server.usehostname).
+
+- Set network interface name to get IP address from:
+
+ ```python
+ greenplum = Greenplum(
+ ...,
+ extra={
+ "server.nic": "eth0",
+ },
+ )
+ ```
+
+ You can get list of network interfaces using this command.
+
+!!! note
+
+ This command should be executed on Hadoop cluster node, **not** Spark driver host!
+
+ ```bash
+ $ ip address
+ 1: lo: mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
+ inet 127.0.0.1/8 scope host lo
+ valid_lft forever preferred_lft forever
+ 2: eth0: mtu 1500 qdisc fq_codel state UP group default qlen 1000
+ inet 192.168.1.1/24 brd 192.168.1.255 scope global dynamic noprefixroute eth0
+ valid_lft 83457sec preferred_lft 83457sec
+ ```
+
+ Note that in this case **each** Hadoop cluster node node should have network interface with name `eth0`,
+ which may not be the case.
+
+ More details can be found in [official documentation](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/options.html#server.nic).
+
+- Update `/etc/hosts` on each Hadoop cluster node to include its IP address:
+
+ ```text
+ 127.0.0.1 localhost
+ # this IP should be accessible from GP segments
+ 192.168.1.1 real-host-name
+ ```
+
+ This requires root privileges on host, not everyone can do this.
+ Also this doesn't work with dynamic IP addresses.
+
+#### Set `gpfdist` server port { #DBR-onetl-connection-db-connection-greenplum-prerequisites-set-gpfdist-server-port-1 }
+
+By default, Spark executors can start `gpfdist` server on *any* random port number.
+You can limit port range using `extra` option:
+
+```python
+greenplum = Greenplum(
+ ...,
+ extra={
+ "server.port": "41000-42000", # !!! JUST AN EXAMPLE !!!
+ },
+)
+```
+
+Number of ports in this range should be at least `number of parallel running Spark sessions per node` * `number of executors per session` / `number of Hadoop nodes`.
+
+### spark.master=k8s { #DBR-onetl-connection-db-connection-greenplum-prerequisites-spark-masterk8s }
+
+Before starting Spark session, you should to create a Kubernetes [Ingress](https://kubernetes.io/docs/concepts/services-networking/ingress/) object:
+
+```yaml title="ingress.yaml"
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+ name: gpfdist-ingress
+ namespace: mynamespace
+ annotations:
+ nginx.ingress.kubernetes.io/ssl-redirect: "false"
+ nginx.ingress.kubernetes.io/force-ssl-redirect: "false"
+spec:
+ rules:
+ - http:
+ paths:
+ - path: /
+ pathType: Prefix
+ backend:
+ service:
+ name: gpfdist-default
+ port:
+ number: 50000
+
+## RETURNED FROM K8S API RESPONSE ##
+# status:
+# loadBalancer:
+# ingress:
+# - ip: 11.22.33.44
+```
+
+Then add special Spark listener to Spark session config, and specify ingress' load balancer IP or domain name with a port number:
+
+```python
+spark = (
+ SparkSession.builder.config("spark.master", "k8s://...")
+ .config("spark.extraListeners", "org.greenplum.GpfdistIngressListener")
+ .config("spark.kubernetes.namespace", "mynamespace")
+ .config("spark.greenplum.k8s.ingress.name", "gpfdist-ingress") # ingress name
+ .config("spark.greenplum.gpfdist.host", "11.22.33.44") # ingress IP/domain name
+ .config("spark.greenplum.gpfdist.listen-port", "50000") # ingress port
+ .config(
+ "spark.greenplum.gpfdist.is-ssl", "false"
+ ) # true for ingress with TLS enabled
+).getOrCreate()
+```
+
+Set fixed port for `gpfdist` server to listen on:
+
+```python
+greenplum = Greenplum(
+ ...,
+ extra={
+ "server.port": "50000", # should match ingress port
+ },
+)
+```
+
+## Set number of connections { #DBR-onetl-connection-db-connection-greenplum-prerequisites-set-number-of-connections }
+
+!!! warning
+
+ This is very important!!!
+
+ If you don't limit number of connections, you can exceed the [max_connections](https://docs.vmware.com/en/VMware-Greenplum/7/greenplum-database/admin_guide-client_auth.html#limiting-concurrent-connections#limiting-concurrent-connections-2)
+ limit set on the Greenplum side. It's usually not so high, e.g. 500-1000 connections max,
+ depending on your Greenplum instance settings and using connection balancers like `pgbouncer`.
+
+ Consuming all available connections means **nobody** (even admin users) can connect to Greenplum!
+
+Each task running on the Spark executor makes its own connection to Greenplum master node.
+To avoid opening too many connections to Greenplum master (coordinator), you should limit number of tasks.
+
+- Reading about `5-10Gb` of data requires about `3-5` parallel connections.
+- Reading about `20-30Gb` of data requires about `5-10` parallel connections.
+- Reading about `50Gb` of data requires ~ `10-20` parallel connections.
+- Reading about `100+Gb` of data requires `20-30` parallel connections.
+- Opening more than `30-50` connections is not recommended.
+
+Max number of parallel tasks is `N executors * N cores-per-executor`, so this can be adjusted using Spark session configuration:
+
+=== "Spark with master=local"
+
+ ```python
+ spark = (
+ SparkSession.builder
+ # Spark will run with 5 threads in local mode, allowing up to 5 parallel tasks
+ .config("spark.master", "local[5]")
+ ).getOrCreate()
+
+ # Set connection pool size AT LEAST to number of executors + 1 for driver
+ Greenplum(
+ ...,
+ extra={
+ "pool.maxSize": 6, # 5 executors + 1 driver
+ },
+ )
+ ```
+
+=== "Spark with master=yarn or master=k8s, dynamic allocation"
+
+ ```python
+ spark = (
+ SparkSession.builder
+ .config("spark.master", "yarn")
+ # Spark will start MAX 10 executors with 1 core each (dynamically), so max number of parallel jobs is 10
+ .config("spark.dynamicAllocation.maxExecutors", 10)
+ .config("spark.executor.cores", 1)
+ ).getOrCreate()
+ ```
+
+=== "Spark with master=yarn or master=k8s, static allocation"
+
+ ```python
+ spark = (
+ SparkSession.builder
+ .config("spark.master", "yarn")
+ # Spark will start EXACTLY 10 executors with 1 core each, so max number of parallel jobs is 10
+ .config("spark.executor.instances", 10)
+ .config("spark.executor.cores", 1)
+ ).getOrCreate()
+ ```
+
+See [connection pooling](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/using_the_connector.html#jdbcconnpool)
+documentation.
+
+## Greenplum side adjustments { #DBR-onetl-connection-db-connection-greenplum-prerequisites-greenplum-side-adjustments }
+
+### Allow connecting to Greenplum master { #DBR-onetl-connection-db-connection-greenplum-prerequisites-allow-connecting-to-greenplum-master }
+
+Ask your Greenplum cluster administrator to allow your user to connect to Greenplum master (coordinator),
+e.g. by updating `pg_hba.conf` file.
+
+More details can be found in [official documentation](https://docs.vmware.com/en/VMware-Greenplum/7/greenplum-database/admin_guide-client_auth.html#limiting-concurrent-connections#allowing-connections-to-greenplum-database-0).
+
+### Provide required grants { #DBR-onetl-connection-db-connection-greenplum-prerequisites-provide-required-grants }
+
+Ask your Greenplum cluster administrator to set following grants for a user:
+
+=== "Read + Write"
+
+ ```sql
+ -- get access to get tables metadata & cluster information
+ GRANT SELECT ON information_schema.tables TO username;
+ GRANT SELECT ON pg_attribute TO username;
+ GRANT SELECT ON pg_class TO username;
+ GRANT SELECT ON pg_namespace TO username;
+ GRANT SELECT ON pg_settings TO username;
+ GRANT SELECT ON pg_stats TO username;
+ GRANT SELECT ON gp_distributed_xacts TO username;
+ GRANT SELECT ON gp_segment_configuration TO username;
+ -- Greenplum 5.x only
+ GRANT SELECT ON gp_distribution_policy TO username;
+
+ -- allow creating external tables in the same schema as source/target table
+ GRANT USAGE ON SCHEMA myschema TO username;
+ GRANT CREATE ON SCHEMA myschema TO username;
+ ALTER USER username CREATEEXTTABLE(type = 'readable', protocol = 'gpfdist') CREATEEXTTABLE(type = 'writable', protocol = 'gpfdist');
+
+ -- allow read access to specific table (to get column types)
+ -- allow write access to specific table
+ GRANT SELECT, INSERT ON myschema.mytable TO username;
+ ```
+
+=== "Read only"
+
+ ```sql
+ -- get access to get tables metadata & cluster information
+ GRANT SELECT ON information_schema.tables TO username;
+ GRANT SELECT ON pg_attribute TO username;
+ GRANT SELECT ON pg_class TO username;
+ GRANT SELECT ON pg_namespace TO username;
+ GRANT SELECT ON pg_settings TO username;
+ GRANT SELECT ON pg_stats TO username;
+ GRANT SELECT ON gp_distributed_xacts TO username;
+ GRANT SELECT ON gp_segment_configuration TO username;
+ -- Greenplum 5.x only
+ GRANT SELECT ON gp_distribution_policy TO username;
+
+ -- allow creating external tables in the same schema as source table
+ GRANT USAGE ON SCHEMA schema_to_read TO username;
+ GRANT CREATE ON SCHEMA schema_to_read TO username;
+ -- yes, `writable` for reading from GP, because data is written from Greenplum to Spark executor.
+ ALTER USER username CREATEEXTTABLE(type = 'writable', protocol = 'gpfdist');
+
+ -- allow read access to specific table
+ GRANT SELECT ON schema_to_read.table_to_read TO username;
+ ```
+
+More details can be found in [official documentation](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/install_cfg.html#role-privileges).
diff --git a/mddocs/docs/connection/db_connection/greenplum/read.md b/mddocs/docs/connection/db_connection/greenplum/read.md
new file mode 100644
index 000000000..1c7501542
--- /dev/null
+++ b/mddocs/docs/connection/db_connection/greenplum/read.md
@@ -0,0 +1,359 @@
+# Reading from Greenplum using `DBReader` { #DBR-onetl-connection-db-connection-greenplum-read-reading-from-greenplum-using-dbreader }
+
+Data can be read from Greenplum to Spark using [DBReader][DBR-onetl-db-reader]. It also supports [strategy][DBR-onetl-strategy-read-strategies] for incremental data reading.
+
+!!! warning
+
+ Please take into account [Greenplum types][DBR-onetl-connection-db-connection-greenplum-types-greenplum-spark-type-mapping].
+
+!!! note
+
+ Unlike JDBC connectors, *Greenplum connector for Spark* does not support
+ executing **custom** SQL queries using `.sql` method. Connector can be used to only read data from a table or view.
+
+## Supported DBReader features { #DBR-onetl-connection-db-connection-greenplum-read-supported-dbreader-features }
+
+- ✅︎ `columns` (see note below)
+- ✅︎ `where` (see note below)
+- ✅︎ `hwm` (see note below), supported strategies:
+ - ✅︎ [Snapshot strategy][DBR-onetl-strategy-snapshot-strategy]
+ - ✅︎ [Incremental strategy][DBR-onetl-connection-db-connection-clickhouse-read-incremental-strategy]
+ - ✅︎ [Snapshot batch strategy][DBR-onetl-strategy-snapshot-batch-strategy]
+ - ✅︎ [Incremental batch strategy][DBR-onetl-strategy-incremental-batch-strategy]
+- ❌ `hint` (is not supported by Greenplum)
+- ❌ `df_schema`
+- ✅︎ `options` (see [Greenplum.ReadOptions][onetl.connection.db_connection.greenplum.options.GreenplumReadOptions])
+
+!!! warning
+
+ In case of Greenplum connector, `DBReader` does not generate raw `SELECT` query. Instead it relies on Spark SQL syntax
+ which in some cases (using column projection and predicate pushdown) can be converted to Greenplum SQL.
+
+ So `columns`, `where` and `hwm.expression` should be specified in [Spark SQL](https://spark.apache.org/docs/latest/sql-ref-syntax.html) syntax,
+ not Greenplum SQL.
+
+ This is OK:
+
+ ```python
+ DBReader(
+ columns=[
+ "some_column",
+ # this cast is executed on Spark side
+ "CAST(another_column AS STRING)",
+ ],
+ # this predicate is parsed by Spark, and can be pushed down to Greenplum
+ where="some_column LIKE 'val1%'",
+ )
+ ```
+
+ This is will fail:
+
+ ```python
+ DBReader(
+ columns=[
+ "some_column",
+ # Spark does not have `text` type
+ "CAST(another_column AS text)",
+ ],
+ # Spark does not support ~ syntax for regexp matching
+ where="some_column ~ 'val1.*'",
+ )
+ ```
+
+## Examples { #DBR-onetl-connection-db-connection-greenplum-read-examples }
+
+Snapshot strategy:
+
+```python
+from onetl.connection import Greenplum
+from onetl.db import DBReader
+
+greenplum = Greenplum(...)
+
+reader = DBReader(
+ connection=greenplum,
+ source="schema.table",
+ columns=["id", "key", "CAST(value AS string) value", "updated_dt"],
+ where="key = 'something'",
+)
+df = reader.run()
+```
+
+Incremental strategy:
+
+```python
+from onetl.connection import Greenplum
+from onetl.db import DBReader
+from onetl.strategy import IncrementalStrategy
+
+greenplum = Greenplum(...)
+
+reader = DBReader(
+ connection=greenplum,
+ source="schema.table",
+ columns=["id", "key", "CAST(value AS string) value", "updated_dt"],
+ where="key = 'something'",
+ hwm=DBReader.AutoDetectHWM(name="greenplum_hwm", expression="updated_dt"),
+)
+
+with IncrementalStrategy():
+ df = reader.run()
+```
+
+## Interaction schema { #DBR-onetl-connection-db-connection-greenplum-read-interaction-schema }
+
+High-level schema is described in [Greenplum prerequisites][DBR-onetl-connection-db-connection-greenplum-prerequisites]. You can find detailed interaction schema below.
+
+??? note "Spark <-> Greenplum interaction during DBReader.run()"
+
+ ```mermaid
+ ---
+ title: Greenplum master <-> Spark driver
+ ---
+
+ sequenceDiagram
+ box "Spark"
+ participant A as "Spark driver"
+ participant B as "Spark executor1"
+ participant C as "Spark executor2"
+ participant D as "Spark executorN"
+ end
+
+ box "Greenplum"
+ participant E as "Greenplum master"
+ participant F as "Greenplum segment1"
+ participant G as "Greenplum segment2"
+ participant H as "Greenplum segmentN"
+ end
+
+ note over A,H: == Greenplum.check() ==
+
+ activate A
+ activate E
+ A ->> E: CONNECT
+
+ A -->> E : CHECK IF TABLE EXISTS gp_table
+ E -->> A : TABLE EXISTS
+ A ->> E : SHOW SCHEMA FOR gp_table
+ E -->> A : (id bigint, col1 int, col2 text, ...)
+
+ note over A,H: == DBReader.run() ==
+
+ A ->> B: START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION 1
+ A ->> C: START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION 2
+ A ->> D: START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION N
+
+ note right of A : This is done in parallel, executors are independent | | | V
+ B ->> E: CREATE WRITABLE EXTERNAL TABLE spark_executor1 (id bigint, col1 int, col2 text, ...) USING address=executor1_host:executor1_port INSERT INTO EXTERNAL TABLE spark_executor1 FROM gp_table WHERE gp_segment_id = 1
+ note right of E : Each white vertical line here is a opened connection to master. Usually, **N+1** connections are created from Spark to Greenplum master
+ activate E
+ E -->> F: SELECT DATA FROM gp_table_data_on_segment1 TO spark_executor1
+ note right of F : No direct requests between Greenplum segments & Spark driver. Data transfer is always initiated by Greenplum segments.
+
+
+ C ->> E: CREATE WRITABLE EXTERNAL TABLE spark_executor2 (id bigint, col1 int, col2 text, ...) USING address=executor2_host:executor2_port INSERT INTO EXTERNAL TABLE spark_executor2 FROM gp_table WHERE gp_segment_id = 2
+ activate E
+ E -->> G: SELECT DATA FROM gp_table_data_on_segment2 TO spark_executor2
+
+ D ->> E: CREATE WRITABLE EXTERNAL TABLE spark_executorN (id bigint, col1 int, col2 text, ...) USING address=executorN_host:executorN_port INSERT INTO EXTERNAL TABLE spark_executorN FROM gp_table WHERE gp_segment_id = N
+ activate E
+ E -->> H: SELECT DATA FROM gp_table_data_on_segmentN TO spark_executorN
+
+ F -xB: INITIALIZE CONNECTION TO Spark executor1 PUSH DATA TO Spark executor1
+ note left of B : Circle is an open GPFDIST port, listened by executor
+
+ G -xC: INITIALIZE CONNECTION TO Spark executor2 PUSH DATA TO Spark executor2
+ H -xD: INITIALIZE CONNECTION TO Spark executorN PUSH DATA TO Spark executorN
+
+ note over A,H: == Spark.stop() ==
+
+ B -->> E : DROP TABLE spark_executor1
+ deactivate E
+ C -->> E : DROP TABLE spark_executor2
+ deactivate E
+ D -->> E : DROP TABLE spark_executorN
+ deactivate E
+
+ B -->> A: DONE
+ C -->> A: DONE
+ D -->> A: DONE
+
+ A -->> E : CLOSE CONNECTION
+ deactivate E
+ deactivate A
+ ```
+
+## Recommendations { #DBR-onetl-connection-db-connection-greenplum-read-recommendations }
+
+### Select only required columns { #DBR-onetl-connection-db-connection-greenplum-read-select-only-required-columns }
+
+Instead of passing `"*"` in `DBReader(columns=[...])` prefer passing exact column names. This reduces the amount of data passed from Greenplum to Spark.
+
+### Pay attention to `where` value { #DBR-onetl-connection-db-connection-greenplum-read-pay-attention-to-where-value }
+
+Instead of filtering data on Spark side using `df.filter(df.column == 'value')` pass proper `DBReader(where="column = 'value'")` clause. This both reduces the amount of data send from Greenplum to Spark, and may also improve performance of the query. Especially if there are indexes or partitions for columns used in `where` clause.
+
+### Read data in parallel { #DBR-onetl-connection-db-connection-greenplum-read-data-in-parallel }
+
+`DBReader` in case of Greenplum connector requires view or table to have a column which is used by Spark for parallel reads.
+
+Choosing proper column allows each Spark executor to read only part of data stored in the specified segment, avoiding moving large amounts of data between segments, which improves reading performance.
+
+#### Using `gp_segment_id` { #DBR-onetl-connection-db-connection-greenplum-read-using-gp-segment-id }
+
+By default, `DBReader` will use [gp_segment_id](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/troubleshooting.html#reading-from-a-view) column for parallel data reading. Each DataFrame partition will contain data of a specific Greenplum segment.
+
+This allows each Spark executor read only data from specific Greenplum segment, avoiding moving large amounts of data between segments.
+
+If view is used, it is recommended to include `gp_segment_id` column to this view:
+
+??? note "Reading from view with gp_segment_id column"
+
+ ```python
+ from onetl.connection import Greenplum
+ from onetl.db import DBReader
+
+ greenplum = Greenplum(...)
+
+ greenplum.execute(
+ """
+ CREATE VIEW schema.view_with_gp_segment_id AS
+ SELECT
+ id,
+ some_column,
+ another_column,
+ gp_segment_id -- IMPORTANT
+ FROM schema.some_table
+ """,
+ )
+
+ reader = DBReader(
+ connection=greenplum,
+ source="schema.view_with_gp_segment_id",
+ )
+ df = reader.run()
+ ```
+
+#### Using custom `partition_column` { #DBR-onetl-connection-db-connection-greenplum-read-using-custom-partition-column }
+
+Sometimes table or view is lack of `gp_segment_id` column, but there is some column
+with value range correlated with Greenplum segment distribution.
+
+In this case, custom column can be used instead:
+
+??? note "Reading from view with custom partition_column"
+
+ ```python
+ from onetl.connection import Greenplum
+ from onetl.db import DBReader
+
+ greenplum = Greenplum(...)
+
+ greenplum.execute(
+ """
+ CREATE VIEW schema.view_with_partition_column AS
+ SELECT
+ id,
+ some_column,
+ part_column -- correlated to greenplum segment ID
+ FROM schema.some_table
+ """,
+ )
+
+ reader = DBReader(
+ connection=greenplum,
+ source="schema.view_with_partition_column",
+ options=Greenplum.ReadOptions(
+ # parallelize data using specified column
+ partitionColumn="part_column",
+ # create 10 Spark tasks, each will read only part of table data
+ partitions=10,
+ ),
+ )
+ df = reader.run()
+ ```
+
+#### Reading `DISTRIBUTED REPLICATED` tables { #DBR-onetl-connection-db-connection-greenplum-read-reading-distributed-replicated-tables }
+
+Replicated tables do not have `gp_segment_id` column at all, so you need to set `partition_column` to some column name of type integer/bigint/smallint.
+
+### Parallel `JOIN` execution { #DBR-onetl-connection-db-connection-greenplum-read-parallel-join-execution }
+
+In case of using views which require some data motion between Greenplum segments, like `JOIN` queries, another approach should be used.
+
+Each Spark executor N will run the same query, so each of N query will start its own JOIN process, leading to really heavy load on Greenplum segments.
+
+**This should be avoided**.
+
+Instead is recommended to run `JOIN` query on Greenplum side, save the result to an intermediate table, and then read this table using `DBReader`:
+
+??? note "Reading from view using intermediate table"
+
+ ```python
+ from onetl.connection import Greenplum
+ from onetl.db import DBReader
+
+ greenplum = Greenplum(...)
+
+ greenplum.execute(
+ """
+ CREATE UNLOGGED TABLE schema.intermediate_table AS
+ SELECT
+ id,
+ tbl1.col1,
+ tbl1.data,
+ tbl2.another_data
+ FROM
+ schema.table1 as tbl1
+ JOIN
+ schema.table2 as tbl2
+ ON
+ tbl1.col1 = tbl2.col2
+ WHERE ...
+ """,
+ )
+
+ reader = DBReader(
+ connection=greenplum,
+ source="schema.intermediate_table",
+ )
+ df = reader.run()
+
+ # write dataframe somethere
+
+ greenplum.execute(
+ """
+ DROP TABLE schema.intermediate_table
+ """,
+ )
+ ```
+
+!!! warning
+
+ **NEVER** do that:
+
+ ```python
+ df1 = DBReader(connection=greenplum, target="public.table1", ...).run()
+ df2 = DBReader(connection=greenplum, target="public.table2", ...).run()
+
+ joined_df = df1.join(df2, on="col")
+ ```
+
+ This will lead to sending all the data from both `table1` and `table2` to Spark executor memory, and then `JOIN`
+ will be performed on Spark side, not inside Greenplum. This is **VERY** inefficient.
+
+#### `TEMPORARY` tables notice { #DBR-onetl-connection-db-connection-greenplum-read-temporary-tables-notice }
+
+Someone could think that writing data from view or result of `JOIN` to `TEMPORARY` table, and then passing it to `DBReader`, is an efficient way to read data from Greenplum. This is because temp tables are not generating WAL files, and are automatically deleted after finishing the transaction.
+
+That will **NOT** work. Each Spark executor establishes its own connection to Greenplum. And each connection starts its own transaction which means that every executor will read empty temporary table.
+
+You should use [UNLOGGED](https://docs.vmware.com/en/VMware-Greenplum/7/greenplum-database/ref_guide-sql_commands-CREATE_TABLE.html) tables to write data to intermediate table without generating WAL logs.
+
+## Options { #DBR-onetl-connection-db-connection-greenplum-read-options }
+
+
+::: onetl.connection.db_connection.greenplum.options.GreenplumReadOptions
+ options:
+ show_root_heading: true
+ heading_level: 3
diff --git a/mddocs/docs/connection/db_connection/greenplum/types.md b/mddocs/docs/connection/db_connection/greenplum/types.md
new file mode 100644
index 000000000..f28338a5a
--- /dev/null
+++ b/mddocs/docs/connection/db_connection/greenplum/types.md
@@ -0,0 +1,304 @@
+# Greenplum <-> Spark type mapping { #DBR-onetl-connection-db-connection-greenplum-types-greenplum-spark-type-mapping }
+
+!!! note
+
+ The results below are valid for Spark 3.2.4, and may differ on other Spark versions.
+
+## Type detection & casting { #DBR-onetl-connection-db-connection-greenplum-types-type-detection-casting }
+
+Spark's DataFrames always have a `schema` which is a list of columns with corresponding Spark types. All operations on a column are performed using column type.
+
+### Reading from Greenplum { #DBR-onetl-connection-db-connection-greenplum-types-reading-from-greenplum }
+
+This is how Greenplum connector performs this:
+
+- Execute query `SELECT * FROM table LIMIT 0` [^1].
+- For each column in query result get column name and Greenplum type.
+- Find corresponding `Greenplum type (read)` → `Spark type` combination (see below) for each DataFrame column. If no combination is found, raise exception.
+- Use Spark column projection and predicate pushdown features to build a final query.
+- Create DataFrame from generated query with inferred schema.
+
+[^1]: Yes, **all columns of a table**, not just selected ones.
+ This means that if source table **contains** columns with unsupported type, the entire table cannot be read.
+
+### Writing to some existing Greenplum table { #DBR-onetl-connection-db-connection-greenplum-types-writing-to-some-existing-greenplum-table }
+
+This is how Greenplum connector performs this:
+
+- Get names of columns in DataFrame.
+- Perform `SELECT * FROM table LIMIT 0` query.
+- For each column in query result get column name and Greenplum type.
+- Match table columns with DataFrame columns (by name, case insensitive).
+ If some column is present only in target table, but not in DataFrame (like `DEFAULT` or `SERIAL` column), and vice versa, raise an exception.
+ See [Explicit type cast][DBR-onetl-connection-db-connection-greenplum-types-explicit-type-cast].
+- Find corresponding `Spark type` → `Greenplumtype (write)` combination (see below) for each DataFrame column. If no combination is found, raise exception.
+- If `Greenplumtype (write)` match `Greenplum type (read)`, no additional casts will be performed, DataFrame column will be written to Greenplum as is.
+- If `Greenplumtype (write)` does not match `Greenplum type (read)`, DataFrame column will be casted to target column type **on Greenplum side**.
+ For example, you can write column with text data to column of `json` type (which Greenplum connector currently does not support).
+
+### Create new table using Spark { #DBR-onetl-connection-db-connection-greenplum-types-create-new-table-using-spark }
+
+!!! warning
+
+ ABSOLUTELY NOT RECOMMENDED!
+
+This is how Greenplum connector performs this:
+
+- Find corresponding `Spark type` → `Greenplum type (create)` combination (see below) for each DataFrame column. If no combination is found, raise exception.
+- Generate DDL for creating table in Greenplum, like `CREATE TABLE (col1 ...)`, and run it.
+- Write DataFrame to created table as is.
+
+More details [can be found here](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/write_to_gpdb.html).
+
+But Greenplum connector support only limited number of types and almost no custom clauses (like `PARTITION BY`).
+So instead of relying on Spark to create tables:
+
+??? note "See example"
+
+ ```python
+ writer = DBWriter(
+ connection=greenplum,
+ target="public.table",
+ options=Greenplum.WriteOptions(
+ if_exists="append",
+ # by default distribution is random
+ distributedBy="id",
+ # partitionBy is not supported
+ ),
+ )
+ writer.run(df)
+ ```
+
+Always prefer creating table with desired DDL **BEFORE WRITING DATA**:
+
+??? note "See example"
+
+ ```python
+ greenplum.execute(
+ """
+ CREATE TABLE public.table (
+ id int32,
+ business_dt timestamp(6),
+ value json
+ )
+ PARTITION BY RANGE (business_dt)
+ DISTRIBUTED BY id
+ """,
+ )
+
+ writer = DBWriter(
+ connection=greenplum,
+ target="public.table",
+ options=Greenplum.WriteOptions(if_exists="append"),
+ )
+ writer.run(df)
+ ```
+
+See Greenplum [CREATE TABLE](https://docs.vmware.com/en/VMware-Greenplum/7/greenplum-database/ref_guide-sql_commands-CREATE_TABLE.html) documentation.
+
+## Supported types { #DBR-onetl-connection-db-connection-greenplum-types-supported-types }
+
+See:
+
+- [official connector documentation](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/reference-datatype_mapping.html)
+- [list of Greenplum types](https://docs.vmware.com/en/VMware-Greenplum/7/greenplum-database/ref_guide-data_types.html)
+
+### Numeric types { #DBR-onetl-connection-db-connection-greenplum-types-numeric-types }
+
+| Greenplum type (read) | Spark type | Greenplumtype (write) | Greenplum type (create) |
+|---------------------------------- |----------------------------------- |------------------------------- |------------------------- |
+| `decimal` `decimal(P=0..38)` `decimal(P=0..38, S=0..38)` | `DecimalType(P=38, S=18)` `DecimalType(P=0..38, S=0)` `DecimalType(P=0..38, S=0..38)` | `decimal(P=38, S=18)` `decimal(P=0..38, S=0)` `decimal(P=0..38, S=0..38)` | `decimal` (unbounded) |
+| `decimal(P=39.., S=0..)` | unsupported [^2] | | |
+| `real` | `FloatType()` | `real` | `real` |
+| `double precision` | `DoubleType()` | `double precision` | `double precision` |
+| `-` | `ByteType()` | unsupported | unsupported |
+| `smallint` | `ShortType()` | `smallint` | `smallint` |
+| `integer` | `IntegerType()` | `integer` | `integer` |
+| `bigint` | `LongType()` | `bigint` | `bigint` |
+| `money` `int4range` `int8range` `numrange` `int2vector` |
unsupported | | |
+
+[^2]: Greenplum support decimal types with unlimited precision.
+
+ But Spark's `DecimalType(P, S)` supports maximum `P=38` (128 bit). It is impossible to read, write or operate with values of larger precision,
+ this leads to an exception.
+
+### Temporal types { #DBR-onetl-connection-db-connection-greenplum-types-temporal-types }
+
+| Greenplum type (read) | Spark type | Greenplumtype (write) | Greenplum type (create) |
+|------------------------------------ |------------------------- |----------------------- |------------------------- |
+| `date` | `DateType()` | `date` | `date` |
+| `time` `time(0..6)` `time with time zone` `time(0..6) with time zone` |
`TimestampType()`, time format quirks [^3] |
`timestamp` |
`timestamp` |
+| `timestamp` `timestamp(0..6)` `timestamp with time zone` `timestamp(0..6) with time zone` |
`TimestampType()` |
`timestamp` |
`timestamp` |
+| `interval` or any precision `daterange` `tsrange` `tstzrange` |
unsupported | | |
+
+!!! warning
+
+ Note that types in Greenplum and Spark have different value ranges:
+
+
+ | Greenplum type | Min value | Max value | Spark type | Min value | Max value |
+ |----------------|---------------------------------|----------------------------------|---------------------|--------------------------------|--------------------------------|
+ | `date` | `-4713-01-01` | `5874897-01-01` | `DateType()` | `0001-01-01` | `9999-12-31` |
+ | `timestamp` `time` | `-4713-01-01 00:00:00.000000` `00:00:00.000000` | `294276-12-31 23:59:59.999999` `24:00:00.000000` | `TimestampType()` | `0001-01-01 00:00:00.000000` | `9999-12-31 23:59:59.999999` |
+
+ So not all of values can be read from Greenplum to Spark.
+
+ References:
+
+ * [Greenplum types documentation](https://docs.vmware.com/en/VMware-Greenplum/7/greenplum-database/ref_guide-data_types.html)
+ * [Spark DateType documentation](https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/DateType.html)
+ * [Spark TimestampType documentation](https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/TimestampType.html)
+
+[^3]: `time` type is the same as `timestamp` with date `1970-01-01`. So instead of reading data from Postgres like `23:59:59`
+ it is actually read `1970-01-01 23:59:59`, and vice versa.
+
+### String types { #DBR-onetl-connection-db-connection-greenplum-types-string-types }
+
+| Greenplum type (read) | Spark type | Greenplumtype (write) | Greenplum type (create) |
+|----------------------------- |------------------ |----------------------- |------------------------- |
+| `character` `character(N)` `character varying` `character varying(N)` `text` `xml` `CREATE TYPE ... AS ENUM` |