From 8a5ae79ee9f7ccfb0397738ee373c6c7f4274bca Mon Sep 17 00:00:00 2001 From: sga Date: Tue, 7 Apr 2026 10:47:47 +0300 Subject: [PATCH 01/28] add markdown docs into develop --- mddocs/docs/_static/images/icon.svg | 11 + mddocs/docs/_static/images/logo.svg | 214 ++++++ mddocs/docs/_static/images/logo_wide.svg | 329 +++++++++ .../_static/stylesheets/autodoc_pydantic.css | 11 + mddocs/docs/changelog/0.10.0.md | 533 ++++++++++++++ mddocs/docs/changelog/0.10.1.md | 29 + mddocs/docs/changelog/0.10.2.md | 39 ++ mddocs/docs/changelog/0.11.0.md | 212 ++++++ mddocs/docs/changelog/0.11.1.md | 9 + mddocs/docs/changelog/0.11.2.md | 5 + mddocs/docs/changelog/0.12.0.md | 54 ++ mddocs/docs/changelog/0.12.1.md | 23 + mddocs/docs/changelog/0.12.2.md | 22 + mddocs/docs/changelog/0.12.3.md | 5 + mddocs/docs/changelog/0.12.4.md | 5 + mddocs/docs/changelog/0.12.5.md | 13 + mddocs/docs/changelog/0.13.0.md | 273 ++++++++ mddocs/docs/changelog/0.13.1.md | 9 + mddocs/docs/changelog/0.13.3.md | 6 + mddocs/docs/changelog/0.13.4.md | 10 + mddocs/docs/changelog/0.13.5.md | 11 + mddocs/docs/changelog/0.14.0.md | 43 ++ mddocs/docs/changelog/0.14.1.md | 17 + mddocs/docs/changelog/0.15.0.md | 173 +++++ mddocs/docs/changelog/0.7.0.md | 239 +++++++ mddocs/docs/changelog/0.7.1.md | 40 ++ mddocs/docs/changelog/0.7.2.md | 37 + mddocs/docs/changelog/0.8.0.md | 162 +++++ mddocs/docs/changelog/0.8.1.md | 42 ++ mddocs/docs/changelog/0.9.0.md | 122 ++++ mddocs/docs/changelog/0.9.1.md | 7 + mddocs/docs/changelog/0.9.2.md | 23 + mddocs/docs/changelog/0.9.3.md | 5 + mddocs/docs/changelog/0.9.4.md | 24 + mddocs/docs/changelog/0.9.5.md | 14 + mddocs/docs/changelog/DRAFT.md | 3 + mddocs/docs/changelog/NEXT_RELEASE.md | 1 + mddocs/docs/changelog/index.md | 29 + mddocs/docs/changelog/next_release/.keep | 0 mddocs/docs/concepts.md | 369 ++++++++++ .../db_connection/clickhouse/connection.md | 18 + .../db_connection/clickhouse/execute.md | 133 ++++ .../db_connection/clickhouse/index.md | 17 + .../db_connection/clickhouse/prerequisites.md | 71 ++ .../db_connection/clickhouse/read.md | 98 +++ .../db_connection/clickhouse/sql.md | 82 +++ .../db_connection/clickhouse/types.md | 350 ++++++++++ .../db_connection/clickhouse/write.md | 63 ++ .../db_connection/greenplum/connection.md | 18 + .../db_connection/greenplum/execute.md | 191 +++++ .../db_connection/greenplum/index.md | 16 + .../db_connection/greenplum/prerequisites.md | 373 ++++++++++ .../db_connection/greenplum/read.md | 441 ++++++++++++ .../db_connection/greenplum/types.md | 303 ++++++++ .../db_connection/greenplum/write.md | 229 ++++++ .../db_connection/hive/connection.md | 19 + .../connection/db_connection/hive/execute.md | 58 ++ .../connection/db_connection/hive/index.md | 17 + .../db_connection/hive/prerequisites.md | 124 ++++ .../connection/db_connection/hive/read.md | 89 +++ .../connection/db_connection/hive/slots.md | 20 + .../docs/connection/db_connection/hive/sql.md | 81 +++ .../connection/db_connection/hive/write.md | 186 +++++ .../db_connection/iceberg/auth_basic.md | 3 + .../db_connection/iceberg/auth_bearer.md | 3 + .../iceberg/auth_oauth2_client_credentials.md | 3 + .../iceberg/catalog_filesystem.md | 3 + .../db_connection/iceberg/catalog_rest.md | 17 + .../db_connection/iceberg/connection.md | 3 + .../db_connection/iceberg/execute.md | 44 ++ .../connection/db_connection/iceberg/index.md | 24 + .../db_connection/iceberg/prerequisites.md | 29 + .../connection/db_connection/iceberg/read.md | 66 ++ .../connection/db_connection/iceberg/sql.md | 46 ++ .../iceberg/warehouse_delegated.md | 3 + .../iceberg/warehouse_filesystem.md | 3 + .../db_connection/iceberg/warehouse_s3.md | 3 + .../connection/db_connection/iceberg/write.md | 28 + mddocs/docs/connection/db_connection/index.md | 12 + .../connection/db_connection/kafka/auth.md | 19 + .../db_connection/kafka/basic_auth.md | 23 + .../db_connection/kafka/connection.md | 18 + .../connection/db_connection/kafka/index.md | 29 + .../db_connection/kafka/kerberos_auth.md | 23 + .../db_connection/kafka/plaintext_protocol.md | 23 + .../db_connection/kafka/prerequisites.md | 65 ++ .../db_connection/kafka/protocol.md | 19 + .../connection/db_connection/kafka/read.md | 145 ++++ .../db_connection/kafka/scram_auth.md | 24 + .../connection/db_connection/kafka/slots.md | 19 + .../db_connection/kafka/ssl_protocol.md | 23 + .../db_connection/kafka/troubleshooting.md | 9 + .../connection/db_connection/kafka/write.md | 85 +++ .../db_connection/mongodb/connection.md | 19 + .../connection/db_connection/mongodb/index.md | 16 + .../db_connection/mongodb/pipeline.md | 48 ++ .../db_connection/mongodb/prerequisites.md | 70 ++ .../connection/db_connection/mongodb/read.md | 143 ++++ .../connection/db_connection/mongodb/types.md | 209 ++++++ .../connection/db_connection/mongodb/write.md | 52 ++ .../db_connection/mssql/connection.md | 18 + .../connection/db_connection/mssql/execute.md | 124 ++++ .../connection/db_connection/mssql/index.md | 17 + .../db_connection/mssql/prerequisites.md | 76 ++ .../connection/db_connection/mssql/read.md | 98 +++ .../connection/db_connection/mssql/sql.md | 82 +++ .../connection/db_connection/mssql/types.md | 260 +++++++ .../connection/db_connection/mssql/write.md | 58 ++ .../db_connection/mysql/connection.md | 18 + .../connection/db_connection/mysql/execute.md | 122 ++++ .../connection/db_connection/mysql/index.md | 17 + .../db_connection/mysql/prerequisites.md | 57 ++ .../connection/db_connection/mysql/read.md | 96 +++ .../connection/db_connection/mysql/sql.md | 82 +++ .../connection/db_connection/mysql/types.md | 265 +++++++ .../connection/db_connection/mysql/write.md | 60 ++ .../db_connection/oracle/connection.md | 18 + .../db_connection/oracle/execute.md | 123 ++++ .../connection/db_connection/oracle/index.md | 17 + .../db_connection/oracle/prerequisites.md | 109 +++ .../connection/db_connection/oracle/read.md | 96 +++ .../connection/db_connection/oracle/sql.md | 82 +++ .../connection/db_connection/oracle/types.md | 268 +++++++ .../connection/db_connection/oracle/write.md | 56 ++ .../db_connection/postgres/connection.md | 18 + .../db_connection/postgres/execute.md | 120 ++++ .../db_connection/postgres/index.md | 17 + .../db_connection/postgres/prerequisites.md | 66 ++ .../connection/db_connection/postgres/read.md | 94 +++ .../connection/db_connection/postgres/sql.md | 81 +++ .../db_connection/postgres/types.md | 355 ++++++++++ .../db_connection/postgres/write.md | 58 ++ mddocs/docs/connection/file_connection/ftp.md | 33 + .../docs/connection/file_connection/ftps.md | 33 + .../file_connection/hdfs/connection.md | 33 + .../connection/file_connection/hdfs/index.md | 9 + .../connection/file_connection/hdfs/slots.md | 24 + .../docs/connection/file_connection/index.md | 9 + mddocs/docs/connection/file_connection/s3.md | 32 + .../docs/connection/file_connection/samba.md | 31 + .../docs/connection/file_connection/sftp.md | 33 + .../docs/connection/file_connection/webdav.md | 32 + .../connection/file_df_connection/base.md | 20 + .../connection/file_df_connection/index.md | 15 + .../spark_hdfs/connection.md | 18 + .../file_df_connection/spark_hdfs/index.md | 8 + .../spark_hdfs/prerequisites.md | 44 ++ .../file_df_connection/spark_hdfs/slots.md | 24 + .../file_df_connection/spark_local_fs.md | 17 + .../file_df_connection/spark_s3/connection.md | 20 + .../file_df_connection/spark_s3/index.md | 5 + .../spark_s3/prerequisites.md | 60 ++ .../spark_s3/troubleshooting.md | 363 ++++++++++ mddocs/docs/connection/index.md | 34 + mddocs/docs/contributing.md | 398 +++++++++++ mddocs/docs/db/index.md | 6 + mddocs/docs/db/reader.md | 19 + mddocs/docs/db/writer.md | 24 + .../file/file_downloader/file_downloader.md | 27 + mddocs/docs/file/file_downloader/index.md | 5 + mddocs/docs/file/file_downloader/options.md | 3 + mddocs/docs/file/file_downloader/result.md | 40 ++ mddocs/docs/file/file_filters/base.md | 24 + mddocs/docs/file/file_filters/exclude_dir.md | 17 + mddocs/docs/file/file_filters/file_filter.md | 17 + .../file/file_filters/file_mtime_filter.md | 17 + .../file/file_filters/file_size_filter.md | 17 + mddocs/docs/file/file_filters/glob.md | 17 + mddocs/docs/file/file_filters/index.md | 16 + .../file/file_filters/match_all_filters.md | 13 + mddocs/docs/file/file_filters/regexp.md | 18 + mddocs/docs/file/file_limits/base.md | 28 + mddocs/docs/file/file_limits/file_limit.md | 19 + mddocs/docs/file/file_limits/index.md | 15 + .../docs/file/file_limits/limits_reached.md | 13 + .../docs/file/file_limits/limits_stop_at.md | 13 + .../docs/file/file_limits/max_files_count.md | 19 + mddocs/docs/file/file_limits/reset_limits.md | 13 + .../docs/file/file_limits/total_files_size.md | 19 + mddocs/docs/file/file_mover/file_mover.md | 27 + mddocs/docs/file/file_mover/index.md | 5 + mddocs/docs/file/file_mover/options.md | 16 + mddocs/docs/file/file_mover/result.md | 39 ++ .../docs/file/file_uploader/file_uploader.md | 27 + mddocs/docs/file/file_uploader/index.md | 5 + mddocs/docs/file/file_uploader/options.md | 20 + mddocs/docs/file/file_uploader/result.md | 39 ++ mddocs/docs/file/index.md | 7 + .../file_df/file_df_reader/file_df_reader.md | 18 + mddocs/docs/file_df/file_df_reader/index.md | 4 + mddocs/docs/file_df/file_df_reader/options.md | 15 + .../file_df/file_df_writer/file_df_writer.md | 18 + mddocs/docs/file_df/file_df_writer/index.md | 4 + mddocs/docs/file_df/file_df_writer/options.md | 15 + mddocs/docs/file_df/file_formats/avro.md | 29 + mddocs/docs/file_df/file_formats/base.md | 31 + mddocs/docs/file_df/file_formats/csv.md | 48 ++ mddocs/docs/file_df/file_formats/excel.md | 30 + mddocs/docs/file_df/file_formats/index.md | 14 + mddocs/docs/file_df/file_formats/json.md | 40 ++ mddocs/docs/file_df/file_formats/jsonline.md | 40 ++ mddocs/docs/file_df/file_formats/orc.md | 20 + mddocs/docs/file_df/file_formats/parquet.md | 20 + mddocs/docs/file_df/file_formats/xml.md | 39 ++ mddocs/docs/file_df/index.md | 5 + mddocs/docs/hooks/design.md | 660 ++++++++++++++++++ mddocs/docs/hooks/global_state.md | 49 ++ mddocs/docs/hooks/hook.md | 50 ++ mddocs/docs/hooks/index.md | 9 + mddocs/docs/hooks/slot.md | 30 + mddocs/docs/hooks/support_hooks.md | 48 ++ mddocs/docs/hwm_store/index.md | 9 + mddocs/docs/hwm_store/yaml_hwm_store.md | 19 + mddocs/docs/index.md | 18 + mddocs/docs/install/files.md | 18 + mddocs/docs/install/full.md | 14 + mddocs/docs/install/index.md | 34 + mddocs/docs/install/kerberos.md | 30 + mddocs/docs/install/minimal.md | 22 + mddocs/docs/install/spark.md | 358 ++++++++++ mddocs/docs/logging.md | 156 +++++ mddocs/docs/nav.md | 224 ++++++ mddocs/docs/plugins.md | 143 ++++ mddocs/docs/quickstart.md | 538 ++++++++++++++ mddocs/docs/security.md | 25 + mddocs/docs/snippet_0.md | 43 ++ .../strategy/incremental_batch_strategy.md | 4 +- .../strategy/incremental_strategy.md | 4 +- mddocs/docs/strategy/index.md | 8 + .../strategy/snapshot_batch_strategy.md | 4 +- .../{ => docs}/strategy/snapshot_strategy.md | 4 +- mddocs/docs/troubleshooting/index.md | 17 + mddocs/docs/troubleshooting/spark.md | 68 ++ mddocs/mkdocs.yml | 110 +++ mddocs/strategy/index.md | 8 - 235 files changed, 15566 insertions(+), 16 deletions(-) create mode 100644 mddocs/docs/_static/images/icon.svg create mode 100644 mddocs/docs/_static/images/logo.svg create mode 100644 mddocs/docs/_static/images/logo_wide.svg create mode 100644 mddocs/docs/_static/stylesheets/autodoc_pydantic.css create mode 100644 mddocs/docs/changelog/0.10.0.md create mode 100644 mddocs/docs/changelog/0.10.1.md create mode 100644 mddocs/docs/changelog/0.10.2.md create mode 100644 mddocs/docs/changelog/0.11.0.md create mode 100644 mddocs/docs/changelog/0.11.1.md create mode 100644 mddocs/docs/changelog/0.11.2.md create mode 100644 mddocs/docs/changelog/0.12.0.md create mode 100644 mddocs/docs/changelog/0.12.1.md create mode 100644 mddocs/docs/changelog/0.12.2.md create mode 100644 mddocs/docs/changelog/0.12.3.md create mode 100644 mddocs/docs/changelog/0.12.4.md create mode 100644 mddocs/docs/changelog/0.12.5.md create mode 100644 mddocs/docs/changelog/0.13.0.md create mode 100644 mddocs/docs/changelog/0.13.1.md create mode 100644 mddocs/docs/changelog/0.13.3.md create mode 100644 mddocs/docs/changelog/0.13.4.md create mode 100644 mddocs/docs/changelog/0.13.5.md create mode 100644 mddocs/docs/changelog/0.14.0.md create mode 100644 mddocs/docs/changelog/0.14.1.md create mode 100644 mddocs/docs/changelog/0.15.0.md create mode 100644 mddocs/docs/changelog/0.7.0.md create mode 100644 mddocs/docs/changelog/0.7.1.md create mode 100644 mddocs/docs/changelog/0.7.2.md create mode 100644 mddocs/docs/changelog/0.8.0.md create mode 100644 mddocs/docs/changelog/0.8.1.md create mode 100644 mddocs/docs/changelog/0.9.0.md create mode 100644 mddocs/docs/changelog/0.9.1.md create mode 100644 mddocs/docs/changelog/0.9.2.md create mode 100644 mddocs/docs/changelog/0.9.3.md create mode 100644 mddocs/docs/changelog/0.9.4.md create mode 100644 mddocs/docs/changelog/0.9.5.md create mode 100644 mddocs/docs/changelog/DRAFT.md create mode 100644 mddocs/docs/changelog/NEXT_RELEASE.md create mode 100644 mddocs/docs/changelog/index.md create mode 100644 mddocs/docs/changelog/next_release/.keep create mode 100644 mddocs/docs/concepts.md create mode 100644 mddocs/docs/connection/db_connection/clickhouse/connection.md create mode 100644 mddocs/docs/connection/db_connection/clickhouse/execute.md create mode 100644 mddocs/docs/connection/db_connection/clickhouse/index.md create mode 100644 mddocs/docs/connection/db_connection/clickhouse/prerequisites.md create mode 100644 mddocs/docs/connection/db_connection/clickhouse/read.md create mode 100644 mddocs/docs/connection/db_connection/clickhouse/sql.md create mode 100644 mddocs/docs/connection/db_connection/clickhouse/types.md create mode 100644 mddocs/docs/connection/db_connection/clickhouse/write.md create mode 100644 mddocs/docs/connection/db_connection/greenplum/connection.md create mode 100644 mddocs/docs/connection/db_connection/greenplum/execute.md create mode 100644 mddocs/docs/connection/db_connection/greenplum/index.md create mode 100644 mddocs/docs/connection/db_connection/greenplum/prerequisites.md create mode 100644 mddocs/docs/connection/db_connection/greenplum/read.md create mode 100644 mddocs/docs/connection/db_connection/greenplum/types.md create mode 100644 mddocs/docs/connection/db_connection/greenplum/write.md create mode 100644 mddocs/docs/connection/db_connection/hive/connection.md create mode 100644 mddocs/docs/connection/db_connection/hive/execute.md create mode 100644 mddocs/docs/connection/db_connection/hive/index.md create mode 100644 mddocs/docs/connection/db_connection/hive/prerequisites.md create mode 100644 mddocs/docs/connection/db_connection/hive/read.md create mode 100644 mddocs/docs/connection/db_connection/hive/slots.md create mode 100644 mddocs/docs/connection/db_connection/hive/sql.md create mode 100644 mddocs/docs/connection/db_connection/hive/write.md create mode 100644 mddocs/docs/connection/db_connection/iceberg/auth_basic.md create mode 100644 mddocs/docs/connection/db_connection/iceberg/auth_bearer.md create mode 100644 mddocs/docs/connection/db_connection/iceberg/auth_oauth2_client_credentials.md create mode 100644 mddocs/docs/connection/db_connection/iceberg/catalog_filesystem.md create mode 100644 mddocs/docs/connection/db_connection/iceberg/catalog_rest.md create mode 100644 mddocs/docs/connection/db_connection/iceberg/connection.md create mode 100644 mddocs/docs/connection/db_connection/iceberg/execute.md create mode 100644 mddocs/docs/connection/db_connection/iceberg/index.md create mode 100644 mddocs/docs/connection/db_connection/iceberg/prerequisites.md create mode 100644 mddocs/docs/connection/db_connection/iceberg/read.md create mode 100644 mddocs/docs/connection/db_connection/iceberg/sql.md create mode 100644 mddocs/docs/connection/db_connection/iceberg/warehouse_delegated.md create mode 100644 mddocs/docs/connection/db_connection/iceberg/warehouse_filesystem.md create mode 100644 mddocs/docs/connection/db_connection/iceberg/warehouse_s3.md create mode 100644 mddocs/docs/connection/db_connection/iceberg/write.md create mode 100644 mddocs/docs/connection/db_connection/index.md create mode 100644 mddocs/docs/connection/db_connection/kafka/auth.md create mode 100644 mddocs/docs/connection/db_connection/kafka/basic_auth.md create mode 100644 mddocs/docs/connection/db_connection/kafka/connection.md create mode 100644 mddocs/docs/connection/db_connection/kafka/index.md create mode 100644 mddocs/docs/connection/db_connection/kafka/kerberos_auth.md create mode 100644 mddocs/docs/connection/db_connection/kafka/plaintext_protocol.md create mode 100644 mddocs/docs/connection/db_connection/kafka/prerequisites.md create mode 100644 mddocs/docs/connection/db_connection/kafka/protocol.md create mode 100644 mddocs/docs/connection/db_connection/kafka/read.md create mode 100644 mddocs/docs/connection/db_connection/kafka/scram_auth.md create mode 100644 mddocs/docs/connection/db_connection/kafka/slots.md create mode 100644 mddocs/docs/connection/db_connection/kafka/ssl_protocol.md create mode 100644 mddocs/docs/connection/db_connection/kafka/troubleshooting.md create mode 100644 mddocs/docs/connection/db_connection/kafka/write.md create mode 100644 mddocs/docs/connection/db_connection/mongodb/connection.md create mode 100644 mddocs/docs/connection/db_connection/mongodb/index.md create mode 100644 mddocs/docs/connection/db_connection/mongodb/pipeline.md create mode 100644 mddocs/docs/connection/db_connection/mongodb/prerequisites.md create mode 100644 mddocs/docs/connection/db_connection/mongodb/read.md create mode 100644 mddocs/docs/connection/db_connection/mongodb/types.md create mode 100644 mddocs/docs/connection/db_connection/mongodb/write.md create mode 100644 mddocs/docs/connection/db_connection/mssql/connection.md create mode 100644 mddocs/docs/connection/db_connection/mssql/execute.md create mode 100644 mddocs/docs/connection/db_connection/mssql/index.md create mode 100644 mddocs/docs/connection/db_connection/mssql/prerequisites.md create mode 100644 mddocs/docs/connection/db_connection/mssql/read.md create mode 100644 mddocs/docs/connection/db_connection/mssql/sql.md create mode 100644 mddocs/docs/connection/db_connection/mssql/types.md create mode 100644 mddocs/docs/connection/db_connection/mssql/write.md create mode 100644 mddocs/docs/connection/db_connection/mysql/connection.md create mode 100644 mddocs/docs/connection/db_connection/mysql/execute.md create mode 100644 mddocs/docs/connection/db_connection/mysql/index.md create mode 100644 mddocs/docs/connection/db_connection/mysql/prerequisites.md create mode 100644 mddocs/docs/connection/db_connection/mysql/read.md create mode 100644 mddocs/docs/connection/db_connection/mysql/sql.md create mode 100644 mddocs/docs/connection/db_connection/mysql/types.md create mode 100644 mddocs/docs/connection/db_connection/mysql/write.md create mode 100644 mddocs/docs/connection/db_connection/oracle/connection.md create mode 100644 mddocs/docs/connection/db_connection/oracle/execute.md create mode 100644 mddocs/docs/connection/db_connection/oracle/index.md create mode 100644 mddocs/docs/connection/db_connection/oracle/prerequisites.md create mode 100644 mddocs/docs/connection/db_connection/oracle/read.md create mode 100644 mddocs/docs/connection/db_connection/oracle/sql.md create mode 100644 mddocs/docs/connection/db_connection/oracle/types.md create mode 100644 mddocs/docs/connection/db_connection/oracle/write.md create mode 100644 mddocs/docs/connection/db_connection/postgres/connection.md create mode 100644 mddocs/docs/connection/db_connection/postgres/execute.md create mode 100644 mddocs/docs/connection/db_connection/postgres/index.md create mode 100644 mddocs/docs/connection/db_connection/postgres/prerequisites.md create mode 100644 mddocs/docs/connection/db_connection/postgres/read.md create mode 100644 mddocs/docs/connection/db_connection/postgres/sql.md create mode 100644 mddocs/docs/connection/db_connection/postgres/types.md create mode 100644 mddocs/docs/connection/db_connection/postgres/write.md create mode 100644 mddocs/docs/connection/file_connection/ftp.md create mode 100644 mddocs/docs/connection/file_connection/ftps.md create mode 100644 mddocs/docs/connection/file_connection/hdfs/connection.md create mode 100644 mddocs/docs/connection/file_connection/hdfs/index.md create mode 100644 mddocs/docs/connection/file_connection/hdfs/slots.md create mode 100644 mddocs/docs/connection/file_connection/index.md create mode 100644 mddocs/docs/connection/file_connection/s3.md create mode 100644 mddocs/docs/connection/file_connection/samba.md create mode 100644 mddocs/docs/connection/file_connection/sftp.md create mode 100644 mddocs/docs/connection/file_connection/webdav.md create mode 100644 mddocs/docs/connection/file_df_connection/base.md create mode 100644 mddocs/docs/connection/file_df_connection/index.md create mode 100644 mddocs/docs/connection/file_df_connection/spark_hdfs/connection.md create mode 100644 mddocs/docs/connection/file_df_connection/spark_hdfs/index.md create mode 100644 mddocs/docs/connection/file_df_connection/spark_hdfs/prerequisites.md create mode 100644 mddocs/docs/connection/file_df_connection/spark_hdfs/slots.md create mode 100644 mddocs/docs/connection/file_df_connection/spark_local_fs.md create mode 100644 mddocs/docs/connection/file_df_connection/spark_s3/connection.md create mode 100644 mddocs/docs/connection/file_df_connection/spark_s3/index.md create mode 100644 mddocs/docs/connection/file_df_connection/spark_s3/prerequisites.md create mode 100644 mddocs/docs/connection/file_df_connection/spark_s3/troubleshooting.md create mode 100644 mddocs/docs/connection/index.md create mode 100644 mddocs/docs/contributing.md create mode 100644 mddocs/docs/db/index.md create mode 100644 mddocs/docs/db/reader.md create mode 100644 mddocs/docs/db/writer.md create mode 100644 mddocs/docs/file/file_downloader/file_downloader.md create mode 100644 mddocs/docs/file/file_downloader/index.md create mode 100644 mddocs/docs/file/file_downloader/options.md create mode 100644 mddocs/docs/file/file_downloader/result.md create mode 100644 mddocs/docs/file/file_filters/base.md create mode 100644 mddocs/docs/file/file_filters/exclude_dir.md create mode 100644 mddocs/docs/file/file_filters/file_filter.md create mode 100644 mddocs/docs/file/file_filters/file_mtime_filter.md create mode 100644 mddocs/docs/file/file_filters/file_size_filter.md create mode 100644 mddocs/docs/file/file_filters/glob.md create mode 100644 mddocs/docs/file/file_filters/index.md create mode 100644 mddocs/docs/file/file_filters/match_all_filters.md create mode 100644 mddocs/docs/file/file_filters/regexp.md create mode 100644 mddocs/docs/file/file_limits/base.md create mode 100644 mddocs/docs/file/file_limits/file_limit.md create mode 100644 mddocs/docs/file/file_limits/index.md create mode 100644 mddocs/docs/file/file_limits/limits_reached.md create mode 100644 mddocs/docs/file/file_limits/limits_stop_at.md create mode 100644 mddocs/docs/file/file_limits/max_files_count.md create mode 100644 mddocs/docs/file/file_limits/reset_limits.md create mode 100644 mddocs/docs/file/file_limits/total_files_size.md create mode 100644 mddocs/docs/file/file_mover/file_mover.md create mode 100644 mddocs/docs/file/file_mover/index.md create mode 100644 mddocs/docs/file/file_mover/options.md create mode 100644 mddocs/docs/file/file_mover/result.md create mode 100644 mddocs/docs/file/file_uploader/file_uploader.md create mode 100644 mddocs/docs/file/file_uploader/index.md create mode 100644 mddocs/docs/file/file_uploader/options.md create mode 100644 mddocs/docs/file/file_uploader/result.md create mode 100644 mddocs/docs/file/index.md create mode 100644 mddocs/docs/file_df/file_df_reader/file_df_reader.md create mode 100644 mddocs/docs/file_df/file_df_reader/index.md create mode 100644 mddocs/docs/file_df/file_df_reader/options.md create mode 100644 mddocs/docs/file_df/file_df_writer/file_df_writer.md create mode 100644 mddocs/docs/file_df/file_df_writer/index.md create mode 100644 mddocs/docs/file_df/file_df_writer/options.md create mode 100644 mddocs/docs/file_df/file_formats/avro.md create mode 100644 mddocs/docs/file_df/file_formats/base.md create mode 100644 mddocs/docs/file_df/file_formats/csv.md create mode 100644 mddocs/docs/file_df/file_formats/excel.md create mode 100644 mddocs/docs/file_df/file_formats/index.md create mode 100644 mddocs/docs/file_df/file_formats/json.md create mode 100644 mddocs/docs/file_df/file_formats/jsonline.md create mode 100644 mddocs/docs/file_df/file_formats/orc.md create mode 100644 mddocs/docs/file_df/file_formats/parquet.md create mode 100644 mddocs/docs/file_df/file_formats/xml.md create mode 100644 mddocs/docs/file_df/index.md create mode 100644 mddocs/docs/hooks/design.md create mode 100644 mddocs/docs/hooks/global_state.md create mode 100644 mddocs/docs/hooks/hook.md create mode 100644 mddocs/docs/hooks/index.md create mode 100644 mddocs/docs/hooks/slot.md create mode 100644 mddocs/docs/hooks/support_hooks.md create mode 100644 mddocs/docs/hwm_store/index.md create mode 100644 mddocs/docs/hwm_store/yaml_hwm_store.md create mode 100644 mddocs/docs/index.md create mode 100644 mddocs/docs/install/files.md create mode 100644 mddocs/docs/install/full.md create mode 100644 mddocs/docs/install/index.md create mode 100644 mddocs/docs/install/kerberos.md create mode 100644 mddocs/docs/install/minimal.md create mode 100644 mddocs/docs/install/spark.md create mode 100644 mddocs/docs/logging.md create mode 100644 mddocs/docs/nav.md create mode 100644 mddocs/docs/plugins.md create mode 100644 mddocs/docs/quickstart.md create mode 100644 mddocs/docs/security.md create mode 100644 mddocs/docs/snippet_0.md rename mddocs/{ => docs}/strategy/incremental_batch_strategy.md (78%) rename mddocs/{ => docs}/strategy/incremental_strategy.md (80%) create mode 100644 mddocs/docs/strategy/index.md rename mddocs/{ => docs}/strategy/snapshot_batch_strategy.md (78%) rename mddocs/{ => docs}/strategy/snapshot_strategy.md (80%) create mode 100644 mddocs/docs/troubleshooting/index.md create mode 100644 mddocs/docs/troubleshooting/spark.md create mode 100644 mddocs/mkdocs.yml delete mode 100644 mddocs/strategy/index.md diff --git a/mddocs/docs/_static/images/icon.svg b/mddocs/docs/_static/images/icon.svg new file mode 100644 index 000000000..a4d737f81 --- /dev/null +++ b/mddocs/docs/_static/images/icon.svg @@ -0,0 +1,11 @@ + + + + + + + + + + + diff --git a/mddocs/docs/_static/images/logo.svg b/mddocs/docs/_static/images/logo.svg new file mode 100644 index 000000000..76527ebf1 --- /dev/null +++ b/mddocs/docs/_static/images/logo.svg @@ -0,0 +1,214 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/mddocs/docs/_static/images/logo_wide.svg b/mddocs/docs/_static/images/logo_wide.svg new file mode 100644 index 000000000..981bf0148 --- /dev/null +++ b/mddocs/docs/_static/images/logo_wide.svg @@ -0,0 +1,329 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/mddocs/docs/_static/stylesheets/autodoc_pydantic.css b/mddocs/docs/_static/stylesheets/autodoc_pydantic.css new file mode 100644 index 000000000..994a3e548 --- /dev/null +++ b/mddocs/docs/_static/stylesheets/autodoc_pydantic.css @@ -0,0 +1,11 @@ +.autodoc_pydantic_validator_arrow { + padding-left: 8px; + } + +.autodoc_pydantic_collapsable_json { + cursor: pointer; + } + +.autodoc_pydantic_collapsable_erd { + cursor: pointer; + } \ No newline at end of file diff --git a/mddocs/docs/changelog/0.10.0.md b/mddocs/docs/changelog/0.10.0.md new file mode 100644 index 000000000..870cc8d19 --- /dev/null +++ b/mddocs/docs/changelog/0.10.0.md @@ -0,0 +1,533 @@ +# 0.10.0 (2023-12-18) { #DBR-onetl-changelog-0-10-0 } + +## Breaking Changes { #DBR-onetl-changelog-0-10-0-breaking-changes } + +- Upgrade `etl-entities` from v1 to v2 ([#172](https://github.com/MTSWebServices/onetl/pull/172)). + + This implies that `HWM` classes are now have different internal structure than they used to. + + Before: + + ```python + from etl_entities.old_hwm import IntHWM as OldIntHWM + from etl_entities.source import Column, Table + from etl_entities.process import Process + + hwm = OldIntHWM( + process=Process(name="myprocess", task="abc", dag="cde", host="myhost"), + source=Table(name="schema.table", instance="postgres://host:5432/db"), + column=Column(name="col1"), + value=123, + ) + ``` + + After: + + ```python + from etl_entities.hwm import ColumnIntHWM + + hwm = ColumnIntHWM( + name="some_unique_name", + description="any value you want", + source="schema.table", + expression="col1", + value=123, + ) + ``` + + **Breaking change:** If you used HWM classes from `etl_entities` module, you should rewrite your code to make it compatible with new version. + +??? "More details" + + - `HWM` classes used by previous onETL versions were moved from `etl_entities` to `etl_entities.old_hwm` submodule. They are here for compatibility reasons, but are planned to be removed in `etl-entities` v3 release. + - New `HWM` classes have flat structure instead of nested. + - New `HWM` classes have mandatory `name` attribute (it was known as `qualified_name` before). + - Type aliases used while serializing and deserializing `HWM` objects to `dict` representation were changed too: `int` → `column_int`. + + To make migration simpler, you can use new method: + + ```python + old_hwm = OldIntHWM(...) + new_hwm = old_hwm.as_new_hwm() + ``` + + Which automatically converts all fields from old structure to new one, including `qualified_name` → `name`. + +- **Breaking changes:** + + - Methods `BaseHWMStore.get()` and `BaseHWMStore.save()` were renamed to `get_hwm()` and `set_hwm()`. + - They now can be used only with new HWM classes from `etl_entities.hwm`, **old HWM classes are not supported**. + + If you used them in your code, please update it accordingly. + +- YAMLHWMStore **CANNOT read files created by older onETL versions** (0.9.x or older). + +??? "Update procedure" + + ```python + # pip install onetl==0.9.5 + + # Get qualified_name for HWM + + + # Option 1. HWM is built manually + from etl_entities import IntHWM, FileListHWM + from etl_entities.source import Column, Table, RemoteFolder + from etl_entities.process import Process + + # for column HWM + old_column_hwm = IntHWM( + process=Process(name="myprocess", task="abc", dag="cde", host="myhost"), + source=Table(name="schema.table", instance="postgres://host:5432/db"), + column=Column(name="col1"), + ) + qualified_name = old_column_hwm.qualified_name + # "col1#schema.table@postgres://host:5432/db#cde.abc.myprocess@myhost" + + # for file HWM + old_file_hwm = FileListHWM( + process=Process(name="myprocess", task="abc", dag="cde", host="myhost"), + source=RemoteFolder(name="/absolute/path", instance="ftp://ftp.server:21"), + ) + qualified_name = old_file_hwm.qualified_name + # "file_list#/absolute/path@ftp://ftp.server:21#cde.abc.myprocess@myhost" + + + # Option 2. HWM is generated automatically (by DBReader/FileDownloader) + # See onETL logs and search for string like qualified_name = '...' + + qualified_name = "col1#schema.table@postgres://host:5432/db#cde.abc.myprocess@myhost" + + + # Get .yml file path by qualified_name + + import os + from pathlib import PurePosixPath + from onetl.hwm.store import YAMLHWMStore + + # here you should pass the same arguments as used on production, if any + yaml_hwm_store = YAMLHWMStore() + hwm_path = yaml_hwm_store.get_file_path(qualified_name) + print(hwm_path) + + # for column HWM + # LocalPosixPath('/home/maxim/.local/share/onETL/yml_hwm_store/col1__schema.table__postgres_host_5432_db__cde.abc.myprocess__myhost.yml') + + # for file HWM + # LocalPosixPath('/home/maxim/.local/share/onETL/yml_hwm_store/file_list__absolute_path__ftp_ftp.server_21__cde.abc.myprocess__myhost.yml') + + + # Read raw .yml file content + + from yaml import safe_load, dump + + raw_old_hwm_items = safe_load(hwm_path.read_text()) + print(raw_old_hwm_items) + + # for column HWM + # [ + # { + # "column": { "name": "col1", "partition": {} }, + # "modified_time": "2023-12-18T10: 39: 47.377378", + # "process": { "dag": "cde", "host": "myhost", "name": "myprocess", "task": "abc" }, + # "source": { "instance": "postgres: //host:5432/db", "name": "schema.table" }, + # "type": "int", + # "value": "123", + # }, + # ] + + # for file HWM + # [ + # { + # "modified_time": "2023-12-18T11:15:36.478462", + # "process": { "dag": "cde", "host": "myhost", "name": "myprocess", "task": "abc" }, + # "source": { "instance": "ftp://ftp.server:21", "name": "/absolute/path" }, + # "type": "file_list", + # "value": ["file1.txt", "file2.txt"], + # }, + # ] + + + # Convert file content to new structure, compatible with onETL 0.10.x + raw_new_hwm_items = [] + for old_hwm in raw_old_hwm_items: + new_hwm = {"name": qualified_name, "modified_time": old_hwm["modified_time"]} + + if "column" in old_hwm: + new_hwm["expression"] = old_hwm["column"]["name"] + new_hwm["entity"] = old_hwm["source"]["name"] + old_hwm.pop("process", None) + + if old_hwm["type"] == "int": + new_hwm["type"] = "column_int" + new_hwm["value"] = old_hwm["value"] + + elif old_hwm["type"] == "date": + new_hwm["type"] = "column_date" + new_hwm["value"] = old_hwm["value"] + + elif old_hwm["type"] == "datetime": + new_hwm["type"] = "column_datetime" + new_hwm["value"] = old_hwm["value"] + + elif old_hwm["type"] == "file_list": + new_hwm["type"] = "file_list" + new_hwm["value"] = [ + os.fspath(PurePosixPath(old_hwm["source"]["name"]).joinpath(path)) + for path in old_hwm["value"] + ] + + else: + raise ValueError("WAT?") + + raw_new_hwm_items.append(new_hwm) + + + print(raw_new_hwm_items) + # for column HWM + # [ + # { + # "name": "col1#schema.table@postgres://host:5432/db#cde.abc.myprocess@myhost", + # "modified_time": "2023-12-18T10:39:47.377378", + # "expression": "col1", + # "source": "schema.table", + # "type": "column_int", + # "value": 123, + # }, + # ] + + # for file HWM + # [ + # { + # "name": "file_list#/absolute/path@ftp://ftp.server:21#cde.abc.myprocess@myhost", + # "modified_time": "2023-12-18T11:15:36.478462", + # "entity": "/absolute/path", + # "type": "file_list", + # "value": ["/absolute/path/file1.txt", "/absolute/path/file2.txt"], + # }, + # ] + + + # Save file with new content + with open(hwm_path, "w") as file: + dump(raw_new_hwm_items, file) + + + # Stop Python interpreter and update onETL + # pip install onetl==0.10.0 + # Check that new .yml file can be read + + from onetl.hwm.store import YAMLHWMStore + + qualified_name = ... + + # here you should pass the same arguments as used on production, if any + yaml_hwm_store = YAMLHWMStore() + yaml_hwm_store.get_hwm(qualified_name) + + # for column HWM + # ColumnIntHWM( + # name='col1#schema.table@postgres://host:5432/db#cde.abc.myprocess@myhost', + # description='', + # entity='schema.table', + # value=123, + # expression='col1', + # modified_time=datetime.datetime(2023, 12, 18, 10, 39, 47, 377378), + # ) + + # for file HWM + # FileListHWM( + # name='file_list#/absolute/path@ftp://ftp.server:21#cde.abc.myprocess@myhost', + # description='', + # entity=AbsolutePath('/absolute/path'), + # value=frozenset({AbsolutePath('/absolute/path/file1.txt'), AbsolutePath('/absolute/path/file2.txt')}), + # expression=None, + # modified_time=datetime.datetime(2023, 12, 18, 11, 15, 36, 478462) + # ) + + + # That's all! + ``` + +But most of users use other HWM store implementations which do not have such issues. + +- Several classes and functions were moved from `onetl` to `etl_entities`: + +=== "onETL `0.9.x` and older" + + ```python + from onetl.hwm.store import ( + detect_hwm_store, + BaseHWMStore, + HWMStoreClassRegistry, + register_hwm_store_class, + HWMStoreManager, + MemoryHWMStore, + ) + ``` + +=== "nETL `0.10.x` and newer" + + ```python + from etl_entities.hwm_store import ( + detect_hwm_store, + BaseHWMStore, + HWMStoreClassRegistry, + register_hwm_store_class, + HWMStoreManager, + MemoryHWMStore, + ) + ``` + + They still can be imported from old module, but this is deprecated and will be removed in v1.0.0 release. + +- Change the way of passing `HWM` to `DBReader` and `FileDownloader` classes: + +=== "onETL `0.9.x` and older" + + ```python linenums="1" hl_lines="12-21" + # Simple + reader = DBReader( + connection=..., + source=..., + hwm_column="col1", + ) + + + # Complex + reader = DBReader( + connection=..., + source=..., + hwm_column=( + "col1", + "cast(col1 as date)", + ), + ) + + + # Files + downloader = FileDownloader( + connection=..., + source_path=..., + target_path=..., + hwm_type="file_list", + ) + ``` + +=== "onETL `0.10.x` and newer" + + ```python linenums="1" hl_lines="12-21" + # Simple + reader = DBReader( + connection=..., + source=..., + hwm=DBReader.AutoDetectHWM( + # name is mandatory now! + name="my_unique_hwm_name", + expression="col1", + ), + ) + + # Complex + reader = DBReader( + connection=..., + source=..., + hwm=DBReader.AutoDetectHWM( + # name is mandatory now! + name="my_unique_hwm_name", + expression="cast(col1 as date)", + ), + ) + + # Files + downloader = FileDownloader( + connection=..., + source_path=..., + target_path=..., + hwm=FileListHWM( + # name is mandatory now! + name="another_unique_hwm_name", + ), + ) + ``` + + New HWM classes have **mandatory** `name` attribute which should be passed explicitly, + instead of generating if automatically under the hood. + + Automatic `name` generation using the old `DBReader.hwm_column` / `FileDownloader.hwm_type` + syntax is still supported, but will be removed in v1.0.0 release. ([#179](https://github.com/MTSWebServices/onetl/pull/179)) + +- Performance of read Incremental and Batch strategies has been drastically improved. ([#182](https://github.com/MTSWebServices/onetl/pull/182)). + +??? "Before and after in details" + + `DBReader.run()` + incremental/batch strategy behavior in versions 0.9.x and older: + + - Get table schema by making query `SELECT * FROM table WHERE 1=0` (if `DBReader.columns` has `*``) + - Expand `*`` to real column names from table, add here `hwm_column`, remove duplicates (as some RDBMS does not allow that). + - Create dataframe from query like `SELECT hwm_expression AS hwm_column, ...other table columns... FROM table WHERE hwm_expression > prev_hwm.value`. + - Determine HWM class using dataframe schema: `df.schema[hwm_column].dataType`. + - Determine x HWM column value using Spark: `df.select(max(hwm_column)).collect()`. + - Use `max(hwm_column)` as next HWM value, and save it to HWM Store. + - Return dataframe to user. + + This was far from ideal: + + - Dataframe content (all rows or just changed ones) was loaded from the source to Spark only to get min/max values of specific column. + + - Step of fetching table schema and then substituting column names in the next query caused some unexpected errors. + + For example, source contains columns with mixed name case, like `"CamelColumn"` or `"spaced column"`. + + Column names were *not* escaped during query generation, leading to queries that cannot be executed by database. + + So users have to *explicitly* pass column names `DBReader`, wrapping columns with mixed naming with `"``: + + ```python + reader = DBReader( + connection=..., + source=..., + columns=[ # passing '*' here leads to wrong SQL query generation + "normal_column", + '"CamelColumn"', + '"spaced column"', + ..., + ], + ) + ``` + - Using `DBReader` with `IncrementalStrategy` could lead to reading rows already read before. + + Dataframe was created from query with WHERE clause like `hwm.expression > prev_hwm.value`, + not `hwm.expression > prev_hwm.value AND hwm.expression <= current_hwm.value`. + + So if new rows appeared in the source **after** HWM value is determined, + they can be read by accessing dataframe content (because Spark dataframes are lazy), + leading to inconsistencies between HWM value and dataframe content. + + This may lead to issues then `DBReader.run()` read some data, updated HWM value, and next call of `DBReader.run()`` + will read rows that were already read in previous run. + + `DBReader.run()` + incremental/batch strategy behavior in versions 0.10.x and newer: + + - Detect type of HWM expression: `SELECT hwm.expression FROM table WHERE 1=0`. + - Determine corresponding Spark type `df.schema[0]` and when determine matching HWM class (if `DBReader.AutoDetectHWM` is used). + - Get min/max values by querying the source: `SELECT MAX(hwm.expression) FROM table WHERE hwm.expression >= prev_hwm.value`. + - Use `max(hwm.expression)` as next HWM value, and save it to HWM Store. + - Create dataframe from query `SELECT ... table columns ... FROM table WHERE hwm.expression > prev_hwm.value AND hwm.expression <= current_hwm.value`, baking new HWM value into the query. + - Return dataframe to user. + + Improvements: + + - Allow source to calculate min/max instead of loading everything to Spark. This should be **faster** on large amounts of data (**up to x2**), because we do not transfer all the data from the source to Spark. This can be even faster if source have indexes for HWM column. + - Columns list is passed to source as-is, without any resolving on `DBReader` side. So you can pass `DBReader(columns=["*"])` to read tables with mixed columns naming. + - Restrict dataframe content to always match HWM values, which leads to never reading the same row twice. + + **Breaking change**: HWM column is not being implicitly added to dataframe. It was a part of `SELECT` clause, but now it is mentioned only in `WHERE` clause. + + So if you had code like this, you have to rewrite it: + +=== "onETL `0.9.x` and older" + + ```python linenums="1" hl_lines="1-16" + reader = DBReader( + connection=..., + source=..., + columns=[ + "col1", + "col2", + ], + hwm_column="hwm_col", + ) + + df = reader.run() + # hwm_column value is in the dataframe + assert df.columns == ["col1", "col2", "hwm_col"] + + + reader = DBReader( + connection=..., + source=..., + columns=[ + "col1", + "col2", + ], + hwm_column=( + "hwm_col", + "cast(hwm_col as int)", + ), + ) + + df = reader.run() + # hwm_expression value is in the dataframe + assert df.columns == ["col1", "col2", "hwm_col"] + ``` + +=== "onETL `0.10.x` and newer" + + ```python linenums="1" hl_lines="1-16" + reader = DBReader( + connection=..., + source=..., + columns=[ + "col1", + "col2", + # add hwm_column explicitly + "hwm_col", + ], + hwm_column="hwm_col", + ) + + df = reader.run() + # if columns list is not updated, + # this fill fail + assert df.columns == ["col1", "col2", "hwm_col"] + + reader = DBReader( + connection=..., + source=..., + columns=[ + "col1", + "col2", + # add hwm_expression explicitly + "cast(hwm_col as int) as hwm_col", + ], + hwm_column=( + "hwm_col", + "cast(hwm_col as int)", + ), + ) + df = reader.run() + # if columns list is not updated, + # this fill fail + assert df.columns == ["col1", "col2", "hwm_col"] + ``` + + But most users just use `columns=["*"]` anyway, they won't see any changes. + +- `FileDownloader.run()` now updates HWM in HWM Store not after each file is being successfully downloaded, + but after all files were handled. + + This is because: + + - FileDownloader can be used with `DownloadOptions(workers=N)`, which could lead to race condition - one thread can save to HWM store one HWM value when another thread will save different value. + - FileDownloader can download hundreds and thousands of files, and issuing a request to HWM Store for each file could potentially DDoS HWM Store. ([#189](https://github.com/MTSWebServices/onetl/pull/189)) + + There is a exception handler which tries to save HWM to HWM store if download process was interrupted. But if it was interrupted by force, like sending `SIGKILL` event, + HWM will not be saved to HWM store, so some already downloaded files may be downloaded again next time. + + But unexpected process kill may produce other negative impact, like some file will be downloaded partially, so this is an expected behavior. + +## Features { #DBR-onetl-changelog-0-10-0-features } + +- Add Python 3.12 compatibility. ([#167](https://github.com/MTSWebServices/onetl/pull/167)) +- `Excel` file format now can be used with Spark 3.5.0. ([#187](https://github.com/MTSWebServices/onetl/pull/187)) +- `SnapshotBatchStagy` and `IncrementalBatchStrategy` does no raise exceptions if source does not contain any data. + Instead they stop at first iteration and return empty dataframe. ([#188](https://github.com/MTSWebServices/onetl/pull/188)) +- Cache result of `connection.check()` in high-level classes like `DBReader`, `FileDownloader` and so on. This makes logs less verbose. ([#190](https://github.com/MTSWebServices/onetl/pull/190)) + +## Bug Fixes { #DBR-onetl-changelog-0-10-0-bug-fixes } + +- Fix `@slot` and `@hook` decorators returning methods with missing arguments in signature (Pylance, VS Code). ([#183](https://github.com/MTSWebServices/onetl/pull/183)) +- Kafka connector documentation said that it does support reading topic data incrementally by passing `group.id` or `groupIdPrefix`. + Actually, this is not true, because Spark does not send information to Kafka which messages were consumed. + So currently users can only read the whole topic, no incremental reads are supported. diff --git a/mddocs/docs/changelog/0.10.1.md b/mddocs/docs/changelog/0.10.1.md new file mode 100644 index 000000000..58d2512f1 --- /dev/null +++ b/mddocs/docs/changelog/0.10.1.md @@ -0,0 +1,29 @@ +# 0.10.1 (2024-02-05) { #DBR-onetl-changelog-0-10-1 } + +## Features { #DBR-onetl-changelog-0-10-1-features } + +- Add support of `Incremental Strategies` for `Kafka` connection: + + ```python + reader = DBReader( + connection=Kafka(...), + source="topic_name", + hwm=DBReader.AutoDetectHWM(name="some_hwm_name", expression="offset"), + ) + + with IncrementalStrategy(): + df = reader.run() + ``` + + This lets you resume reading data from a Kafka topic starting at the last committed offset from your previous run. ([#202](https://github.com/MTSWebServices/onetl/pull/202)) + +- Add `has_data`, `raise_if_no_data` methods to `DBReader` class. ([#203](https://github.com/MTSWebServices/onetl/pull/203)) + +- Updare VMware Greenplum connector from `2.1.4` to `2.3.0`. This implies: + - Greenplum 7.x support + - [Kubernetes support](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/configure.html#k8scfg) + - New read option [gpdb.matchDistributionPolicy](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/options.html#distpolmotion) + which allows to match each Spark executor with specific Greenplum segment, avoiding redundant data transfer between Greenplum segments + - Allows overriding [Greenplum optimizer parameters](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/options.html#greenplum-gucs) in read/write operations ([#208](https://github.com/MTSWebServices/onetl/pull/208)) + +- `Greenplum.get_packages()` method now accepts optional arg `package_version` which allows to override version of Greenplum connector package. ([#208](https://github.com/MTSWebServices/onetl/pull/208)) diff --git a/mddocs/docs/changelog/0.10.2.md b/mddocs/docs/changelog/0.10.2.md new file mode 100644 index 000000000..930316a03 --- /dev/null +++ b/mddocs/docs/changelog/0.10.2.md @@ -0,0 +1,39 @@ +# 0.10.2 (2024-03-21) { #DBR-onetl-changelog-0-10-2 } + +## Features { #DBR-onetl-changelog-0-10-2-features } + +- Add support of Pydantic v2. ([#230](https://github.com/MTSWebServices/onetl/pull/230)) + +## Improvements { #DBR-onetl-changelog-0-10-2-improvements } + +- Improve database connections documentation: + - Add "Types" section describing mapping between Clickhouse and Spark types + - Add "Prerequisites" section describing different aspects of connecting to Clickhouse + - Separate documentation of `DBReader` and `.sql()` / `.pipeline(...)` + - Add examples for `.fetch()` and `.execute()` ([#211](https://github.com/MTSWebServices/onetl/pull/211), [#228](https://github.com/MTSWebServices/onetl/pull/228), [#229](https://github.com/MTSWebServices/onetl/pull/229), [#233](https://github.com/MTSWebServices/onetl/pull/233), [#234](https://github.com/MTSWebServices/onetl/pull/234), [#235](https://github.com/MTSWebServices/onetl/pull/235), [#236](https://github.com/MTSWebServices/onetl/pull/236), [#240](https://github.com/MTSWebServices/onetl/pull/240)) + +- Add notes to Greenplum documentation about issues with IP resolution and building `gpfdist` URL ([#228](https://github.com/MTSWebServices/onetl/pull/228)) + +- Allow calling `MongoDB.pipeline(...)` with passing just collection name, without explicit aggregation pipeline. ([#237](https://github.com/MTSWebServices/onetl/pull/237)) + +- Update default `Postgres(extra={...})` to include `{"stringtype": "unspecified"}` option. + This allows to write text data to non-text column (or vice versa), relying to Postgres cast capabilities. + + For example, now it is possible to read column of type `money` as Spark's `StringType()`, and write it back to the same column, + without using intermediate columns or tables. ([#229](https://github.com/MTSWebServices/onetl/pull/229)) + +## Bug Fixes { #DBR-onetl-changelog-0-10-2-bug-fixes } + +- Return back handling of `DBReader(columns="string")`. This was a valid syntax up to v0.10 release, but it was removed because + most of users neved used it. It looks that we were wrong, returning this behavior back, but with deprecation warning. ([#238](https://github.com/MTSWebServices/onetl/pull/238)) + +- Downgrade Greenplum package version from `2.3.0` to `2.2.0`. ([#239](https://github.com/MTSWebServices/onetl/pull/239)) + + This is because version 2.3.0 introduced issues with writing data to Greenplum 6.x. + Connector can open transaction with `SELECT * FROM table LIMIT 0` query, but does not close it, which leads to deadlocks. + + For using this connector with Greenplum 7.x, please pass package version explicitly: + + ```python + maven_packages = Greenplum.get_packages(package_version="2.3.0", ...) + ``` diff --git a/mddocs/docs/changelog/0.11.0.md b/mddocs/docs/changelog/0.11.0.md new file mode 100644 index 000000000..b093991ad --- /dev/null +++ b/mddocs/docs/changelog/0.11.0.md @@ -0,0 +1,212 @@ +# 0.11.0 (2024-05-27) { #DBR-onetl-changelog-0-11-0 } + +## Breaking Changes { #DBR-onetl-changelog-0-11-0-breaking-changes } + +There can be some changes in connection behavior, related to version upgrades. So we mark these changes as **breaking** although +most of users will not see any differences. + +- Update Clickhouse JDBC driver to latest version ([#249](https://github.com/MTSWebServices/onetl/pull/249)): + - Package was renamed `ru.yandex.clickhouse:clickhouse-jdbc` → `com.clickhouse:clickhouse-jdbc`. + - Package version changed `0.3.2` → `0.6.0-patch5`. + - Driver name changed `ru.yandex.clickhouse.ClickHouseDriver` → `com.clickhouse.jdbc.ClickHouseDriver`. + + This brings up several fixes for Spark <-> Clickhouse type compatibility, and also Clickhouse clusters support. + +- Update other JDBC drivers to latest versions: + - MSSQL `12.2.0` → `12.6.2` ([#254](https://github.com/MTSWebServices/onetl/pull/254)). + - MySQL `8.0.33` → `8.4.0` ([#253](https://github.com/MTSWebServices/onetl/pull/253), [#285](https://github.com/MTSWebServices/onetl/pull/285)). + - Oracle `23.2.0.0` → `23.4.0.24.05` ([#252](https://github.com/MTSWebServices/onetl/pull/252), [#284](https://github.com/MTSWebServices/onetl/pull/284)). + - Postgres `42.6.0` → `42.7.3` ([#251](https://github.com/MTSWebServices/onetl/pull/251)). + +- Update MongoDB connector to latest version: `10.1.1` → `10.3.0` ([#255](https://github.com/MTSWebServices/onetl/pull/255), [#283](https://github.com/MTSWebServices/onetl/pull/283)). + + This brings up Spark 3.5 support. + +- Update `XML` package to latest version: `0.17.0` → `0.18.0` ([#259](https://github.com/MTSWebServices/onetl/pull/259)). + + This brings few bugfixes with datetime format handling. + +- For JDBC connections add new `SQLOptions` class for `DB.sql(query, options=...)` method ([#272](https://github.com/MTSWebServices/onetl/pull/272)). + + Firsly, to keep naming more consistent. + + Secondly, some of options are not supported by `DB.sql(...)` method, but supported by `DBReader`. + For example, `SQLOptions` do not support `partitioning_mode` and require explicit definition of `lower_bound` and `upper_bound` when `num_partitions` is greater than 1. + `ReadOptions` does support `partitioning_mode` and allows skipping `lower_bound` and `upper_bound` values. + + This require some code changes. Before: + + ```python + from onetl.connection import Postgres + + postgres = Postgres(...) + df = postgres.sql( + """ + SELECT * + FROM some.mytable + WHERE key = 'something' + """, + options=Postgres.ReadOptions( + partitioning_mode="range", + partition_column="id", + num_partitions=10, + ), + ) + ``` + + After: + + ```python + from onetl.connection import Postgres + + postgres = Postgres(...) + df = postgres.sql( + """ + SELECT * + FROM some.mytable + WHERE key = 'something' + """, + options=Postgres.SQLOptions( + # partitioning_mode is not supported! + partition_column="id", + num_partitions=10, + lower_bound=0, # <-- set explicitly + upper_bound=1000, # <-- set explicitly + ), + ) + ``` + + For now, `DB.sql(query, options=...)` can accept `ReadOptions` to keep backward compatibility, but emits deprecation warning. + The support will be removed in `v1.0.0`. + +- Split up `JDBCOptions` class into `FetchOptions` and `ExecuteOptions` ([#274](https://github.com/MTSWebServices/onetl/pull/274)). + + New classes are used by `DB.fetch(query, options=...)` and `DB.execute(query, options=...)` methods respectively. + This is mostly to keep naming more consistent. + + This require some code changes. Before: + + ```python + from onetl.connection import Postgres + + postgres = Postgres(...) + df = postgres.fetch( + "SELECT * FROM some.mytable WHERE key = 'something'", + options=Postgres.JDBCOptions( + fetchsize=1000, + query_timeout=30, + ), + ) + + postgres.execute( + "UPDATE some.mytable SET value = 'new' WHERE key = 'something'", + options=Postgres.JDBCOptions(query_timeout=30), + ) + ``` + + After: + + ```python + from onetl.connection import Postgres + + # Using FetchOptions for fetching data + postgres = Postgres(...) + df = postgres.fetch( + "SELECT * FROM some.mytable WHERE key = 'something'", + options=Postgres.FetchOptions( # <-- change class name + fetchsize=1000, + query_timeout=30, + ), + ) + + # Using ExecuteOptions for executing statements + postgres.execute( + "UPDATE some.mytable SET value = 'new' WHERE key = 'something'", + options=Postgres.ExecuteOptions(query_timeout=30), # <-- change class name + ) + ``` + + For now, `DB.fetch(query, options=...)` and `DB.execute(query, options=...)` can accept `JDBCOptions`, to keep backward compatibility, + but emit a deprecation warning. The old class will be removed in `v1.0.0`. + +- Serialize `ColumnDatetimeHWM` to Clickhouse's `DateTime64(6)` (precision up to microseconds) instead of `DateTime` (precision up to seconds) ([#267](https://github.com/MTSWebServices/onetl/pull/267)). + + In previous onETL versions, `ColumnDatetimeHWM` value was rounded to the second, and thus reading some rows that were read in previous runs, + producing duplicates. + + For Clickhouse versions below 21.1 comparing column of type `DateTime` with a value of type `DateTime64` is not supported, returning an empty dataframe. + To avoid this, replace: + + ```python + DBReader( + ..., + hwm=DBReader.AutoDetectHWM( + name="my_hwm", + expression="hwm_column", # <-- + ), + ) + ``` + + with: + + ```python + DBReader( + ..., + hwm=DBReader.AutoDetectHWM( + name="my_hwm", + expression="CAST(hwm_column AS DateTime64)", # <-- add explicit CAST + ), + ) + ``` + +- Pass JDBC connection extra params as `properties` dict instead of URL with query part ([#268](https://github.com/MTSWebServices/onetl/pull/268)). + + This allows passing custom connection parameters like `Clickhouse(extra={"custom_http_options": "option1=value1,option2=value2"})` + without need to apply urlencode to parameter value, like `option1%3Dvalue1%2Coption2%3Dvalue2`. + +## Features { #DBR-onetl-changelog-0-11-0-features } + +Improve user experience with Kafka messages and Database tables with serialized columns, like JSON/XML. + +- Allow passing custom package version as argument for `DB.get_packages(...)` method of several DB connectors: + - `Clickhouse.get_packages(package_version=..., apache_http_client_version=...)` ([#249](https://github.com/MTSWebServices/onetl/pull/249)). + - `MongoDB.get_packages(scala_version=..., spark_version=..., package_version=...)` ([#255](https://github.com/MTSWebServices/onetl/pull/255)). + - `MySQL.get_packages(package_version=...)` ([#253](https://github.com/MTSWebServices/onetl/pull/253)). + - `MSSQL.get_packages(java_version=..., package_version=...)` ([#254](https://github.com/MTSWebServices/onetl/pull/254)). + - `Oracle.get_packages(java_version=..., package_version=...)` ([#252](https://github.com/MTSWebServices/onetl/pull/252)). + - `Postgres.get_packages(package_version=...)` ([#251](https://github.com/MTSWebServices/onetl/pull/251)). + - `Teradata.get_packages(package_version=...)` ([#256](https://github.com/MTSWebServices/onetl/pull/256)). + Now users can downgrade or upgrade connection without waiting for next onETL release. Previously only `Kafka` and `Greenplum` supported this feature. +- Add `FileFormat.parse_column(...)` method to several classes: + - `Avro.parse_column(col)` ([#265](https://github.com/MTSWebServices/onetl/pull/265)). + - `JSON.parse_column(col, schema=...)` ([#257](https://github.com/MTSWebServices/onetl/pull/257)). + - `CSV.parse_column(col, schema=...)` ([#258](https://github.com/MTSWebServices/onetl/pull/258)). + - `XML.parse_column(col, schema=...)` ([#269](https://github.com/MTSWebServices/onetl/pull/269)). + This allows parsing data in `value` field of Kafka message or string/binary column of some table as a nested Spark structure. +- Add `FileFormat.serialize_column(...)` method to several classes: + - `Avro.serialize_column(col)` ([#265](https://github.com/MTSWebServices/onetl/pull/265)). + - `JSON.serialize_column(col)` ([#257](https://github.com/MTSWebServices/onetl/pull/257)). + - `CSV.serialize_column(col)` ([#258](https://github.com/MTSWebServices/onetl/pull/258)). + This allows saving Spark nested structures or arrays to `value` field of Kafka message or string/binary column of some table. + +## Improvements { #DBR-onetl-changelog-0-11-0-improvements } + +Few documentation improvements. + +- Replace all `assert` in documentation with doctest syntax. This should make documentation more readable ([#273](https://github.com/MTSWebServices/onetl/pull/273)). +- Add generic `Troubleshooting` guide ([#275](https://github.com/MTSWebServices/onetl/pull/275)). +- Improve Kafka documentation: + - Add "Prerequisites" page describing different aspects of connecting to Kafka. + - Improve "Reading from" and "Writing to" page of Kafka documentation, add more examples and usage notes. + - Add "Troubleshooting" page ([#276](https://github.com/MTSWebServices/onetl/pull/276)). +- Improve Hive documentation: + - Add "Prerequisites" page describing different aspects of connecting to Hive. + - Improve "Reading from" and "Writing to" page of Hive documentation, add more examples and recommendations. + - Improve "Executing statements in Hive" page of Hive documentation. ([#278](https://github.com/MTSWebServices/onetl/pull/278)). +- Add "Prerequisites" page describing different aspects of using SparkHDFS and SparkS3 connectors. ([#279](https://github.com/MTSWebServices/onetl/pull/279)). +- Add note about connecting to Clickhouse cluster. ([#280](https://github.com/MTSWebServices/onetl/pull/280)). +- Add notes about versions when specific class/method/attribute/argument was added, renamed or changed behavior ([#282](https://github.com/MTSWebServices/onetl/pull/282)). + +## Bug Fixes { #DBR-onetl-changelog-0-11-0-bug-fixes } + +- Fix missing `pysmb` package after installing `pip install onetl[files]` . diff --git a/mddocs/docs/changelog/0.11.1.md b/mddocs/docs/changelog/0.11.1.md new file mode 100644 index 000000000..59d72a7b0 --- /dev/null +++ b/mddocs/docs/changelog/0.11.1.md @@ -0,0 +1,9 @@ +# 0.11.1 (2024-05-29) { #DBR-onetl-changelog-0-11-1 } + +## Features { #DBR-onetl-changelog-0-11-1-features } + +- Change `MSSQL.port` default from `1433` to `None`, allowing use of `instanceName` to detect port number. ([#287](https://github.com/MTSWebServices/onetl/pull/287)) + +## Bug Fixes { #DBR-onetl-changelog-0-11-1-bug-fixes } + +- Remove `fetchsize` from `JDBC.WriteOptions`. ([#288](https://github.com/MTSWebServices/onetl/pull/288)) diff --git a/mddocs/docs/changelog/0.11.2.md b/mddocs/docs/changelog/0.11.2.md new file mode 100644 index 000000000..dcacef9f7 --- /dev/null +++ b/mddocs/docs/changelog/0.11.2.md @@ -0,0 +1,5 @@ +# 0.11.2 (2024-09-02) { #DBR-onetl-changelog-0-11-2 } + +## Bug Fixes { #DBR-onetl-changelog-0-11-2-bug-fixes } + +- Fix passing `Greenplum(extra={"options": ...})` during read/write operations. ([#308](https://github.com/MTSWebServices/onetl/pull/308)) diff --git a/mddocs/docs/changelog/0.12.0.md b/mddocs/docs/changelog/0.12.0.md new file mode 100644 index 000000000..f26bc4505 --- /dev/null +++ b/mddocs/docs/changelog/0.12.0.md @@ -0,0 +1,54 @@ +# 0.12.0 (2024-09-03) { #DBR-onetl-changelog-0-12-0 } + +## Breaking Changes { #DBR-onetl-changelog-0-12-0-breaking-changes } + +- Change connection URL used for generating HWM names of S3 and Samba sources: + - `smb://host:port` -> `smb://host:port/share` + - `s3://host:port` -> `s3://host:port/bucket` ([#304](https://github.com/MTSWebServices/onetl/pull/304)) +- Update DB connectors/drivers to latest versions: + - Clickhouse `0.6.0-patch5` → `0.6.5` + - MongoDB `10.3.0` → `10.4.0` + - MSSQL `12.6.2` → `12.8.1` + - MySQL `8.4.0` → `9.0.0` + - Oracle `23.4.0.24.05` → `23.5.0.24.07` + - Postgres `42.7.3` → `42.7.4` +- Update `Excel` package from `0.20.3` to `0.20.4`, to include Spark 3.5.1 support. ([#306](https://github.com/MTSWebServices/onetl/pull/306)) + +## Features { #DBR-onetl-changelog-0-12-0-features } + +- Add support for specifying file formats (`ORC`, `Parquet`, `CSV`, etc.) in `HiveWriteOptions.format` ([#292](https://github.com/MTSWebServices/onetl/pull/292)): + + ```python + Hive.WriteOptions(format=ORC(compression="snappy")) + ``` + +- Collect Spark execution metrics in following methods, and log then in DEBUG mode: + - `DBWriter.run()` + - `FileDFWriter.run()` + - `Hive.sql()` + - `Hive.execute()` + + This is implemented using custom `SparkListener` which wraps the entire method call, and + then report collected metrics. But these metrics sometimes may be missing due to Spark architecture, + so they are not reliable source of information. That's why logs are printed only in DEBUG mode, and + are not returned as method call result. ([#303](https://github.com/MTSWebServices/onetl/pull/303)) + +- Generate default `jobDescription` based on currently executed method. Examples: + - `DBWriter.run(schema.table) -> Postgres[host:5432/database]` + - `MongoDB[localhost:27017/admin] -> DBReader.has_data(mycollection)` + - `Hive[cluster].execute()` + + If user already set custom `jobDescription`, it will left intact. ([#304](https://github.com/MTSWebServices/onetl/pull/304)) + +- Add log.info about JDBC dialect usage ([#305](https://github.com/MTSWebServices/onetl/pull/305)): + + ```text + |MySQL| Detected dialect: 'org.apache.spark.sql.jdbc.MySQLDialect' + ``` + +- Log estimated size of in-memory dataframe created by `JDBC.fetch` and `JDBC.execute` methods. ([#303](https://github.com/MTSWebServices/onetl/pull/303)) + +## Bug Fixes { #DBR-onetl-changelog-0-12-0-bug-fixes } + +- Fix passing `Greenplum(extra={"options": ...})` during read/write operations. ([#308](https://github.com/MTSWebServices/onetl/pull/308)) +- Do not raise exception if yield-based hook whas something past (and only one) `yield`. diff --git a/mddocs/docs/changelog/0.12.1.md b/mddocs/docs/changelog/0.12.1.md new file mode 100644 index 000000000..f3126477c --- /dev/null +++ b/mddocs/docs/changelog/0.12.1.md @@ -0,0 +1,23 @@ +# 0.12.1 (2024-10-28) { #DBR-onetl-changelog-0-12-1 } + +## Features { #DBR-onetl-changelog-0-12-1-features } + +- Log detected JDBC dialect while using `DBWriter`. + +## Bug Fixes { #DBR-onetl-changelog-0-12-1-bug-fixes } + +- Fix `SparkMetricsRecorder` failing when receiving + `SparkListenerTaskEnd` without `taskMetrics` (e.g. executor was + killed by OOM). ([#313](https://github.com/MTSWebServices/onetl/pull/313)) +- Call `kinit` before checking for HDFS active namenode. +- Wrap `kinit` with `threading.Lock` to avoid multithreading issues. +- Immediately show `kinit` errors to user, instead of hiding them. +- Use `AttributeError` instead of `ImportError` in module's + `__getattr__` method, to make code compliant with Python spec. + +## Doc only Changes { #DBR-onetl-changelog-0-12-1-doc-only-changes } + +- Add note about + [spark-dialect-extension](https://github.com/MTSWebServices/spark-dialect-extension) + package to Clickhouse connector documentation. + ([#310](https://github.com/MTSWebServices/onetl/pull/310)) diff --git a/mddocs/docs/changelog/0.12.2.md b/mddocs/docs/changelog/0.12.2.md new file mode 100644 index 000000000..2391c32ad --- /dev/null +++ b/mddocs/docs/changelog/0.12.2.md @@ -0,0 +1,22 @@ +# 0.12.2 (2024-11-12) { #DBR-onetl-changelog-0-12-2 } + +## Improvements { #DBR-onetl-changelog-0-12-2-improvements } + +- Change Spark `jobDescription` for DBReader & FileDFReader from + `DBReader.run() -> Connection` to `Connection -> DBReader.run()`. + +## Bug Fixes { #DBR-onetl-changelog-0-12-2-bug-fixes } + +- Fix `log_hwm` result for `KeyValueIntHWM` (used by Kafka). + ([#316](https://github.com/MTSWebServices/onetl/pull/316)) +- Fix `log_collection` hiding values of `Kafka.addresses` in logs with + `INFO` level. ([#316](https://github.com/MTSWebServices/onetl/pull/316)) + +## Dependencies { #DBR-onetl-changelog-0-12-2-dependencies } + +- Allow using + [etl-entities==2.4.0](https://github.com/MTSWebServices/etl-entities/releases/tag/2.4.0). + +## Doc only Changes { #DBR-onetl-changelog-0-12-2-doc-only-changes } + +- Fix links to MSSQL date & time type documentation. diff --git a/mddocs/docs/changelog/0.12.3.md b/mddocs/docs/changelog/0.12.3.md new file mode 100644 index 000000000..02c5c07ab --- /dev/null +++ b/mddocs/docs/changelog/0.12.3.md @@ -0,0 +1,5 @@ +# 0.12.3 (2024-11-22) { #DBR-onetl-changelog-0-12-3 } + +## Bug Fixes { #DBR-onetl-changelog-0-12-3-bug-fixes } + +- Allow passing table names in format `schema."table.with.dots"` to `DBReader(source=...)` and `DBWriter(target=...)`. diff --git a/mddocs/docs/changelog/0.12.4.md b/mddocs/docs/changelog/0.12.4.md new file mode 100644 index 000000000..7489d87b9 --- /dev/null +++ b/mddocs/docs/changelog/0.12.4.md @@ -0,0 +1,5 @@ +# 0.12.4 (2024-11-27) { #DBR-onetl-changelog-0-12-4 } + +## Bug Fixes { #DBR-onetl-changelog-0-12-4-bug-fixes } + +- Fix `DBReader(conn=oracle, options={"partitioning_mode": "hash"})` lead to data skew in last partition due to wrong `ora_hash` usage. ([#319](https://github.com/MTSWebServices/onetl/pull/319)) diff --git a/mddocs/docs/changelog/0.12.5.md b/mddocs/docs/changelog/0.12.5.md new file mode 100644 index 000000000..62d50c5a1 --- /dev/null +++ b/mddocs/docs/changelog/0.12.5.md @@ -0,0 +1,13 @@ +# 0.12.5 (2024-12-03) { #DBR-onetl-changelog-0-12-5 } + +## Improvements { #DBR-onetl-changelog-0-12-5-improvements } + +- Use `sipHash64` instead of `md5` in Clickhouse for reading data with `{"partitioning_mode": "hash"}`, as it is 5 times faster. +- Use `hashtext` instead of `md5` in Postgres for reading data with `{"partitioning_mode": "hash"}`, as it is 3-5 times faster. +- Use `BINARY_CHECKSUM` instead of `HASHBYTES` in MSSQL for reading data with `{"partitioning_mode": "hash"}`, as it is 5 times faster. + +## Big fixes { #DBR-onetl-changelog-0-12-5-big-fixes } + +- In JDBC sources wrap `MOD(partitionColumn, numPartitions)` with `ABS(...)` to make al returned values positive. This prevents data skew. +- Fix reading table data from MSSQL using `{"partitioning_mode": "hash"}` with `partitionColumn` of integer type. +- Fix reading table data from Postgres using `{"partitioning_mode": "hash"}` lead to data skew (all the data was read into one Spark partition). diff --git a/mddocs/docs/changelog/0.13.0.md b/mddocs/docs/changelog/0.13.0.md new file mode 100644 index 000000000..9d217c8a7 --- /dev/null +++ b/mddocs/docs/changelog/0.13.0.md @@ -0,0 +1,273 @@ +# 0.13.0 (2025-02-24) { #DBR-onetl-changelog-0-13-0 } + +🎉 3 years since first release 0.1.0 🎉 + +## Breaking Changes { #DBR-onetl-changelog-0-13-0-breaking-changes } + +- Add Python 3.13. support. ([#298](https://github.com/MTSWebServices/onetl/pull/298)) + +- Change the logic of `FileConnection.walk` and + `FileConnection.list_dir`. ([#327](https://github.com/MTSWebServices/onetl/pull/327)) + + Previously `limits.stops_at(path) == True` considered as \"return + current file and stop\", and could lead to exceeding some limit. Not + it means \"stop immediately\". + +- Change default value for `FileDFWriter.Options(if_exists=...)` from + `error` to `append`, to make it consistent with other `.Options()` + classes within onETL. ([#343](https://github.com/MTSWebServices/onetl/pull/343)) + +## Features { #DBR-onetl-changelog-0-13-0-features } + +- Add support for `FileModifiedTimeHWM` HWM class (see [etl-entities + 2.5.0](https://github.com/MTSWebServices/etl-entities/releases/tag/2.5.0)): + + ```python + from etl_entitites.hwm import FileModifiedTimeHWM + from onetl.file import FileDownloader + from onetl.strategy import IncrementalStrategy + + downloader = FileDownloader( + ..., + hwm=FileModifiedTimeHWM(name="somename"), + ) + + with IncrementalStrategy(): + downloader.run() + ``` + +- Introduce `FileSizeRange(min=..., max=...)` filter class. + ([#325](https://github.com/MTSWebServices/onetl/pull/325)) + + Now users can set `FileDownloader` / `FileMover` to download/move + only files with specific file size range: + + ```python + from onetl.file import FileDownloader + from onetl.file.filter import FileSizeRange + + downloader = FileDownloader( + ..., + filters=[FileSizeRange(min="10KiB", max="1GiB")], + ) + ``` + +- Introduce `TotalFilesSize(...)` limit class. + ([#326](https://github.com/MTSWebServices/onetl/pull/326)) + + Now users can set `FileDownloader` / `FileMover` to stop + downloading/moving files after reaching a certain amount of data: + + ```python + from datetime import datetime, timedelta + from onetl.file import FileDownloader + from onetl.file.limit import TotalFilesSize + + downloader = FileDownloader( + ..., + limits=[TotalFilesSize("1GiB")], + ) + ``` + +- Implement `FileModifiedTime(since=..., until=...)` file filter. + ([#330](https://github.com/MTSWebServices/onetl/pull/330)) + + Now users can set `FileDownloader` / `FileMover` to download/move + only files with specific file modification time: + + ```python + from datetime import datetime, timedelta + from onetl.file import FileDownloader + from onetl.file.filter import FileModifiedTime + + downloader = FileDownloader( + ..., + filters=[FileModifiedTime(before=datetime.now() - timedelta(hours=1))], + ) + ``` + +- Add `SparkS3.get_exclude_packages()` and + `Kafka.get_exclude_packages()` methods. ([#341](https://github.com/MTSWebServices/onetl/pull/341)) + + Using them allows to skip downloading dependencies not required by + this specific connector, or which are already a part of + Spark/PySpark: + + ```python + from onetl.connection import SparkS3, Kafka + + maven_packages = [ + *SparkS3.get_packages(spark_version="3.5.4"), + *Kafka.get_packages(spark_version="3.5.4"), + ] + exclude_packages = SparkS3.get_exclude_packages() + Kafka.get_exclude_packages() + spark = ( + SparkSession.builder.appName("spark_app_onetl_demo") + .config("spark.jars.packages", ",".join(maven_packages)) + .config("spark.jars.excludes", ",".join(exclude_packages)) + .getOrCreate() + ) + ``` + +## Improvements { #DBR-onetl-changelog-0-13-0-improvements } + +- All DB connections opened by `JDBC.fetch(...)`, `JDBC.execute(...)` + or `JDBC.check()` are immediately closed after the statements is + executed. ([#334](https://github.com/MTSWebServices/onetl/pull/334)) + + Previously Spark session with `master=local[3]` actually opened up + to 5 connections to target DB - one for `JDBC.check()`, another for + Spark driver interaction with DB to create tables, and one for each + Spark executor. Now only max 4 connections are opened, as + `JDBC.check()` does not hold opened connection. + + This is important for RDBMS like Postgres or Greenplum where number + of connections is strictly limited and limit is usually quite low. + +- Set up `ApplicationName` (client info) for Clickhouse, MongoDB, + MSSQL, MySQL and Oracle. ([#339](https://github.com/MTSWebServices/onetl/pull/339), + [#248](https://github.com/MTSWebServices/onetl/pull/248)) + + Also update `ApplicationName` format for Greenplum, Postgres, Kafka + and SparkS3. Now all connectors have the same `ApplicationName` + format: + `${spark.applicationId} ${spark.appName} onETL/${onetl.version} Spark/${spark.version}` + + The only connections not sending `ApplicationName` are Teradata and + FileConnection implementations. + +- Now `DB.check()` will test connection availability not only on Spark + driver, but also from some Spark executor. ([#346](https://github.com/MTSWebServices/onetl/pull/346)) + + This allows to fail immediately if Spark driver host has network + access to target DB, but Spark executors have not. + + !!! note + + Now `Greenplum.check()` requires the same user grants as + `DBReader(connection=greenplum)`: + + ``` sql + -- yes, "writable" for reading data from GP, it's not a mistake + ALTER USER username CREATEEXTTABLE(type = 'writable', protocol = 'gpfdist'); + + -- for both reading and writing to GP + -- ALTER USER username CREATEEXTTABLE(type = 'readable', protocol = 'gpfdist') CREATEEXTTABLE(type = 'writable', protocol = 'gpfdist'); + ``` + + Please ask your Greenplum administrators to provide these grants. + +## Bug Fixes { #DBR-onetl-changelog-0-13-0-bug-fixes } + +- Avoid suppressing Hive Metastore errors while using `DBWriter`. + ([#329](https://github.com/MTSWebServices/onetl/pull/329)) + + Previously this was implemented as: + + ```python + try: + spark.sql(f"SELECT * FROM {table}") + table_exists = True + except Exception: + table_exists = False + ``` + + If Hive Metastore was overloaded and responded with an exception, it + was considered as non-existing table, resulting to full table + override instead of append or override only partitions subset. + +- Fix using onETL to write data to PostgreSQL or Greenplum instances + behind *pgbouncer* with `pool_mode=transaction`. + ([#336](https://github.com/MTSWebServices/onetl/pull/336)) + + Previously `Postgres.check()` opened a read-only transaction, + pgbouncer changed the entire connection type from read-write to + read-only, and when `DBWriter.run(df)` executed in read-only + connection, producing errors like: + + ``` + org.postgresql.util.PSQLException: ERROR: cannot execute INSERT in a read-only transaction + org.postgresql.util.PSQLException: ERROR: cannot execute TRUNCATE TABLE in a read-only transaction + ``` + + Added a workaround by passing `readOnly=True` to JDBC params for + read-only connections, so pgbouncer may differ read-only and + read-write connections properly. + + After upgrading onETL 0.13.x or higher the same error still may + appear of pgbouncer still holds read-only connections and returns + them for DBWriter. To this this, user can manually convert read-only + connection to read-write: + + ```python + postgres.execute("BEGIN READ WRITE;") # <-- add this line + DBWriter(...).run() + ``` + + After all connections in pgbouncer pool were converted from + read-only to read-write, and error fixed, this additional line could + be removed. + + See [Postgres JDBC driver + documentation](https://jdbc.postgresql.org/documentation/use/). + +- Fix `MSSQL.fetch(...)` and `MySQL.fetch(...)` opened a read-write + connection instead of read-only. ([#337](https://github.com/MTSWebServices/onetl/pull/337)) + + Now this is fixed: + + - `MSSQL.fetch(...)` establishes connection with `ApplicationIntent=ReadOnly`. + - `MySQL.fetch(...)` calls `SET SESSION TRANSACTION READ ONLY` statement. + +- Fixed passing multiple filters to `FileDownloader` and `FileMover`. + ([#338](https://github.com/MTSWebServices/onetl/pull/338)) If was caused by + sorting filters list in internal logging method, but `FileFilter` + subclasses are not sortable. + +- Fix a false warning about a lof of parallel connections to Grenplum. + ([#342](https://github.com/MTSWebServices/onetl/pull/342)) + + Creating Spark session with `.master("local[5]")` may open up to 6 + connections to Greenplum (=number of Spark executors + 1 for + driver), but onETL instead used number of *CPU cores* on the host as + a number of parallel connections. + + This lead to showing a false warning that number of Greenplum + connections is too high, which actually should be the case only if + number of executors is higher than 30. + +- Fix MongoDB trying to use current database name as `authSource`. + ([#347](https://github.com/MTSWebServices/onetl/pull/347)) + + Use default connector value which is `admin` database. Previous + onETL versions could be fixed by: + + ```python + from onetl.connection import MongoDB + + mongodb = MongoDB( + ..., + database="mydb", + extra={ + "authSource": "admin", + }, + ) + ``` + +## Dependencies { #DBR-onetl-changelog-0-13-0-dependencies } + +- Minimal `etl-entities` version is now + [2.5.0](https://github.com/MTSWebServices/etl-entities/releases/tag/2.5.0). + ([#331](https://github.com/MTSWebServices/onetl/pull/331)) + +- Update DB connectors/drivers to latest versions: ([#345](https://github.com/MTSWebServices/onetl/pull/345)) + + - Clickhouse `0.6.5` → `0.7.2` + - MongoDB `10.4.0` → `10.4.1` + - MySQL `9.0.0` → `9.2.0` + - Oracle `23.5.0.24.07` → `23.7.0.25.01` + - Postgres `42.7.4` → `42.7.5` + +## Doc only Changes { #DBR-onetl-changelog-0-13-0-doc-only-changes } + +- Split large code examples to tabs. ([#344](https://github.com/MTSWebServices/onetl/pull/344)) diff --git a/mddocs/docs/changelog/0.13.1.md b/mddocs/docs/changelog/0.13.1.md new file mode 100644 index 000000000..b025397f5 --- /dev/null +++ b/mddocs/docs/changelog/0.13.1.md @@ -0,0 +1,9 @@ +# 0.13.1 (2025-03-06) { #DBR-onetl-changelog-0-13-1 } + +## Bug Fixes { #DBR-onetl-changelog-0-13-1-bug-fixes } + +In 0.13.0, using `DBWriter(connection=hive, target="SOMEDB.SOMETABLE")` lead to executing `df.write.saveAsTable()` +instead of `df.write.insertInto()` if target table `somedb.sometable` already exist. + +This is caused by table name normalization (Hive uses lower-case names), which wasn't properly handled by method used for checking table existence. +([#350](https://github.com/MTSWebServices/onetl/pull/350)) diff --git a/mddocs/docs/changelog/0.13.3.md b/mddocs/docs/changelog/0.13.3.md new file mode 100644 index 000000000..7a17b10d8 --- /dev/null +++ b/mddocs/docs/changelog/0.13.3.md @@ -0,0 +1,6 @@ +# 0.13.3 (2025-03-11) { #DBR-onetl-changelog-0-13-3 } + +## Dependencies { #DBR-onetl-changelog-0-13-3-dependencies } + +Allow using [etl-entities +2.6.0](https://github.com/MTSWebServices/etl-entities/releases/tag/2.6.0). diff --git a/mddocs/docs/changelog/0.13.4.md b/mddocs/docs/changelog/0.13.4.md new file mode 100644 index 000000000..a1f0f4478 --- /dev/null +++ b/mddocs/docs/changelog/0.13.4.md @@ -0,0 +1,10 @@ +# 0.13.4 (2025-03-20) { #DBR-onetl-changelog-0-13-4 } + +## Doc only Changes { #DBR-onetl-changelog-0-13-4-doc-only-changes } + +- Prefer `ReadOptions(partitionColumn=..., numPartitions=..., queryTimeout=...)` + instead of `ReadOptions(partition_column=..., num_partitions=..., query_timeout=...)`, + to match Spark documentation. ([#352](https://github.com/MTSWebServices/onetl/pull/352)) +- Prefer `WriteOptions(if_exists=...)` instead of `WriteOptions(mode=...)` for IDE suggestions. ([#354](https://github.com/MTSWebServices/onetl/pull/354)) +- Document all options of supported file formats. + ([#355](https://github.com/MTSWebServices/onetl/pull/355), [#356](https://github.com/MTSWebServices/onetl/pull/356), [#357](https://github.com/MTSWebServices/onetl/pull/357), [#358](https://github.com/MTSWebServices/onetl/pull/358), [#359](https://github.com/MTSWebServices/onetl/pull/359), [#360](https://github.com/MTSWebServices/onetl/pull/360), [#361](https://github.com/MTSWebServices/onetl/pull/361), [#362](https://github.com/MTSWebServices/onetl/pull/362)) diff --git a/mddocs/docs/changelog/0.13.5.md b/mddocs/docs/changelog/0.13.5.md new file mode 100644 index 000000000..a464db4e1 --- /dev/null +++ b/mddocs/docs/changelog/0.13.5.md @@ -0,0 +1,11 @@ +# 0.13.5 (2025-04-14) { #DBR-onetl-changelog-0-13-5 } + +## Bug Fixes { #DBR-onetl-changelog-0-13-5-bug-fixes } + +0.13.0 changed the way `Greenplum.check()` is implemented - it begin +checking DB availability from both Spark driver and executor. But due to +misspell, `SELECT` queries were emitted from all available executors. +This lead to opening too much connections to Greenplum, which was +unexpected. + +Now only one Spark executor is used to run `Greenplum.check()`. diff --git a/mddocs/docs/changelog/0.14.0.md b/mddocs/docs/changelog/0.14.0.md new file mode 100644 index 000000000..31f0bbe84 --- /dev/null +++ b/mddocs/docs/changelog/0.14.0.md @@ -0,0 +1,43 @@ +# 0.14.0 (2025-09-08) { #DBR-onetl-changelog-0-14-0 } + +## Breaking Changes { #DBR-onetl-changelog-0-14-0-breaking-changes } + +- Drop Spark 2 support. Minimal supported Spark version is 3.2. + ([#383](https://github.com/MTSWebServices/onetl/pull/383)) + + Also dropped: + + - `Greenplum.package_spark_2_3` + - `Greenplum.package_spark_2_4` + +- Update DB connectors/drivers to latest versions: + + - MongoDB `10.4.1` → `10.5.0` + - MySQL `9.2.0` → `9.4.0` + - MSSQL `12.8.10` → `13.2.0` + - Oracle `23.7.0.25.01` → `23.9.0.25.07` + - Postgres `42.7.5` → `42.7.7` + +- Update Excel package name from `com.crealytics:spark-excel` to + `dev.mauch:spark-excel`. ([#382](https://github.com/MTSWebServices/onetl/pull/382)) + +- Now `Excel.get_packages(package_version=...)` parameter is + mandatory. ([#382](https://github.com/MTSWebServices/onetl/pull/382)) + +- Return full file/directory path from `FileConnection.list_dir` and + `FileConnection.walk`. ([#381](https://github.com/MTSWebServices/onetl/pull/381)) + Previously these methods returned only file names. + +## Features { #DBR-onetl-changelog-0-14-0-features } + +- Add Spark 4.0 support. ([#297](https://github.com/MTSWebServices/onetl/pull/297)) +- Add `Iceberg` connection support. For now this is alpha version, and + behavior may change in future. ([#378](https://github.com/MTSWebServices/onetl/pull/378), + [#386](https://github.com/MTSWebServices/onetl/pull/386)) +- Treat S3 objects with names ending with a `/` slash as directory + marker. ([#379](https://github.com/MTSWebServices/onetl/pull/379)) + +## Improvements { #DBR-onetl-changelog-0-14-0-improvements } + +- Speed up removing S3 and Samba directories with `recursive=True`. + ([#380](https://github.com/MTSWebServices/onetl/pull/380)) diff --git a/mddocs/docs/changelog/0.14.1.md b/mddocs/docs/changelog/0.14.1.md new file mode 100644 index 000000000..c3012a94f --- /dev/null +++ b/mddocs/docs/changelog/0.14.1.md @@ -0,0 +1,17 @@ +# 0.14.1 (2025-11-25) { #DBR-onetl-changelog-0-14-1 } + +## Dependencies { #DBR-onetl-changelog-0-14-1-dependencies } + +Release [minio==7.2.19](https://github.com/minio/minio-py/issues/1536) +lead to broken `S3` connector with errors like these: + +``` +TypeError: Minio.fget_object() takes 1 positional argument but 3 were given +TypeError: Minio.fput_object() takes 1 positional argument but 3 were given +``` + +Fixed. + +Added limit `minio<8.0` to avoid [breaking +things](https://github.com/minio/minio-py/pull/1530) in next major +release. diff --git a/mddocs/docs/changelog/0.15.0.md b/mddocs/docs/changelog/0.15.0.md new file mode 100644 index 000000000..f70bd5686 --- /dev/null +++ b/mddocs/docs/changelog/0.15.0.md @@ -0,0 +1,173 @@ +# 0.15.0 (2025-12-08) { #DBR-onetl-changelog-0-15-0 } + +## Removals { #DBR-onetl-changelog-0-15-0-removals } + +Drop `Teradata` connector. It is not used in our company anymore, and +never had proper integration tests. + +## Breaking Changes { #DBR-onetl-changelog-0-15-0-breaking-changes } + +Add `Iceberg(catalog=..., warehouse=...)` mandatory options +([#391](https://github.com/MTSWebServices/onetl/pull/391), +[#393](https://github.com/MTSWebServices/onetl/pull/393), +[#394](https://github.com/MTSWebServices/onetl/pull/394), +[#397](https://github.com/MTSWebServices/onetl/pull/397), +[#399](https://github.com/MTSWebServices/onetl/pull/399), +[#413](https://github.com/MTSWebServices/onetl/pull/413)). + +In 0.14.0 we've implemented very basic `Iceberg` connector configured +via dictionary: + +``` python +iceberg = Iceberg( + catalog_name="mycatalog", + extra={ + "type": "rest", + "uri": "https://catalog.company.com/rest", + "rest.auth.type": "oauth2", + "token": "jwt_token", + "warehouse": "s3a://mybucket/", + "io-impl": "org.apache.iceberg.aws.s3.S3FileIO", + "s3.endpoint": "http://localhost:9010", + "s3.access-key-id": "access_key", + "s3.secret-access-key": "secret_key", + "s3.path-style-access": "true", + "client.region": "us-east-1", + }, + spark=spark, +) +``` + +Now we've implemented wrapper classes allowing to configure various +Iceberg catalogs: + +```python title="REST Catalog with Bearer token auth" +iceberg = Iceberg( + catalog_name="mycatalog", + catalog=Iceberg.RESTCatalog( + url="https://catalog.company.com/rest", + auth=Iceberg.RESTCatalog.BearerAuth( + access_token="jwt_token", + ), + ), + warehouse=..., +) +``` + +```python title="REST Catalog with OAuth2 ClientCredentials auth" +iceberg = Iceberg( + catalog_name="mycatalog", + catalog=Iceberg.RESTCatalog( + url="https://catalog.company.com/rest", + auth=Iceberg.RESTCatalog.OAuth2ClientCredentials( + client_id="my_client", + client_secret="my_secret", + oauth2_token_endpoint="http://keycloak.company.com/realms/my-realm/protocol/openid-connect/token", + scopes=["catalog"], + ), + ), + warehouse=..., + spark=spark, +) +``` + +And also set of classes to configure for warehouses: + +```python title="S3 warehouse" +iceberg = Iceberg( + catalog_name="mycatalog", + catalog=..., + # using Iceberg AWS integration + warehouse=Iceberg.S3Warehouse( + path="/", + bucket="mybucket", + host="localhost", + port=9010, + protocol="http", + path_style_access=True, + access_key="access_key", + secret_key="secret_key", + region="us-east-1", + ), + spark=spark, +) +``` + +```python title="For Lakekeeper, Polaris, Gravitino" +iceberg = Iceberg( + catalog_name="mycatalog", + catalog=..., + # Delegate warehouse config to REST Catalog + warehouse=Iceberg.DelegatedWarehouse( + warehouse="some-warehouse", + access_delegation="vended-credentials", + ), + spark=spark, +) +``` + +```python title="HDFS warehouse" +iceberg = Iceberg( + catalog_name="mycatalog", + # store both data and metadata on HadoopFilesystem + catalog=Iceberg.FilesystemCatalog(), + warehouse=Iceberg.FilesystemWarehouse( + path="/some/warehouse", + connection=SparkHDFS(cluster="dwh"), + ), + spark=spark, +) +``` + +Having classes instead of dicts brings IDE autocompletion, and allows to +reuse the same catalog connection options for multiple warehouses. + +## Features { #DBR-onetl-changelog-0-15-0-features } + +- Added support for `Iceberg.WriteOptions(table_properties={})` + ([#401](https://github.com/MTSWebServices/onetl/pull/401)). + + In particular, table's `"location": "/some/warehouse/mytable"` can + be set now. + +- Added support for `Hive.WriteOptions(table_properties={})` + ([#412](https://github.com/MTSWebServices/onetl/pull/412)). + + In particular, table's `"auto.purge": "true"` can be set now. + +## Improvements { #DBR-onetl-changelog-0-15-0-improvements } + +- Allow to set `SparkS3(path_style_access=True)` instead of + `SparkS3(extra={"path.style.access": True)` ([#392](https://github.com/MTSWebServices/onetl/pull/392)). + + This change improves IDE autocompletion and made it more explicit + that the parameter is important for the connector's functionality. + +- Add a runtime warning about missing `S3(region=...)` and + `SparkS3(region=...)` params ([#418](https://github.com/MTSWebServices/onetl/pull/418)). + + It is recommended to explicitly pass this parameter to avoid + potential access errors. + +Thanks to [@yabel](https://github.com/yabel) + +## Dependencies { #DBR-onetl-changelog-0-15-0-dependencies } + +- Update JDBC connectors: + + - MySQL `9.4.0` → `9.5.0` + - MSSQL `13.2.0` → `13.2.1` + - Oracle `23.9.0.25.07` → `23.26.0.0.0` + - Postgres `42.7.7` → `42.7.8` + +- Added support for `Clickhouse.get_packages(package_version="0.9.3")` + ([#407](https://github.com/MTSWebServices/onetl/pull/407)). + + Versions in range 0.8.0-0.9.2 are not supported due to [issue #2625](https://github.com/ClickHouse/clickhouse-java/issues/2625). + + Versions 0.9.3+ is still not default one because of various + compatibility and performance issues. Use it at your own risk. + +## Documentation { #DBR-onetl-changelog-0-15-0-documentation } + +- Document using Greenplum connector with Spark on `master=k8s` diff --git a/mddocs/docs/changelog/0.7.0.md b/mddocs/docs/changelog/0.7.0.md new file mode 100644 index 000000000..75ef943c9 --- /dev/null +++ b/mddocs/docs/changelog/0.7.0.md @@ -0,0 +1,239 @@ +# 0.7.0 (2023-05-15) { #DBR-onetl-changelog-0-7-0 } + +## 🎉 onETL is now open source 🎉 { #DBR-onetl-changelog-0-7-0-onetl-is-now-open-source } + +That was long road, but we finally did it! + +## Breaking Changes { #DBR-onetl-changelog-0-7-0-breaking-changes } + +- Changed installation method. + + **TL;DR What should I change to restore previous behavior** + + Simple way: + + | onETL < 0.7.0 | onETL >= 0.7.0 | + | ----------------- | --------------------------------- | + | pip install onetl | pip install onetl[files,kerberos] | + + Right way - enumerate connectors should be installed: + + ```bash + pip install onetl[hdfs,ftp,kerberos] # except DB connections + ``` + + **Details** + + In onetl<0.7 the package installation looks like: + + ```bash title="before" + + pip install onetl + ``` + + But this includes all dependencies for all connectors, even if user does not use them. + This caused some issues, for example user had to install Kerberos libraries to be able to install onETL, even if user uses only S3 (without Kerberos support). + + Since 0.7.0 installation process was changed: + + ``` bash title="after" + + pip install onetl # minimal installation, only onETL core + # there is no extras for DB connections because they are using Java packages which are installed in runtime + + pip install onetl[ftp,ftps,hdfs,sftp,s3,webdav] # install dependencies for specified file connections + pip install onetl[files] # install dependencies for all file connections + + pip install onetl[kerberos] # Kerberos auth support + pip install onetl[spark] # install PySpark to use DB connections + + pip install onetl[spark,kerberos,files] # all file connections + Kerberos + PySpark + pip install onetl[all] # alias for previous case + ``` + + There are corresponding documentation items for each extras. + + Also onETL checks that some requirements are missing, and raises exception with recommendation how to install them: + + ``` text title="exception while import Clickhouse connection" + + Cannot import module "pyspark". + + Since onETL v0.7.0 you should install package as follows: + pip install onetl[spark] + + or inject PySpark to sys.path in some other way BEFORE creating MongoDB instance. + ``` + + ``` text title="exception while import FTP connection" + + Cannot import module "ftputil". + + Since onETL v0.7.0 you should install package as follows: + pip install onetl[ftp] + + or + pip install onetl[files] + ``` + +- Added new `cluster` argument to `Hive` and `HDFS` connections. + + `Hive` qualified name (used in HWM) contains cluster name. But in onETL<0.7.0 cluster name had hard coded value `rnd-dwh` which was not OK for some users. + + `HDFS` connection qualified name contains host (active namenode of Hadoop cluster), but its value can change over time, leading to creating of new HWM. + + Since onETL 0.7.0 both `Hive` and `HDFS` connections have `cluster` attribute which can be set to a specific cluster name. + For `Hive` it is mandatory, for `HDFS` it can be omitted (using host as a fallback). + + But passing cluster name every time could lead to errors. + + Now `Hive` and `HDFS` have nested class named `slots` with methods: + + - `normalize_cluster_name` + - `get_known_clusters` + - `get_current_cluster` + - `normalize_namenode_host` (only `HDFS`) + - `get_cluster_namenodes` (only `HDFS`) + - `get_webhdfs_port` (only `HDFS`) + - `is_namenode_active` (only `HDFS`) + + And new method `HDFS.get_current` / `Hive.get_current`. + + Developers can implement hooks validating user input or substituting values for automatic cluster detection. + This should improve user experience while using these connectors. + + See slots documentation. + +- Update JDBC connection drivers. + + - Greenplum `2.1.3` → `2.1.4`. + - MSSQL `10.2.1.jre8` → `12.2.0.jre8`. Minimal supported version of MSSQL is now 2014 instead 2021. + - MySQL `8.0.30` → `8.0.33`: + - Package was renamed `mysql:mysql-connector-java` → `com.mysql:mysql-connector-j`. + - Driver class was renamed `com.mysql.jdbc.Driver` → `com.mysql.cj.jdbc.Driver`. + - Oracle `21.6.0.0.1` → `23.2.0.0`. + - Postgres `42.4.0` → `42.6.0`. + - Teradata `17.20.00.08` → `17.20.00.15`: + - Package was renamed `com.teradata.jdbc:terajdbc4` → `com.teradata.jdbc:terajdbc`. + - Teradata driver is now published to Maven. + + See [#31](https://github.com/MTSWebServices/onetl/pull/31). + +## Features { #DBR-onetl-changelog-0-7-0-features } + +- Added MongoDB connection. + + Using official [MongoDB connector for Spark v10](https://www.mongodb.com/docs/spark-connector/current/). Only Spark 3.2+ is supported. + + There are some differences between MongoDB and other database sources: + + - Instead of `mongodb.sql` method there is `mongodb.pipeline`. + - No methods `mongodb.fetch` and `mongodb.execute`. + - `DBReader.hint` and `DBReader.where` have different types than in SQL databases: + + ```python + where = { + "col1": { + "$eq": 10, + }, + } + + hint = { + "col1": 1, + } + ``` + + - Because MongoDB does not have schemas of collections, but Spark cannot create dataframe with dynamic schema, new option `DBReader.df_schema` was introduced. + It is mandatory for MongoDB, but optional for other sources. + - Currently DBReader cannot be used with MongoDB and hwm expression, e.g. `hwm_column=("mycolumn", {"$cast": {"col1": "date"}})` + + Because there are no tables in MongoDB, some options were renamed in core classes: + + - `DBReader(table=...)` → `DBReader(source=...)` + - `DBWriter(table=...)` → `DBWriter(target=...)` + + Old names can be used too, they are not deprecated ([#30](https://github.com/MTSWebServices/onetl/pull/30)). + +- Added option for disabling some plugins during import. + + Previously if some plugin were failing during the import, the only way to import onETL would be to disable all plugins + using environment variable. + + Now there are several variables with different behavior: + + - `ONETL_PLUGINS_ENABLED=false` - disable all plugins autoimport. Previously it was named `ONETL_ENABLE_PLUGINS`. + - `ONETL_PLUGINS_BLACKLIST=plugin-name,another-plugin` - set list of plugins which should NOT be imported automatically. + - `ONETL_PLUGINS_WHITELIST=plugin-name,another-plugin` - set list of plugins which should ONLY be imported automatically. + + Also we improved exception message with recommendation how to disable a failing plugin: + + ``` text title="exception message example" + + Error while importing plugin 'mtspark' from package 'mtspark' v4.0.0. + + Statement: + import mtspark.onetl + + Check if plugin is compatible with current onETL version 0.7.0. + + You can disable loading this plugin by setting environment variable: + ONETL_PLUGINS_BLACKLIST='mtspark,failing-plugin' + + You can also define a whitelist of packages which can be loaded by onETL: + ONETL_PLUGINS_WHITELIST='not-failing-plugin1,not-failing-plugin2' + + Please take into account that plugin name may differ from package or module name. + See package metadata for more details + ``` + +## Improvements { #DBR-onetl-changelog-0-7-0-improvements } + +- Added compatibility with Python 3.11 and PySpark 3.4.0. + + File connections were OK, but `jdbc.fetch` and `jdbc.execute` were failing. Fixed in [#28](https://github.com/MTSWebServices/onetl/pull/28). + +- Added check for missing Java packages. + + Previously if DB connection tried to use some Java class which were not loaded into Spark version, it raised an exception + with long Java stacktrace. Most users failed to interpret this trace. + + Now onETL shows the following error message: + + ``` text title="exception message example" + + |Spark| Cannot import Java class 'com.mongodb.spark.sql.connector.MongoTableProvider'. + + It looks like you've created Spark session without this option: + SparkSession.builder.config("spark.jars.packages", MongoDB.package_spark_3_2) + + Please call `spark.stop()`, restart the interpreter, + and then create new SparkSession with proper options. + ``` + +- Documentation improvements. + + - Changed documentation site theme - using [furo](https://github.com/pradyunsg/furo) + instead of default [ReadTheDocs](https://github.com/readthedocs/sphinx_rtd_theme). + + New theme supports wide screens and dark mode. + See [#10](https://github.com/MTSWebServices/onetl/pull/10). + + - Now each connection class have compatibility table for Spark + Java + Python. + + - Added global compatibility table for Spark + Java + Python + Scala. + +## Bug Fixes { #DBR-onetl-changelog-0-7-0-bug-fixes } + +- Fixed several SFTP issues. + + - If SSH config file `~/.ssh/config` contains some options not recognized by Paramiko (unknown syntax, unknown option name), + previous versions were raising exception until fixing or removing this file. Since 0.7.0 exception is replaced with warning. + + - If user passed `host_key_check=False` but server changed SSH keys, previous versions raised exception until new key is accepted. + Since 0.7.0 exception is replaced with warning if option value is `False`. + + Fixed in [#19](https://github.com/MTSWebServices/onetl/pull/19). + +- Fixed several S3 issues. + + There was a bug in S3 connection which prevented handling files in the root of a bucket - they were invisible for the connector. Fixed in [#29](https://github.com/MTSWebServices/onetl/pull/29). diff --git a/mddocs/docs/changelog/0.7.1.md b/mddocs/docs/changelog/0.7.1.md new file mode 100644 index 000000000..c69428444 --- /dev/null +++ b/mddocs/docs/changelog/0.7.1.md @@ -0,0 +1,40 @@ +# 0.7.1 (2023-05-23) { #DBR-onetl-changelog-0-7-1 } + +## Bug Fixes { #DBR-onetl-changelog-0-7-1-bug-fixes } + +- Fixed `setup_logging` function. + + In onETL==0.7.0 calling `onetl.log.setup_logging()` broke the logging: + + ``` text title="exception message" + + Traceback (most recent call last): + File "/opt/anaconda/envs/py39/lib/python3.9/logging/__init__.py", line 434, in format + return self._format(record) + File "/opt/anaconda/envs/py39/lib/python3.9/logging/__init__.py", line 430, in _format + return self._fmt % record.dict + KeyError: 'levelname:8s' + ``` + +- Fixed installation examples. + + In onETL==0.7.0 there are examples of installing onETL with extras: + + ``` bash title="before" + + pip install onetl[files, kerberos, spark] + ``` + + But pip fails to install such package: + + ``` text title="exception" + + ERROR: Invalid requirement: 'onet[files,' + ``` + + This is because of spaces in extras clause. Fixed: + + ``` bash title="after" + + pip install onetl[files,kerberos,spark] + ``` diff --git a/mddocs/docs/changelog/0.7.2.md b/mddocs/docs/changelog/0.7.2.md new file mode 100644 index 000000000..0796b5cce --- /dev/null +++ b/mddocs/docs/changelog/0.7.2.md @@ -0,0 +1,37 @@ +# 0.7.2 (2023-05-24) { #DBR-onetl-changelog-0-7-2 } + +## Dependencies { #DBR-onetl-changelog-0-7-2-dependencies } + +- Limited `typing-extensions` version. + + `typing-extensions==4.6.0` release contains some breaking changes causing errors like: + + ``` text title="typing-extensions 4.6.0" + + Traceback (most recent call last): + File "/Users/project/lib/python3.9/typing.py", line 852, in __subclasscheck__ + return issubclass(cls, self.__origin__) + TypeError: issubclass() arg 1 must be a class + ``` + + `typing-extensions==4.6.1` was causing another error: + + ``` text title="typing-extensions 4.6.1" + + Traceback (most recent call last): + File "/home/maxim/Repo/typing_extensions/1.py", line 33, in + isinstance(file, ContainsException) + File "/home/maxim/Repo/typing_extensions/src/typing_extensions.py", line 599, in __instancecheck__ + if super().__instancecheck__(instance): + File "/home/maxim/.pyenv/versions/3.7.8/lib/python3.7/abc.py", line 139, in __instancecheck__ + return _abc_instancecheck(cls, instance) + File "/home/maxim/Repo/typing_extensions/src/typing_extensions.py", line 583, in __subclasscheck__ + return super().__subclasscheck__(other) + File "/home/maxim/.pyenv/versions/3.7.8/lib/python3.7/abc.py", line 143, in __subclasscheck__ + return _abc_subclasscheck(cls, subclass) + File "/home/maxim/Repo/typing_extensions/src/typing_extensions.py", line 661, in _proto_hook + and other._is_protocol + AttributeError: type object 'PathWithFailure' has no attribute '_is_protocol' + ``` + + We updated requirements with `typing-extensions<4.6` until fixing compatibility issues. diff --git a/mddocs/docs/changelog/0.8.0.md b/mddocs/docs/changelog/0.8.0.md new file mode 100644 index 000000000..00f011aed --- /dev/null +++ b/mddocs/docs/changelog/0.8.0.md @@ -0,0 +1,162 @@ +# 0.8.0 (2023-05-31) { #DBR-onetl-changelog-0-8-0 } + +## Breaking Changes { #DBR-onetl-changelog-0-8-0-breaking-changes } + +- Rename methods of `FileConnection` classes: + + - `get_directory` → `resolve_dir` + - `get_file` → `resolve_file` + - `listdir` → `list_dir` + - `mkdir` → `create_dir` + - `rmdir` → `remove_dir` + + New naming should be more consistent. + + They were undocumented in previous versions, but someone could use these methods, so this is a breaking change. ([#36](https://github.com/MTSWebServices/onetl/pull/36)) + +- Deprecate `onetl.core.FileFilter` class, replace it with new classes: + + - `onetl.file.filter.Glob` + - `onetl.file.filter.Regexp` + - `onetl.file.filter.ExcludeDir` + + Old class will be removed in v1.0.0. ([#43](https://github.com/MTSWebServices/onetl/pull/43)) + +- Deprecate `onetl.core.FileLimit` class, replace it with new class `onetl.file.limit.MaxFilesCount`. + + Old class will be removed in v1.0.0. ([#44](https://github.com/MTSWebServices/onetl/pull/44)) + +- Change behavior of `BaseFileLimit.reset` method. + + This method should now return `self` instead of `None`. + Return value could be the same limit object or a copy, this is an implementation detail. ([#44](https://github.com/MTSWebServices/onetl/pull/44)) + +- Replaced `FileDownloader.filter` and `.limit` with new options `.filters` and `.limits`: + + ``` python title="onETL < 0.8.0" + FileDownloader( + ..., + filter=FileFilter(glob="*.txt", exclude_dir="/path"), + limit=FileLimit(count_limit=10), + ) + ``` + + ``` python title="onETL >= 0.8.0" + FileDownloader( + ..., + filters=[Glob("*.txt"), ExcludeDir("/path")], + limits=[MaxFilesCount(10)], + ) + ``` + + This allows to developers to implement their own filter and limit classes, and combine them with existing ones. + + Old behavior still supported, but it will be removed in v1.0.0. ([#45](https://github.com/MTSWebServices/onetl/pull/45)) + +- Removed default value for `FileDownloader.limits`, user should pass limits list explicitly. ([#45](https://github.com/MTSWebServices/onetl/pull/45)) + +- Move classes from module `onetl.core`: + + ``` python title="before" + from onetl.core import DBReader + from onetl.core import DBWriter + from onetl.core import FileDownloader + from onetl.core import FileUploader + ``` + + with new modules `onetl.db` and `onetl.file`: + + ``` python title="after" + from onetl.db import DBReader + from onetl.db import DBWriter + + from onetl.file import FileDownloader + from onetl.file import FileUploader + ``` + + Imports from old module `onetl.core` still can be used, but marked as deprecated. Module will be removed in v1.0.0. ([#46](https://github.com/MTSWebServices/onetl/pull/46)) + +## Features { #DBR-onetl-changelog-0-8-0-features } + +- Add `rename_dir` method. + + Method was added to following connections: + + - `FTP` + - `FTPS` + - `HDFS` + - `SFTP` + - `WebDAV` + + It allows to rename/move directory to new path with all its content. + + `S3` does not have directories, so there is no such method in that class. ([#40](https://github.com/MTSWebServices/onetl/pull/40)) + +- Add `onetl.file.FileMover` class. + + It allows to move files between directories of remote file system. + Signature is almost the same as in `FileDownloader`, but without HWM support. ([#42](https://github.com/MTSWebServices/onetl/pull/42)) + +## Improvements { #DBR-onetl-changelog-0-8-0-improvements } + +- Document all public methods in `FileConnection` classes: + + - `download_file` + - `resolve_dir` + - `resolve_file` + - `get_stat` + - `is_dir` + - `is_file` + - `list_dir` + - `create_dir` + - `path_exists` + - `remove_file` + - `rename_file` + - `remove_dir` + - `upload_file` + - `walk` ([#39](https://github.com/MTSWebServices/onetl/pull/39)) + +- Update documentation of `check` method of all connections - add usage example and document result type. ([#39](https://github.com/MTSWebServices/onetl/pull/39)) + +- Add new exception type `FileSizeMismatchError`. + + Methods `connection.download_file` and `connection.upload_file` now raise new exception type instead of `RuntimeError`, + if target file after download/upload has different size than source. ([#39](https://github.com/MTSWebServices/onetl/pull/39)) + +- Add new exception type `DirectoryExistsError` - it is raised if target directory already exists. ([#40](https://github.com/MTSWebServices/onetl/pull/40)) + +- Improved `FileDownloader` / `FileUploader` exception logging. + + If `DEBUG` logging is enabled, print exception with stacktrace instead of + printing only exception message. ([#42](https://github.com/MTSWebServices/onetl/pull/42)) + +- Updated documentation of `FileUploader`. + + - Class does not support read strategies, added note to documentation. + - Added examples of using `run` method with explicit files list passing, both absolute and relative paths. + - Fix outdated imports and class names in examples. ([#42](https://github.com/MTSWebServices/onetl/pull/42)) + +- Updated documentation of `DownloadResult` class - fix outdated imports and class names. ([#42](https://github.com/MTSWebServices/onetl/pull/42)) + +- Improved file filters documentation section. + + Document interface class `onetl.base.BaseFileFilter` and function `match_all_filters`. ([#43](https://github.com/MTSWebServices/onetl/pull/43)) + +- Improved file limits documentation section. + + Document interface class `onetl.base.BaseFileLimit` and functions `limits_stop_at` / `limits_reached` / `reset_limits`. ([#44](https://github.com/MTSWebServices/onetl/pull/44)) + +- Added changelog. + + Changelog is generated from separated news files using [towncrier](https://pypi.org/project/towncrier/). ([#47](https://github.com/MTSWebServices/onetl/pull/47)) + +## Misc { #DBR-onetl-changelog-0-8-0-misc } + +- Improved CI workflow for tests. + + - If developer haven't changed source core of a specific connector or its dependencies, + run tests only against maximum supported versions of Spark, Python, Java and db/file server. + - If developed made some changes in a specific connector, or in core classes, or in dependencies, + run tests for both minimal and maximum versions. + - Once a week run all aganst for minimal and latest versions to detect breaking changes in dependencies + - Minimal tested Spark version is 2.3.1 instead on 2.4.8. ([#32](https://github.com/MTSWebServices/onetl/pull/32)) diff --git a/mddocs/docs/changelog/0.8.1.md b/mddocs/docs/changelog/0.8.1.md new file mode 100644 index 000000000..8d4ef4a9e --- /dev/null +++ b/mddocs/docs/changelog/0.8.1.md @@ -0,0 +1,42 @@ +# 0.8.1 (2023-07-10) { #DBR-onetl-changelog-0-8-1 } + +## Features { #DBR-onetl-changelog-0-8-1-features } + +- Add `@slot` decorator to public methods of: + + - `DBConnection` + - `FileConnection` + - `DBReader` + - `DBWriter` + - `FileDownloader` + - `FileUploader` + - `FileMover` ([#49](https://github.com/MTSWebServices/onetl/pull/49)) + +- Add `workers` field to `FileDownloader` / `FileUploader` / `FileMover`. `Options` classes. + + This allows to speed up all file operations using parallel threads. ([#57](https://github.com/MTSWebServices/onetl/pull/57)) + +## Improvements { #DBR-onetl-changelog-0-8-1-improvements } + +- Add documentation for HWM store `.get` and `.save` methods. ([#49](https://github.com/MTSWebServices/onetl/pull/49)) + +- Improve Readme: + + - Move `Quick start` section from documentation + - Add `Non-goals` section + - Fix code blocks indentation ([#50](https://github.com/MTSWebServices/onetl/pull/50)) + +- Improve Contributing guide: + + - Move `Develop` section from Readme + - Move `docs/changelog/README.rst` content + - Add `Limitations` section + - Add instruction of creating a fork and building documentation ([#50](https://github.com/MTSWebServices/onetl/pull/50)) + +- Remove duplicated checks for source file existence in `FileDownloader` / `FileMover`. ([#57](https://github.com/MTSWebServices/onetl/pull/57)) + +- Update default logging format to include thread name. ([#57](https://github.com/MTSWebServices/onetl/pull/57)) + +## Bug Fixes { #DBR-onetl-changelog-0-8-1-bug-fixes } + +- Fix `S3.list_dir('/')` returns empty list on latest Minio version. ([#58](https://github.com/MTSWebServices/onetl/pull/58)) diff --git a/mddocs/docs/changelog/0.9.0.md b/mddocs/docs/changelog/0.9.0.md new file mode 100644 index 000000000..b87049ec4 --- /dev/null +++ b/mddocs/docs/changelog/0.9.0.md @@ -0,0 +1,122 @@ +# 0.9.0 (2023-08-17) { #DBR-onetl-changelog-0-9-0 } + +## Breaking Changes { #DBR-onetl-changelog-0-9-0-breaking-changes } + +- Rename methods: + + - `DBConnection.read_df` → `DBConnection.read_source_as_df` + - `DBConnection.write_df` → `DBConnection.write_df_to_target` ([#66](https://github.com/MTSWebServices/onetl/pull/66)) + +- Rename classes: + + - `HDFS.slots` → `HDFS.Slots` + - `Hive.slots` → `Hive.Slots` + + Old names are left intact, but will be removed in v1.0.0 ([#103](https://github.com/MTSWebServices/onetl/pull/103)) + +- Rename options to make them self-explanatory: + + - `Hive.WriteOptions(mode="append")` → `Hive.WriteOptions(if_exists="append")` + - `Hive.WriteOptions(mode="overwrite_table")` → `Hive.WriteOptions(if_exists="replace_entire_table")` + - `Hive.WriteOptions(mode="overwrite_partitions")` → `Hive.WriteOptions(if_exists="replace_overlapping_partitions")` + - `JDBC.WriteOptions(mode="append")` → `JDBC.WriteOptions(if_exists="append")` + - `JDBC.WriteOptions(mode="overwrite")` → `JDBC.WriteOptions(if_exists="replace_entire_table")` + - `Greenplum.WriteOptions(mode="append")` → `Greenplum.WriteOptions(if_exists="append")` + - `Greenplum.WriteOptions(mode="overwrite")` → `Greenplum.WriteOptions(if_exists="replace_entire_table")` + - `MongoDB.WriteOptions(mode="append")` → `Greenplum.WriteOptions(if_exists="append")` + - `MongoDB.WriteOptions(mode="overwrite")` → `Greenplum.WriteOptions(if_exists="replace_entire_collection")` + - `FileDownloader.Options(mode="error")` → `FileDownloader.Options(if_exists="error")` + - `FileDownloader.Options(mode="ignore")` → `FileDownloader.Options(if_exists="ignore")` + - `FileDownloader.Options(mode="overwrite")` → `FileDownloader.Options(if_exists="replace_file")` + - `FileDownloader.Options(mode="delete_all")` → `FileDownloader.Options(if_exists="replace_entire_directory")` + - `FileUploader.Options(mode="error")` → `FileUploader.Options(if_exists="error")` + - `FileUploader.Options(mode="ignore")` → `FileUploader.Options(if_exists="ignore")` + - `FileUploader.Options(mode="overwrite")` → `FileUploader.Options(if_exists="replace_file")` + - `FileUploader.Options(mode="delete_all")` → `FileUploader.Options(if_exists="replace_entire_directory")` + - `FileMover.Options(mode="error")` → `FileMover.Options(if_exists="error")` + - `FileMover.Options(mode="ignore")` → `FileMover.Options(if_exists="ignore")` + - `FileMover.Options(mode="overwrite")` → `FileMover.Options(if_exists="replace_file")` + - `FileMover.Options(mode="delete_all")` → `FileMover.Options(if_exists="replace_entire_directory")` + + Old names are left intact, but will be removed in v1.0.0 ([#108](https://github.com/MTSWebServices/onetl/pull/108)) + +- Rename `onetl.log.disable_clients_logging()` to `onetl.log.setup_clients_logging()`. ([#120](https://github.com/MTSWebServices/onetl/pull/120)) + +## Features { #DBR-onetl-changelog-0-9-0-features } + +- Add new methods returning Maven packages for specific connection class: + + - `Clickhouse.get_packages()` + - `MySQL.get_packages()` + - `Postgres.get_packages()` + - `Teradata.get_packages()` + - `MSSQL.get_packages(java_version="8")` + - `Oracle.get_packages(java_version="8")` + - `Greenplum.get_packages(scala_version="2.12")` + - `MongoDB.get_packages(scala_version="2.12")` + - `Kafka.get_packages(spark_version="3.4.1", scala_version="2.12")` + + Deprecate old syntax: + + - `Clickhouse.package` + - `MySQL.package` + - `Postgres.package` + - `Teradata.package` + - `MSSQL.package` + - `Oracle.package` + - `Greenplum.package_spark_2_3` + - `Greenplum.package_spark_2_4` + - `Greenplum.package_spark_3_2` + - `MongoDB.package_spark_3_2` + - `MongoDB.package_spark_3_3` + - `MongoDB.package_spark_3_4` ([#87](https://github.com/MTSWebServices/onetl/pull/87)) + +- Allow to set client modules log level in `onetl.log.setup_clients_logging()`. + + Allow to enable underlying client modules logging in `onetl.log.setup_logging()` by providing additional argument `enable_clients=True`. + This is useful for debug. ([#120](https://github.com/MTSWebServices/onetl/pull/120)) + +- Added support for reading and writing data to Kafka topics. + + For these operations, new classes were added. + + - `Kafka` ([#54](https://github.com/MTSWebServices/onetl/pull/54), [#60](https://github.com/MTSWebServices/onetl/pull/60), [#72](https://github.com/MTSWebServices/onetl/pull/72), [#84](https://github.com/MTSWebServices/onetl/pull/84), [#87](https://github.com/MTSWebServices/onetl/pull/87), [#89](https://github.com/MTSWebServices/onetl/pull/89), [#93](https://github.com/MTSWebServices/onetl/pull/93), [#96](https://github.com/MTSWebServices/onetl/pull/96), [#102](https://github.com/MTSWebServices/onetl/pull/102), [#104](https://github.com/MTSWebServices/onetl/pull/104)) + - `Kafka.PlaintextProtocol` ([#79](https://github.com/MTSWebServices/onetl/pull/79)) + - `Kafka.SSLProtocol` ([#118](https://github.com/MTSWebServices/onetl/pull/118)) + - `Kafka.BasicAuth` ([#63](https://github.com/MTSWebServices/onetl/pull/63), [#77](https://github.com/MTSWebServices/onetl/pull/77)) + - `Kafka.KerberosAuth` ([#63](https://github.com/MTSWebServices/onetl/pull/63), [#77](https://github.com/MTSWebServices/onetl/pull/77), [#110](https://github.com/MTSWebServices/onetl/pull/110)) + - `Kafka.ScramAuth` ([#115](https://github.com/MTSWebServices/onetl/pull/115)) + - `Kafka.Slots` ([#109](https://github.com/MTSWebServices/onetl/pull/109)) + - `Kafka.ReadOptions` ([#68](https://github.com/MTSWebServices/onetl/pull/68)) + - `Kafka.WriteOptions` ([#68](https://github.com/MTSWebServices/onetl/pull/68)) + + Currently, Kafka does not support incremental read strategies, this will be implemented in future releases. + +- Added support for reading files as Spark DataFrame and saving DataFrame as Files. + + For these operations, new classes were added. + + FileDFConnections: + + - `SparkHDFS` ([#98](https://github.com/MTSWebServices/onetl/pull/98)) + - `SparkS3` ([#94](https://github.com/MTSWebServices/onetl/pull/94), [#100](https://github.com/MTSWebServices/onetl/pull/100), [#124](https://github.com/MTSWebServices/onetl/pull/124)) + - `SparkLocalFS` ([#67](https://github.com/MTSWebServices/onetl/pull/67)) + + High-level classes: + + - `FileDFReader` ([#73](https://github.com/MTSWebServices/onetl/pull/73)) + - `FileDFWriter` ([#81](https://github.com/MTSWebServices/onetl/pull/81)) + + File formats: + + - `Avro` ([#69](https://github.com/MTSWebServices/onetl/pull/69)) + - `CSV` ([#92](https://github.com/MTSWebServices/onetl/pull/92)) + - `JSON` ([#83](https://github.com/MTSWebServices/onetl/pull/83)) + - `JSONLine` ([#83](https://github.com/MTSWebServices/onetl/pull/83)) + - `ORC` ([#86](https://github.com/MTSWebServices/onetl/pull/86)) + - `Parquet` ([#88](https://github.com/MTSWebServices/onetl/pull/88)) + +## Improvements { #DBR-onetl-changelog-0-9-0-improvements } + +- Remove redundant checks for driver availability in Greenplum and MongoDB connections. ([#67](https://github.com/MTSWebServices/onetl/pull/67)) +- Check of Java class availability moved from `.check()` method to connection constructor. ([#97](https://github.com/MTSWebServices/onetl/pull/97)) diff --git a/mddocs/docs/changelog/0.9.1.md b/mddocs/docs/changelog/0.9.1.md new file mode 100644 index 000000000..40cb722da --- /dev/null +++ b/mddocs/docs/changelog/0.9.1.md @@ -0,0 +1,7 @@ +# 0.9.1 (2023-08-17) { #DBR-onetl-changelog-0-9-1 } + +## Bug Fixes { #DBR-onetl-changelog-0-9-1-bug-fixes } + +- Fixed bug then number of threads created by `FileDownloader` / `FileUploader` / `FileMover` was + not `min(workers, len(files))`, but `max(workers, len(files))`. leading to create too much workers + on large files list. diff --git a/mddocs/docs/changelog/0.9.2.md b/mddocs/docs/changelog/0.9.2.md new file mode 100644 index 000000000..349e907e1 --- /dev/null +++ b/mddocs/docs/changelog/0.9.2.md @@ -0,0 +1,23 @@ +# 0.9.2 (2023-09-06) { #DBR-onetl-changelog-0-9-2 } + +## Features { #DBR-onetl-changelog-0-9-2-features } + +- Add `if_exists="ignore"` and `error` to `Greenplum.WriteOptions` ([#142](https://github.com/MTSWebServices/onetl/pull/142)) + +## Improvements { #DBR-onetl-changelog-0-9-2-improvements } + +- Improve validation messages while writing dataframe to Kafka. ([#131](https://github.com/MTSWebServices/onetl/pull/131)) + +- Improve documentation: + + - Add notes about reading and writing to database connections documentation + - Add notes about executing statements in JDBC and Greenplum connections + +## Bug Fixes { #DBR-onetl-changelog-0-9-2-bug-fixes } + +- Fixed validation of `headers` column is written to Kafka with default `Kafka.WriteOptions()` - default value was `False`, + but instead of raising an exception, column value was just ignored. ([#131](https://github.com/MTSWebServices/onetl/pull/131)) +- Fix reading data from Oracle with `partitioningMode="range"` without explicitly set `lowerBound` / `upperBound`. ([#133](https://github.com/MTSWebServices/onetl/pull/133)) +- Update Kafka documentation with SSLProtocol usage. ([#136](https://github.com/MTSWebServices/onetl/pull/136)) +- Raise exception if someone tries to read data from Kafka topic which does not exist. ([#138](https://github.com/MTSWebServices/onetl/pull/138)) +- Allow to pass Kafka topics with name like `some.topic.name` to DBReader. Same for MongoDB collections. ([#139](https://github.com/MTSWebServices/onetl/pull/139)) diff --git a/mddocs/docs/changelog/0.9.3.md b/mddocs/docs/changelog/0.9.3.md new file mode 100644 index 000000000..c8f24f4ba --- /dev/null +++ b/mddocs/docs/changelog/0.9.3.md @@ -0,0 +1,5 @@ +# 0.9.3 (2023-09-06) { #DBR-onetl-changelog-0-9-3 } + +## Bug Fixes { #DBR-onetl-changelog-0-9-3-bug-fixes } + +- Fix documentation build diff --git a/mddocs/docs/changelog/0.9.4.md b/mddocs/docs/changelog/0.9.4.md new file mode 100644 index 000000000..d74ea2564 --- /dev/null +++ b/mddocs/docs/changelog/0.9.4.md @@ -0,0 +1,24 @@ +# 0.9.4 (2023-09-26) { #DBR-onetl-changelog-0-9-4 } + +## Features { #DBR-onetl-changelog-0-9-4-features } + +- Add `Excel` file format support. ([#148](https://github.com/MTSWebServices/onetl/pull/148)) +- Add `Samba` file connection. + It is now possible to download and upload files to Samba shared folders using `FileDownloader`/`FileUploader`. ([#150](https://github.com/MTSWebServices/onetl/pull/150)) +- Add `if_exists="ignore"` and `error` to `Hive.WriteOptions` ([#143](https://github.com/MTSWebServices/onetl/pull/143)) +- Add `if_exists="ignore"` and `error` to `JDBC.WriteOptions` ([#144](https://github.com/MTSWebServices/onetl/pull/144)) +- Add `if_exists="ignore"` and `error` to `MongoDB.WriteOptions` ([#145](https://github.com/MTSWebServices/onetl/pull/145)) + +## Improvements { #DBR-onetl-changelog-0-9-4-improvements } + +- Add documentation about different ways of passing packages to Spark session. ([#151](https://github.com/MTSWebServices/onetl/pull/151)) +- Drastically improve `Greenplum` documentation: + - Added information about network ports, grants, `pg_hba.conf` and so on. + - Added interaction schemas for reading, writing and executing statements in Greenplum. + - Added recommendations about reading data from views and `JOIN` results from Greenplum. ([#154](https://github.com/MTSWebServices/onetl/pull/154)) +- Make `.fetch` and `.execute` methods of DB connections thread-safe. Each thread works with its own connection. ([#156](https://github.com/MTSWebServices/onetl/pull/156)) +- Call `.close()` on `FileConnection` then it is removed by garbage collector. ([#156](https://github.com/MTSWebServices/onetl/pull/156)) + +## Bug Fixes { #DBR-onetl-changelog-0-9-4-bug-fixes } + +- Fix issue when stopping Python interpreter calls `JDBCMixin.close()`, but it is finished with exceptions. ([#156](https://github.com/MTSWebServices/onetl/pull/156)) diff --git a/mddocs/docs/changelog/0.9.5.md b/mddocs/docs/changelog/0.9.5.md new file mode 100644 index 000000000..b86961d3c --- /dev/null +++ b/mddocs/docs/changelog/0.9.5.md @@ -0,0 +1,14 @@ +# 0.9.5 (2023-10-10) { #DBR-onetl-changelog-0-9-5 } + +## Features { #DBR-onetl-changelog-0-9-5-features } + +- Add `XML` file format support. ([#163](https://github.com/MTSWebServices/onetl/pull/163)) +- Tested compatibility with Spark 3.5.0. `MongoDB` and `Excel` are not supported yet, but other packages do. ([#159](https://github.com/MTSWebServices/onetl/pull/159)) + +## Improvements { #DBR-onetl-changelog-0-9-5-improvements } + +- Add check to all DB and FileDF connections that Spark session is alive. ([#164](https://github.com/MTSWebServices/onetl/pull/164)) + +## Bug Fixes { #DBR-onetl-changelog-0-9-5-bug-fixes } + +- Fix `Hive.check()` behavior when Hive Metastore is not available. ([#164](https://github.com/MTSWebServices/onetl/pull/164)) diff --git a/mddocs/docs/changelog/DRAFT.md b/mddocs/docs/changelog/DRAFT.md new file mode 100644 index 000000000..912b7d7f7 --- /dev/null +++ b/mddocs/docs/changelog/DRAFT.md @@ -0,0 +1,3 @@ +```{eval-rst} +.. towncrier-draft-entries:: |release| [UNRELEASED] +``` diff --git a/mddocs/docs/changelog/NEXT_RELEASE.md b/mddocs/docs/changelog/NEXT_RELEASE.md new file mode 100644 index 000000000..a9831f9d1 --- /dev/null +++ b/mddocs/docs/changelog/NEXT_RELEASE.md @@ -0,0 +1 @@ +% towncrier release notes start diff --git a/mddocs/docs/changelog/index.md b/mddocs/docs/changelog/index.md new file mode 100644 index 000000000..62a6c6eb2 --- /dev/null +++ b/mddocs/docs/changelog/index.md @@ -0,0 +1,29 @@ +# Changelog { #DBR-onetl-changelog } + +- [0.13.4 (2025-03-20)][DBR-onetl-changelog-0-13-4] +- [0.13.3 (2025-03-11)][DBR-onetl-changelog-0-13-3] +- [0.13.1 (2025-03-06)][DBR-onetl-changelog-0-13-1] +- [0.13.0 (2025-02-24)][DBR-onetl-changelog-0-13-0] +- [0.12.5 (2024-12-03)][DBR-onetl-changelog-0-12-5] +- [0.12.4 (2024-11-27)][DBR-onetl-changelog-0-12-4] +- [0.12.3 (2024-11-22)][DBR-onetl-changelog-0-12-3] +- [0.12.2 (2024-11-12)][DBR-onetl-changelog-0-12-2] +- [0.12.1 (2024-10-28)][DBR-onetl-changelog-0-12-1] +- [0.12.0 (2024-09-03)][DBR-onetl-changelog-0-12-0] +- [0.11.2 (2024-09-02)][DBR-onetl-changelog-0-11-2] +- [0.11.1 (2024-05-29)][DBR-onetl-changelog-0-11-1] +- [0.11.0 (2024-05-27)][DBR-onetl-changelog-0-11-0] +- [0.10.2 (2024-03-21)][DBR-onetl-changelog-0-10-2] +- [0.10.1 (2024-02-05)][DBR-onetl-changelog-0-10-1] +- [0.10.0 (2023-12-18)][DBR-onetl-changelog-0-10-0] +- [0.9.5 (2023-10-10)][DBR-onetl-changelog-0-9-5] +- [0.9.4 (2023-09-26)][DBR-onetl-changelog-0-9-4] +- [0.9.3 (2023-09-06)][DBR-onetl-changelog-0-9-3] +- [0.9.2 (2023-09-06)][DBR-onetl-changelog-0-9-2] +- [0.9.1 (2023-08-17)][DBR-onetl-changelog-0-9-1] +- [0.9.0 (2023-08-17)][DBR-onetl-changelog-0-9-0] +- [0.8.1 (2023-07-10)][DBR-onetl-changelog-0-8-1] +- [0.8.0 (2023-05-31)][DBR-onetl-changelog-0-8-0] +- [0.7.2 (2023-05-24)][DBR-onetl-changelog-0-7-2] +- [0.7.1 (2023-05-23)][DBR-onetl-changelog-0-7-1] +- [0.7.0 (2023-05-15)][DBR-onetl-changelog-0-7-0] diff --git a/mddocs/docs/changelog/next_release/.keep b/mddocs/docs/changelog/next_release/.keep new file mode 100644 index 000000000..e69de29bb diff --git a/mddocs/docs/concepts.md b/mddocs/docs/concepts.md new file mode 100644 index 000000000..372ced02b --- /dev/null +++ b/mddocs/docs/concepts.md @@ -0,0 +1,369 @@ +# Concepts { #DBR-onetl-concepts } + +Here you can find detailed documentation about each one of the onETL concepts and how to use them. + +## Connection { #DBR-onetl-concepts-connection } + +### Connection basics { #DBR-onetl-concepts-connection-basics } + +onETL is used to pull and push data into other systems, and so it has a first-class `Connection` concept for storing credentials that are used to communicate with external systems. + +A `Connection` is essentially a set of parameters, such as username, password, hostname. + +To create a connection to a specific storage type, you must use a class that matches the storage type. The class name is the same as the storage type name (`Oracle`, `MSSQL`, `SFTP`, etc): + +```python +from onetl.connection import SFTP + +sftp = SFTP( + host="sftp.test.com", + user="onetl", + password="onetl", +) +``` + +All connection types are inherited from the parent class `BaseConnection`. + +### Connection class diagram { #DBR-onetl-concepts-connection-class-diagram } + +```mermaid +classDiagram + BaseConnection <|-- DBConnection + DBConnection <|-- Hive + DBConnection <|-- Greenplum + DBConnection <|-- MongoDB + DBConnection <|-- Kafka + DBConnection <|-- JDBCConnection + JDBCConnection <|-- Clickhouse + JDBCConnection <|-- MSSQL + JDBCConnection <|-- MySQL + JDBCConnection <|-- Postgres + JDBCConnection <|-- Oracle + JDBCConnection <|-- Teradata + BaseConnection <|-- FileConnection + FileConnection <|-- FTP + FileConnection <|-- FTPS + FileConnection <|-- HDFS + FileConnection <|-- WebDAV + FileConnection <|-- Samba + FileConnection <|-- SFTP + FileConnection <|-- S3 + BaseConnection <|-- FileDFConnection + FileDFConnection <|-- SparkHDFS + FileDFConnection <|-- SparkLocalFS + FileDFConnection <|-- SparkS3 +``` + +### DBConnection { #DBR-onetl-concepts-dbconnection } + +Classes inherited from `DBConnection` could be used for accessing databases. + +A `DBConnection` could be instantiated as follows: + +```python +from onetl.connection import MSSQL + +mssql = MSSQL( + host="mssqldb.demo.com", + user="onetl", + password="onetl", + database="Telecom", + spark=spark, +) +``` + +where **spark** is the current SparkSession. +`onETL` uses `Spark` and specific Java connectors under the hood to work with databases. + +For a description of other parameters, see the documentation for the [available DBConnections][DBR-onetl-connection-db-connection-db-connections]. + +### FileConnection { #DBR-onetl-concepts-fileconnection } + +Classes inherited from `FileConnection` could be used to access files stored on the different file systems/file servers + +A `FileConnection` could be instantiated as follows: + +```python +from onetl.connection import SFTP + +sftp = SFTP( + host="sftp.test.com", + user="onetl", + password="onetl", +) +``` + +For a description of other parameters, see the documentation for the [available FileConnections][DBR-onetl-connection-file-connection-file-connections]. + +### FileDFConnection { #DBR-onetl-concepts-filedfconnection } + +Classes inherited from `FileDFConnection` could be used for accessing files as Spark DataFrames. + +A `FileDFConnection` could be instantiated as follows: + +```python +from onetl.connection import SparkHDFS + +spark_hdfs = SparkHDFS( + host="namenode1.domain.com", + cluster="mycluster", + spark=spark, +) +``` + +where **spark** is the current SparkSession. +`onETL` uses `Spark` and specific Java connectors under the hood to work with DataFrames. + +For a description of other parameters, see the documentation for the [available FileDFConnections][DBR-onetl-connection-file-df-connection-file-dataframe-connections]. + +### Checking connection availability { #DBR-onetl-concepts-checking-connection-availability } + +Once you have created a connection, you can check the database/filesystem availability using the method `check()`: + +```python +mssql.check() +sftp.check() +spark_hdfs.check() +``` + +It will raise an exception if database/filesystem cannot be accessed. + +This method returns connection itself, so you can create connection and immediately check its availability: + +```python +mssql = MSSQL( + host="mssqldb.demo.com", + user="onetl", + password="onetl", + database="Telecom", + spark=spark, +).check() # <-- +``` + +## Extract/Load data { #DBR-onetl-concepts-extractload-data } + +### Basics { #DBR-onetl-concepts-basics } + +As we said above, onETL is used to extract data from and load data into remote systems. + +onETL provides several classes for this: + +* [DBReader][DBR-onetl-db-reader] +* [DBWriter][DBR-onetl-db-writer] +* [FileDFReader][DBR-onetl-file-df-reader-filedf-reader-0] +* [FileDFWriter][DBR-onetl-file-df-writer-filedf-writer-0] +* [FileDownloader][DBR-onetl-file-downloader-0] +* [FileUploader][DBR-onetl-file-uploader-0] +* [FileMover][DBR-onetl-file-mover-0] + +All of these classes have a method `run()` that starts extracting/loading the data: + +```python +from onetl.db import DBReader, DBWriter + +reader = DBReader( + connection=mssql, + source="dbo.demo_table", + columns=["column_1", "column_2"], +) + +# Read data as Spark DataFrame +df = reader.run() + +db_writer = DBWriter( + connection=hive, + target="dl_sb.demo_table", +) + +# Save Spark DataFrame to Hive table +writer.run(df) +``` + +### Extract data { #DBR-onetl-concepts-extract-data } + +To extract data you can use classes: + +| | Use case | Connection | `run()` gets | `run()` returns | +| -- | - | - | - | --- | +| [`DBReader`][DBR-onetl-db-reader] | Reading data from a database | Any [`DBConnection`][DBR-onetl-connection-db-connection-db-connections] | - | [Spark DataFrame](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/dataframe.html#dataframe) | +| [`FileDFReader`][DBR-onetl-file-df-reader-filedf-reader-0] | Read data from a file or set of files | Any [`FileDFConnection`][DBR-onetl-connection-file-df-connection-file-dataframe-connections] | No input, or List[File path on FileSystem] | [Spark DataFrame](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/dataframe.html#dataframe) | +| [`FileDownloader`][DBR-onetl-file-downloader-0] | Download files from remote FS to local FS | Any [`FileConnection`][DBR-onetl-connection-file-connection-file-connections] | No input, or List[File path on remote FileSystem] | [`DownloadResult`][DBR-onetl-file-downloader-result] | + +### Load data { #DBR-onetl-concepts-load-data } + +To load data you can use classes: + +| | Use case | Connection | `run()` gets | `run()` returns | +| - | -- | - | --- | -- | +| [`DBWriter`][DBR-onetl-db-writer] | Writing data from a DataFrame to a database | Any [`DBConnection`][DBR-onetl-connection-db-connection-db-connections] | [Spark DataFrame](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/dataframe.html#dataframe) | None | +| [`FileDFWriter`][DBR-onetl-file-df-writer-filedf-writer-0] | Writing data from a DataFrame to a folder | Any [`FileDFConnection`][DBR-onetl-connection-file-df-connection-file-dataframe-connections] | [Spark DataFrame](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/dataframe.html#dataframe) | None | +| [`FileUploader`][DBR-onetl-file-uploader-0] | Uploading files from a local FS to remote FS | Any [`FileConnection`][DBR-onetl-connection-file-connection-file-connections] | List[File path on local FileSystem] | [`UploadResult`][DBR-onetl-file-uploader-result] | + +### Manipulate data { #DBR-onetl-concepts-manipulate-data } + +To manipulate data you can use classes: + +| | Use case | Connection | `run()` gets | `run()` returns | +| - | - | -- | -- | - | +| [`FileMover`][DBR-onetl-file-mover-0] | Move files between directories in remote FS | Any [`FileConnection`][DBR-onetl-connection-file-connection-file-connections] | List[File path on remote FileSystem] | [`MoveResult`][DBR-onetl-file-mover-result] | + +### Options { #DBR-onetl-concepts-options } + +Extract and load classes have a `options` parameter, which has a special meaning: + +* all other parameters - *WHAT* we extract / *WHERE* we load to +* `options` parameter - *HOW* we extract/load data + +```python +db_reader = DBReader( + # WHAT do we read: + connection=mssql, + source="dbo.demo_table", # some table from MSSQL + columns=["column_1", "column_2"], # but only specific set of columns + where="column_2 > 1000", # only rows matching the clause + # HOW do we read: + options=MSSQL.ReadOptions( + numPartitions=10, # read in 10 parallel jobs + partitionColumn="id", # balance data read by assigning each job a part of data using `hash(id) mod N` expression + partitioningMode="hash", + fetchsize=1000, # each job will fetch block of 1000 rows each on every read attempt + ), +) + +db_writer = DBWriter( + # WHERE do we write to - to some table in Hive + connection=hive, + target="dl_sb.demo_table", + # HOW do we write - overwrite all the data in the existing table + options=Hive.WriteOptions(if_exists="replace_entire_table"), +) + +file_downloader = FileDownloader( + # WHAT do we download - files from some dir in SFTP + connection=sftp, + source_path="/source", + filters=[Glob("*.csv")], # only CSV files + limits=[MaxFilesCount(1000)], # 1000 files max + # WHERE do we download to - a specific dir on local FS + local_path="/some", + # HOW do we download: + options=FileDownloader.Options( + delete_source=True, # after downloading each file remove it from source_path + if_exists="replace_file", # replace existing files in the local_path + ), +) + +file_uploader = FileUploader( + # WHAT do we upload - files from some local dir + local_path="/source", + # WHERE do we upload to- specific remote dir in HDFS + connection=hdfs, + target_path="/some", + # HOW do we upload: + options=FileUploader.Options( + delete_local=True, # after uploading each file remove it from local_path + if_exists="replace_file", # replace existing files in the target_path + ), +) + +file_mover = FileMover( + # WHAT do we move - files in some remote dir in HDFS + source_path="/source", + connection=hdfs, + # WHERE do we move files to + target_path="/some", # a specific remote dir within the same HDFS connection + # HOW do we load - replace existing files in the target_path + options=FileMover.Options(if_exists="replace_file"), +) + +file_df_reader = FileDFReader( + # WHAT do we read - *.csv files from some dir in S3 + connection=s3, + source_path="/source", + file_format=CSV(), + # HOW do we read - load files from /source/*.csv, not from /source/nested/*.csv + options=FileDFReader.Options(recursive=False), +) + +file_df_writer = FileDFWriter( + # WHERE do we write to - as .csv files in some dir in S3 + connection=s3, + target_path="/target", + file_format=CSV(), + # HOW do we write - replace all existing files in /target, if exists + options=FileDFWriter.Options(if_exists="replace_entire_directory"), +) +``` + +More information about `options` could be found on [`DBConnection`][DBR-onetl-connection-db-connection-db-connections] and [`FileDownloader`][DBR-onetl-file-downloader-0] / [`FileUploader`][DBR-onetl-file-uploader-0] / [`FileMover`][DBR-onetl-file-mover-0] / [`FileDFReader`][DBR-onetl-file-df-reader-filedf-reader-0] / [`FileDFWriter`][DBR-onetl-file-df-writer-filedf-writer-0] documentation. + +### Read Strategies { #DBR-onetl-concepts-read-strategies } + +onETL have several builtin strategies for reading data: + +1. [Snapshot strategy][DBR-onetl-strategy-snapshot-strategy] (default strategy) +2. [Incremental strategy][DBR-onetl-connection-db-connection-clickhouse-read-incremental-strategy] +3. [Snapshot batch strategy][DBR-onetl-strategy-snapshot-batch-strategy] +4. [Incremental batch strategy][DBR-onetl-strategy-incremental-batch-strategy] + +For example, an incremental strategy allows you to get only new data from the table: + +```python +from onetl.strategy import IncrementalStrategy + +reader = DBReader( + connection=mssql, + source="dbo.demo_table", + hwm_column="id", # detect new data based on value of "id" column +) + +# first run +with IncrementalStrategy(): + df = reader.run() + +sleep(3600) + +# second run +with IncrementalStrategy(): + # only rows, that appeared in the source since previous run + df = reader.run() +``` + +or get only files which were not downloaded before: + +```python +from onetl.strategy import IncrementalStrategy + +file_downloader = FileDownloader( + connection=sftp, + source_path="/remote", + local_path="/local", + hwm_type="file_list", # save all downloaded files to a list, and exclude files already present in this list +) + +# first run +with IncrementalStrategy(): + files = file_downloader.run() + +sleep(3600) + +# second run +with IncrementalStrategy(): + # only files, that appeared in the source since previous run + files = file_downloader.run() +``` + +Most of strategies are based on [`HWM`][DBR-onetl-hwm-store-hwm], Please check each strategy documentation for more details + +### Why just not use Connection class for extract/load? { #DBR-onetl-concepts-why-just-not-use-connection-class-for-extractload } + +Connections are very simple, they have only a set of some basic operations, +like `mkdir`, `remove_file`, `get_table_schema`, and so on. + +High-level operations, like + +* [`strategy`][DBR-onetl-strategy-read-strategies] support +* Handling metadata push/pull +* Handling different options, like `if_exists="replace_file"` in case of file download/upload + +is moved to a separate class which calls the connection object methods to perform some complex logic. diff --git a/mddocs/docs/connection/db_connection/clickhouse/connection.md b/mddocs/docs/connection/db_connection/clickhouse/connection.md new file mode 100644 index 000000000..a328bb10e --- /dev/null +++ b/mddocs/docs/connection/db_connection/clickhouse/connection.md @@ -0,0 +1,18 @@ +# Clickhouse connection { #DBR-onetl-connection-db-connection-clickhouse-connection-0 } + + + +::: onetl.connection.db_connection.clickhouse.connection.Clickhouse + options: + members: + - get_packages + - check diff --git a/mddocs/docs/connection/db_connection/clickhouse/execute.md b/mddocs/docs/connection/db_connection/clickhouse/execute.md new file mode 100644 index 000000000..f252f257c --- /dev/null +++ b/mddocs/docs/connection/db_connection/clickhouse/execute.md @@ -0,0 +1,133 @@ +# Executing statements in Clickhouse { #DBR-onetl-connection-db-connection-clickhouse-execute-executing-statements-in-clickhouse } + +!!! warning + + Methods below **read all the rows** returned from DB **to Spark driver memory**, and then convert them to DataFrame. + + Do **NOT** use them to read large amounts of data. Use [DBReader][DBR-onetl-connection-db-connection-clickhouse-read-reading-from-clickhouse-using-dbreader] or [Clickhouse.sql][DBR-onetl-connection-db-connection-clickhouse-sql-reading-from-clickhouse-using-clickhouse-sql] instead. + +## How to { #DBR-onetl-connection-db-connection-clickhouse-execute-how-to } + +There are 2 ways to execute some statement in Clickhouse + +### Use `Clickhouse.fetch` { #DBR-onetl-connection-db-connection-clickhouse-execute-use-clickhouse-fetch } + +Use this method to perform some `SELECT` query which returns **small number or rows**, like reading +Clickhouse config, or reading data from some reference table. Method returns Spark DataFrame. + +Method accepts [Clickhouse.FetchOptions][onetl.connection.db_connection.clickhouse.options.ClickhouseFetchOptions]. + +Connection opened using this method should be then closed with `connection.close()` or `with connection:`. + +!!! warning + + Please take into account [Clickhouse types][DBR-onetl-connection-db-connection-clickhouse-types-clickhouse-spark-type-mapping]. + +#### Syntax support in `Clickhouse.fetch` { #DBR-onetl-connection-db-connection-clickhouse-execute-syntax-support-in-clickhouse-fetch } + +This method supports **any** query syntax supported by Clickhouse, like: + +- ✅︎ `SELECT ... FROM ...` +- ✅︎ `WITH alias AS (...) SELECT ...` +- ✅︎ `SELECT func(arg1, arg2)` - call function +- ✅︎ `SHOW ...` +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +#### Examples for `Clickhouse.fetch` { #DBR-onetl-connection-db-connection-clickhouse-execute-examples-for-clickhouse-fetch } + +```python +from onetl.connection import Clickhouse + +clickhouse = Clickhouse(...) + +df = clickhouse.fetch( + "SELECT value FROM some.reference_table WHERE key = 'some_constant'", + options=Clickhouse.FetchOptions(queryTimeout=10), +) +clickhouse.close() +value = df.collect()[0][0] # get value from first row and first column +``` + +### Use `Clickhouse.execute` { #DBR-onetl-connection-db-connection-clickhouse-execute-use-clickhouse-execute } + +Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. + +Method accepts [Clickhouse.ExecuteOptions][onetl.connection.db_connection.clickhouse.options.ClickhouseExecuteOptions]. + +Connection opened using this method should be then closed with `connection.close()` or `with connection:`. + +#### Syntax support in `Clickhouse.execute` { #DBR-onetl-connection-db-connection-clickhouse-execute-syntax-support-in-clickhouse-execute } + +This method supports **any** query syntax supported by Clickhouse, like: + +- ✅︎ `CREATE TABLE ...`, `CREATE VIEW ...`, and so on +- ✅︎ `ALTER ...` +- ✅︎ `INSERT INTO ... SELECT ...`, `UPDATE ...`, `DELETE ...`, and so on +- ✅︎ `DROP TABLE ...`, `DROP VIEW ...`, `TRUNCATE TABLE`, and so on +- ✅︎ other statements not mentioned here +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +#### Examples for `Clickhouse.execute` { #DBR-onetl-connection-db-connection-clickhouse-execute-examples-for-clickhouse-execute } + + ```python + from onetl.connection import Clickhouse + + clickhouse = Clickhouse(...) + + clickhouse.execute("DROP TABLE schema.table") + clickhouse.execute( + """ + CREATE TABLE schema.table ( + id UInt8, + key String, + value Float32 + ) + ENGINE = MergeTree() + ORDER BY id + """, + options=Clickhouse.ExecuteOptions(queryTimeout=10), + ) + ``` + +## Notes { #DBR-onetl-connection-db-connection-clickhouse-execute-notes } + +These methods **read all the rows** returned from DB **to Spark driver memory**, and then convert them to DataFrame. + +So it should **NOT** be used to read large amounts of data. Use [DBReader][DBR-onetl-connection-db-connection-clickhouse-read-reading-from-clickhouse-using-dbreader] or [Clickhouse.sql][DBR-onetl-connection-db-connection-clickhouse-sql-reading-from-clickhouse-using-clickhouse-sql] instead. + +## Options { #DBR-onetl-connection-db-connection-clickhouse-execute-options } + + + +::: onetl.connection.db_connection.clickhouse.options.ClickhouseFetchOptions + options: + inherited_members: true + heading_level: 3 + show_root_heading: true + +::: onetl.connection.db_connection.clickhouse.options.ClickhouseExecuteOptions + options: + inherited_members: true + heading_level: 3 + show_root_heading: true \ No newline at end of file diff --git a/mddocs/docs/connection/db_connection/clickhouse/index.md b/mddocs/docs/connection/db_connection/clickhouse/index.md new file mode 100644 index 000000000..25e55cf75 --- /dev/null +++ b/mddocs/docs/connection/db_connection/clickhouse/index.md @@ -0,0 +1,17 @@ +# Clickhouse { #DBR-onetl-connection-db-connection-clickhouse } + +## Connection { #DBR-onetl-connection-db-connection-clickhouse-connection-1 } + +* [Prerequisites][DBR-onetl-connection-db-connection-clickhouse-prerequisites] +* [Clickhouse connection][DBR-onetl-connection-db-connection-clickhouse-connection-0] + +## Operations { #DBR-onetl-connection-db-connection-clickhouse-operations } + +* [Reading from Clickhouse using `DBReader`][DBR-onetl-connection-db-connection-clickhouse-read-reading-from-clickhouse-using-dbreader] +* [Reading from Clickhouse using `Clickhouse.sql`][DBR-onetl-connection-db-connection-clickhouse-sql-reading-from-clickhouse-using-clickhouse-sql] +* [Writing to Clickhouse using `DBWriter`][DBR-onetl-connection-db-connection-clickhouse-write-writing-to-clickhouse-using-dbwriter] +* [Executing statements in Clickhouse][DBR-onetl-connection-db-connection-clickhouse-execute-executing-statements-in-clickhouse] + +## Troubleshooting { #DBR-onetl-connection-db-connection-clickhouse-troubleshooting } + +* [Clickhouse <-> Spark type mapping][DBR-onetl-connection-db-connection-clickhouse-types-clickhouse-spark-type-mapping] diff --git a/mddocs/docs/connection/db_connection/clickhouse/prerequisites.md b/mddocs/docs/connection/db_connection/clickhouse/prerequisites.md new file mode 100644 index 000000000..9613cfb9e --- /dev/null +++ b/mddocs/docs/connection/db_connection/clickhouse/prerequisites.md @@ -0,0 +1,71 @@ +# Prerequisites { #DBR-onetl-connection-db-connection-clickhouse-prerequisites } + +## Version Compatibility { #DBR-onetl-connection-db-connection-clickhouse-prerequisites-version-compatibility } + +- Clickhouse server versions: + - Officially declared: 22.8 or higher + - Actually tested: 21.1, 25.1 +- Spark versions: 2.3.x - 3.5.x +- Java versions: 8 - 20 + +See [official documentation](https://clickhouse.com/docs/en/integrations/java#jdbc-driver). + +## Installing PySpark { #DBR-onetl-connection-db-connection-clickhouse-prerequisites-installing-pyspark } + +To use Clickhouse connector you should have PySpark installed (or injected to `sys.path`) +BEFORE creating the connector instance. + +See [installation instruction][DBR-onetl-install-spark] for more details. + +## Connecting to Clickhouse { #DBR-onetl-connection-db-connection-clickhouse-prerequisites-connecting-to-clickhouse } + +### Connection port { #DBR-onetl-connection-db-connection-clickhouse-prerequisites-connection-port } + +Connector can only use **HTTP** (usually `8123` port) or **HTTPS** (usually `8443` port) protocol. + +TCP and GRPC protocols are NOT supported. + +### Connecting to cluster { #DBR-onetl-connection-db-connection-clickhouse-prerequisites-connecting-to-cluster } + +It is possible to connect to Clickhouse cluster, and use it's load balancing capabilities to read or write data in parallel. +Each Spark executor can connect to random Clickhouse nodes, instead of sending all the data to a node specified in connection params. + +This requires all Clickhouse servers to run on different hosts, and **listen the same HTTP port**. +Set `auto_discovery=True` to enable this feature (disabled by default): + +```python +Clickhouse( + host="node1.of.cluster", + port=8123, + extra={ + "auto_discovery": True, + "load_balancing_policy": "roundRobin", + }, +) +``` + +See [official documentation](https://clickhouse.com/docs/en/integrations/java#configuring-node-discovery-load-balancing-and-failover). + +### Required grants { #DBR-onetl-connection-db-connection-clickhouse-prerequisites-required-grants } + +Ask your Clickhouse cluster administrator to set following grants for a user, +used for creating a connection: + +=== "Read + Write" + + ```sql + -- allow creating tables in the target schema + GRANT CREATE TABLE ON myschema.* TO username; + + -- allow read & write access to specific table + GRANT SELECT, INSERT ON myschema.mytable TO username; + ``` + +=== "Read only" + + ```sql + -- allow read access to specific table + GRANT SELECT ON myschema.mytable TO username; + ``` + +More details can be found in [official documentation](https://clickhouse.com/docs/en/sql-reference/statements/grant). diff --git a/mddocs/docs/connection/db_connection/clickhouse/read.md b/mddocs/docs/connection/db_connection/clickhouse/read.md new file mode 100644 index 000000000..bddb1d30a --- /dev/null +++ b/mddocs/docs/connection/db_connection/clickhouse/read.md @@ -0,0 +1,98 @@ +# Reading from Clickhouse using `DBReader` { #DBR-onetl-connection-db-connection-clickhouse-read-reading-from-clickhouse-using-dbreader } + +[DBReader][DBR-onetl-db-reader] supports [strategy][DBR-onetl-strategy-read-strategies] for incremental data reading, +but does not support custom queries, like `JOIN`. + +!!! warning + + Please take into account [Сlickhouse types][DBR-onetl-connection-db-connection-clickhouse-types-clickhouse-spark-type-mapping] + +## Supported DBReader features { #DBR-onetl-connection-db-connection-clickhouse-read-supported-dbreader-features } + +- ✅︎ `columns` +- ✅︎ `where` +- ✅︎ `hwm`, supported strategies: + - ✅︎ [Snapshot strategy][DBR-onetl-strategy-snapshot-strategy] + - ✅︎ [Incremental strategy][DBR-onetl-connection-db-connection-clickhouse-read-incremental-strategy] + - ✅︎ [Snapshot batch strategy][DBR-onetl-strategy-snapshot-batch-strategy] + - ✅︎ [Incremental batch strategy][DBR-onetl-strategy-incremental-batch-strategy] +- ❌ `hint` (is not supported by Clickhouse) +- ❌ `df_schema` +- ✅︎ `options` (see [Clickhouse.ReadOptions][onetl.connection.db_connection.clickhouse.options.ClickhouseReadOptions]) + +## Examples { #DBR-onetl-connection-db-connection-clickhouse-read-examples } + +### Snapshot strategy { #DBR-onetl-connection-db-connection-clickhouse-read-snapshot-strategy } + + ```python + from onetl.connection import Clickhouse + from onetl.db import DBReader + + clickhouse = Clickhouse(...) + + reader = DBReader( + connection=clickhouse, + source="schema.table", + columns=["id", "key", "CAST(value AS String) value", "updated_dt"], + where="key = 'something'", + options=Clickhouse.ReadOptions(partitionColumn="id", numPartitions=10), + ) + df = reader.run() + + ``` + +### Incremental strategy { #DBR-onetl-connection-db-connection-clickhouse-read-incremental-strategy } + + ```python + from onetl.connection import Clickhouse + from onetl.db import DBReader + from onetl.strategy import IncrementalStrategy + + clickhouse = Clickhouse(...) + + reader = DBReader( + connection=clickhouse, + source="schema.table", + columns=["id", "key", "CAST(value AS String) value", "updated_dt"], + where="key = 'something'", + hwm=DBReader.AutoDetectHWM(name="clickhouse_hwm", expression="updated_dt"), + options=Clickhouse.ReadOptions(partitionColumn="id", numPartitions=10), + ) + + with IncrementalStrategy(): + df = reader.run() + ``` + +## Recommendations { #DBR-onetl-connection-db-connection-clickhouse-read-recommendations } + +### Select only required columns { #DBR-onetl-connection-db-connection-clickhouse-read-select-only-required-columns } + +Instead of passing `"*"` in `DBReader(columns=[...])` prefer passing exact column names. This reduces the amount of data passed from Clickhouse to Spark. + +### Pay attention to `where` value { #DBR-onetl-connection-db-connection-clickhouse-read-pay-attention-to-where-value } + +Instead of filtering data on Spark side using `df.filter(df.column == 'value')` pass proper `DBReader(where="column = 'value'")` clause. +This both reduces the amount of data send from Clickhouse to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in `where` clause. + +## Options { #DBR-onetl-connection-db-connection-clickhouse-read-options } + + + +::: onetl.connection.db_connection.clickhouse.options.ClickhouseReadOptions + options: + inherited_members: true + heading_level: 3 + show_root_heading: true diff --git a/mddocs/docs/connection/db_connection/clickhouse/sql.md b/mddocs/docs/connection/db_connection/clickhouse/sql.md new file mode 100644 index 000000000..ddcf1ffde --- /dev/null +++ b/mddocs/docs/connection/db_connection/clickhouse/sql.md @@ -0,0 +1,82 @@ +# Reading from Clickhouse using `Clickhouse.sql` { #DBR-onetl-connection-db-connection-clickhouse-sql-reading-from-clickhouse-using-clickhouse-sql } + +`Clickhouse.sql` allows passing custom SQL query, but does not support incremental strategies. + +!!! warning + + Please take into account [Clickhouse types][DBR-onetl-connection-db-connection-clickhouse-types-clickhouse-spark-type-mapping] + +!!! warning + + Statement is executed in **read-write** connection, so if you're calling some functions/procedures with DDL/DML statements inside, + they can change data in your database. + +## Syntax support { #DBR-onetl-connection-db-connection-clickhouse-sql-syntax-support } + +Only queries with the following syntax are supported: + +- ✅︎ `SELECT ... FROM ...` +- ✅︎ `WITH alias AS (...) SELECT ...` +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +## Examples { #DBR-onetl-connection-db-connection-clickhouse-sql-examples } + + ```python + from onetl.connection import Clickhouse + + clickhouse = Clickhouse(...) + df = clickhouse.sql( + """ + SELECT + id, + key, + CAST(value AS String) value, + updated_at + FROM + some.mytable + WHERE + key = 'something' + """, + options=Clickhouse.SQLOptions( + partitionColumn="id", + numPartitions=10, + lowerBound=0, + upperBound=1000, + ), + ) + ``` + +## Recommendations { #DBR-onetl-connection-db-connection-clickhouse-sql-recommendations } + +### Select only required columns { #DBR-onetl-connection-db-connection-clickhouse-sql-select-only-required-columns } + +Instead of passing `SELECT * FROM ...` prefer passing exact column names `SELECT col1, col2, ...`. +This reduces the amount of data passed from Clickhouse to Spark. + +### Pay attention to `where` value { #DBR-onetl-connection-db-connection-clickhouse-sql-pay-attention-to-where-value } + +Instead of filtering data on Spark side using `df.filter(df.column == 'value')` pass proper `WHERE column = 'value'` clause. +This both reduces the amount of data send from Clickhouse to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in `where` clause. + +## Options { #DBR-onetl-connection-db-connection-clickhouse-sql-options } + + + +::: onetl.connection.db_connection.clickhouse.options.ClickhouseSQLOptions + options: + inherited_members: true + heading_level: 3 + show_root_heading: true diff --git a/mddocs/docs/connection/db_connection/clickhouse/types.md b/mddocs/docs/connection/db_connection/clickhouse/types.md new file mode 100644 index 000000000..38795caf4 --- /dev/null +++ b/mddocs/docs/connection/db_connection/clickhouse/types.md @@ -0,0 +1,350 @@ +# Clickhouse <-> Spark type mapping { #DBR-onetl-connection-db-connection-clickhouse-types-clickhouse-spark-type-mapping } + +!!! note + + The results below are valid for Spark 3.5.5, and may differ on other Spark versions. + +!!! note + + It is recommended to use [spark-dialect-extension](https://github.com/MTSWebServices/spark-dialect-extension) package, + which implements writing Arrays from Spark to Clickhouse, fixes dropping fractions of seconds in `TimestampType`, + and fixes other type conversion issues. + +## Type detection & casting { #DBR-onetl-connection-db-connection-clickhouse-types-type-detection-casting } + +Spark's DataFrames always have a `schema` which is a list of columns with corresponding Spark types. All operations on a column are performed using column type. + +### Reading from Clickhouse { #DBR-onetl-connection-db-connection-clickhouse-types-reading-from-clickhouse } + +This is how Clickhouse connector performs this: + +- For each column in query result (`SELECT column1, column2, ... FROM table ...`) get column name and Clickhouse type. +- Find corresponding `Clickhouse type (read)` → `Spark type` combination (see below) for each DataFrame column. If no combination is found, raise exception. +- Create DataFrame from query with specific column names and Spark types. + +### Writing to some existing Clickhouse table { #DBR-onetl-connection-db-connection-clickhouse-types-writing-to-some-existing-clickhouse-table } + +This is how Clickhouse connector performs this: + +- Get names of columns in DataFrame. [^1] +- Perform `SELECT * FROM table LIMIT 0` query. +- Take only columns present in DataFrame (by name, case insensitive). For each found column get Clickhouse type. +- **Find corresponding** `Clickhouse type (read)` → `Spark type` **combination** (see below) for each DataFrame column. If no combination is found, raise exception. [^2] +- Find corresponding `Spark type` → `Clickhouse type (write)` combination (see below) for each DataFrame column. If no combination is found, raise exception. +- If `Clickhouse type (write)` match `Clickhouse type (read)`, no additional casts will be performed, DataFrame column will be written to Clickhouse as is. +- If `Clickhouse type (write)` does not match `Clickhouse type (read)`, DataFrame column will be casted to target column type **on Clickhouse side**. For example, you can write column with text data to `Int32` column, if column contains valid integer values within supported value range and precision. + +[^1]: This allows to write data to tables with `DEFAULT` columns - if DataFrame has no such column, it will be populated by Clickhouse. + +[^2]: Yes, this is weird. + +### Create new table using Spark { #DBR-onetl-connection-db-connection-clickhouse-types-create-new-table-using-spark } + +!!! warning + + ABSOLUTELY NOT RECOMMENDED! + +This is how Clickhouse connector performs this: + +- Find corresponding `Spark type` → `Clickhouse type (create)` combination (see below) for each DataFrame column. If no combination is found, raise exception. +- Generate DDL for creating table in Clickhouse, like `CREATE TABLE (col1 ...)`, and run it. +- Write DataFrame to created table as is. + +But Spark does not have specific dialect for Clickhouse, so Generic JDBC dialect is used. +Generic dialect is using SQL ANSI type names while creating tables in target database, not database-specific types. + +If some cases this may lead to using wrong column type. For example, Spark creates column of type `TIMESTAMP` +which corresponds to Clickhouse type `DateTime32` (precision up to seconds) +instead of more precise `DateTime64` (precision up to nanoseconds). +This may lead to incidental precision loss, or sometimes data cannot be written to created table at all. + +So instead of relying on Spark to create tables: + +??? "See example" + + ```python + writer = DBWriter( + connection=clickhouse, + target="default.target_tbl", + options=Clickhouse.WriteOptions( + if_exists="append", + # ENGINE is required by Clickhouse + createTableOptions="ENGINE = MergeTree() ORDER BY id", + ), + ) + writer.run(df) + ``` + +Always prefer creating tables with specific types **BEFORE WRITING DATA**: + +??? "See example" + + ```python + clickhouse.execute( + """ + CREATE TABLE default.target_tbl ( + id UInt8, + value DateTime64(6) -- specific type and precision + ) + ENGINE = MergeTree() + ORDER BY id + """, + ) + + writer = DBWriter( + connection=clickhouse, + target="default.target_tbl", + options=Clickhouse.WriteOptions(if_exists="append"), + ) + writer.run(df) + ``` + +### References { #DBR-onetl-connection-db-connection-clickhouse-types-references } + +Here you can find source code with type conversions: + +- [Clickhouse -> JDBC](https://github.com/ClickHouse/clickhouse-java/blob/0.3.2/clickhouse-jdbc/src/main/java/com/clickhouse/jdbc/JdbcTypeMapping.java#L39-L176) +- [JDBC -> Spark](https://github.com/apache/spark/blob/v3.5.5/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala#L307) +- [Spark -> JDBC](https://github.com/apache/spark/blob/v3.5.5/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala#L141-L164) +- [JDBC -> Clickhouse](https://github.com/ClickHouse/clickhouse-java/blob/0.3.2/clickhouse-jdbc/src/main/java/com/clickhouse/jdbc/JdbcTypeMapping.java#L185-L311) + +## Supported types { #DBR-onetl-connection-db-connection-clickhouse-types-supported-types } + +See [official documentation](https://clickhouse.com/docs/en/sql-reference/data-types) + +### Generic types { #DBR-onetl-connection-db-connection-clickhouse-types-generic-types } + +- `LowCardinality(T)` is same as `T` +- `Nullable(T)` is same as `T`, but Spark column is inferred as `nullable=True` + +### Numeric types { #DBR-onetl-connection-db-connection-clickhouse-types-numeric-types } + +| Clickhouse type (read) | Spark type | Clickhouse type (write) | Clickhouse type (create) | +|--------------------------------|-----------------------------------|-------------------------------|------------------------------| +| `Bool` | `BooleanType()` | `Bool` | `UInt64` | +| `Decimal` | `DecimalType(P=10, S=0)` | `Decimal(P=10, S=0)` | `Decimal(P=10, S=0)` | +| `Decimal(P=0..38)` | `DecimalType(P=0..38, S=0)` | `Decimal(P=0..38, S=0)` | `Decimal(P=0..38, S=0)` | +| `Decimal(P=0..38, S=0..38)` | `DecimalType(P=0..38, S=0..38)` | `Decimal(P=0..38, S=0..38)` | `Decimal(P=0..38, S=0..38)` | +| `Decimal(P=39..76, S=0..76)` | unsupported [^3] | | | +| `Decimal32(P=0..9)` | `DecimalType(P=9, S=0..9)` | `Decimal(P=9, S=0..9)` | `Decimal(P=9, S=0..9)` | +| `Decimal64(S=0..18)` | `DecimalType(P=18, S=0..18)` | `Decimal(P=18, S=0..18)` | `Decimal(P=18, S=0..18)` | +| `Decimal128(S=0..38)` | `DecimalType(P=38, S=0..38)` | `Decimal(P=38, S=0..38)` | `Decimal(P=38, S=0..38)` | +| `Decimal256(S=0..76)` | unsupported [^3] | | | +| `Float32` | `FloatType()` | `Float32` | `Float32` | +| `Float64` | `DoubleType()` | `Float64` | `Float64` | +| `Int8`
`Int16`
`Int32` |
`IntegerType()` |
`Int32` |
`Int32` | +| `Int64` | `LongType()` | `Int64` | `Int64` | +| `Int128`
`Int256` | unsupported [^3] | | | +| `-` | `ByteType()` | `Int8` | `Int8` | +| `-` | `ShortType()` | `Int32` | `Int32` | +| `UInt8` | `IntegerType()` | `Int32` | `Int32` | +| `UInt16` | `LongType()` | `Int64` | `Int64` | +| `UInt32`
`UInt64` | `DecimalType(20,0)` | `Decimal(20,0)` | `Decimal(20,0)` | +| `UInt128`
`UInt256` | unsupported [^3] | | | + +[^3]: Clickhouse support numeric types up to 256 bit - `Int256`, `UInt256`, `Decimal256(S)`, `Decimal(P=39..76, S=0..76)`. + + But Spark's `DecimalType(P, S)` supports maximum `P=38` (128 bit). It is impossible to read, write or operate with values of larger precision, + this leads to an exception. + +### Temporal types { #DBR-onetl-connection-db-connection-clickhouse-types-temporal-types } + +Notes: + +- Datetime with timezone has the same precision as without timezone +- `DateTime` is alias for `DateTime32` +- `TIMESTAMP` is alias for `DateTime32`, but `TIMESTAMP(N)` is alias for `DateTime64(N)` + +| Clickhouse type (read) | Spark type | Clickhouse type (write) | Clickhouse type (create) | +|-----------------------------------|--------------------------------------|----------------------------------|-------------------------------| +| `Date` | `DateType()` | `Date` | `Date` | +| `Date32` | `DateType()` | `Date` | `Date`, **cannot insert data** [^4] | +| `DateTime32`, seconds | `TimestampType()`, microseconds | `DateTime64(6)`, microseconds | `DateTime32`, seconds | +| `DateTime64(3)`, milliseconds | `TimestampType()`, microseconds | `DateTime64(6)`, microseconds | `DateTime32`, seconds, **precision loss** [^5] | +| `DateTime64(6)`, microseconds | `TimestampType()`, microseconds | | `DateTime32`, seconds, **precision loss** [^7] | +| `DateTime64(7..9)`, nanoseconds | `TimestampType()`, microseconds, **precision loss** [^6] | | | +| `-` | `TimestampNTZType()`, microseconds | | | +| `DateTime32(TZ)`
`DateTime64(P, TZ)` | unsupported [^7] | | | +| `IntervalNanosecond`
`IntervalMicrosecond`
`IntervalMillisecond`
`IntervalSecond`
`IntervalMinute`
`IntervalHour`
`IntervalDay`
`IntervalMonth`
`IntervalQuarter`
`IntervalWeek`
`IntervalYear` |





`LongType()` |





`Int64` |





`Int64` | + +!!! warning + + Note that types in Clickhouse and Spark have different value ranges: + + | Clickhouse type | Min value | Max value | Spark type | Min value | Max value | + |------------------------|-----------------------------------|-----------------------------------|---------------------|--------------------------------|--------------------------------| + | `Date` | `1970-01-01` | `2149-06-06` |

`DateType()` {: rowspan=3} |

`0001-01-01 00:00:00.000000` {: rowspan=3} |

`9999-12-31 23:59:59.999999` {: rowspan=3} | + | `DateTime64(P=0..8)` | `1900-01-01 00:00:00.00000000` | `2299-12-31 23:59:59.99999999` | ⁠ {: style="padding:0"} | ⁠ {: style="padding:0"} | ⁠ {: style="padding:0"} | + | `DateTime64(P=9)` | `1900-01-01 00:00:00.000000000` | `2262-04-11 23:47:16.999999999` | ⁠ {: style="padding:0"} | ⁠ {: style="padding:0"} | ⁠ {: style="padding:0"} | + + So not all of values in Spark DataFrame can be written to Clickhouse. + + References: + + * [Clickhouse Date documentation](https://clickhouse.com/docs/en/sql-reference/data-types/date) + * [Clickhouse Datetime32 documentation](https://clickhouse.com/docs/en/sql-reference/data-types/datetime) + * [Clickhouse Datetime64 documentation](https://clickhouse.com/docs/en/sql-reference/data-types/datetime64) + * [Spark DateType documentation](https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/DateType.html) + * [Spark TimestampType documentation](https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/TimestampType.html) + +[^4]: `Date32` has different bytes representation than `Date`, and inserting value of type `Date32` to `Date` column + leads to errors on Clickhouse side, e.g. `Date(106617) should be between 0 and 65535 inclusive of both values`. + Although Spark does properly read the `Date32` column as `DateType()`, and there should be no difference at all. + Probably this is some bug in Clickhouse driver. + +[^5]: Generic JDBC dialect generates DDL with Clickhouse type `TIMESTAMP` which is alias for `DateTime32` with precision up to seconds (`23:59:59`). + Inserting data with milliseconds precision (`23:59:59.999`) will lead to **throwing away milliseconds**. + Solution: create table manually, with proper column type. + +[^6]: Clickhouse support datetime up to nanoseconds precision (`23:59:59.999999999`), + but Spark `TimestampType()` supports datetime up to microseconds precision (`23:59:59.999999`). + Nanoseconds will be lost during read or write operations. + Solution: create table manually, with proper column type. + +[^7]: Clickhouse will raise an exception that data in format `2001-01-01 23:59:59.999999` has data `.999999` which does not match format `YYYY-MM-DD hh:mm:ss` + of `DateTime32` column type (see [^5]). + So Spark can create Clickhouse table, but cannot write data to column of this type. + Solution: create table manually, with proper column type. + +### String types { #DBR-onetl-connection-db-connection-clickhouse-types-string-types } + +| Clickhouse type (read) | Spark type | Clickhouse type (write) | Clickhouse type (create) | +|--------------------------------------|------------------|------------------------|--------------------------| +| `FixedString(N)`
`String`
`Enum8`
`Enum16`
`IPv4`
`IPv6`
`UUID` |


`StringType()` |


`String` |


`String` | +| `-` | `BinaryType()` | | | + +## Unsupported types { #DBR-onetl-connection-db-connection-clickhouse-types-unsupported-types } + +Columns of these Clickhouse types cannot be read by Spark: + +- `AggregateFunction(func, T)` +- `Array(T)` +- `JSON` +- `Map(K, V)` +- `MultiPolygon` +- `Nested(field1 T1, ...)` +- `Nothing` +- `Point` +- `Polygon` +- `Ring` +- `SimpleAggregateFunction(func, T)` +- `Tuple(T1, T2, ...)` + +Dataframe with these Spark types cannot be written to Clickhouse: + +- `ArrayType(T)` +- `BinaryType()` +- `CharType(N)` +- `DayTimeIntervalType(P, S)` +- `MapType(K, V)` +- `NullType()` +- `StructType([...])` +- `TimestampNTZType()` +- `VarcharType(N)` + +This is because Spark does not have dedicated Clickhouse dialect, and uses Generic JDBC dialect instead. +This dialect does not have type conversion between some types, like Clickhouse `Array` -> Spark `ArrayType()`, and vice versa. + +The is a way to avoid this - just cast everything to `String`. + +## Explicit type cast { #DBR-onetl-connection-db-connection-clickhouse-types-explicit-type-cast } + +### `DBReader` { #DBR-onetl-connection-db-connection-clickhouse-types-dbreader } + +Use `CAST` or `toJSONString` to get column data as string in JSON format, + +For parsing JSON columns in ClickHouse, [JSON.parse_column][onetl.file.format.json.JSON.parse_column] method. + +```python +from pyspark.sql.types import ArrayType, IntegerType + +from onetl.file.format import JSON +from onetl.connection import ClickHouse +from onetl.db import DBReader + +reader = DBReader( + connection=clickhouse, + target="default.source_tbl", + columns=[ + "id", + "toJSONString(array_column) array_column", + ], +) +df = reader.run() + +# Spark requires all columns to have some specific type, describe it +column_type = ArrayType(IntegerType()) + +json = JSON() +df = df.select( + df.id, + json.parse_column("array_column", column_type), +) +``` + +### `DBWriter` { #DBR-onetl-connection-db-connection-clickhouse-types-dbwriter } + +For writing JSON data to ClickHouse, use the [JSON.serialize_column][onetl.file.format.json.JSON.serialize_column] method to convert a DataFrame column to JSON format efficiently and write it as a `String` column in Clickhouse. + +```python +from onetl.file.format import JSON +from onetl.connection import ClickHouse +from onetl.db import DBWriter + +clickhouse = ClickHouse(...) + +clickhouse.execute( + """ + CREATE TABLE default.target_tbl ( + id Int32, + array_column_json String, + ) + ENGINE = MergeTree() + ORDER BY id + """, +) + +json = JSON() +df = df.select( + df.id, + json.serialize_column(df.array_column).alias("array_column_json"), +) + +writer.run(df) +``` + +Then you can parse this column on Clickhouse side - for example, by creating a view: + +```sql +SELECT + id, + JSONExtract(json_column, 'Array(String)') AS array_column +FROM target_tbl +``` + +You can also use [ALIAS](https://clickhouse.com/docs/en/sql-reference/statements/create/table#alias) +or [MATERIALIZED](https://clickhouse.com/docs/en/sql-reference/statements/create/table#materialized) columns +to avoid writing such expression in every `SELECT` clause all the time: + +```sql +CREATE TABLE default.target_tbl ( + id Int32, + array_column_json String, + -- computed column + array_column Array(String) ALIAS JSONExtract(json_column, 'Array(String)') + -- or materialized column + -- array_column Array(String) MATERIALIZED JSONExtract(json_column, 'Array(String)') +) +ENGINE = MergeTree() +ORDER BY id +``` + +Downsides: + +- Using `SELECT JSONExtract(...)` or `ALIAS` column can be expensive, because value is calculated on every row access. This can be especially harmful if such column is used in `WHERE` clause. +- `ALIAS` and `MATERIALIZED` columns are not included in `SELECT *` clause, they should be added explicitly: `SELECT *, calculated_column FROM table`. + +!!! warning + + [EPHEMERAL](https://clickhouse.com/docs/en/sql-reference/statements/create/table#ephemeral) columns are not supported by Spark + because they cannot be selected to determine target column type. diff --git a/mddocs/docs/connection/db_connection/clickhouse/write.md b/mddocs/docs/connection/db_connection/clickhouse/write.md new file mode 100644 index 000000000..1a7fa2f23 --- /dev/null +++ b/mddocs/docs/connection/db_connection/clickhouse/write.md @@ -0,0 +1,63 @@ +# Writing to Clickhouse using `DBWriter` { #DBR-onetl-connection-db-connection-clickhouse-write-writing-to-clickhouse-using-dbwriter } + +For writing data to Clickhouse, use [DBWriter][DBR-onetl-db-writer]. + +!!! warning + + Please take into account [Clickhouse types][DBR-onetl-connection-db-connection-clickhouse-types-clickhouse-spark-type-mapping] + + +!!! warning + + It is always recommended to create table explicitly using [Clickhouse.execute][DBR-onetl-connection-db-connection-clickhouse-execute-executing-statements-in-clickhouse] + instead of relying on Spark's table DDL generation. + + This is because Spark's DDL generator can create columns with different precision and types than it is expected, + causing precision loss or other issues. + +## Examples { #DBR-onetl-connection-db-connection-clickhouse-write-examples } + +```python +from onetl.connection import Clickhouse +from onetl.db import DBWriter + +clickhouse = Clickhouse(...) + +df = ... # data is here + +writer = DBWriter( + connection=clickhouse, + target="schema.table", + options=Clickhouse.WriteOptions( + if_exists="append", + # ENGINE is required by Clickhouse + createTableOptions="ENGINE = MergeTree() ORDER BY id", + ), +) + +writer.run(df) +``` + +## Options { #DBR-onetl-connection-db-connection-clickhouse-write-options } + +Method above accepts [Clickhouse.WriteOptions][onetl.connection.db_connection.clickhouse.options.ClickhouseWriteOptions] + + + +::: onetl.connection.db_connection.clickhouse.options.ClickhouseWriteOptions + options: + inherited_members: true + heading_level: 3 + show_root_heading: true diff --git a/mddocs/docs/connection/db_connection/greenplum/connection.md b/mddocs/docs/connection/db_connection/greenplum/connection.md new file mode 100644 index 000000000..823855c9d --- /dev/null +++ b/mddocs/docs/connection/db_connection/greenplum/connection.md @@ -0,0 +1,18 @@ +# Greenplum connection { #DBR-onetl-connection-db-connection-greenplum-connection-0 } + + + +::: onetl.connection.db_connection.greenplum.connection.Greenplum + options: + members: + - get_packages + - check diff --git a/mddocs/docs/connection/db_connection/greenplum/execute.md b/mddocs/docs/connection/db_connection/greenplum/execute.md new file mode 100644 index 000000000..fa1e5cb05 --- /dev/null +++ b/mddocs/docs/connection/db_connection/greenplum/execute.md @@ -0,0 +1,191 @@ +# Executing statements in Greenplum { #DBR-onetl-connection-db-connection-greenplum-execute-executing-statements-in-greenplum } + +!!! warning + + Methods below **read all the rows** returned from DB **to Spark driver memory**, and then convert them to DataFrame. + + Do **NOT** use them to read large amounts of data. Use [DBReader][DBR-onetl-connection-db-connection-greenplum-read-reading-from-greenplum-using-dbreader] instead. + +## How to { #DBR-onetl-connection-db-connection-greenplum-execute-how-to } + +There are 2 ways to execute some statement in Greenplum + +### Use `Greenplum.fetch` { #DBR-onetl-connection-db-connection-greenplum-execute-use-greenplum-fetch } + +Use this method to perform some `SELECT` query which returns **small number or rows**, like reading +Greenplum config, or reading data from some reference table. Method returns Spark DataFrame. + +Method accepts [Greenplum.FetchOptions][onetl.connection.db_connection.greenplum.options.GreenplumFetchOptions]. + +Connection opened using this method should be then closed with `connection.close()` or `with connection:`. + +!!! warning + + `Greenplum.fetch` is implemented using Postgres JDBC connection, so types are handled a bit differently than in `DBReader`. See [Postgres types][DBR-onetl-connection-db-connection-postgres-types-postgres-spark-type-mapping]. + +#### Syntax support in `Greenplum.fetch` { #DBR-onetl-connection-db-connection-greenplum-execute-syntax-support-in-greenplum-fetch } + +This method supports **any** query syntax supported by Greenplum, like: + +- ✅︎ `SELECT ... FROM ...` +- ✅︎ `WITH alias AS (...) SELECT ...` +- ✅︎ `SELECT func(arg1, arg2)` or `{call func(arg1, arg2)}` - special syntax for calling functions +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +#### Examples for `Greenplum.fetch` { #DBR-onetl-connection-db-connection-greenplum-execute-examples-for-greenplum-fetch } + + ```python + from onetl.connection import Greenplum + + greenplum = Greenplum(...) + + df = greenplum.fetch( + "SELECT value FROM some.reference_table WHERE key = 'some_constant'", + options=Greenplum.FetchOptions(queryTimeout=10), + ) + greenplum.close() + value = df.collect()[0][0] # get value from first row and first column + + ``` + +### Use `Greenplum.execute` { #DBR-onetl-connection-db-connection-greenplum-execute-use-greenplum-execute } + +Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. + +Method accepts [Greenplum.ExecuteOptions][onetl.connection.db_connection.greenplum.options.GreenplumExecuteOptions]. + +Connection opened using this method should be then closed with `connection.close()` or `with connection:`. + +#### Syntax support in `Greenplum.execute` { #DBR-onetl-connection-db-connection-greenplum-execute-syntax-support-in-greenplum-execute } + +This method supports **any** query syntax supported by Greenplum, like: + +- ✅︎ `CREATE TABLE ...`, `CREATE VIEW ...`, and so on +- ✅︎ `ALTER ...` +- ✅︎ `INSERT INTO ... SELECT ...`, `UPDATE ...`, `DELETE ...`, and so on +- ✅︎ `DROP TABLE ...`, `DROP VIEW ...`, `TRUNCATE TABLE`, and so on +- ✅︎ `CALL procedure(arg1, arg2) ...` +- ✅︎ `SELECT func(arg1, arg2)` or `{call func(arg1, arg2)}` - special syntax for calling functions +- ✅︎ other statements not mentioned here +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +#### Examples for `Greenplum.execute` { #DBR-onetl-connection-db-connection-greenplum-execute-examples-for-greenplum-execute } + + ```python + from onetl.connection import Greenplum + + greenplum = Greenplum(...) + + greenplum.execute("DROP TABLE schema.table") + greenplum.execute( + """ + CREATE TABLE schema.table ( + id int, + key text, + value real + ) + DISTRIBUTED BY id + """, + options=Greenplum.ExecuteOptions(queryTimeout=10), + ) + ``` + +## Interaction schema { #DBR-onetl-connection-db-connection-greenplum-execute-interaction-schema } + +Unlike reading & writing, executing statements in Greenplum is done **only** through Greenplum master node, +without any interaction between Greenplum segments and Spark executors. More than that, Spark executors are not used in this case. + +The only port used while interacting with Greenplum in this case is `5432` (Greenplum master port). + +??? note "Spark <-> Greenplum interaction during Greenplum.execute()/Greenplum.fetch()" + + ```plantuml + @startuml + title Greenplum master <-> Spark driver + box Spark + participant "Spark driver" + end box + + box "Greenplum" + participant "Greenplum master" + end box + + == Greenplum.check() == + + activate "Spark driver" + "Spark driver" -> "Greenplum master" ++ : CONNECT + + == Greenplum.execute(statement) == + "Spark driver" --> "Greenplum master" : EXECUTE statement + "Greenplum master" -> "Spark driver" : RETURN result + + == Greenplum.close() == + "Spark driver" --> "Greenplum master" : CLOSE CONNECTION + + deactivate "Greenplum master" + deactivate "Spark driver" + @enduml + ``` + + ```mermaid + --- + title: Greenplum master <—> Spark driver + --- + + sequenceDiagram + box Spark + participant A as Spark driver + end + box Greenplum + participant B as Greenplum master + end + + Note over A,B: == Greenplum.check() == + + A->>B: CONNECT + + Note over A,B: == Greenplum.execute(statement) == + + A-->>B: EXECUTE statement + B-->> A: RETURN result + + Note over A,B: == Greenplum.close() == + + A ->> B: CLOSE CONNECTION + ``` + +## Options { #DBR-onetl-connection-db-connection-greenplum-execute-options } + + + +::: onetl.connection.db_connection.greenplum.options.GreenplumFetchOptions + options: + inherited_members: true + heading_level: 3 + show_root_heading: true + +::: onetl.connection.db_connection.greenplum.options.GreenplumExecuteOptions + options: + inherited_members: true + heading_level: 3 + show_root_heading: true diff --git a/mddocs/docs/connection/db_connection/greenplum/index.md b/mddocs/docs/connection/db_connection/greenplum/index.md new file mode 100644 index 000000000..741751517 --- /dev/null +++ b/mddocs/docs/connection/db_connection/greenplum/index.md @@ -0,0 +1,16 @@ +# Greenplum { #DBR-onetl-connection-db-connection-greenplum } + +## Connection { #DBR-onetl-connection-db-connection-greenplum-connection-1 } + +* [Prerequisites][DBR-onetl-connection-db-connection-greenplum-prerequisites] +* [Greenplum connection][DBR-onetl-connection-db-connection-greenplum-connection-0] + +## Operations { #DBR-onetl-connection-db-connection-greenplum-operations } + +* [Reading from Greenplum using `DBReader`][DBR-onetl-connection-db-connection-greenplum-read-reading-from-greenplum-using-dbreader] +* [Writing to Greenplum using `DBWriter`][DBR-onetl-connection-db-connection-greenplum-write-writing-to-greenplum-using-dbwriter] +* [Executing statements in Greenplum][DBR-onetl-connection-db-connection-greenplum-execute-executing-statements-in-greenplum] + +## Troubleshooting { #DBR-onetl-connection-db-connection-greenplum-troubleshooting } + +* [Greenplum <-> Spark type mapping][DBR-onetl-connection-db-connection-greenplum-types-greenplum-spark-type-mapping] diff --git a/mddocs/docs/connection/db_connection/greenplum/prerequisites.md b/mddocs/docs/connection/db_connection/greenplum/prerequisites.md new file mode 100644 index 000000000..be9a58af3 --- /dev/null +++ b/mddocs/docs/connection/db_connection/greenplum/prerequisites.md @@ -0,0 +1,373 @@ +# Prerequisites { #DBR-onetl-connection-db-connection-greenplum-prerequisites } + +## Version Compatibility { #DBR-onetl-connection-db-connection-greenplum-prerequisites-version-compatibility } + +- Greenplum server versions: + - Officially declared: 5.x, 6.x, and 7.x (which requires `Greenplum.get_packages(package_version="2.3.0")` or higher) + - Actually tested: 6.23, 7.0 +- Spark versions: 2.3.x - 3.2.x (Spark 3.3+ is not supported yet) +- Java versions: 8 - 11 + +See [official documentation](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.2/greenplum-connector-spark/release_notes.html). + +## Installing PySpark { #DBR-onetl-connection-db-connection-greenplum-prerequisites-installing-pyspark } + +To use Greenplum connector you should have PySpark installed (or injected to `sys.path`) +BEFORE creating the connector instance. + +See [installation instruction][DBR-onetl-install-spark] for more details. + +## Downloading VMware package { #DBR-onetl-connection-db-connection-greenplum-prerequisites-downloading-vmware-package } + +To use Greenplum connector you should download connector `.jar` file from +[VMware website](https://network.tanzu.vmware.com/products/vmware-greenplum#/releases/1413479/file_groups/16966) +and then pass it to Spark session. + +!!! warning + + Please pay attention to [Spark & Scala version compatibility][DBR-onetl-install-spark-compatibility-matrix]. + +!!! warning + + There are issues with using package of version 2.3.0/2.3.1 with Greenplum 6.x - connector can + open transaction with `SELECT * FROM table LIMIT 0` query, but does not close it, which leads to deadlocks + during write. + +There are several ways to do that. See [install Java packages][DBR-onetl-install-spark-injecting-java-packages] for details. + +!!! note + + If you're uploading package to private package repo, use `groupId=io.pivotal` and `artifactoryId=greenplum-spark_2.12` + (`2.12` is Scala version) to give uploaded package a proper name. + +## Connecting to Greenplum { #DBR-onetl-connection-db-connection-greenplum-prerequisites-connecting-to-greenplum } + +### Interaction schema { #DBR-onetl-connection-db-connection-greenplum-prerequisites-interaction-schema } + +Spark executors open ports to listen incoming requests. +Greenplum segments are initiating connections to Spark executors using [EXTERNAL TABLE](https://docs.vmware.com/en/VMware-Greenplum/7/greenplum-database/ref_guide-sql_commands-CREATE_EXTERNAL_TABLE.html) +functionality, and send/read data using [gpfdist protocol](https://docs.vmware.com/en/VMware-Greenplum/7/greenplum-database/admin_guide-external-g-using-the-greenplum-parallel-file-server--gpfdist-.html#about-gpfdist-setup-and-performance-1). + +Data is **not** send through Greenplum master. +Greenplum master only receives commands to start reading/writing process, and manages all the metadata (external table location, schema and so on). + +More details can be found in [official documentation](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/overview.html). + +### Set number of connections { #DBR-onetl-connection-db-connection-greenplum-prerequisites-set-number-of-connections } + +!!! warning + + This is very important!!! + + If you don't limit number of connections, you can exceed the [max_connections](https://docs.vmware.com/en/VMware-Greenplum/7/greenplum-database/admin_guide-client_auth.html#limiting-concurrent-connections#limiting-concurrent-connections-2) + limit set on the Greenplum side. It's usually not so high, e.g. 500-1000 connections max, + depending on your Greenplum instance settings and using connection balancers like `pgbouncer`. + + Consuming all available connections means **nobody** (even admin users) can connect to Greenplum. + +Each job on the Spark executor makes its own connection to Greenplum master node, +so you need to limit number of connections to avoid opening too many of them. + +- Reading about `5-10Gb` of data requires about `3-5` parallel connections. +- Reading about `20-30Gb` of data requires about `5-10` parallel connections. +- Reading about `50Gb` of data requires ~ `10-20` parallel connections. +- Reading about `100+Gb` of data requires `20-30` parallel connections. +- Opening more than `30-50` connections is not recommended. + +Number of connections can be limited by 2 ways: + +- By limiting number of Spark executors and number of cores per-executor. Max number of parallel jobs is `executors * cores`. + +=== "Spark with master=local" + + ```python + spark = ( + SparkSession.builder + # Spark will run with 5 threads in local mode, allowing up to 5 parallel tasks + .config("spark.master", "local[5]") + .config("spark.executor.cores", 1) + ).getOrCreate() + ``` + +=== "Spark with master=yarn or master=k8s, dynamic allocation" + + ```python + spark = ( + SparkSession.builder + .config("spark.master", "yarn") + # Spark will start MAX 10 executors with 1 core each (dynamically), so max number of parallel jobs is 10 + .config("spark.dynamicAllocation.maxExecutors", 10) + .config("spark.executor.cores", 1) + ).getOrCreate() + ``` + +=== "Spark with master=yarn or master=k8s, static allocation" + + ```python + spark = ( + SparkSession.builder + .config("spark.master", "yarn") + # Spark will start EXACTLY 10 executors with 1 core each, so max number of parallel jobs is 10 + .config("spark.executor.instances", 10) + .config("spark.executor.cores", 1) + ).getOrCreate() + ``` + +- By limiting connection pool size user by Spark (**only** for Spark with `master=local`): + + ```python + spark = SparkSession.builder.config("spark.master", "local[*]").getOrCreate() + + # No matter how many executors are started and how many cores they have, + # number of connections cannot exceed pool size: + Greenplum( + ..., + extra={ + "pool.maxSize": 10, + }, + ) + ``` + +See [connection pooling](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/using_the_connector.html#jdbcconnpool) +documentation. + +- By setting [num_partitions][onetl.connection.db_connection.greenplum.options.GreenplumReadOptions.num_partitions] + and [partition_column][onetl.connection.db_connection.greenplum.options.GreenplumReadOptions.partition_column] (not recommended). + +### Allowing connection to Greenplum master { #DBR-onetl-connection-db-connection-greenplum-prerequisites-allowing-connection-to-greenplum-master } + +Ask your Greenplum cluster administrator to allow your user to connect to Greenplum master node, +e.g. by updating `pg_hba.conf` file. + +More details can be found in [official documentation](https://docs.vmware.com/en/VMware-Greenplum/7/greenplum-database/admin_guide-client_auth.html#limiting-concurrent-connections#allowing-connections-to-greenplum-database-0). + +### Set connection port { #DBR-onetl-connection-db-connection-greenplum-prerequisites-set-connection-port } + +#### Connection port for Spark with `master=k8s` { #DBR-onetl-connection-db-connection-greenplum-prerequisites-connection-port-for-spark-with-masterk8s } + +Please follow [the official documentation](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/configure.html#k8scfg) + +#### Connection port for Spark with `master=yarn` or `master=local` { #DBR-onetl-connection-db-connection-greenplum-prerequisites-connection-port-for-spark-with-masteryarn-or-masterlocal } + +To read data from Greenplum using Spark, following ports should be opened in firewall between Spark and Greenplum: + +- Spark driver and all Spark executors -> port `5432` on Greenplum master node. + + This port number should be set while connecting to Greenplum: + + ```python + greenplum = Greenplum(host="master.host", port=5432, ...) + ``` + +- Greenplum segments -> some port range (e.g. `41000-42000`) **listened by Spark executors**. + + This range should be set in `extra` option: + + ```python + greenplum = Greenplum( + ..., + extra={ + "server.port": "41000-42000", + }, + ) + ``` + + Number of ports in this range is `number of parallel running Spark sessions` * `number of parallel connections per session`. + + Number of connections per session (see below) is usually less than `30` (see above). + + Number of session depends on your environment: + + - For `master=local` only few ones-tens sessions can be started on the same host, depends on available RAM and CPU. + - For `master=yarn` hundreds or thousands of sessions can be started simultaneously, but they are executing on different cluster nodes, so one port can be opened on different nodes at the same time. + +More details can be found in official documentation: + +- [port requirements](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/sys_reqs.html#network-port-requirements) +- [format of server.port value](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/options.html#server.port) +- [port troubleshooting](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/troubleshooting.html#port-errors) + +### Set connection host { #DBR-onetl-connection-db-connection-greenplum-prerequisites-set-connection-host } + +#### Connection host for Spark with `master=k8s` { #DBR-onetl-connection-db-connection-greenplum-prerequisites-connection-host-for-spark-with-masterk8s } + +Please follow [the official documentation](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/configure.html#k8scfg) + +#### Connection host for Spark with `master=local` { #DBR-onetl-connection-db-connection-greenplum-prerequisites-connection-host-for-spark-with-masterlocal } + +By default, Greenplum connector tries to resolve IP of current host, and then pass it as `gpfdist` URL to Greenplum segment. +This may fail in some cases. + +For example, IP can be resolved using `/etc/hosts` content like this: + +```text +127.0.0.1 localhost real-host-name +``` + +```bash +$ hostname -f +localhost + +$ hostname -i +127.0.0.1 +``` + +Reading/writing data to Greenplum will fail with following exception: + +```text +org.postgresql.util.PSQLException: ERROR: connection with gpfdist failed for +"gpfdist://127.0.0.1:49152/local-1709739764667/exec/driver", +effective url: "http://127.0.0.1:49152/local-1709739764667/exec/driver": +error code = 111 (Connection refused); (seg3 slice1 12.34.56.78:10003 pid=123456) +``` + +There are 2 ways to fix that: + +- Explicitly pass your host IP address to connector, like this + + ```python + import os + + # pass here real host IP (accessible from GP segments) + os.environ["HOST_IP"] = "192.168.1.1" + + greenplum = Greenplum( + ..., + extra={ + # connector will read IP from this environment variable + "server.hostEnv": "env.HOST_IP", + }, + spark=spark, + ) + ``` + + More details can be found in [official documentation](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/options.html#server.hostenv). + +- Update `/etc/hosts` file to include real host IP: + + ```text + 127.0.0.1 localhost + # this IP should be accessible from GP segments + 192.168.1.1 driver-host-name + ``` + + So Greenplum connector will properly resolve host IP. + +#### Connection host for Spark with `master=yarn` { #DBR-onetl-connection-db-connection-greenplum-prerequisites-connection-host-for-spark-with-masteryarn } + +The same issue with resolving IP address can occur on Hadoop cluster node, but it's tricky to fix, because each node has a different IP. + +There are 3 ways to fix that: + +- Pass node hostname to `gpfdist` URL. So IP will be resolved on segment side: + + ```python + greenplum = Greenplum( + ..., + extra={ + "server.useHostname": "true", + }, + ) + ``` + + But this may fail if Hadoop cluster node hostname cannot be resolved from Greenplum segment side. + + More details can be found in [official documentation](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/options.html#server.usehostname). + +- Set specific network interface to get IP address from: + + ```python + greenplum = Greenplum( + ..., + extra={ + "server.nic": "eth0", + }, + ) + ``` + + You can get list of network interfaces using this command. + + !!! note + + This command should be executed on Hadoop cluster node, **not** Spark driver host! + + ```bash + $ ip address + 1: lo: mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000 + inet 127.0.0.1/8 scope host lo + valid_lft forever preferred_lft forever + 2: eth0: mtu 1500 qdisc fq_codel state UP group default qlen 1000 + inet 192.168.1.1/24 brd 192.168.1.255 scope global dynamic noprefixroute eth0 + valid_lft 83457sec preferred_lft 83457sec + ``` + + Note that in this case **each** Hadoop cluster node node should have network interface with name `eth0`. + + More details can be found in [official documentation](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/options.html#server.nic). + +- Update `/etc/hosts` on each Hadoop cluster node to include real node IP: + + ```text + 127.0.0.1 localhost + # this IP should be accessible from GP segments + 192.168.1.1 cluster-node-name + ``` + + So Greenplum connector will properly resolve node IP. + +### Set required grants { #DBR-onetl-connection-db-connection-greenplum-prerequisites-set-required-grants } + +Ask your Greenplum cluster administrator to set following grants for a user, +used for creating a connection: + +=== "Read + Write" + + ```sql + -- get access to get tables metadata & cluster information + GRANT SELECT ON information_schema.tables TO username; + GRANT SELECT ON pg_attribute TO username; + GRANT SELECT ON pg_class TO username; + GRANT SELECT ON pg_namespace TO username; + GRANT SELECT ON pg_settings TO username; + GRANT SELECT ON pg_stats TO username; + GRANT SELECT ON gp_distributed_xacts TO username; + GRANT SELECT ON gp_segment_configuration TO username; + -- Greenplum 5.x only + GRANT SELECT ON gp_distribution_policy TO username; + + -- allow creating external tables in the same schema as source/target table + GRANT USAGE ON SCHEMA myschema TO username; + GRANT CREATE ON SCHEMA myschema TO username; + ALTER USER username CREATEEXTTABLE(type = 'readable', protocol = 'gpfdist') CREATEEXTTABLE(type = 'writable', protocol = 'gpfdist'); + + -- allow read access to specific table (to get column types) + -- allow write access to specific table + GRANT SELECT, INSERT ON myschema.mytable TO username; + ``` + +=== "Read only" + + ```sql + -- get access to get tables metadata & cluster information + GRANT SELECT ON information_schema.tables TO username; + GRANT SELECT ON pg_attribute TO username; + GRANT SELECT ON pg_class TO username; + GRANT SELECT ON pg_namespace TO username; + GRANT SELECT ON pg_settings TO username; + GRANT SELECT ON pg_stats TO username; + GRANT SELECT ON gp_distributed_xacts TO username; + GRANT SELECT ON gp_segment_configuration TO username; + -- Greenplum 5.x only + GRANT SELECT ON gp_distribution_policy TO username; + + -- allow creating external tables in the same schema as source table + GRANT USAGE ON SCHEMA schema_to_read TO username; + GRANT CREATE ON SCHEMA schema_to_read TO username; + -- yes, `writable` for reading from GP, because data is written from Greenplum to Spark executor. + ALTER USER username CREATEEXTTABLE(type = 'writable', protocol = 'gpfdist'); + + -- allow read access to specific table + GRANT SELECT ON schema_to_read.table_to_read TO username; + ``` + +More details can be found in [official documentation](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/install_cfg.html#role-privileges). diff --git a/mddocs/docs/connection/db_connection/greenplum/read.md b/mddocs/docs/connection/db_connection/greenplum/read.md new file mode 100644 index 000000000..0bb904052 --- /dev/null +++ b/mddocs/docs/connection/db_connection/greenplum/read.md @@ -0,0 +1,441 @@ +# Reading from Greenplum using `DBReader` { #DBR-onetl-connection-db-connection-greenplum-read-reading-from-greenplum-using-dbreader } + +Data can be read from Greenplum to Spark using [DBReader][DBR-onetl-db-reader]. It also supports [strategy][DBR-onetl-strategy-read-strategies] for incremental data reading. + +!!! warning + + Please take into account [Greenplum types][DBR-onetl-connection-db-connection-greenplum-types-greenplum-spark-type-mapping]. + +!!! note + + Unlike JDBC connectors, *Greenplum connector for Spark* does not support + executing **custom** SQL queries using `.sql` method. Connector can be used to only read data from a table or view. + +## Supported DBReader features { #DBR-onetl-connection-db-connection-greenplum-read-supported-dbreader-features } + +- ✅︎ `columns` (see note below) +- ✅︎ `where` (see note below) +- ✅︎ `hwm` (see note below), supported strategies: + - ✅︎ [Snapshot strategy][DBR-onetl-strategy-snapshot-strategy] + - ✅︎ [Incremental strategy][DBR-onetl-connection-db-connection-clickhouse-read-incremental-strategy] + - ✅︎ [Snapshot batch strategy][DBR-onetl-strategy-snapshot-batch-strategy] + - ✅︎ [Incremental batch strategy][DBR-onetl-strategy-incremental-batch-strategy] +- ❌ `hint` (is not supported by Greenplum) +- ❌ `df_schema` +- ✅︎ `options` (see [Greenplum.ReadOptions][onetl.connection.db_connection.greenplum.options.GreenplumReadOptions]) + +!!! warning + + In case of Greenplum connector, `DBReader` does not generate raw `SELECT` query. Instead it relies on Spark SQL syntax + which in some cases (using column projection and predicate pushdown) can be converted to Greenplum SQL. + + So `columns`, `where` and `hwm.expression` should be specified in [Spark SQL](https://spark.apache.org/docs/latest/sql-ref-syntax.html) syntax, + not Greenplum SQL. + + This is OK: + + ```python + DBReader( + columns=[ + "some_column", + # this cast is executed on Spark side + "CAST(another_column AS STRING)", + ], + # this predicate is parsed by Spark, and can be pushed down to Greenplum + where="some_column LIKE 'val1%'", + ) + ``` + + This is will fail: + + ```python + DBReader( + columns=[ + "some_column", + # Spark does not have `text` type + "CAST(another_column AS text)", + ], + # Spark does not support ~ syntax for regexp matching + where="some_column ~ 'val1.*'", + ) + ``` + +## Examples { #DBR-onetl-connection-db-connection-greenplum-read-examples } + +Snapshot strategy: + +```python +from onetl.connection import Greenplum +from onetl.db import DBReader + +greenplum = Greenplum(...) + +reader = DBReader( + connection=greenplum, + source="schema.table", + columns=["id", "key", "CAST(value AS string) value", "updated_dt"], + where="key = 'something'", +) +df = reader.run() +``` + +Incremental strategy: + +```python +from onetl.connection import Greenplum +from onetl.db import DBReader +from onetl.strategy import IncrementalStrategy + +greenplum = Greenplum(...) + +reader = DBReader( + connection=greenplum, + source="schema.table", + columns=["id", "key", "CAST(value AS string) value", "updated_dt"], + where="key = 'something'", + hwm=DBReader.AutoDetectHWM(name="greenplum_hwm", expression="updated_dt"), +) + +with IncrementalStrategy(): + df = reader.run() +``` + +## Interaction schema { #DBR-onetl-connection-db-connection-greenplum-read-interaction-schema } + +High-level schema is described in [Greenplum prerequisites][DBR-onetl-connection-db-connection-greenplum-prerequisites]. You can find detailed interaction schema below. + +??? note "Spark <-> Greenplum interaction during DBReader.run()" + + ```plantuml + @startuml + title Greenplum master <-> Spark driver + box "Spark" + participant "Spark driver" + participant "Spark executor1" + participant "Spark executor2" + participant "Spark executorN" + end box + + box "Greenplum" + participant "Greenplum master" + participant "Greenplum segment1" + participant "Greenplum segment2" + participant "Greenplum segmentN" + end box + + == Greenplum.check() == + + activate "Spark driver" + "Spark driver" -> "Greenplum master" ++ : CONNECT + + "Spark driver" --> "Greenplum master" : CHECK IF TABLE EXISTS gp_table + "Greenplum master" --> "Spark driver" : TABLE EXISTS + "Spark driver" -> "Greenplum master" : SHOW SCHEMA FOR gp_table + "Greenplum master" --> "Spark driver" : (id bigint, col1 int, col2 text, ...) + + == DBReader.run() == + + "Spark driver" -> "Spark executor1" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION 1 + "Spark driver" -> "Spark executor2" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION 2 + "Spark driver" -> "Spark executorN" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION N + + note right of "Spark driver" : This is done in parallel,\nexecutors are independent\n|\n|\n|\nV + "Spark executor1" -> "Greenplum master" ++ : CREATE WRITABLE EXTERNAL TABLE spark_executor1 (id bigint, col1 int, col2 text, ...) USING address=executor1_host:executor1_port;\nINSERT INTO EXTERNAL TABLE spark_executor1 FROM gp_table WHERE gp_segment_id = 1 + note right of "Greenplum master" : Each white vertical line here is a opened connection to master.\nUsually, **N+1** connections are created from Spark to Greenplum master + "Greenplum master" --> "Greenplum segment1" ++ : SELECT DATA FROM gp_table_data_on_segment1 TO spark_executor1 + note right of "Greenplum segment1" : No direct requests between Greenplum segments & Spark driver.\nData transfer is always initiated by Greenplum segments. + + "Spark executor2" -> "Greenplum master" ++ : CREATE WRITABLE EXTERNAL TABLE spark_executor2 (id bigint, col1 int, col2 text, ...) USING address=executor2_host:executor2_port;\nINSERT INTO EXTERNAL TABLE spark_executor2 FROM gp_table WHERE gp_segment_id = 2 + "Greenplum master" --> "Greenplum segment2" ++ : SELECT DATA FROM gp_table_data_on_segment2 TO spark_executor2 + + "Spark executorN" -> "Greenplum master" ++ : CREATE WRITABLE EXTERNAL TABLE spark_executorN (id bigint, col1 int, col2 text, ...) USING address=executorN_host:executorN_port;\nINSERT INTO EXTERNAL TABLE spark_executorN FROM gp_table WHERE gp_segment_id = N + "Greenplum master" --> "Greenplum segmentN" ++ : SELECT DATA FROM gp_table_data_on_segmentN TO spark_executorN + + "Greenplum segment1" ->o "Spark executor1" -- : INITIALIZE CONNECTION TO Spark executor1\nPUSH DATA TO Spark executor1 + note left of "Spark executor1" : Circle is an open GPFDIST port,\nlistened by executor + + "Greenplum segment2" ->o "Spark executor2" -- : INITIALIZE CONNECTION TO Spark executor2\nPUSH DATA TO Spark executor2 + "Greenplum segmentN" ->o "Spark executorN" -- : INITIALIZE CONNECTION TO Spark executorN\nPUSH DATA TO Spark executorN + + == Spark.stop() == + + "Spark executor1" --> "Greenplum master" : DROP TABLE spark_executor1 + deactivate "Greenplum master" + "Spark executor2" --> "Greenplum master" : DROP TABLE spark_executor2 + deactivate "Greenplum master" + "Spark executorN" --> "Greenplum master" : DROP TABLE spark_executorN + deactivate "Greenplum master" + + "Spark executor1" --> "Spark driver" -- : DONE + "Spark executor2" --> "Spark driver" -- : DONE + "Spark executorN" --> "Spark driver" -- : DONE + + "Spark driver" --> "Greenplum master" : CLOSE CONNECTION + deactivate "Greenplum master" + deactivate "Spark driver" + @enduml + ``` + + ```mermaid + --- + title: Greenplum master <-> Spark driver + --- + + sequenceDiagram + box "Spark" + participant A as "Spark driver" + participant B as "Spark executor1" + participant C as "Spark executor2" + participant D as "Spark executorN" + end + + box "Greenplum" + participant E as "Greenplum master" + participant F as "Greenplum segment1" + participant G as "Greenplum segment2" + participant H as "Greenplum segmentN" + end + + note over A,H: == Greenplum.check() == + + activate A + activate E + A ->> E: CONNECT + + A -->> E : CHECK IF TABLE EXISTS gp_table + E -->> A : TABLE EXISTS + A ->> E : SHOW SCHEMA FOR gp_table + E -->> A : (id bigint, col1 int, col2 text, ...) + + note over A,H: == DBReader.run() == + + A ->> B: START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION 1 + A ->> C: START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION 2 + A ->> D: START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION N + + note right of A : This is done in parallel,
executors are independent
|
|
|
V + B ->> E: CREATE WRITABLE EXTERNAL TABLE spark_executor1 (id bigint, col1 int, col2 text, ...)
USING address=executor1_host:executor1_port
INSERT INTO EXTERNAL TABLE spark_executor1 FROM gp_table WHERE gp_segment_id = 1 + note right of E : Each white vertical line here is a opened connection to master.
Usually, **N+1** connections are created from Spark to Greenplum master + activate E + E -->> F: SELECT DATA FROM gp_table_data_on_segment1 TO spark_executor1 + note right of F : No direct requests between Greenplum segments & Spark driver.
Data transfer is always initiated by Greenplum segments. + + + C ->> E: CREATE WRITABLE EXTERNAL TABLE spark_executor2 (id bigint, col1 int, col2 text, ...)
USING address=executor2_host:executor2_port
INSERT INTO EXTERNAL TABLE spark_executor2 FROM gp_table WHERE gp_segment_id = 2 + activate E + E -->> G: SELECT DATA FROM gp_table_data_on_segment2 TO spark_executor2 + + D ->> E: CREATE WRITABLE EXTERNAL TABLE spark_executorN (id bigint, col1 int, col2 text, ...)
USING address=executorN_host:executorN_port
INSERT INTO EXTERNAL TABLE spark_executorN FROM gp_table WHERE gp_segment_id = N + activate E + E -->> H: SELECT DATA FROM gp_table_data_on_segmentN TO spark_executorN + + F -xB: INITIALIZE CONNECTION TO Spark executor1
PUSH DATA TO Spark executor1 + note left of B : Circle is an open GPFDIST port,
listened by executor + + G -xC: INITIALIZE CONNECTION TO Spark executor2
PUSH DATA TO Spark executor2 + H -xD: INITIALIZE CONNECTION TO Spark executorN
PUSH DATA TO Spark executorN + + note over A,H: == Spark.stop() == + + B -->> E : DROP TABLE spark_executor1 + deactivate E + C -->> E : DROP TABLE spark_executor2 + deactivate E + D -->> E : DROP TABLE spark_executorN + deactivate E + + B -->> A: DONE + C -->> A: DONE + D -->> A: DONE + + A -->> E : CLOSE CONNECTION + deactivate E + deactivate A + ``` + +## Recommendations { #DBR-onetl-connection-db-connection-greenplum-read-recommendations } + +### Select only required columns { #DBR-onetl-connection-db-connection-greenplum-read-select-only-required-columns } + +Instead of passing `"*"` in `DBReader(columns=[...])` prefer passing exact column names. This reduces the amount of data passed from Greenplum to Spark. + +### Pay attention to `where` value { #DBR-onetl-connection-db-connection-greenplum-read-pay-attention-to-where-value } + +Instead of filtering data on Spark side using `df.filter(df.column == 'value')` pass proper `DBReader(where="column = 'value'")` clause. This both reduces the amount of data send from Greenplum to Spark, and may also improve performance of the query. Especially if there are indexes or partitions for columns used in `where` clause. + +### Read data in parallel { #DBR-onetl-connection-db-connection-greenplum-read-data-in-parallel } + +`DBReader` in case of Greenplum connector requires view or table to have a column which is used by Spark for parallel reads. + +Choosing proper column allows each Spark executor to read only part of data stored in the specified segment, avoiding moving large amounts of data between segments, which improves reading performance. + +#### Using `gp_segment_id` { #DBR-onetl-connection-db-connection-greenplum-read-using-gp-segment-id } + +By default, `DBReader` will use [gp_segment_id](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/troubleshooting.html#reading-from-a-view) column for parallel data reading. Each DataFrame partition will contain data of a specific Greenplum segment. + +This allows each Spark executor read only data from specific Greenplum segment, avoiding moving large amounts of data between segments. + +If view is used, it is recommended to include `gp_segment_id` column to this view: + +??? note "Reading from view with gp_segment_id column" + + ```python + from onetl.connection import Greenplum + from onetl.db import DBReader + + greenplum = Greenplum(...) + + greenplum.execute( + """ + CREATE VIEW schema.view_with_gp_segment_id AS + SELECT + id, + some_column, + another_column, + gp_segment_id -- IMPORTANT + FROM schema.some_table + """, + ) + + reader = DBReader( + connection=greenplum, + source="schema.view_with_gp_segment_id", + ) + df = reader.run() + ``` + +#### Using custom `partition_column` { #DBR-onetl-connection-db-connection-greenplum-read-using-custom-partition-column } + +Sometimes table or view is lack of `gp_segment_id` column, but there is some column +with value range correlated with Greenplum segment distribution. + +In this case, custom column can be used instead: + +??? note "Reading from view with custom partition_column" + + ```python + from onetl.connection import Greenplum + from onetl.db import DBReader + + greenplum = Greenplum(...) + + greenplum.execute( + """ + CREATE VIEW schema.view_with_partition_column AS + SELECT + id, + some_column, + part_column -- correlated to greenplum segment ID + FROM schema.some_table + """, + ) + + reader = DBReader( + connection=greenplum, + source="schema.view_with_partition_column", + options=Greenplum.ReadOptions( + # parallelize data using specified column + partitionColumn="part_column", + # create 10 Spark tasks, each will read only part of table data + partitions=10, + ), + ) + df = reader.run() + ``` + +#### Reading `DISTRIBUTED REPLICATED` tables { #DBR-onetl-connection-db-connection-greenplum-read-reading-distributed-replicated-tables } + +Replicated tables do not have `gp_segment_id` column at all, so you need to set `partition_column` to some column name of type integer/bigint/smallint. + +### Parallel `JOIN` execution { #DBR-onetl-connection-db-connection-greenplum-read-parallel-join-execution } + +In case of using views which require some data motion between Greenplum segments, like `JOIN` queries, another approach should be used. + +Each Spark executor N will run the same query, so each of N query will start its own JOIN process, leading to really heavy load on Greenplum segments. + +**This should be avoided**. + +Instead is recommended to run `JOIN` query on Greenplum side, save the result to an intermediate table, and then read this table using `DBReader`: + +??? note "Reading from view using intermediate table" + + ```python + from onetl.connection import Greenplum + from onetl.db import DBReader + + greenplum = Greenplum(...) + + greenplum.execute( + """ + CREATE UNLOGGED TABLE schema.intermediate_table AS + SELECT + id, + tbl1.col1, + tbl1.data, + tbl2.another_data + FROM + schema.table1 as tbl1 + JOIN + schema.table2 as tbl2 + ON + tbl1.col1 = tbl2.col2 + WHERE ... + """, + ) + + reader = DBReader( + connection=greenplum, + source="schema.intermediate_table", + ) + df = reader.run() + + # write dataframe somethere + + greenplum.execute( + """ + DROP TABLE schema.intermediate_table + """, + ) + ``` + +!!! warning + + **NEVER** do that: + + ```python + df1 = DBReader(connection=greenplum, target="public.table1", ...).run() + df2 = DBReader(connection=greenplum, target="public.table2", ...).run() + + joined_df = df1.join(df2, on="col") + ``` + + This will lead to sending all the data from both `table1` and `table2` to Spark executor memory, and then `JOIN` + will be performed on Spark side, not inside Greenplum. This is **VERY** inefficient. + +#### `TEMPORARY` tables notice { #DBR-onetl-connection-db-connection-greenplum-read-temporary-tables-notice } + +Someone could think that writing data from view or result of `JOIN` to `TEMPORARY` table, and then passing it to `DBReader`, is an efficient way to read data from Greenplum. This is because temp tables are not generating WAL files, and are automatically deleted after finishing the transaction. + +That will **NOT** work. Each Spark executor establishes its own connection to Greenplum. And each connection starts its own transaction which means that every executor will read empty temporary table. + +You should use [UNLOGGED](https://docs.vmware.com/en/VMware-Greenplum/7/greenplum-database/ref_guide-sql_commands-CREATE_TABLE.html) tables to write data to intermediate table without generating WAL logs. + +## Options { #DBR-onetl-connection-db-connection-greenplum-read-options } + + + +::: onetl.connection.db_connection.greenplum.options.GreenplumReadOptions + options: + show_root_heading: true + heading_level: 3 diff --git a/mddocs/docs/connection/db_connection/greenplum/types.md b/mddocs/docs/connection/db_connection/greenplum/types.md new file mode 100644 index 000000000..7736f32f9 --- /dev/null +++ b/mddocs/docs/connection/db_connection/greenplum/types.md @@ -0,0 +1,303 @@ +# Greenplum <-> Spark type mapping { #DBR-onetl-connection-db-connection-greenplum-types-greenplum-spark-type-mapping } + +!!! note + + The results below are valid for Spark 3.2.4, and may differ on other Spark versions. + +## Type detection & casting { #DBR-onetl-connection-db-connection-greenplum-types-type-detection-casting } + +Spark's DataFrames always have a `schema` which is a list of columns with corresponding Spark types. All operations on a column are performed using column type. + +### Reading from Greenplum { #DBR-onetl-connection-db-connection-greenplum-types-reading-from-greenplum } + +This is how Greenplum connector performs this: + +- Execute query `SELECT * FROM table LIMIT 0` [^1]. +- For each column in query result get column name and Greenplum type. +- Find corresponding `Greenplum type (read)` → `Spark type` combination (see below) for each DataFrame column. If no combination is found, raise exception. +- Use Spark column projection and predicate pushdown features to build a final query. +- Create DataFrame from generated query with inferred schema. + +[^1]: Yes, **all columns of a table**, not just selected ones. + This means that if source table **contains** columns with unsupported type, the entire table cannot be read. + +### Writing to some existing Greenplum table { #DBR-onetl-connection-db-connection-greenplum-types-writing-to-some-existing-greenplum-table } + +This is how Greenplum connector performs this: + +- Get names of columns in DataFrame. +- Perform `SELECT * FROM table LIMIT 0` query. +- For each column in query result get column name and Greenplum type. +- Match table columns with DataFrame columns (by name, case insensitive). + If some column is present only in target table, but not in DataFrame (like `DEFAULT` or `SERIAL` column), and vice versa, raise an exception. + See [Explicit type cast][DBR-onetl-connection-db-connection-greenplum-types-explicit-type-cast]. +- Find corresponding `Spark type` → `Greenplumtype (write)` combination (see below) for each DataFrame column. If no combination is found, raise exception. +- If `Greenplumtype (write)` match `Greenplum type (read)`, no additional casts will be performed, DataFrame column will be written to Greenplum as is. +- If `Greenplumtype (write)` does not match `Greenplum type (read)`, DataFrame column will be casted to target column type **on Greenplum side**. For example, you can write column with text data to `json` column which Greenplum connector currently does not support. + +### Create new table using Spark { #DBR-onetl-connection-db-connection-greenplum-types-create-new-table-using-spark } + +!!! warning + + ABSOLUTELY NOT RECOMMENDED! + +This is how Greenplum connector performs this: + +- Find corresponding `Spark type` → `Greenplum type (create)` combination (see below) for each DataFrame column. If no combination is found, raise exception. +- Generate DDL for creating table in Greenplum, like `CREATE TABLE (col1 ...)`, and run it. +- Write DataFrame to created table as is. + +More details [can be found here](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/write_to_gpdb.html). + +But Greenplum connector support only limited number of types and almost no custom clauses (like `PARTITION BY`). +So instead of relying on Spark to create tables: + +??? note "See example" + + ```python + writer = DBWriter( + connection=greenplum, + target="public.table", + options=Greenplum.WriteOptions( + if_exists="append", + # by default distribution is random + distributedBy="id", + # partitionBy is not supported + ), + ) + writer.run(df) + ``` + +Always prefer creating table with desired DDL **BEFORE WRITING DATA**: + +??? note "See example" + + ```python + greenplum.execute( + """ + CREATE TABLE public.table ( + id int32, + business_dt timestamp(6), + value json + ) + PARTITION BY RANGE (business_dt) + DISTRIBUTED BY id + """, + ) + + writer = DBWriter( + connection=greenplum, + target="public.table", + options=Greenplum.WriteOptions(if_exists="append"), + ) + writer.run(df) + ``` + +See Greenplum [CREATE TABLE](https://docs.vmware.com/en/VMware-Greenplum/7/greenplum-database/ref_guide-sql_commands-CREATE_TABLE.html) documentation. + +## Supported types { #DBR-onetl-connection-db-connection-greenplum-types-supported-types } + +See: + +- [official connector documentation](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/reference-datatype_mapping.html) +- [list of Greenplum types](https://docs.vmware.com/en/VMware-Greenplum/7/greenplum-database/ref_guide-data_types.html) + +### Numeric types { #DBR-onetl-connection-db-connection-greenplum-types-numeric-types } + +| Greenplum type (read) | Spark type | Greenplumtype (write) | Greenplum type (create) | +|---------------------------------- |----------------------------------- |------------------------------- |------------------------- | +| `decimal`
`decimal(P=0..38)`
`decimal(P=0..38, S=0..38)` | `DecimalType(P=38, S=18)`
`DecimalType(P=0..38, S=0)`
`DecimalType(P=0..38, S=0..38)` | `decimal(P=38, S=18)`
`decimal(P=0..38, S=0)`
`decimal(P=0..38, S=0..38)` |
`decimal` (unbounded) | +| `decimal(P=39.., S=0..)` | unsupported [^2] | | | +| `real` | `FloatType()` | `real` | `real` | +| `double precision` | `DoubleType()` | `double precision` | `double precision` | +| `-` | `ByteType()` | unsupported | unsupported | +| `smallint` | `ShortType()` | `smallint` | `smallint` | +| `integer` | `IntegerType()` | `integer` | `integer` | +| `bigint` | `LongType()` | `bigint` | `bigint` | +| `money`
`int4range`
`int8range`
`numrange`
`int2vector` |


unsupported | | | + +[^2]: Greenplum support decimal types with unlimited precision. + + But Spark's `DecimalType(P, S)` supports maximum `P=38` (128 bit). It is impossible to read, write or operate with values of larger precision, + this leads to an exception. + +### Temporal types { #DBR-onetl-connection-db-connection-greenplum-types-temporal-types } + +| Greenplum type (read) | Spark type | Greenplumtype (write) | Greenplum type (create) | +|------------------------------------ |------------------------- |----------------------- |------------------------- | +| `date` | `DateType()` | `date` | `date` | +| `time`
`time(0..6)`
`time with time zone`
`time(0..6) with time zone` |

`TimestampType()`, time format quirks [^3] |

`timestamp` |

`timestamp` | +| `timestamp`
`timestamp(0..6)`
`timestamp with time zone`
`timestamp(0..6) with time zone` |

`TimestampType()` |

`timestamp` |

`timestamp` | +| `interval` or any precision
`daterange`
`tsrange`
`tstzrange` |

unsupported | | | + +!!! warning + + Note that types in Greenplum and Spark have different value ranges: + + + | Greenplum type | Min value | Max value | Spark type | Min value | Max value | + |----------------|---------------------------------|----------------------------------|---------------------|--------------------------------|--------------------------------| + | `date` | `-4713-01-01` | `5874897-01-01` | `DateType()` | `0001-01-01` | `9999-12-31` | + | `timestamp`
`time` | `-4713-01-01 00:00:00.000000`
`00:00:00.000000` | `294276-12-31 23:59:59.999999`
`24:00:00.000000` | `TimestampType()` | `0001-01-01 00:00:00.000000` | `9999-12-31 23:59:59.999999` | + + So not all of values can be read from Greenplum to Spark. + + References: + + * [Greenplum types documentation](https://docs.vmware.com/en/VMware-Greenplum/7/greenplum-database/ref_guide-data_types.html) + * [Spark DateType documentation](https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/DateType.html) + * [Spark TimestampType documentation](https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/TimestampType.html) + +[^3]: `time` type is the same as `timestamp` with date `1970-01-01`. So instead of reading data from Postgres like `23:59:59` + it is actually read `1970-01-01 23:59:59`, and vice versa. + +### String types { #DBR-onetl-connection-db-connection-greenplum-types-string-types } + +| Greenplum type (read) | Spark type | Greenplumtype (write) | Greenplum type (create) | +|----------------------------- |------------------ |----------------------- |------------------------- | +| `character`
`character(N)`
`character varying`
`character varying(N)`
`text`
`xml`
`CREATE TYPE ... AS ENUM` |



`StringType()` |



`text` |



`text` | +| `json`
`jsonb` |
unsupported | | | + +### Binary types { #DBR-onetl-connection-db-connection-greenplum-types-binary-types } + +| Greenplum type (read) | Spark type | Greenplumtype (write) | Greenplum type (create) | + |-------------------------- |------------------- |----------------------- |------------------------- | +| `boolean` | `BooleanType()` | `boolean` | `boolean` | +| `bit`
`bit(N)`
`bit varying`
`bit varying(N)` |

unsupported | | | +| `bytea` | unsupported [^4] | | | +| `-` | `BinaryType()` | `bytea` | `bytea` | + +[^4]: Yes, that's weird. + +### Struct types { #DBR-onetl-connection-db-connection-greenplum-types-struct-types } + +| Greenplum type (read) | Spark type | Greenplumtype (write) | Greenplum type (create) | +|--------------------------------|------------------|-----------------------|-------------------------| +| `T[]` | unsupported | | | +| `-` | `ArrayType()` | unsupported | | +| `CREATE TYPE sometype (...)` | `StringType()` | `text` | `text` | +| `-` | `StructType()`
`MapType()` | unsupported | | + +## Unsupported types { #DBR-onetl-connection-db-connection-greenplum-types-unsupported-types } + +Columns of these types cannot be read/written by Spark: + +- `cidr` +- `inet` +- `macaddr` +- `macaddr8` +- `circle` +- `box` +- `line` +- `lseg` +- `path` +- `point` +- `polygon` +- `tsvector` +- `tsquery` +- `uuid` + +The is a way to avoid this - just cast unsupported types to `text`. But the way this can be done is not a straightforward. + +## Explicit type cast { #DBR-onetl-connection-db-connection-greenplum-types-explicit-type-cast } + +### `DBReader` { #DBR-onetl-connection-db-connection-greenplum-types-dbreader } + +Direct casting of Greenplum types is not supported by DBReader due to the connector’s implementation specifics. + + ```python + reader = DBReader( + connection=greenplum, + # will fail + columns=["CAST(unsupported_column AS text)"], + ) + ``` + +But there is a workaround - create a view with casting unsupported column to text (or any other supported type). +For example, you can use [to_json](https://www.postgresql.org/docs/current/functions-json.html) Postgres function to convert column of any type to string representation and then parse this column on Spark side using [JSON.parse_column][onetl.file.format.json.JSON.parse_column] method. + + ```python + from pyspark.sql.types import ArrayType, IntegerType + + from onetl.connection import Greenplum + from onetl.db import DBReader + from onetl.file.format import JSON + + greenplum = Greenplum(...) + + greenplum.execute( + """ + CREATE VIEW schema.view_with_json_column AS + SELECT + id, + supported_column, + to_json(array_column) array_column_as_json, + gp_segment_id -- ! important ! + FROM + schema.table_with_unsupported_columns + """, + ) + + # create dataframe using this view + reader = DBReader( + connection=greenplum, + source="schema.view_with_json_column", + ) + df = reader.run() + + # Define the schema for the JSON data + json_scheme = ArrayType(IntegerType()) + + df = df.select( + df.id, + df.supported_column, + JSON().parse_column(df.array_column_as_json, json_scheme).alias("array_column"), + ) + ``` + +### `DBWriter` { #DBR-onetl-connection-db-connection-greenplum-types-dbwriter } + +To write data to a `text` or `json` column in a Greenplum table, use [JSON.serialize_column][onetl.file.format.json.JSON.serialize_column] method. + + ```python + from onetl.connection import Greenplum + from onetl.db import DBWriter + from onetl.file.format import JSON + + greenplum = Greenplum(...) + + greenplum.execute( + """ + CREATE TABLE schema.target_table ( + id int, + supported_column timestamp, + array_column_as_json jsonb, -- or text + ) + DISTRIBUTED BY id + """, + ) + + write_df = df.select( + df.id, + df.supported_column, + JSON().serialize_column(df.array_column).alias("array_column_json"), + ) + + writer = DBWriter( + connection=greenplum, + target="schema.target_table", + ) + writer.run(write_df) + ``` + +Then you can parse this column on Greenplum side: + + ```sql + SELECT + id, + supported_column, + -- access first item of an array + array_column_as_json->0 + FROM + schema.target_table + ``` diff --git a/mddocs/docs/connection/db_connection/greenplum/write.md b/mddocs/docs/connection/db_connection/greenplum/write.md new file mode 100644 index 000000000..70525ed5c --- /dev/null +++ b/mddocs/docs/connection/db_connection/greenplum/write.md @@ -0,0 +1,229 @@ +# Writing to Greenplum using `DBWriter` { #DBR-onetl-connection-db-connection-greenplum-write-writing-to-greenplum-using-dbwriter } + +For writing data to Greenplum, use [DBWriter][DBR-onetl-db-writer] with [GreenplumWriteOptions][onetl.connection.db_connection.greenplum.options.GreenplumWriteOptions]. + +!!! warning + + Please take into account [Greenplum types][DBR-onetl-connection-db-connection-greenplum-types-greenplum-spark-type-mapping]. + +!!! warning + + It is always recommended to create table explicitly using [Greenplum.execute][DBR-onetl-connection-db-connection-greenplum-execute-executing-statements-in-greenplum] + instead of relying on Spark's table DDL generation. + + This is because Spark's DDL generator can create columns with different types than it is expected. + +## Examples { #DBR-onetl-connection-db-connection-greenplum-write-examples } + + ```python + from onetl.connection import Greenplum + from onetl.db import DBWriter + + greenplum = Greenplum(...) + + df = ... # data is here + + writer = DBWriter( + connection=greenplum, + target="schema.table", + options=Greenplum.WriteOptions( + if_exists="append", + # by default distribution is random + distributedBy="id", + # partitionBy is not supported + ), + ) + + writer.run(df) + ``` + +## Interaction schema { #DBR-onetl-connection-db-connection-greenplum-write-interaction-schema } + +High-level schema is described in [Greenplum prerequisites][DBR-onetl-connection-db-connection-greenplum-prerequisites]. You can find detailed interaction schema below. + +??? note "Spark <-> Greenplum interaction during DBWriter.run()" + + ```plantuml + @startuml + title Greenplum master <-> Spark driver + box "Spark" + participant "Spark driver" + participant "Spark executor1" + participant "Spark executor2" + participant "Spark executorN" + end box + + box "Greenplum" + participant "Greenplum master" + participant "Greenplum segment1" + participant "Greenplum segment2" + participant "Greenplum segmentN" + end box + + == Greenplum.check() == + + activate "Spark driver" + "Spark driver" -> "Greenplum master" ++ : CONNECT + "Spark driver" --> "Greenplum master" ++ : CHECK IF TABLE EXISTS gp_table + "Greenplum master" --> "Spark driver" : TABLE NOT EXISTS + + == DBWriter.run(df) == + + "Spark driver" -> "Spark executor1" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION 1 + "Spark driver" -> "Spark executor2" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION 2 + "Spark driver" -> "Spark executorN" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION N + + note right of "Spark driver" : This is done in parallel,\nexecutors are independent\n|\n|\n|\nV + "Spark executor1" -> "Greenplum master" ++ : CREATE READABLE EXTERNAL TABLE spark_executor1 (id bigint, col1 int, col2 text, ...) USING address=executor1_host:executor1_port;\nINSERT INTO gp_table FROM spark_executor1 + note right of "Greenplum master" : Each white vertical line here is a opened connection to master.\nUsually, **N+1** connections are created from Spark to Greenplum master + "Greenplum master" --> "Greenplum segment1" ++ : SELECT DATA FROM spark_executor1 TO gp_table_data_on_segment1 + note right of "Greenplum segment1" : No direct requests between Greenplum segments & Spark.\nData transfer is always initiated by Greenplum segments. + + "Spark executor2" -> "Greenplum master" ++ : CREATE READABLE EXTERNAL TABLE spark_executor2 (id bigint, col1 int, col2 text, ...) USING address=executor2_host:executor2_port;\nINSERT INTO gp_table FROM spark_executor2 + "Greenplum master" --> "Greenplum segment2" ++ : SELECT DATA FROM spark_executor2 TO gp_table_data_on_segment2 + + "Spark executorN" -> "Greenplum master" ++ : CREATE READABLE EXTERNAL TABLE spark_executorN (id bigint, col1 int, col2 text, ...) USING address=executorN_host:executorN_port;\nINSERT INTO gp_table FROM spark_executorN + "Greenplum master" --> "Greenplum segmentN" ++ : SELECT DATA FROM spark_executorN TO gp_table_data_on_segmentN + + "Greenplum segment1" -->o "Spark executor1" : INITIALIZE CONNECTION TO Spark executor1 + "Spark executor1" -> "Greenplum segment1" : READ DATA FROM Spark executor1 + note left of "Spark executor1" : Circle is an open GPFDIST port,\nlistened by executor + deactivate "Greenplum segment1" + + "Greenplum segment2" -->o "Spark executor2" : INITIALIZE CONNECTION TO Spark executor2 + "Spark executor2" -> "Greenplum segment2" : READ DATA FROM Spark executor2 + deactivate "Greenplum segment2" + + "Greenplum segmentN" -->o "Spark executorN" : INITIALIZE CONNECTION TO Spark executorN + "Spark executorN" -> "Greenplum segmentN" : READ DATA FROM Spark executorN + deactivate "Greenplum segmentN" + + == Finished == + + "Spark executor1" --> "Greenplum master" : DROP TABLE spark_executor1 + deactivate "Greenplum master" + "Spark executor2" --> "Greenplum master" : DROP TABLE spark_executor2 + deactivate "Greenplum master" + "Spark executorN" --> "Greenplum master" : DROP TABLE spark_executorN + deactivate "Greenplum master" + + "Spark executor1" --> "Spark driver" -- : DONE + "Spark executor2" --> "Spark driver" -- : DONE + "Spark executorN" --> "Spark driver" -- : DONE + + "Spark driver" --> "Greenplum master" : CLOSE CONNECTION + deactivate "Greenplum master" + deactivate "Spark driver" + @enduml + ``` + + ```mermaid + --- + title: Greenplum master <-> Spark driver + --- + + sequenceDiagram + box Spark + participant A as Spark driver + participant B as Spark executor1 + participant C as Spark executor2 + participant D as Spark executorN + end + + box Greenplum + participant E as Greenplum master + participant F as Greenplum segment1 + participant G as Greenplum segment2 + participant H as Greenplum segmentN + end + + note over A,H: == Greenplum.check() == + A ->> E: CONNECT + activate E + activate A + + A -->> E : CHECK IF TABLE EXISTS gp_table + E -->> A : TABLE EXISTS + A ->> E : SHOW SCHEMA FOR gp_table + E -->> A : (id bigint, col1 int, col2 text, ...) + + note over A,H: == DBReader.run() == + + A ->> B: START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION 1 + activate B + A ->> C: START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION 2 + activate C + A ->> D: START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION N + activate D + + note right of A : This is done in parallel,
executors are independent
|
|
|
V + + B ->> E: CREATE WRITABLE EXTERNAL TABLE spark_executor1 (id bigint, col1 int, col2 text, ...)
USING address=executor1_host:executor1_port
INSERT INTO EXTERNAL TABLE spark_executor1 FROM gp_table WHERE gp_segment_id = 1 + activate E + note right of E : Each white vertical line here is a opened connection to master.
Usually, **N+1** connections are created from Spark to Greenplum master + E -->> F: SELECT DATA FROM gp_table_data_on_segment1 TO spark_executor1 + activate F + + note right of F : No direct requests between Greenplum segments & Spark.
Data transfer is always initiated by Greenplum segments. + + C ->> E: CREATE WRITABLE EXTERNAL TABLE spark_executor2 (id bigint, col1 int, col2 text, ...)
USING address=executor2_host:executor2_port
INSERT INTO EXTERNAL TABLE spark_executor2 FROM gp_table WHERE gp_segment_id = 2 + activate E + E -->> G: SELECT DATA FROM gp_table_data_on_segment2 TO spark_executor2 + activate G + + D ->> E: CREATE WRITABLE EXTERNAL TABLE spark_executorN (id bigint, col1 int, col2 text, ...)
USING address=executorN_host:executorN_port
INSERT INTO EXTERNAL TABLE spark_executorN FROM gp_table WHERE gp_segment_id = N + activate E + E -->> H: SELECT DATA FROM gp_table_data_on_segmentN TO spark_executorN + activate H + + + F -xB: INITIALIZE CONNECTION TO Spark executor1
PUSH DATA TO Spark executor1 + deactivate F + note left of B : Circle is an open GPFDIST port,
listened by executor + + G -xC: INITIALIZE CONNECTION TO Spark executor2
PUSH DATA TO Spark executor2 + deactivate G + H -xD: INITIALIZE CONNECTION TO Spark executorN
PUSH DATA TO Spark executorN + deactivate H + + note over A,H: == Spark.stop() == + + B -->> E : DROP TABLE spark_executor1 + deactivate E + C -->> E : DROP TABLE spark_executor2 + deactivate E + D -->> E : DROP TABLE spark_executorN + deactivate E + + B -->> A: DONE + deactivate B + C -->> A: DONE + deactivate C + D -->> A: DONE + deactivate D + + + A -->> E: CLOSE CONNECTION + deactivate E + deactivate A + ``` + +## Options { #DBR-onetl-connection-db-connection-greenplum-write-options } + + + +::: onetl.connection.db_connection.greenplum.options.GreenplumWriteOptions + options: + show_root_heading: true + heading_level: 3 diff --git a/mddocs/docs/connection/db_connection/hive/connection.md b/mddocs/docs/connection/db_connection/hive/connection.md new file mode 100644 index 000000000..679caef8a --- /dev/null +++ b/mddocs/docs/connection/db_connection/hive/connection.md @@ -0,0 +1,19 @@ +# Hive Connection { #DBR-onetl-connection-db-connection-hive-connection-0 } + + + +::: onetl.connection.db_connection.hive.connection.Hive + options: + members: + - get_current + - check diff --git a/mddocs/docs/connection/db_connection/hive/execute.md b/mddocs/docs/connection/db_connection/hive/execute.md new file mode 100644 index 000000000..94dfa31bc --- /dev/null +++ b/mddocs/docs/connection/db_connection/hive/execute.md @@ -0,0 +1,58 @@ +# Executing statements in Hive { #DBR-onetl-connection-db-connection-hive-execute-executing-statements-in-hive } + +Use `Hive.execute(...)` to execute DDL and DML operations. + +## Syntax support { #DBR-onetl-connection-db-connection-hive-execute-syntax-support } + +This method supports **any** query syntax supported by Hive, like: + +- ✅︎ `CREATE TABLE ...`, `CREATE VIEW ...`, and so on +- ✅︎ `LOAD DATA ...`, and so on +- ✅︎ `ALTER ...` +- ✅︎ `INSERT INTO ... SELECT ...`, and so on +- ✅︎ `DROP TABLE ...`, `DROP VIEW ...`, and so on +- ✅︎ `MSCK REPAIR TABLE ...`, and so on +- ✅︎ other statements not mentioned here +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +!!! warning + + Actually, query should be written using [SparkSQL](https://spark.apache.org/docs/latest/sql-ref-syntax.html#ddl-statements) syntax, not HiveQL. + +## Examples { #DBR-onetl-connection-db-connection-hive-execute-examples } + + ```python + from onetl.connection import Hive + + hive = Hive(...) + + hive.execute("DROP TABLE schema.table") + hive.execute( + """ + CREATE TABLE schema.table ( + id NUMBER, + key VARCHAR, + value DOUBLE + ) + PARTITION BY (business_date DATE) + STORED AS orc + """ + ) + ``` + +### Details { #DBR-onetl-connection-db-connection-hive-execute-details } + + + +::: onetl.connection.db_connection.hive.connection.Hive.execute + options: + members: + - execute diff --git a/mddocs/docs/connection/db_connection/hive/index.md b/mddocs/docs/connection/db_connection/hive/index.md new file mode 100644 index 000000000..3a71c65d9 --- /dev/null +++ b/mddocs/docs/connection/db_connection/hive/index.md @@ -0,0 +1,17 @@ +# Hive { #DBR-onetl-connection-db-connection-hive } + +## Connection { #DBR-onetl-connection-db-connection-hive-connection-1 } + +* [Prerequisites][DBR-onetl-connection-db-connection-hive-prerequisites] +* [Hive Connection][DBR-onetl-connection-db-connection-hive-connection-0] + +## Operations { #DBR-onetl-connection-db-connection-hive-operations } + +* [Reading from Hive using `DBReader`][DBR-onetl-connection-db-connection-hive-read-reading-from-hive-using-dbreader] +* [Reading from Hive using `Hive.sql`][DBR-onetl-connection-db-connection-hive-sql-reading-from-hive-using-hive-sql] +* [Writing to Hive using `DBWriter`][DBR-onetl-connection-db-connection-hive-write-writing-to-hive-using-dbwriter] +* [Executing statements in Hive][DBR-onetl-connection-db-connection-hive-execute-executing-statements-in-hive] + +## For developers { #DBR-onetl-connection-db-connection-hive-for-developers } + +* [Hive Slots][DBR-onetl-connection-db-connection-hive-slots] diff --git a/mddocs/docs/connection/db_connection/hive/prerequisites.md b/mddocs/docs/connection/db_connection/hive/prerequisites.md new file mode 100644 index 000000000..bd4ac2e4a --- /dev/null +++ b/mddocs/docs/connection/db_connection/hive/prerequisites.md @@ -0,0 +1,124 @@ +# Prerequisites { #DBR-onetl-connection-db-connection-hive-prerequisites } + +!!! note + + onETL's Hive connection is actually SparkSession with access to [Hive Thrift Metastore](https://docs.cloudera.com/cdw-runtime/1.5.0/hive-hms-overview/topics/hive-hms-introduction.html) and HDFS/S3. + All data motion is made using Spark. Hive Metastore is used only to store tables and partitions metadata. + + This connector does **NOT** require Hive server. It also does **NOT** use Hive JDBC connector. + +## Version Compatibility { #DBR-onetl-connection-db-connection-hive-prerequisites-version-compatibility } + +- Hive Metastore version: + - Officially declared: 0.12 - 3.1.3 (may require to add proper .jar file explicitly) + - Actually tested: 1.2.100, 2.3.10, 3.1.3 +- Spark versions: 2.3.x - 3.5.x +- Java versions: 8 - 20 + +See [official documentation](https://spark.apache.org/docs/latest/sql-data-sources-hive-tables.html). + +## Installing PySpark { #DBR-onetl-connection-db-connection-hive-prerequisites-installing-pyspark } + +To use Hive connector you should have PySpark installed (or injected to `sys.path`) BEFORE creating the connector instance. + +See [installation instruction][DBR-onetl-install-spark] for more details. + +## Connecting to Hive Metastore { #DBR-onetl-connection-db-connection-hive-prerequisites-connecting-to-hive-metastore } + +!!! note + + If you're using managed Hadoop cluster, skip this step, as all Spark configs are should already present on the host. + +Create `$SPARK_CONF_DIR/hive-site.xml` with Hive Metastore URL: + + ```xml + + + + + hive.metastore.uris + thrift://metastore.host.name:9083 + + + ``` + +Create `$SPARK_CONF_DIR/core-site.xml` with warehouse location ,e.g. HDFS IPC port of Hadoop namenode, or S3 bucket address & credentials: + +=== "HDFS" + + ```xml + + + + + fs.defaultFS + hdfs://myhadoopcluster:9820 + + + ``` + +=== "S3" + + ```xml + + + + + +::: onetl.connection.db_connection.hive.slots.HiveSlots + options: + members: + - normalize_cluster_name + - get_known_clusters + - get_current_cluster diff --git a/mddocs/docs/connection/db_connection/hive/sql.md b/mddocs/docs/connection/db_connection/hive/sql.md new file mode 100644 index 000000000..7ec1fad42 --- /dev/null +++ b/mddocs/docs/connection/db_connection/hive/sql.md @@ -0,0 +1,81 @@ +# Reading from Hive using `Hive.sql` { #DBR-onetl-connection-db-connection-hive-sql-reading-from-hive-using-hive-sql } + +`Hive.sql` allows passing custom SQL query, but does not support incremental strategies. + +## Syntax support { #DBR-onetl-connection-db-connection-hive-sql-syntax-support } + +Only queries with the following syntax are supported: + +- ✅︎ `SELECT ... FROM ...` +- ✅︎ `WITH alias AS (...) SELECT ...` +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +!!! warning + + Actually, query should be written using [SparkSQL](https://spark.apache.org/docs/latest/sql-ref-syntax.html#data-retrieval-statements) syntax, not HiveQL. + +## Examples { #DBR-onetl-connection-db-connection-hive-sql-examples } + + ```python + from onetl.connection import Hive + + hive = Hive(...) + df = hive.sql( + """ + SELECT + id, + key, + CAST(value AS text) value, + updated_at + FROM + some.mytable + WHERE + key = 'something' + """ + ) + ``` + +## Recommendations { #DBR-onetl-connection-db-connection-hive-sql-recommendations } + +### Use column-based write formats { #DBR-onetl-connection-db-connection-hive-sql-use-column-based-write-formats } + +Prefer these write formats: + +- [ORC](https://spark.apache.org/docs/latest/sql-data-sources-orc.html) +- [Parquet](https://spark.apache.org/docs/latest/sql-data-sources-parquet.html) +- [Iceberg](https://iceberg.apache.org/spark-quickstart/) +- [Hudi](https://hudi.apache.org/docs/quick-start-guide/) +- [Delta](https://docs.delta.io/latest/quick-start.html#set-up-apache-spark-with-delta-lake) + +For colum-based write formats, each file contains separated sections there column data is stored. The file footer contains location of each column section/group. Spark can use this information to load only sections required by specific query, e.g. only selected columns, to drastically speed up the query. + +Another advantage is high compression ratio, e.g. 10x-100x in comparison to JSON or CSV. + +### Select only required columns { #DBR-onetl-connection-db-connection-hive-sql-select-only-required-columns } + +Instead of passing `SELECT * FROM ...` prefer passing exact column names `SELECT col1, col2, ...`. +This drastically reduces the amount of data read by Spark, **if column-based file formats are used**. + +### Use partition columns in `where` clause { #DBR-onetl-connection-db-connection-hive-sql-use-partition-columns-in-where-clause } + +Queries should include `WHERE` clause with filters on Hive partitioning columns. +This allows Spark to read only small set of files (*partition pruning*) instead of scanning the entire table, so this drastically increases performance. + +Supported operators are: `=`, `>`, `<` and `BETWEEN`, and only against some **static** value. + +## Details { #DBR-onetl-connection-db-connection-hive-sql-details } + + + +::: onetl.connection.db_connection.hive.connection.Hive.sql + options: + members: + - sql diff --git a/mddocs/docs/connection/db_connection/hive/write.md b/mddocs/docs/connection/db_connection/hive/write.md new file mode 100644 index 000000000..e4d9639e9 --- /dev/null +++ b/mddocs/docs/connection/db_connection/hive/write.md @@ -0,0 +1,186 @@ +# Writing to Hive using `DBWriter` { #DBR-onetl-connection-db-connection-hive-write-writing-to-hive-using-dbwriter } + +For writing data to Hive, use [DBWriter][DBR-onetl-db-writer]. + +## Examples { #DBR-onetl-connection-db-connection-hive-write-examples } + +```python +from onetl.connection import Hive +from onetl.db import DBWriter + +hive = Hive(...) + +df = ... # data is here + +# Create dataframe with specific number of Spark partitions. +# Use the Hive partitioning columns to group the data. Create max 20 files per Hive partition. +# Also sort the data by column which most data is correlated with (e.g. user_id), reducing files size. + +num_files_per_partition = 20 +partition_columns = ["country", "business_date"] +sort_columns = ["user_id"] +write_df = df.repartition( + num_files_per_partition, + *partition_columns, + *sort_columns, +).sortWithinPartitions(*partition_columns, *sort_columns) + +writer = DBWriter( + connection=hive, + target="schema.table", + options=Hive.WriteOptions( + if_exists="append", + # Hive partitioning columns. + partitionBy=partition_columns, + ), +) + +writer.run(write_df) +``` + +## Recommendations { #DBR-onetl-connection-db-connection-hive-write-recommendations } + +### Use column-based write formats { #DBR-onetl-connection-db-connection-hive-write-use-column-based-write-formats } + +Prefer these write formats: + +- [ORC](https://spark.apache.org/docs/latest/sql-data-sources-orc.html) (**default**) +- [Parquet](https://spark.apache.org/docs/latest/sql-data-sources-parquet.html) +- [Iceberg](https://iceberg.apache.org/spark-quickstart/) +- [Hudi](https://hudi.apache.org/docs/quick-start-guide/) +- [Delta](https://docs.delta.io/latest/quick-start.html#set-up-apache-spark-with-delta-lake) + +!!! warning + When using `DBWriter`, the default spark data format configured in `spark.sql.sources.default` is ignored, as `Hive.WriteOptions(format=...)` default value is explicitly set to `orc`. + +For column-based write formats, each file contains separated sections where column data is stored. The file footer contains +location of each column section/group. Spark can use this information to load only sections required by specific query, e.g. only selected columns, +to drastically speed up the query. + +Another advantage is high compression ratio, e.g. 10x-100x in comparison to JSON or CSV. + +### Use partitioning { #DBR-onetl-connection-db-connection-hive-write-use-partitioning } + +#### How does it work { #DBR-onetl-connection-db-connection-hive-write-how-does-it-work } + +Hive support splitting data to partitions, which are different directories in filesystem with names like `some_col=value1/another_col=value2`. + +For example, dataframe with content like this: + +| country: string | business_date: date | user_id: int | bytes: long | +| --------------- | ------------------- | ------------ | ----------- | +| RU | 2024-01-01 | 1234 | 25325253525 | +| RU | 2024-01-01 | 2345 | 23234535243 | +| RU | 2024-01-02 | 1234 | 62346634564 | +| US | 2024-01-01 | 5678 | 4252345354 | +| US | 2024-01-02 | 5678 | 5474575745 | +| US | 2024-01-03 | 5678 | 3464574567 | + +With `partitionBy=["country", "business_dt"]` data will be stored as files in the following subfolders: + +- `/country=RU/business_date=2024-01-01/` +- `/country=RU/business_date=2024-01-02/` +- `/country=US/business_date=2024-01-01/` +- `/country=US/business_date=2024-01-02/` +- `/country=US/business_date=2024-01-03/` + +A separated subdirectory is created for each distinct combination of column values in the dataframe. + +Please do not confuse Spark dataframe partitions (a.k.a batches of data handled by Spark executors, usually in parallel) +and Hive partitioning (store data in different subdirectories). +Number of Spark dataframe partitions is correlated the number of files created in **each** Hive partition. +For example, Spark dataframe with 10 partitions and 5 distinct values of Hive partition columns will be saved as 5 subfolders with 10 files each = 50 files in total. +Without Hive partitioning, all the files are placed into one flat directory. + +#### But why? { #DBR-onetl-connection-db-connection-hive-write-but-why } + +Queries which has `WHERE` clause with filters on Hive partitioning columns, like `WHERE country = 'RU' AND business_date='2024-01-01'`, will +read only files from this exact partitions, like `/country=RU/business_date=2024-01-01/`, and skip files from other partitions. + +This drastically increases performance and reduces the amount of memory used by Spark. +Consider using Hive partitioning in all tables. + +#### Which columns should I use? { #DBR-onetl-connection-db-connection-hive-write-which-columns-should-i-use } + +Usually Hive partitioning columns are based on event date or location, like `country: string`, `business_date: date`, `run_date: date` and so on. + +**Partition columns should contain data with low cardinality.** +Dates, small integers, strings with low number of possible values are OK. +But timestamp, float, decimals, longs (like user id), strings with lots oj unique values (like user name or email) should **NOT** be used as Hive partitioning columns. +Unlike some other databases, range and hash-based partitions are not supported. + +Partition column should be a part of a dataframe. If you want to partition values by date component of `business_dt: timestamp` column, +add a new column to dataframe like this: `df.withColumn("business_date", date(df.business_dt))`. + +### Use compression { #DBR-onetl-connection-db-connection-hive-write-use-compression } + +Using compression algorithms like `snappy`, `lz4` or `zstd` can reduce the size of files (up to 10x). + +### Prefer creating large files { #DBR-onetl-connection-db-connection-hive-write-prefer-creating-large-files } + +Storing millions of small files is not that HDFS and S3 are designed for. Minimal file size should be at least 10Mb, but usually it is like 128Mb+ or 256Mb+ (HDFS block size). +**NEVER** create files with few Kbytes in size. + +Number of files can be different in different cases. +On one hand, Spark Adaptive Query Execution (AQE) can merge small Spark dataframe partitions into one larger. +On the other hand, dataframes with skewed data can produce a larger number of files than expected. + +To create small amount of large files, you can reduce number of Spark dataframe partitions. +Use [df.repartition(N, columns...)](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.repartition.html) function, +like this: `df.repartition(20, "col1", "col2")`. +This creates new Spark dataframe with partitions using `hash(df.col1 + df.col2) mod 20` expression, avoiding data skew. + +Note: larger dataframe partitions requires more resources (CPU, RAM) on Spark executor. The exact number of partitions +should be determined empirically, as it depends on the amount of data and available resources. + +### Sort data before writing { #DBR-onetl-connection-db-connection-hive-write-sort-data-before-writing } + +Dataframe with sorted content: + +| country: string | business_date: date | user_id: int | business_dt: timestamp | bytes: long | +| --------------- | ------------------- | ------------ | ----------------------- | ----------- | +| RU | 2024-01-01 | 1234 | 2024-01-01T11:22:33.456 | 25325253525 | +| RU | 2024-01-01 | 1234 | 2024-01-01T12:23:44.567 | 25325253525 | +| RU | 2024-01-02 | 1234 | 2024-01-01T13:25:56.789 | 34335645635 | +| US | 2024-01-01 | 2345 | 2024-01-01T10:00:00.000 | 12341 | +| US | 2024-01-02 | 2345 | 2024-01-01T15:11:22.345 | 13435 | +| US | 2024-01-03 | 2345 | 2024-01-01T20:22:33.567 | 14564 | + +Has a much better compression rate than unsorted one, e.g. 2x or even higher: + +| country: string | business_date: date | user_id: int | business_dt: timestamp | bytes: long | +| --------------- | ------------------- | ------------ | ----------------------- | ----------- | +| RU | 2024-01-01 | 1234 | 2024-01-01T11:22:33.456 | 25325253525 | +| RU | 2024-01-01 | 6345 | 2024-12-01T23:03:44.567 | 25365 | +| RU | 2024-01-02 | 5234 | 2024-07-01T06:10:56.789 | 45643456747 | +| US | 2024-01-01 | 4582 | 2024-04-01T17:59:00.000 | 362546475 | +| US | 2024-01-02 | 2345 | 2024-09-01T04:24:22.345 | 3235 | +| US | 2024-01-03 | 3575 | 2024-03-01T21:37:33.567 | 346345764 | + +Choosing columns to sort data by is really depends on the data. If data is correlated with some specific +column, like in example above the amount of traffic is correlated with both `user_id` and `timestamp`, +use `df.sortWithinPartitions("user_id", "timestamp")` before writing the data. + +If `df.repartition(N, repartition_columns...)` is used in combination with `df.sortWithinPartitions(sort_columns...)`, +then `sort_columns` should start with `repartition_columns` or be equal to it. + +## Options { #DBR-onetl-connection-db-connection-hive-write-options } + + + +::: onetl.connection.db_connection.hive.options.HiveWriteOptions + options: + inherited_members: true + heading_level: 3 + show_root_heading: true diff --git a/mddocs/docs/connection/db_connection/iceberg/auth_basic.md b/mddocs/docs/connection/db_connection/iceberg/auth_basic.md new file mode 100644 index 000000000..22866af08 --- /dev/null +++ b/mddocs/docs/connection/db_connection/iceberg/auth_basic.md @@ -0,0 +1,3 @@ +# Basic Authentication { #DBR-onetl-connection-db-connection-iceberg-auth-basic-basic-authentication } + +::: onetl.connection.db_connection.iceberg.catalog.auth.basic.IcebergRESTCatalogBasicAuth diff --git a/mddocs/docs/connection/db_connection/iceberg/auth_bearer.md b/mddocs/docs/connection/db_connection/iceberg/auth_bearer.md new file mode 100644 index 000000000..289eb396d --- /dev/null +++ b/mddocs/docs/connection/db_connection/iceberg/auth_bearer.md @@ -0,0 +1,3 @@ +# Bearer Token Authentication { #DBR-onetl-connection-db-connection-iceberg-auth-bearer-bearer-token-authentication } + +::: onetl.connection.db_connection.iceberg.catalog.auth.bearer.IcebergRESTCatalogBearerAuth diff --git a/mddocs/docs/connection/db_connection/iceberg/auth_oauth2_client_credentials.md b/mddocs/docs/connection/db_connection/iceberg/auth_oauth2_client_credentials.md new file mode 100644 index 000000000..30982366d --- /dev/null +++ b/mddocs/docs/connection/db_connection/iceberg/auth_oauth2_client_credentials.md @@ -0,0 +1,3 @@ +# OAuth2 Client Credentials Flow { #DBR-onetl-connection-db-connection-iceberg-auth-oauth2-client-credentials-oauth2-client-credentials-flow } + +::: onetl.connection.db_connection.iceberg.catalog.auth.oauth2_client_credentials.IcebergRESTCatalogOAuth2ClientCredentials diff --git a/mddocs/docs/connection/db_connection/iceberg/catalog_filesystem.md b/mddocs/docs/connection/db_connection/iceberg/catalog_filesystem.md new file mode 100644 index 000000000..da6e2b592 --- /dev/null +++ b/mddocs/docs/connection/db_connection/iceberg/catalog_filesystem.md @@ -0,0 +1,3 @@ +# Filesystem Catalog { #DBR-onetl-connection-db-connection-iceberg-catalog-filesystem-filesystem-catalog } + +::: onetl.connection.db_connection.iceberg.catalog.filesystem.IcebergFilesystemCatalog diff --git a/mddocs/docs/connection/db_connection/iceberg/catalog_rest.md b/mddocs/docs/connection/db_connection/iceberg/catalog_rest.md new file mode 100644 index 000000000..c410962d0 --- /dev/null +++ b/mddocs/docs/connection/db_connection/iceberg/catalog_rest.md @@ -0,0 +1,17 @@ +# REST Catalog { #DBR-onetl-connection-db-connection-iceberg-catalog-rest-rest-catalog } + +::: onetl.connection.db_connection.iceberg.catalog.rest.IcebergRESTCatalog + options: + inherited_members: true + heading_level: 3 + show_root_heading: true + extensions: + - griffe_sphinx + - griffe_pydantic: {schema: false} + + +## Authentication { #DBR-onetl-connection-db-connection-iceberg-catalog-rest-authentication } + + diff --git a/mddocs/docs/connection/db_connection/iceberg/connection.md b/mddocs/docs/connection/db_connection/iceberg/connection.md new file mode 100644 index 000000000..d3046a74f --- /dev/null +++ b/mddocs/docs/connection/db_connection/iceberg/connection.md @@ -0,0 +1,3 @@ +# Iceberg Connection { #DBR-onetl-connection-db-connection-iceberg-connection-0 } + +::: onetl.connection.db_connection.iceberg.connection.Iceberg diff --git a/mddocs/docs/connection/db_connection/iceberg/execute.md b/mddocs/docs/connection/db_connection/iceberg/execute.md new file mode 100644 index 000000000..4f7b8eaae --- /dev/null +++ b/mddocs/docs/connection/db_connection/iceberg/execute.md @@ -0,0 +1,44 @@ +# Executing statements in Iceberg { #DBR-onetl-connection-db-connection-iceberg-execute-executing-statements-in-iceberg } + +Use `Iceberg.execute(...)` to execute DDL and DML operations. + +!!! warning + + In DML/DDL queries **table names must include catalog prefix**. + +## Syntax support { #DBR-onetl-connection-db-connection-iceberg-execute-syntax-support } + +This method supports **any** query syntax supported by Iceberg (Spark +SQL), like: + +- ✅︎ `CREATE TABLE ...`, `CREATE VIEW ...` +- ✅︎ `INSERT INTO ... SELECT ...`, `MERGE INTO ...` +- ✅︎ `ALTER TABLE ... ADD COLUMN`, `ALTER TABLE ... DROP COLUMN` +- ✅︎ `DROP TABLE ...`, `DROP VIEW ...` +- ✅︎ `REPLACE TABLE ...` +- ✅︎ other statements supported by Iceberg +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +## Examples { #DBR-onetl-connection-db-connection-iceberg-execute-examples } + +``` python +from onetl.connection import Iceberg + +iceberg = Iceberg(catalog_name="my_catalog", ...) + +iceberg.execute("DROP TABLE my_catalog.my_schema.my_table") +iceberg.execute( + """ + CREATE TABLE my_catalog.my_schema.my_table ( + id BIGINT, + key STRING, + value DOUBLE + ) + USING iceberg + """, +) +``` + +### Details { #DBR-onetl-connection-db-connection-iceberg-execute-details } + +::: onetl.connection.db_connection.iceberg.connection.Iceberg.execute diff --git a/mddocs/docs/connection/db_connection/iceberg/index.md b/mddocs/docs/connection/db_connection/iceberg/index.md new file mode 100644 index 000000000..a9b3422aa --- /dev/null +++ b/mddocs/docs/connection/db_connection/iceberg/index.md @@ -0,0 +1,24 @@ +# Iceberg { #DBR-onetl-connection-db-connection-iceberg } + +## Connection { #DBR-onetl-connection-db-connection-iceberg-connection-1 } + +* [Prerequisites][DBR-onetl-connection-db-connection-iceberg-prerequisites] +* [Iceberg Connection][DBR-onetl-connection-db-connection-iceberg-connection-0] + +## Warehouse { #DBR-onetl-connection-db-connection-iceberg-warehouse } + +* [Filesystem Warehouse][DBR-onetl-connection-db-connection-iceberg-warehouse-filesystem-filesystem-warehouse] +* [S3 Warehouse][DBR-onetl-connection-db-connection-iceberg-warehouse-s3-s3-warehouse] +* [Delegated Warehouse][DBR-onetl-connection-db-connection-iceberg-warehouse-delegated-delegated-warehouse] + +## Catalog { #DBR-onetl-connection-db-connection-iceberg-catalog } + +* [Filesystem Catalog][DBR-onetl-connection-db-connection-iceberg-catalog-filesystem-filesystem-catalog] +* [REST Catalog][DBR-onetl-connection-db-connection-iceberg-catalog-rest-rest-catalog] + +## Operations { #DBR-onetl-connection-db-connection-iceberg-operations } + +* [Reading from Iceberg using `DBReader`][DBR-onetl-connection-db-connection-iceberg-read-reading-from-iceberg-using-dbreader] +* [Reading from Iceberg using `Iceberg.sql`][DBR-onetl-connection-db-connection-iceberg-sql-reading-from-iceberg-using-iceberg-sql] +* [Writing to Iceberg using `DBWriter`][DBR-onetl-connection-db-connection-iceberg-write-writing-to-iceberg-using-dbwriter] +* [Executing statements in Iceberg][DBR-onetl-connection-db-connection-iceberg-execute-executing-statements-in-iceberg] diff --git a/mddocs/docs/connection/db_connection/iceberg/prerequisites.md b/mddocs/docs/connection/db_connection/iceberg/prerequisites.md new file mode 100644 index 000000000..0d11a323c --- /dev/null +++ b/mddocs/docs/connection/db_connection/iceberg/prerequisites.md @@ -0,0 +1,29 @@ +# Prerequisites { #DBR-onetl-connection-db-connection-iceberg-prerequisites } + +!!! note + + onETL's Iceberg connection is actually a `SparkSession` configured to work with [Apache Iceberg](https://iceberg.apache.org/docs/latest/) tables. All data motion is made using Spark. Iceberg catalog (REST, Hadoop, etc.) is used only to store tables metadata, while data itself is stored in a warehouse location (HDFS, S3, or another supported filesystem). + +## Version Compatibility { #DBR-onetl-connection-db-connection-iceberg-prerequisites-version-compatibility } + +- Iceberg catalog: depends on chosen implementation (e.g. REST, Hadoop) +- Spark versions: 3.2.x -- 4.0.x +- Java versions: 8 -- 22 + +See [official documentation](https://iceberg.apache.org/docs/latest/spark-getting-started/) for details on catalog and warehouse configuration. + +## Installing PySpark { #DBR-onetl-connection-db-connection-iceberg-prerequisites-installing-pyspark } + +To use Iceberg connector you should have PySpark installed (or injected to `sys.path`) BEFORE creating the connector instance. + +See [installation instruction][DBR-onetl-install-spark] for more details. + +## Popular Metastore Implementations { #DBR-onetl-connection-db-connection-iceberg-prerequisites-popular-metastore-implementations } + +Iceberg supports multiple catalog implementations. Here are some popular options: + +- [Apache Iceberg Hadoop Catalog](https://iceberg.apache.org/docs/latest/spark-configuration/) +- [Lakekeeper](https://docs.lakekeeper.io/getting-started/) +- [Polaris](https://polaris.apache.org/in-dev/unreleased/getting-started/) +- [Apache Gravitino](https://gravitino.apache.org/docs/) +- [Databricks Unity Catalog](https://docs.databricks.com/aws/en/external-access/iceberg/) diff --git a/mddocs/docs/connection/db_connection/iceberg/read.md b/mddocs/docs/connection/db_connection/iceberg/read.md new file mode 100644 index 000000000..7f84863c1 --- /dev/null +++ b/mddocs/docs/connection/db_connection/iceberg/read.md @@ -0,0 +1,66 @@ +# Reading from Iceberg using `DBReader` { #DBR-onetl-connection-db-connection-iceberg-read-reading-from-iceberg-using-dbreader } + +[DBReader][DBR-onetl-db-reader] supports [strategy][DBR-onetl-strategy-read-strategies] for incremental data reading, but does not support custom queries, like `JOIN`. + +## Supported DBReader features { #DBR-onetl-connection-db-connection-iceberg-read-supported-dbreader-features } + +- ✅︎ `columns` +- ✅︎ `where` +- ✅︎ `hwm`, supported strategies: + - ✅︎ [`snapshot-strategy`][DBR-onetl-strategy-snapshot-strategy] + - ✅︎ [`incremental-strategy`][DBR-onetl-connection-db-connection-clickhouse-read-incremental-strategy] + - ✅︎ [`snapshot-batch-strategy`][DBR-onetl-strategy-snapshot-batch-strategy] + - ✅︎ [`incremental-batch-strategy`][DBR-onetl-strategy-incremental-batch-strategy] +- ✅︎ `hint` +- ❌ `df_schema` +- ❌ `options` (only Spark config params are used) + +!!! warning + + `columns`, `where` and `hwm.expression` should be written using [SparkSQL](https://spark.apache.org/docs/latest/sql-ref-syntax.html#data-retrieval-statements) syntax. + +## Examples { #DBR-onetl-connection-db-connection-iceberg-read-examples } + +Snapshot strategy: + +``` python +from onetl.connection import Iceberg +from onetl.db import DBReader + +iceberg = Iceberg(catalog_name="my_catalog", ...) + +reader = DBReader( + connection=iceberg, + source="my_schema.table", # catalog is already defined in connection + columns=["id", "key", "value", "updated_dt"], + where="key = 'something'", +) +df = reader.run() +``` + +Incremental strategy: + +``` python +from onetl.connection import Iceberg +from onetl.db import DBReader +from onetl.strategy import IncrementalStrategy + +iceberg = Iceberg(catalog_name="my_catalog", ...) + +reader = DBReader( + connection=iceberg, + source="my_schema.table", # catalog is already defined in connection + columns=["id", "key", "value", "updated_dt"], + where="key = 'something'", + hwm=DBReader.AutoDetectHWM(name="iceberg_hwm", expression="updated_dt"), +) + +with IncrementalStrategy(): + df = reader.run() +``` + +## Recommendations { #DBR-onetl-connection-db-connection-iceberg-read-recommendations } + +### Select only required columns { #DBR-onetl-connection-db-connection-iceberg-read-select-only-required-columns } + +Instead of passing `"*"` in `DBReader(columns=[...])` prefer passing exact column names. This drastically reduces the amount of data read by Spark. diff --git a/mddocs/docs/connection/db_connection/iceberg/sql.md b/mddocs/docs/connection/db_connection/iceberg/sql.md new file mode 100644 index 000000000..c2e522d88 --- /dev/null +++ b/mddocs/docs/connection/db_connection/iceberg/sql.md @@ -0,0 +1,46 @@ +# Reading from Iceberg using `Iceberg.sql` { #DBR-onetl-connection-db-connection-iceberg-sql-reading-from-iceberg-using-iceberg-sql } + +`Iceberg.sql` allows passing custom SQL query, but does not support incremental strategies. + +!!! warning + + Unlike DBReader, in SQL queries **table names must include catalog prefix**. + +## Syntax support { #DBR-onetl-connection-db-connection-iceberg-sql-syntax-support } + +Only queries with the following syntax are supported: + +- ✅︎ `SELECT ... FROM ...` +- ✅︎ `WITH alias AS (...) SELECT ...` +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +## Examples { #DBR-onetl-connection-db-connection-iceberg-sql-examples } + +``` python +from onetl.connection import Iceberg + +iceberg = Iceberg(catalog_name="my_catalog", ...) +df = iceberg.sql( + """ + SELECT + id, + key, + CAST(value AS string) value, + updated_at + FROM + my_catalog.my_schema.my_table + WHERE + key = 'something' + """, +) +``` + +## Recommendations { #DBR-onetl-connection-db-connection-iceberg-sql-recommendations } + +### Select only required columns { #DBR-onetl-connection-db-connection-iceberg-sql-select-only-required-columns } + +Avoid `SELECT *`. List only required columns to minimize I/O and improve query performance. + +### Use filters { #DBR-onetl-connection-db-connection-iceberg-sql-use-filters } + +Include `WHERE` clauses on columns to allow Spark to prune unnecessary data, e.g. operators `=`, `>`, `<`, `BETWEEN`. diff --git a/mddocs/docs/connection/db_connection/iceberg/warehouse_delegated.md b/mddocs/docs/connection/db_connection/iceberg/warehouse_delegated.md new file mode 100644 index 000000000..e6c933b2c --- /dev/null +++ b/mddocs/docs/connection/db_connection/iceberg/warehouse_delegated.md @@ -0,0 +1,3 @@ +# Delegated Warehouse { #DBR-onetl-connection-db-connection-iceberg-warehouse-delegated-delegated-warehouse } + +::: onetl.connection.db_connection.iceberg.warehouse.delegated.IcebergDelegatedWarehouse diff --git a/mddocs/docs/connection/db_connection/iceberg/warehouse_filesystem.md b/mddocs/docs/connection/db_connection/iceberg/warehouse_filesystem.md new file mode 100644 index 000000000..2a4dafb02 --- /dev/null +++ b/mddocs/docs/connection/db_connection/iceberg/warehouse_filesystem.md @@ -0,0 +1,3 @@ +# Filesystem Warehouse { #DBR-onetl-connection-db-connection-iceberg-warehouse-filesystem-filesystem-warehouse } + +::: onetl.connection.db_connection.iceberg.warehouse.filesystem.IcebergFilesystemWarehouse diff --git a/mddocs/docs/connection/db_connection/iceberg/warehouse_s3.md b/mddocs/docs/connection/db_connection/iceberg/warehouse_s3.md new file mode 100644 index 000000000..2df394b7c --- /dev/null +++ b/mddocs/docs/connection/db_connection/iceberg/warehouse_s3.md @@ -0,0 +1,3 @@ +# S3 Warehouse { #DBR-onetl-connection-db-connection-iceberg-warehouse-s3-s3-warehouse } + +::: onetl.connection.db_connection.iceberg.warehouse.s3.IcebergS3Warehouse diff --git a/mddocs/docs/connection/db_connection/iceberg/write.md b/mddocs/docs/connection/db_connection/iceberg/write.md new file mode 100644 index 000000000..ff3d55651 --- /dev/null +++ b/mddocs/docs/connection/db_connection/iceberg/write.md @@ -0,0 +1,28 @@ +# Writing to Iceberg using `DBWriter` { #DBR-onetl-connection-db-connection-iceberg-write-writing-to-iceberg-using-dbwriter } + +For writing data to Iceberg, use [`DBWriter `][DBR-onetl-db-writer]. + +## Examples { #DBR-onetl-connection-db-connection-iceberg-write-examples } + +``` python +from onetl.connection import Iceberg +from onetl.db import DBWriter + +iceberg = Iceberg(catalog_name="my_catalog", ...) + +df = ... # data is here + +writer = DBWriter( + connection=iceberg, + target="my_schema.my_table", # catalog name is already defined in connection + options=Iceberg.WriteOptions( + if_exists="append", + ), +) + +writer.run(df) +``` + +## Options { #DBR-onetl-connection-db-connection-iceberg-write-options } + +::: onetl.connection.db_connection.iceberg.options.IcebergWriteOptions diff --git a/mddocs/docs/connection/db_connection/index.md b/mddocs/docs/connection/db_connection/index.md new file mode 100644 index 000000000..bdb0cc8fe --- /dev/null +++ b/mddocs/docs/connection/db_connection/index.md @@ -0,0 +1,12 @@ +# DB Connections { #DBR-onetl-connection-db-connection-db-connections } + +* [Clickhouse][DBR-onetl-connection-db-connection-clickhouse] +* [Greenplum][DBR-onetl-connection-db-connection-greenplum] +* [Kafka][DBR-onetl-connection-db-connection-kafka] +* [Iceberg][DBR-onetl-connection-db-connection-iceberg] +* [Hive][DBR-onetl-connection-db-connection-hive] +* [MongoDB][DBR-onetl-connection-db-connection-mongodb] +* [MSSQL][DBR-onetl-connection-db-connection-mssql] +* [MySQL][DBR-onetl-connection-db-connection-mysql] +* [Oracle][DBR-onetl-connection-db-connection-oracle] +* [Postgres][DBR-onetl-connection-db-connection-postgres] diff --git a/mddocs/docs/connection/db_connection/kafka/auth.md b/mddocs/docs/connection/db_connection/kafka/auth.md new file mode 100644 index 000000000..ce2bd7d77 --- /dev/null +++ b/mddocs/docs/connection/db_connection/kafka/auth.md @@ -0,0 +1,19 @@ +# Kafka Auth { #DBR-onetl-connection-db-connection-kafka-auth } + + + +::: onetl.connection.db_connection.kafka.kafka_auth.KafkaAuth + options: + extensions: + - griffe_sphinx + - griffe_pydantic: {schema: false} diff --git a/mddocs/docs/connection/db_connection/kafka/basic_auth.md b/mddocs/docs/connection/db_connection/kafka/basic_auth.md new file mode 100644 index 000000000..91dd4ecf0 --- /dev/null +++ b/mddocs/docs/connection/db_connection/kafka/basic_auth.md @@ -0,0 +1,23 @@ +# Kafka BasicAuth { #DBR-onetl-connection-db-connection-kafka-basic-auth-kafka-basicauth } + + + +::: onetl.connection.db_connection.kafka.kafka_basic_auth.KafkaBasicAuth + options: + inherited_members: true + heading_level: 3 + show_root_heading: true + extensions: + - griffe_sphinx + - griffe_pydantic: {schema: false} diff --git a/mddocs/docs/connection/db_connection/kafka/connection.md b/mddocs/docs/connection/db_connection/kafka/connection.md new file mode 100644 index 000000000..cacefa988 --- /dev/null +++ b/mddocs/docs/connection/db_connection/kafka/connection.md @@ -0,0 +1,18 @@ +# Kafka Connection { #DBR-onetl-connection-db-connection-kafka-connection-0 } + + + +::: onetl.connection.db_connection.kafka.connection.Kafka + options: + extensions: + - griffe_sphinx + - griffe_pydantic: {schema: false} diff --git a/mddocs/docs/connection/db_connection/kafka/index.md b/mddocs/docs/connection/db_connection/kafka/index.md new file mode 100644 index 000000000..3d49079c0 --- /dev/null +++ b/mddocs/docs/connection/db_connection/kafka/index.md @@ -0,0 +1,29 @@ +# Kafka { #DBR-onetl-connection-db-connection-kafka } + +## Connection { #DBR-onetl-connection-db-connection-kafka-connection-1 } + +* [Prerequisites][DBR-onetl-connection-db-connection-kafka-prerequisites] +* [Kafka Connection][DBR-onetl-connection-db-connection-kafka-connection-0] +* [Kafka Troubleshooting][DBR-onetl-connection-db-connection-kafka-troubleshooting] + +## Protocols { #DBR-onetl-connection-db-connection-kafka-protocols } + +* [Kafka PlaintextProtocol][DBR-onetl-connection-db-connection-kafka-plaintext-protocol-kafka-plaintextprotocol] +* [Kafka SSLProtocol][DBR-onetl-connection-db-connection-kafka-ssl-protocol-kafka-sslprotocol] + +## Auth methods { #DBR-onetl-connection-db-connection-kafka-auth-methods } + +* [Kafka BasicAuth][DBR-onetl-connection-db-connection-kafka-basic-auth-kafka-basicauth] +* [Kafka KerberosAuth][DBR-onetl-connection-db-connection-kafka-kerberos-auth-kafka-kerberosauth] +* [Kafka ScramAuth][DBR-onetl-connection-db-connection-kafka-scram-auth-kafka-scramauth] + +## Operations { #DBR-onetl-connection-db-connection-kafka-operations } + +* [Reading from Kafka][DBR-onetl-connection-db-connection-kafka-read-reading-from-kafka] +* [Writing to Kafka][DBR-onetl-connection-db-connection-kafka-write-writing-to-kafka] + +## For developers { #DBR-onetl-connection-db-connection-kafka-for-developers } + +* [Kafka Auth][DBR-onetl-connection-db-connection-kafka-auth] +* [Kafka Protocol][DBR-onetl-connection-db-connection-kafka-protocol] +* [Kafka Slots][DBR-onetl-connection-db-connection-kafka-slots] diff --git a/mddocs/docs/connection/db_connection/kafka/kerberos_auth.md b/mddocs/docs/connection/db_connection/kafka/kerberos_auth.md new file mode 100644 index 000000000..7034fbc45 --- /dev/null +++ b/mddocs/docs/connection/db_connection/kafka/kerberos_auth.md @@ -0,0 +1,23 @@ +# Kafka KerberosAuth { #DBR-onetl-connection-db-connection-kafka-kerberos-auth-kafka-kerberosauth } + + + +::: onetl.connection.db_connection.kafka.kafka_kerberos_auth.KafkaKerberosAuth + options: + inherited_members: true + heading_level: 3 + show_root_heading: true + extensions: + - griffe_sphinx + - griffe_pydantic: {schema: false} diff --git a/mddocs/docs/connection/db_connection/kafka/plaintext_protocol.md b/mddocs/docs/connection/db_connection/kafka/plaintext_protocol.md new file mode 100644 index 000000000..c93fa8c92 --- /dev/null +++ b/mddocs/docs/connection/db_connection/kafka/plaintext_protocol.md @@ -0,0 +1,23 @@ +# Kafka PlaintextProtocol { #DBR-onetl-connection-db-connection-kafka-plaintext-protocol-kafka-plaintextprotocol } + + + +::: onetl.connection.db_connection.kafka.kafka_plaintext_protocol.KafkaPlaintextProtocol + options: + inherited_members: true + heading_level: 3 + show_root_heading: true + extensions: + - griffe_sphinx + - griffe_pydantic: {schema: false} diff --git a/mddocs/docs/connection/db_connection/kafka/prerequisites.md b/mddocs/docs/connection/db_connection/kafka/prerequisites.md new file mode 100644 index 000000000..417c3ea89 --- /dev/null +++ b/mddocs/docs/connection/db_connection/kafka/prerequisites.md @@ -0,0 +1,65 @@ +# Prerequisites { #DBR-onetl-connection-db-connection-kafka-prerequisites } + +## Version Compatibility { #DBR-onetl-connection-db-connection-kafka-prerequisites-version-compatibility } + +- Kafka server versions: + - Officially declared: 0.10 or higher + - Actually tested: 3.2.3, 3.9.0 (only Kafka 3.x supports message headers) +- Spark versions: 2.4.x - 3.5.x +- Java versions: 8 - 17 + +See [official documentation](https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html). + +## Installing PySpark { #DBR-onetl-connection-db-connection-kafka-prerequisites-installing-pyspark } + +To use Kafka connector you should have PySpark installed (or injected to `sys.path`) BEFORE creating the connector instance. + +See [installation instruction][DBR-onetl-install-spark] for more details. + +## Connecting to Kafka { #DBR-onetl-connection-db-connection-kafka-prerequisites-connecting-to-kafka } + +### Connection address { #DBR-onetl-connection-db-connection-kafka-prerequisites-connection-address } + +Kafka is a distributed service, and usually has a list of brokers you can connect to (unlike other connectors, there only one host+port can be set). +Please contact your Kafka administrator to get addresses of these brokers, as there are no defaults. + +Also Kafka has a feature called *advertised listeners* - client connects to one broker, and received list of other brokers in the clusters. +So you don't have to pass all brokers to `addresses`, it can be some subset. Other broker addresses will be fetched directly from the cluster. + +### Connection protocol { #DBR-onetl-connection-db-connection-kafka-prerequisites-connection-protocol } + +Kafka can support different connection protocols. List of currently supported protocols: + +- [PLAINTEXT][onetl.connection.db_connection.kafka.kafka_plaintext_protocol.KafkaPlaintextProtocol] (not secure) +- [SSL][onetl.connection.db_connection.kafka.kafka_ssl_protocol.KafkaSSLProtocol] (secure, recommended) + +Note that specific port can listen for only one of these protocols, so it is important to set proper port number + protocol combination. + +### Authentication mechanism { #DBR-onetl-connection-db-connection-kafka-prerequisites-authentication-mechanism } + +Kafka can support different authentication mechanism (also known as [SASL](https://en.wikipedia.org/wiki/Simple_Authentication_and_Security_Layer)). + +List of currently supported mechanisms: + +- [PLAIN][onetl.connection.db_connection.kafka.kafka_basic_auth.KafkaBasicAuth]. To no confuse this with `PLAINTEXT` connection protocol, onETL uses name `BasicAuth`. +- [GSSAPI][onetl.connection.db_connection.kafka.kafka_kerberos_auth.KafkaKerberosAuth]. To simplify naming, onETL uses name `KerberosAuth`. +- [SCRAM-SHA-256 or SCRAM-SHA-512][onetl.connection.db_connection.kafka.kafka_scram_auth.KafkaScramAuth] (recommended). + +Different mechanisms use different types of credentials (login + password, keytab file, and so on). + +Note that connection protocol and auth mechanism are set in pairs: + +- If you see `SASL_PLAINTEXT` this means `PLAINTEXT` connection protocol + some auth mechanism. +- If you see `SASL_SSL` this means `SSL` connection protocol + some auth mechanism. +- If you see just `PLAINTEXT` or `SSL` (**no** `SASL`), this means that authentication is disabled (anonymous access). + +Please contact your Kafka administrator to get details about enabled auth mechanism in a specific Kafka instance. + +### Required grants { #DBR-onetl-connection-db-connection-kafka-prerequisites-required-grants } + +Ask your Kafka administrator to set following grants for a user, *if Kafka instance uses ACL*: + +- `Describe` + `Read` for reading data from Kafka (Consumer). +- `Describe` + `Write` for writing data from Kafka (Producer). + +More details can be found in [documentation](https://kafka.apache.org/documentation/#operations_in_kafka). diff --git a/mddocs/docs/connection/db_connection/kafka/protocol.md b/mddocs/docs/connection/db_connection/kafka/protocol.md new file mode 100644 index 000000000..df7186094 --- /dev/null +++ b/mddocs/docs/connection/db_connection/kafka/protocol.md @@ -0,0 +1,19 @@ +# Kafka Protocol { #DBR-onetl-connection-db-connection-kafka-protocol } + + + +::: onetl.connection.db_connection.kafka.kafka_protocol.KafkaProtocol + options: + extensions: + - griffe_sphinx + - griffe_pydantic: {schema: false} diff --git a/mddocs/docs/connection/db_connection/kafka/read.md b/mddocs/docs/connection/db_connection/kafka/read.md new file mode 100644 index 000000000..685433ff1 --- /dev/null +++ b/mddocs/docs/connection/db_connection/kafka/read.md @@ -0,0 +1,145 @@ +# Reading from Kafka { #DBR-onetl-connection-db-connection-kafka-read-reading-from-kafka } + +Data can be read from Kafka to Spark using [DBReader][DBR-onetl-db-reader]. +It also supports [strategy][DBR-onetl-strategy-read-strategies] for incremental data reading. + +## Supported DBReader features { #DBR-onetl-connection-db-connection-kafka-read-supported-dbreader-features } + +- ❌ `columns` (is not supported by Kafka) +- ❌ `where` (is not supported by Kafka) +- ✅︎ `hwm`, supported strategies: + - ✅︎ [Snapshot strategy][DBR-onetl-strategy-snapshot-strategy] + - ✅︎ [Incremental strategy][DBR-onetl-connection-db-connection-clickhouse-read-incremental-strategy] + - ❌ [Snapshot batch strategy][DBR-onetl-strategy-snapshot-batch-strategy] + - ❌ [Incremental batch strategy][DBR-onetl-strategy-incremental-batch-strategy] +- ❌ `hint` (is not supported by Kafka) +- ❌ `df_schema` (see note below) +- ✅︎ `options` (see [Kafka.ReadOptions][onetl.connection.db_connection.kafka.options.KafkaReadOptions]) + +## Dataframe schema { #DBR-onetl-connection-db-connection-kafka-read-dataframe-schema } + +Unlike other DB connections, Kafka does not have concept of columns. +All the topics messages have the same set of fields, see structure below: + +```text +root +|-- key: binary (nullable = true) +|-- value: binary (nullable = true) +|-- topic: string (nullable = false) +|-- partition: integer (nullable = false) +|-- offset: integer (nullable = false) +|-- timestamp: timestamp (nullable = false) +|-- timestampType: integer (nullable = false) +|-- headers: struct (nullable = true) + |-- key: string (nullable = false) + |-- value: binary (nullable = true) +``` + +`headers` field is present in the dataframe only if `Kafka.ReadOptions(include_headers=True)` is passed (compatibility with Kafka 1.x). + +## Value deserialization { #DBR-onetl-connection-db-connection-kafka-read-value-deserialization } + +To read `value` or `key` of other type than bytes (e.g. struct or integer), users have to deserialize values manually. + +This could be done using following methods: + +- [Avro.parse_column][onetl.file.format.avro.Avro.parse_column] +- [JSON.parse_column][onetl.file.format.json.JSON.parse_column] +- [CSV.parse_column][onetl.file.format.csv.CSV.parse_column] +- [XML.parse_column][onetl.file.format.xml.XML.parse_column] + +## Examples { #DBR-onetl-connection-db-connection-kafka-read-examples } + +Snapshot strategy, `value` is Avro binary data: + +```python +from onetl.connection import Kafka +from onetl.db import DBReader, DBWriter +from onetl.file.format import Avro +from pyspark.sql.functions import decode + +# read all topic data from Kafka +kafka = Kafka(...) +reader = DBReader(connection=kafka, source="avro_topic") +read_df = reader.run() + +# parse Avro format to Spark struct +avro = Avro( + schema_dict={ + "type": "record", + "name": "Person", + "fields": [ + {"name": "name", "type": "string"}, + {"name": "age", "type": "int"}, + ], + } +) +deserialized_df = read_df.select( + # cast binary key to string + decode("key", "UTF-8").alias("key"), + avro.parse_column("value"), +) +``` + +Incremental strategy, `value` is JSON string: + +!!! note + + Currently Kafka connector does support only HWMs based on `offset` field. Other fields, like `timestamp`, are not yet supported. + +```python +from onetl.connection import Kafka +from onetl.db import DBReader, DBWriter +from onetl.file.format import JSON +from pyspark.sql.functions import decode + +kafka = Kafka(...) + +# read only new data from Kafka topic +reader = DBReader( + connection=kafka, + source="topic_name", + hwm=DBReader.AutoDetectHWM(name="kafka_hwm", expression="offset"), +) + +with IncrementalStrategy(): + read_df = reader.run() + +# parse JSON format to Spark struct +json = JSON() +schema = StructType( + [ + StructField("name", StringType(), nullable=True), + StructField("age", IntegerType(), nullable=True), + ], +) +deserialized_df = read_df.select( + # cast binary key to string + decode("key", "UTF-8").alias("key"), + json.parse_column("value", json), +) +``` + +## Options { #DBR-onetl-connection-db-connection-kafka-read-options } + + + +::: onetl.connection.db_connection.kafka.options.KafkaReadOptions + options: + inherited_members: true + heading_level: 3 + show_root_heading: true + extensions: + - griffe_sphinx + - griffe_pydantic: {schema: false} diff --git a/mddocs/docs/connection/db_connection/kafka/scram_auth.md b/mddocs/docs/connection/db_connection/kafka/scram_auth.md new file mode 100644 index 000000000..a53b3f035 --- /dev/null +++ b/mddocs/docs/connection/db_connection/kafka/scram_auth.md @@ -0,0 +1,24 @@ +# Kafka ScramAuth { #DBR-onetl-connection-db-connection-kafka-scram-auth-kafka-scramauth } + + + +::: onetl.connection.db_connection.kafka.kafka_scram_auth.KafkaScramAuth + options: + inherited_members: true + heading_level: 3 + show_root_heading: true + extensions: + - griffe_sphinx + - griffe_pydantic: {schema: false} diff --git a/mddocs/docs/connection/db_connection/kafka/slots.md b/mddocs/docs/connection/db_connection/kafka/slots.md new file mode 100644 index 000000000..874b9352a --- /dev/null +++ b/mddocs/docs/connection/db_connection/kafka/slots.md @@ -0,0 +1,19 @@ +# Kafka Slots { #DBR-onetl-connection-db-connection-kafka-slots } + + + +::: onetl.connection.db_connection.kafka.slots.KafkaSlots + options: + extensions: + - griffe_sphinx + - griffe_pydantic: {schema: false} diff --git a/mddocs/docs/connection/db_connection/kafka/ssl_protocol.md b/mddocs/docs/connection/db_connection/kafka/ssl_protocol.md new file mode 100644 index 000000000..a8a9a32ad --- /dev/null +++ b/mddocs/docs/connection/db_connection/kafka/ssl_protocol.md @@ -0,0 +1,23 @@ +# Kafka SSLProtocol { #DBR-onetl-connection-db-connection-kafka-ssl-protocol-kafka-sslprotocol } + + + +::: onetl.connection.db_connection.kafka.kafka_ssl_protocol.KafkaSSLProtocol + options: + inherited_members: true + heading_level: 3 + show_root_heading: true + extensions: + - griffe_sphinx + - griffe_pydantic: {schema: false} diff --git a/mddocs/docs/connection/db_connection/kafka/troubleshooting.md b/mddocs/docs/connection/db_connection/kafka/troubleshooting.md new file mode 100644 index 000000000..c0c926290 --- /dev/null +++ b/mddocs/docs/connection/db_connection/kafka/troubleshooting.md @@ -0,0 +1,9 @@ +# Kafka Troubleshooting { #DBR-onetl-connection-db-connection-kafka-troubleshooting } + +!!! note + + [General guide][DBR-onetl-troubleshooting]. + +## Cannot connect using `SSL` protocol { #DBR-onetl-connection-db-connection-kafka-troubleshooting-cannot-connect-using-ssl-protocol } + +Please check that certificate files are not Base-64 encoded. diff --git a/mddocs/docs/connection/db_connection/kafka/write.md b/mddocs/docs/connection/db_connection/kafka/write.md new file mode 100644 index 000000000..03fa30c32 --- /dev/null +++ b/mddocs/docs/connection/db_connection/kafka/write.md @@ -0,0 +1,85 @@ +# Writing to Kafka { #DBR-onetl-connection-db-connection-kafka-write-writing-to-kafka } + +For writing data to Kafka, use [DBWriter][DBR-onetl-db-writer] with specific options (see below). + +## Dataframe schema { #DBR-onetl-connection-db-connection-kafka-write-dataframe-schema } + +Unlike other DB connections, Kafka does not have concept of columns. +All the topics messages have the same set of fields. Only some of them can be written: + +```text +root +|-- key: binary (nullable = true) +|-- value: binary (nullable = true) +|-- headers: struct (nullable = true) + |-- key: string (nullable = false) + |-- value: binary (nullable = true) +``` + +`headers` can be passed only with `Kafka.WriteOptions(include_headers=True)` (compatibility with Kafka 1.x). + +Field `topic` should not be present in the dataframe, as it is passed to `DBWriter(target=...)`. + +Other fields, like `partition`, `offset`, `timestamp` are set by Kafka, and cannot be passed explicitly. + +## Value serialization { #DBR-onetl-connection-db-connection-kafka-write-value-serialization } + +To write `value` or `key` of other type than bytes (e.g. struct or integer), users have to serialize values manually. + +This could be done using following methods: + +- [Avro.serialize_column][onetl.file.format.avro.Avro.serialize_column] +- [JSON.serialize_column][onetl.file.format.json.JSON.serialize_column] +- [CSV.serialize_column][onetl.file.format.csv.CSV.serialize_column] + +## Examples { #DBR-onetl-connection-db-connection-kafka-write-examples } + +Convert `value` to JSON string, and write to Kafka: + +```python +from onetl.connection import Kafka +from onetl.db import DBWriter +from onetl.file.format import JSON + +df = ... # original data is here + +# serialize struct data as JSON +json = JSON() +write_df = df.select( + df.key, + json.serialize_column(df.value), +) + +# write data to Kafka +kafka = Kafka(...) + +writer = DBWriter( + connection=kafka, + target="topic_name", +) +writer.run(write_df) +``` + +## Options { #DBR-onetl-connection-db-connection-kafka-write-options } + + + +::: onetl.connection.db_connection.kafka.options.KafkaWriteOptions + options: + inherited_members: true + heading_level: 3 + show_root_heading: true + extensions: + - griffe_sphinx + - griffe_pydantic: {schema: false} diff --git a/mddocs/docs/connection/db_connection/mongodb/connection.md b/mddocs/docs/connection/db_connection/mongodb/connection.md new file mode 100644 index 000000000..ea3fa113f --- /dev/null +++ b/mddocs/docs/connection/db_connection/mongodb/connection.md @@ -0,0 +1,19 @@ +# MongoDB Connection { #DBR-onetl-connection-db-connection-mongodb-connection-0 } + + + +::: onetl.connection.db_connection.mongodb.connection.MongoDB + options: + members: + - get_packages + - check diff --git a/mddocs/docs/connection/db_connection/mongodb/index.md b/mddocs/docs/connection/db_connection/mongodb/index.md new file mode 100644 index 000000000..7b063884b --- /dev/null +++ b/mddocs/docs/connection/db_connection/mongodb/index.md @@ -0,0 +1,16 @@ +# MongoDB { #DBR-onetl-connection-db-connection-mongodb } + +## Connection { #DBR-onetl-connection-db-connection-mongodb-connection-1 } + +* [Prerequisites][DBR-onetl-connection-db-connection-mongodb-prerequisites] +* [MongoDB Connection][DBR-onetl-connection-db-connection-mongodb-connection-0] + +## Operations { #DBR-onetl-connection-db-connection-mongodb-operations } + +* [Reading from MongoDB using `DBReader`][DBR-onetl-connection-db-connection-mongodb-read-reading-from-mongodb-using-dbreader] +* [Reading from MongoDB using `MongoDB.pipeline`][DBR-onetl-connection-db-connection-mongodb-pipeline-reading-from-mongodb-using-mongodb-pipeline] +* [Writing to MongoDB using `DBWriter`][DBR-onetl-connection-db-connection-mongodb-write-writing-to-mongodb-using-dbwriter] + +## Troubleshooting { #DBR-onetl-connection-db-connection-mongodb-troubleshooting } + +* [MongoDB <-> Spark type mapping][DBR-onetl-connection-db-connection-mongodb-types-mongodb-spark-type-mapping] diff --git a/mddocs/docs/connection/db_connection/mongodb/pipeline.md b/mddocs/docs/connection/db_connection/mongodb/pipeline.md new file mode 100644 index 000000000..dbe5286ed --- /dev/null +++ b/mddocs/docs/connection/db_connection/mongodb/pipeline.md @@ -0,0 +1,48 @@ +# Reading from MongoDB using `MongoDB.pipeline` { #DBR-onetl-connection-db-connection-mongodb-pipeline-reading-from-mongodb-using-mongodb-pipeline } + +[MongoDB.sql][onetl.connection.db_connection.mongodb.connection.MongoDB.pipeline] allows passing custom pipeline, but does not support incremental strategies. + +!!! warning + + Please take into account [Mongodb types][DBR-onetl-connection-db-connection-mongodb-types-mongodb-spark-type-mapping] + +## Recommendations { #DBR-onetl-connection-db-connection-mongodb-pipeline-recommendations } + +### Pay attention to `pipeline` value { #DBR-onetl-connection-db-connection-mongodb-pipeline-pay-attention-to-pipeline-value } + +Instead of filtering data on Spark side using `df.filter(df.column == 'value')` pass proper `mongodb.pipeline(..., pipeline={"$match": {"column": {"$eq": "value"}}})` value. +This both reduces the amount of data send from MongoDB to Spark, and may also improve performance of the query. +Especially if there are indexes for columns used in `pipeline` value. + +## References { #DBR-onetl-connection-db-connection-mongodb-pipeline-references } + + + +::: onetl.connection.db_connection.mongodb.connection.MongoDB.pipeline + options: + heading_level: 3 + show_root_heading: true + +::: onetl.connection.db_connection.mongodb.options.MongoDBPipelineOptions + options: + heading_level: 3 + show_root_heading: true diff --git a/mddocs/docs/connection/db_connection/mongodb/prerequisites.md b/mddocs/docs/connection/db_connection/mongodb/prerequisites.md new file mode 100644 index 000000000..84a151a12 --- /dev/null +++ b/mddocs/docs/connection/db_connection/mongodb/prerequisites.md @@ -0,0 +1,70 @@ +# Prerequisites { #DBR-onetl-connection-db-connection-mongodb-prerequisites } + +## Version Compatibility { #DBR-onetl-connection-db-connection-mongodb-prerequisites-version-compatibility } + +- MongoDB server versions: + - Officially declared: 4.0 or higher + - Actually tested: 4.0.0, 8.0.4 +- Spark versions: 3.2.x - 3.5.x +- Java versions: 8 - 20 + +See [official documentation](https://www.mongodb.com/docs/spark-connector/). + +## Installing PySpark { #DBR-onetl-connection-db-connection-mongodb-prerequisites-installing-pyspark } + +To use MongoDB connector you should have PySpark installed (or injected to `sys.path`) +BEFORE creating the connector instance. + +See [installation instruction][DBR-onetl-install-spark] for more details. + +## Connecting to MongoDB { #DBR-onetl-connection-db-connection-mongodb-prerequisites-connecting-to-mongodb } + +### Connection host { #DBR-onetl-connection-db-connection-mongodb-prerequisites-connection-host } + +It is possible to connect to MongoDB host by using either DNS name of host or it's IP address. + +It is also possible to connect to MongoDB shared cluster: + + ```python + mongo = MongoDB( + host="master.host.or.ip", + user="user", + password="*****", + database="target_database", + spark=spark, + extra={ + # read data from secondary cluster node, switch to primary if not available + "readPreference": "secondaryPreferred", + }, + ) + ``` + +Supported `readPreference` values are described in [official documentation](https://www.mongodb.com/docs/manual/core/read-preference/). + +### Connection port { #DBR-onetl-connection-db-connection-mongodb-prerequisites-connection-port } + +Connection is usually performed to port `27017`. Port may differ for different MongoDB instances. +Please ask your MongoDB administrator to provide required information. + +### Required grants { #DBR-onetl-connection-db-connection-mongodb-prerequisites-required-grants } + +Ask your MongoDB cluster administrator to set following grants for a user, used for creating a connection: + +=== "Read + Write" + + ```js + // allow writing data to specific database + db.grantRolesToUser("username", [{db: "somedb", role: "readWrite"}]) + ``` + +=== "Read only" + + ```js + // allow reading data from specific database + db.grantRolesToUser("username", [{db: "somedb", role: "read"}]) + ``` + +See: + +- [db.grantRolesToUser documentation](https://www.mongodb.com/docs/manual/reference/method/db.grantRolesToUser) +- [MongoDB builtin roles](https://www.mongodb.com/docs/manual/reference/built-in-roles) diff --git a/mddocs/docs/connection/db_connection/mongodb/read.md b/mddocs/docs/connection/db_connection/mongodb/read.md new file mode 100644 index 000000000..52c023357 --- /dev/null +++ b/mddocs/docs/connection/db_connection/mongodb/read.md @@ -0,0 +1,143 @@ +# Reading from MongoDB using `DBReader` { #DBR-onetl-connection-db-connection-mongodb-read-reading-from-mongodb-using-dbreader } + +[DBReader][DBR-onetl-db-reader] supports [strategy][DBR-onetl-strategy-read-strategies] for incremental data reading, but does not support custom pipelines, e.g. aggregation. + +!!! warning + + Please take into account [MongoDB types][DBR-onetl-connection-db-connection-mongodb-types-mongodb-spark-type-mapping] + +## Supported DBReader features { #DBR-onetl-connection-db-connection-mongodb-read-supported-dbreader-features } + +- ❌ `columns` (for now, all document fields are read) +- ✅︎ `where` (passed to `{"$match": ...}` aggregation pipeline) +- ✅︎ `hwm`, supported strategies: + - ✅︎ [Snapshot strategy][DBR-onetl-strategy-snapshot-strategy] + - ✅︎ [Incremental strategy][DBR-onetl-connection-db-connection-clickhouse-read-incremental-strategy] + - ✅︎ [Snapshot batch strategy][DBR-onetl-strategy-snapshot-batch-strategy] + - ✅︎ [Incremental batch strategy][DBR-onetl-strategy-incremental-batch-strategy] + - Note that `expression` field of HWM can only be a field name, not a custom expression +- ✅︎ `hint` (see [official documentation](https://www.mongodb.com/docs/v5.0/reference/operator/meta/hint/)) +- ✅︎ `df_schema` (mandatory) +- ✅︎ `options` (see [MongoDB.ReadOptions][onetl.connection.db_connection.mongodb.options.MongoDBReadOptions]) + +## Examples { #DBR-onetl-connection-db-connection-mongodb-read-examples } + +Snapshot strategy: + + ```python + from onetl.connection import MongoDB + from onetl.db import DBReader + + from pyspark.sql.types import ( + StructType, + StructField, + IntegerType, + StringType, + TimestampType, + ) + + mongodb = MongoDB(...) + + # mandatory + df_schema = StructType( + [ + StructField("_id", StringType()), + StructField("some", StringType()), + StructField( + "field", + StructType( + [ + StructField("nested", IntegerType()), + ], + ), + ), + StructField("updated_dt", TimestampType()), + ] + ) + + reader = DBReader( + connection=mongodb, + source="some_collection", + df_schema=df_schema, + where={"field": {"$eq": 123}}, + hint={"field": 1}, + options=MongoDBReadOptions(batchSize=10000), + ) + df = reader.run() + ``` + +Incremental strategy: + + ```python + from onetl.connection import MongoDB + from onetl.db import DBReader + from onetl.strategy import IncrementalStrategy + + from pyspark.sql.types import ( + StructType, + StructField, + IntegerType, + StringType, + TimestampType, + ) + + mongodb = MongoDB(...) + + # mandatory + df_schema = StructType( + [ + StructField("_id", StringType()), + StructField("some", StringType()), + StructField( + "field", + StructType( + [ + StructField("nested", IntegerType()), + ], + ), + ), + StructField("updated_dt", TimestampType()), + ] + ) + + reader = DBReader( + connection=mongodb, + source="some_collection", + df_schema=df_schema, + where={"field": {"$eq": 123}}, + hint={"field": 1}, + hwm=DBReader.AutoDetectHWM(name="mongodb_hwm", expression="updated_dt"), + options=MongoDBReadOptions(batchSize=10000), + ) + + with IncrementalStrategy(): + df = reader.run() + ``` + +## Recommendations { #DBR-onetl-connection-db-connection-mongodb-read-recommendations } + +### Pay attention to `where` value { #DBR-onetl-connection-db-connection-mongodb-read-pay-attention-to-where-value } + +Instead of filtering data on Spark side using `df.filter(df.column == 'value')` pass proper `DBReader(where={"column": {"$eq": "value"}})` clause. +This both reduces the amount of data send from MongoDB to Spark, and may also improve performance of the query. +Especially if there are indexes for columns used in `where` clause. + +## Read options { #DBR-onetl-connection-db-connection-mongodb-read-options } + + + +::: onetl.connection.db_connection.mongodb.options.MongoDBReadOptions + options: + heading_level: 3 + show_root_heading: true diff --git a/mddocs/docs/connection/db_connection/mongodb/types.md b/mddocs/docs/connection/db_connection/mongodb/types.md new file mode 100644 index 000000000..d9d4f05e1 --- /dev/null +++ b/mddocs/docs/connection/db_connection/mongodb/types.md @@ -0,0 +1,209 @@ +# MongoDB <-> Spark type mapping { #DBR-onetl-connection-db-connection-mongodb-types-mongodb-spark-type-mapping } + +!!! note + + The results below are valid for Spark 3.5.5, and may differ on other Spark versions. + +## Type detection & casting { #DBR-onetl-connection-db-connection-mongodb-types-type-detection-casting } + +Spark's DataFrames always have a `schema` which is a list of fields with corresponding Spark types. All operations on a field are performed using field type. + +MongoDB is, by design, \_\_schemaless\_\_. So there are 2 ways how this can be handled: + +- User provides DataFrame schema explicitly: + +??? note "See example" + + ```python + from onetl.connection import MongoDB + from onetl.db import DBReader + + from pyspark.sql.types import ( + StructType, + StructField, + IntegerType, + StringType, + TimestampType, + ) + + mongodb = MongoDB(...) + + df_schema = StructType( + [ + StructField("_id", StringType()), + StructField("some", StringType()), + StructField( + "field", + StructType( + [ + StructField("nested", IntegerType()), + ] + ), + ), + ] + ) + + reader = DBReader( + connection=mongodb, + source="some_collection", + df_schema=df_schema, + ) + df = reader.run() + + # or + + df = mongodb.pipeline( + collection="some_collection", + df_schema=df_schema, + ) + ``` + +- Rely on MongoDB connector schema infer: + + ```python + df = mongodb.pipeline(collection="some_collection") + ``` + + In this case MongoDB connector read a sample of collection documents, and build DataFrame schema based on document fields and values. + +It is highly recommended to pass `df_schema` explicitly, to avoid type conversion issues. + +### References { #DBR-onetl-connection-db-connection-mongodb-types-references } + +Here you can find source code with type conversions: + +- [MongoDB -> Spark](https://github.com/mongodb/mongo-spark/blob/r10.4.1/src/main/java/com/mongodb/spark/sql/connector/schema/InferSchema.java#L214-L260) +- [Spark -> MongoDB](https://github.com/mongodb/mongo-spark/blob/r10.4.1/src/main/java/com/mongodb/spark/sql/connector/schema/RowToBsonDocumentConverter.java#L157-L260) + +## Supported types { #DBR-onetl-connection-db-connection-mongodb-types-supported-types } + +See [official documentation](https://www.mongodb.com/docs/manual/reference/bson-types/) + +### Numeric types { #DBR-onetl-connection-db-connection-mongodb-types-numeric-types } + +| MongoDB type (read) | Spark type | MongoDB type (write) | +|---------------------|-----------------------------|----------------------| +| `Decimal128` | `DecimalType(P=34, S=32)` | `Decimal128` | +| `-`
`Double` | `FloatType()`
`DoubleType()` | `Double` | +| `-`
`-`
`Int32` | `ByteType()`
`ShortType()`
`IntegerType()` | `Int32` | +| `Int64` | `LongType()` | `Int64` | + +### Temporal types { #DBR-onetl-connection-db-connection-mongodb-types-temporal-types } + +| MongoDB type (read) | Spark type | MongoDB type (write) | +|------------------------|-----------------------------------|-------------------------| +| `-` | `DateType()`, days | `Date`, milliseconds | +| `Date`, milliseconds | `TimestampType()`, microseconds | `Date`, milliseconds, **precision loss** [^1]| +| `Timestamp`, seconds | `TimestampType()`, microseconds | `Date`, milliseconds | +| `-`
`-` | `TimestampNTZType()`
`DayTimeIntervalType()` | unsupported | + +!!! warning + + Note that types in MongoDB and Spark have different value ranges: + + + | MongoDB type | Min value | Max value | Spark type | Min value | Max value | + |---------------|--------------------------------|--------------------------------|---------------------|--------------------------------|--------------------------------| + | `Date`
`Timestamp` | -290 million years
`1970-01-01 00:00:00` | 290 million years
`2106-02-07 09:28:16` | `TimestampType()` | `0001-01-01 00:00:00.000000` | `9999-12-31 23:59:59.999999` | + + So not all values can be read from MongoDB to Spark, and can written from Spark DataFrame to MongoDB. + + References: + + * [MongoDB Date type documentation](https://www.mongodb.com/docs/manual/reference/bson-types/#date) + * [MongoDB Timestamp documentation](https://www.mongodb.com/docs/manual/reference/bson-types/#timestamps) + * [Spark DateType documentation](https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/DateType.html) + * [Spark TimestampType documentation](https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/TimestampType.html) + +[^1]: MongoDB `Date` type has precision up to milliseconds (`23:59:59.999`). + Inserting data with microsecond precision (`23:59:59.999999`) + will lead to **throwing away microseconds**. + +### String types { #DBR-onetl-connection-db-connection-mongodb-types-string-types } + +Note: fields of deprecated MongoDB type `Symbol` are excluded during read. + +| MongoDB type (read) | Spark type | MongoDB type (write) | +|---------------------|------------------|----------------------| +| `String`
`Code`
`RegExp` | `StringType()` | `String` | + +### Binary types { #DBR-onetl-connection-db-connection-mongodb-types-binary-types } + +| MongoDB type (read) | Spark type | MongoDB type (write) | +| ------------------- | --------------- | -------------------- | +| `Boolean` | `BooleanType()` | `Boolean` | +| `Binary` | `BinaryType()` | `Binary` | + +### Struct types { #DBR-onetl-connection-db-connection-mongodb-types-struct-types } + +| MongoDB type (read) | Spark type | MongoDB type (write) | +|---------------------|-----------------------|----------------------| +| `Array[T]` | `ArrayType(T)` | `Array[T]` | +| `Object[...]`
`-` | `StructType([...])`
`MapType(...)` | `Object[...]`
| + +### Special types { #DBR-onetl-connection-db-connection-mongodb-types-special-types } + +| MongoDB type (read) | Spark type | MongoDB type (write) | +|---------------------|---------------------------------------------------------|---------------------------------------| +| `ObjectId`
`MaxKey`
`MinKey` |

`StringType()` |

`String` | +| `Null`
`Undefined` | `NullType()` | `Null` | +| `DBRef` | `StructType([$ref: StringType(), $id: StringType()])` | `Object[$ref: String, $id: String]` | + +## Explicit type cast { #DBR-onetl-connection-db-connection-mongodb-types-explicit-type-cast } + +### `DBReader` { #DBR-onetl-connection-db-connection-mongodb-types-dbreader } + +Currently it is not possible to cast field types using `DBReader`. But this can be done using `MongoDB.pipeline`. + +### `MongoDB.pipeline` { #DBR-onetl-connection-db-connection-mongodb-types-mongodb-pipeline } + +You can use `$project` aggregation to cast field types: + + ```python + from pyspark.sql.types import IntegerType, StructField, StructType + + from onetl.connection import MongoDB + from onetl.db import DBReader + + mongodb = MongoDB(...) + + df = mongodb.pipeline( + collection="my_collection", + pipeline=[ + { + "$project": { + # convert unsupported_field to string + "unsupported_field_str": { + "$convert": { + "input": "$unsupported_field", + "to": "string", + }, + }, + # skip unsupported_field from result + "unsupported_field": 0, + } + } + ], + ) + + # cast field content to proper Spark type + df = df.select( + df.id, + df.supported_field, + # explicit cast + df.unsupported_field_str.cast("integer").alias("parsed_integer"), + ) + ``` + +### `DBWriter` { #DBR-onetl-connection-db-connection-mongodb-types-dbwriter } + +Convert dataframe field to string on Spark side, and then write it to MongoDB: + + ```python + df = df.select( + df.id, + df.unsupported_field.cast("string").alias("array_field_json"), + ) + + writer.run(df) + ``` diff --git a/mddocs/docs/connection/db_connection/mongodb/write.md b/mddocs/docs/connection/db_connection/mongodb/write.md new file mode 100644 index 000000000..6216ec6e8 --- /dev/null +++ b/mddocs/docs/connection/db_connection/mongodb/write.md @@ -0,0 +1,52 @@ +# Writing to MongoDB using `DBWriter` { #DBR-onetl-connection-db-connection-mongodb-write-writing-to-mongodb-using-dbwriter } + +For writing data to MongoDB, use [DBWriter][DBR-onetl-db-writer]. + +!!! warning + + Please take into account [MongoDB types][DBR-onetl-connection-db-connection-mongodb-types-mongodb-spark-type-mapping] + +## Examples { #DBR-onetl-connection-db-connection-mongodb-write-examples } + + ```python + from onetl.connection import MongoDB + from onetl.db import DBWriter + + mongodb = MongoDB(...) + + df = ... # data is here + + writer = DBWriter( + connection=mongodb, + target="schema.table", + options=MongoDB.WriteOptions( + if_exists="append", + ), + ) + + writer.run(df) + ``` + +## Write options { #DBR-onetl-connection-db-connection-mongodb-write-options } + +Method above accepts [MongoDB.WriteOptions][onetl.connection.db_connection.mongodb.options.MongoDBWriteOptions] + + + +::: onetl.connection.db_connection.mongodb.options.MongoDBWriteOptions + options: + heading_level: 3 + show_root_heading: true diff --git a/mddocs/docs/connection/db_connection/mssql/connection.md b/mddocs/docs/connection/db_connection/mssql/connection.md new file mode 100644 index 000000000..9295fe918 --- /dev/null +++ b/mddocs/docs/connection/db_connection/mssql/connection.md @@ -0,0 +1,18 @@ +# MSSQL connection { #DBR-onetl-connection-db-connection-mssql-connection-0 } + + + +::: onetl.connection.db_connection.mssql.connection.MSSQL + options: + members: + - get_packages + - check diff --git a/mddocs/docs/connection/db_connection/mssql/execute.md b/mddocs/docs/connection/db_connection/mssql/execute.md new file mode 100644 index 000000000..858354c73 --- /dev/null +++ b/mddocs/docs/connection/db_connection/mssql/execute.md @@ -0,0 +1,124 @@ +# Executing statements in MSSQL { #DBR-onetl-connection-db-connection-mssql-execute-executing-statements-in-mssql } + +!!! warning + + Methods below **read all the rows** returned from DB **to Spark driver memory**, and then convert them to DataFrame. + + Do **NOT** use them to read large amounts of data. Use [DBReader][DBR-onetl-connection-db-connection-mssql-read-reading-from-mssql-using-dbreader] or [MSSQL.sql][DBR-onetl-connection-db-connection-mssql-sql-reading-from-mssql-using-mssql-sql] instead. + +## How to { #DBR-onetl-connection-db-connection-mssql-execute-how-to } + +There are 2 ways to execute some statement in MSSQL + +### Use `MSSQL.fetch` { #DBR-onetl-connection-db-connection-mssql-execute-use-mssql-fetch } + +Use this method to perform some `SELECT` query which returns **small number or rows**, like reading MSSQL config, or reading data from some reference table. Method returns Spark DataFrame. + +Method accepts [MSSQL.FetchOptions][onetl.connection.db_connection.mssql.options.MSSQLFetchOptions]. + +Connection opened using this method should be then closed with `connection.close()` or `with connection:`. + +!!! warning + + Please take into account [MSSQL types][DBR-onetl-connection-db-connection-mssql-types-mssql-spark-type-mapping]. + +#### Syntax support in `MSSQL.fetch` { #DBR-onetl-connection-db-connection-mssql-execute-syntax-support-in-mssql-fetch } + +This method supports **any** query syntax supported by MSSQL, like: + +- ✅︎ `SELECT ... FROM ...` +- ✅︎ `WITH alias AS (...) SELECT ...` +- ✅︎ `SELECT func(arg1, arg2) FROM DUAL` - call function +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +#### Examples for `MSSQL.fetch` { #DBR-onetl-connection-db-connection-mssql-execute-examples-for-mssql-fetch } + + ```python + from onetl.connection import MSSQL + + mssql = MSSQL(...) + + df = mssql.fetch( + "SELECT value FROM some.reference_table WHERE key = 'some_constant'", + options=MSSQL.FetchOptions(queryTimeout=10), + ) + mssql.close() + value = df.collect()[0][0] # get value from first row and first column + ``` + +### Use `MSSQL.execute` { #DBR-onetl-connection-db-connection-mssql-execute-use-mssql-execute } + +Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. + +Method accepts [MSSQL.ExecuteOptions][onetl.connection.db_connection.mssql.options.MSSQLExecuteOptions]. + +Connection opened using this method should be then closed with `connection.close()` or `with connection:`. + +#### Syntax support in `MSSQL.execute` { #DBR-onetl-connection-db-connection-mssql-execute-syntax-support-in-mssql-execute } + +This method supports **any** query syntax supported by MSSQL, like: + +- ✅︎ `CREATE TABLE ...`, `CREATE VIEW ...` +- ✅︎ `ALTER ...` +- ✅︎ `INSERT INTO ... AS SELECT ...` +- ✅︎ `DROP TABLE ...`, `DROP VIEW ...`, `TRUNCATE TABLE`, and so on +- ✅︎ `EXEC procedure(arg1, arg2) ...` or `{call procedure(arg1, arg2)}` - special syntax for calling procedure +- ✅︎ `DECLARE ... BEGIN ... END` - execute PL/SQL statement +- ✅︎ other statements not mentioned here +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +#### Examples for `MSSQL.execute` { #DBR-onetl-connection-db-connection-mssql-execute-examples-for-mssql-execute } + + ```python + from onetl.connection import MSSQL + + mssql = MSSQL(...) + + mssql.execute("DROP TABLE schema.table") + mssql.execute( + """ + CREATE TABLE schema.table ( + id bigint GENERATED ALWAYS AS IDENTITY, + key VARCHAR2(4000), + value NUMBER + ) + """, + options=MSSQL.ExecuteOptions(queryTimeout=10), + ) + ``` + +## Options { #DBR-onetl-connection-db-connection-mssql-execute-options } + + + +::: onetl.connection.db_connection.mssql.options.MSSQLFetchOptions + options: + inherited_members: true + heading_level: 3 + show_root_heading: true + +::: onetl.connection.db_connection.mssql.options.MSSQLExecuteOptions + options: + inherited_members: true + heading_level: 3 + show_root_heading: true diff --git a/mddocs/docs/connection/db_connection/mssql/index.md b/mddocs/docs/connection/db_connection/mssql/index.md new file mode 100644 index 000000000..4301ea84c --- /dev/null +++ b/mddocs/docs/connection/db_connection/mssql/index.md @@ -0,0 +1,17 @@ +# MSSQL { #DBR-onetl-connection-db-connection-mssql } + +## Connection { #DBR-onetl-connection-db-connection-mssql-connection-1 } + +* [Prerequisites][DBR-onetl-connection-db-connection-mssql-prerequisites] +* [MSSQL connection][DBR-onetl-connection-db-connection-mssql-connection-0] + +## Operations { #DBR-onetl-connection-db-connection-mssql-operations } + +* [Reading from MSSQL using `DBReader`][DBR-onetl-connection-db-connection-mssql-read-reading-from-mssql-using-dbreader] +* [Reading from MSSQL using `MSSQL.sql`][DBR-onetl-connection-db-connection-mssql-sql-reading-from-mssql-using-mssql-sql] +* [Writing to MSSQL using `DBWriter`][DBR-onetl-connection-db-connection-mssql-write-writing-to-mssql-using-dbwriter] +* [Executing statements in MSSQL][DBR-onetl-connection-db-connection-mssql-execute-executing-statements-in-mssql] + +## Troubleshooting { #DBR-onetl-connection-db-connection-mssql-troubleshooting } + +* [MSSQL <-> Spark type mapping][DBR-onetl-connection-db-connection-mssql-types-mssql-spark-type-mapping] diff --git a/mddocs/docs/connection/db_connection/mssql/prerequisites.md b/mddocs/docs/connection/db_connection/mssql/prerequisites.md new file mode 100644 index 000000000..f233950d2 --- /dev/null +++ b/mddocs/docs/connection/db_connection/mssql/prerequisites.md @@ -0,0 +1,76 @@ +# Prerequisites { #DBR-onetl-connection-db-connection-mssql-prerequisites } + +## Version Compatibility { #DBR-onetl-connection-db-connection-mssql-prerequisites-version-compatibility } + +- SQL Server versions: + - Officially declared: 2016 - 2022 + - Actually tested: 2017, 2022 +- Spark versions: 2.3.x - 3.5.x +- Java versions: 8 - 20 + +See [official documentation](https://learn.microsoft.com/en-us/sql/connect/jdbc/system-requirements-for-the-jdbc-driver) +and [official compatibility matrix](https://learn.microsoft.com/en-us/sql/connect/jdbc/microsoft-jdbc-driver-for-sql-server-support-matrix). + +## Installing PySpark { #DBR-onetl-connection-db-connection-mssql-prerequisites-installing-pyspark } + +To use MSSQL connector you should have PySpark installed (or injected to `sys.path`) BEFORE creating the connector instance. + +See [installation instruction][DBR-onetl-install-spark] for more details. + +## Connecting to MSSQL { #DBR-onetl-connection-db-connection-mssql-prerequisites-connecting-to-mssql } + +### Connection port { #DBR-onetl-connection-db-connection-mssql-prerequisites-connection-port } + +Connection is usually performed to port 1433. Port may differ for different MSSQL instances. +Please ask your MSSQL administrator to provide required information. + +For named MSSQL instances (`instanceName` option), [port number is optional](https://learn.microsoft.com/en-us/sql/connect/jdbc/building-the-connection-url?view=sql-server-ver16#named-and-multiple-sql-server-instances), and could be omitted. + +### Connection host { #DBR-onetl-connection-db-connection-mssql-prerequisites-connection-host } + +It is possible to connect to MSSQL by using either DNS name of host or it's IP address. + +If you're using MSSQL cluster, it is currently possible to connect only to **one specific node**. +Connecting to multiple nodes to perform load balancing, as well as automatic failover to new master/replica are not supported. + +### Required grants { #DBR-onetl-connection-db-connection-mssql-prerequisites-required-grants } + +Ask your MSSQL cluster administrator to set following grants for a user, +used for creating a connection: + +=== "Read + Write (schema is owned by user)" + + ```sql + -- allow creating tables for user + GRANT CREATE TABLE TO username; + + -- allow read & write access to specific table + GRANT SELECT, INSERT ON username.mytable TO username; + + -- only if if_exists="replace_entire_table" is used: + -- allow dropping/truncating tables in any schema + GRANT ALTER ON username.mytable TO username; + ``` + +=== "Read + Write (schema is not owned by user)" + + ```sql + -- allow creating tables for user + GRANT CREATE TABLE TO username; + + -- allow managing tables in specific schema, and inserting data to tables + GRANT ALTER, SELECT, INSERT ON SCHEMA::someschema TO username; + ``` + +=== "Read only" + + ```sql + -- allow read access to specific table + GRANT SELECT ON someschema.mytable TO username; + ``` + +More details can be found in official documentation: + +- [GRANT ON DATABASE](https://learn.microsoft.com/en-us/sql/t-sql/statements/grant-database-permissions-transact-sql) +- [GRANT ON OBJECT](https://learn.microsoft.com/en-us/sql/t-sql/statements/grant-object-permissions-transact-sql) +- [GRANT ON SCHEMA](https://learn.microsoft.com/en-us/sql/t-sql/statements/grant-schema-permissions-transact-sql) diff --git a/mddocs/docs/connection/db_connection/mssql/read.md b/mddocs/docs/connection/db_connection/mssql/read.md new file mode 100644 index 000000000..f554efbf4 --- /dev/null +++ b/mddocs/docs/connection/db_connection/mssql/read.md @@ -0,0 +1,98 @@ +# Reading from MSSQL using `DBReader` { #DBR-onetl-connection-db-connection-mssql-read-reading-from-mssql-using-dbreader } + +[DBReader][DBR-onetl-db-reader] supports [strategy][DBR-onetl-strategy-read-strategies] for incremental data reading, but does not support custom queries, like `JOIN`. + +!!! warning + + Please take into account [MSSQL types][DBR-onetl-connection-db-connection-mssql-types-mssql-spark-type-mapping] + +## Supported DBReader features { #DBR-onetl-connection-db-connection-mssql-read-supported-dbreader-features } + +- ✅︎ `columns` +- ✅︎ `where` +- ✅︎ `hwm`, supported strategies: + - ✅︎ [Snapshot strategy][DBR-onetl-strategy-snapshot-strategy] + - ✅︎ [Incremental strategy][DBR-onetl-connection-db-connection-clickhouse-read-incremental-strategy] + - ✅︎ [Snapshot batch strategy][DBR-onetl-strategy-snapshot-batch-strategy] + - ✅︎ [Incremental batch strategy][DBR-onetl-strategy-incremental-batch-strategy] +- ❌ `hint` (MSSQL does support hints, but DBReader not, at least for now) +- ❌ `df_schema` +- ✅︎ `options` (see [MSSQL.ReadOptions][onetl.connection.db_connection.mssql.options.MSSQLReadOptions]) + +## Examples { #DBR-onetl-connection-db-connection-mssql-read-examples } + +Snapshot strategy: + + ```python + from onetl.connection import MSSQL + from onetl.db import DBReader + + mssql = MSSQL(...) + + reader = DBReader( + connection=mssql, + source="schema.table", + columns=["id", "key", "CAST(value AS text) value", "updated_dt"], + where="key = 'something'", + options=MSSQL.ReadOptions(partitionColumn="id", numPartitions=10), + ) + df = reader.run() + + . + ``` + +Incremental strategy: + + ```python + from onetl.connection import MSSQL + from onetl.db import DBReader + from onetl.strategy import IncrementalStrategy + + mssql = MSSQL(...) + + reader = DBReader( + connection=mssql, + source="schema.table", + columns=["id", "key", "CAST(value AS text) value", "updated_dt"], + where="key = 'something'", + hwm=DBReader.AutoDetectHWM(name="mssql_hwm", expression="updated_dt"), + options=MSSQL.ReadOptions(partitionColumn="id", numPartitions=10), + ) + + with IncrementalStrategy(): + df = reader.run() + ``` + +## Recommendations { #DBR-onetl-connection-db-connection-mssql-read-recommendations } + +### Select only required columns { #DBR-onetl-connection-db-connection-mssql-read-select-only-required-columns } + +Instead of passing `"*"` in `DBReader(columns=[...])` prefer passing exact column names. This reduces the amount of data passed from MSSQL to Spark. + +### Pay attention to `where` value { #DBR-onetl-connection-db-connection-mssql-read-pay-attention-to-where-value } + +Instead of filtering data on Spark side using `df.filter(df.column == 'value')` pass proper `DBReader(where="column = 'value'")` clause. +This both reduces the amount of data send from MSSQL to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in `where` clause. + +## Options { #DBR-onetl-connection-db-connection-mssql-read-options } + + + +::: onetl.connection.db_connection.mssql.options.MSSQLReadOptions + options: + inherited_members: true + heading_level: 3 + show_root_heading: true diff --git a/mddocs/docs/connection/db_connection/mssql/sql.md b/mddocs/docs/connection/db_connection/mssql/sql.md new file mode 100644 index 000000000..ac25073d7 --- /dev/null +++ b/mddocs/docs/connection/db_connection/mssql/sql.md @@ -0,0 +1,82 @@ +# Reading from MSSQL using `MSSQL.sql` { #DBR-onetl-connection-db-connection-mssql-sql-reading-from-mssql-using-mssql-sql } + +`MSSQL.sql` allows passing custom SQL query, but does not support incremental strategies. + +!!! warning + + Please take into account [MSSQL types][DBR-onetl-connection-db-connection-mssql-types-mssql-spark-type-mapping] + +!!! warning + + Statement is executed in **read-write** connection, so if you're calling some functions/procedures with DDL/DML statements inside, + they can change data in your database. + +## Syntax support { #DBR-onetl-connection-db-connection-mssql-sql-syntax-support } + +Only queries with the following syntax are supported: + +- ✅︎ `SELECT ... FROM ...` +- ❌ `WITH alias AS (...) SELECT ...` +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +## Examples { #DBR-onetl-connection-db-connection-mssql-sql-examples } + + ```python + from onetl.connection import MSSQL + + mssql = MSSQL(...) + df = mssql.sql( + """ + SELECT + id, + key, + CAST(value AS text) value, + updated_at + FROM + some.mytable + WHERE + key = 'something' + """, + options=MSSQL.SQLOptions( + partitionColumn="id", + numPartitions=10, + lowerBound=0, + upperBound=1000, + ), + ) + ``` + +## Recommendations { #DBR-onetl-connection-db-connection-mssql-sql-recommendations } + +### Select only required columns { #DBR-onetl-connection-db-connection-mssql-sql-select-only-required-columns } + +Instead of passing `SELECT * FROM ...` prefer passing exact column names `SELECT col1, col2, ...`. +This reduces the amount of data passed from MSSQL to Spark. + +### Pay attention to `where` value { #DBR-onetl-connection-db-connection-mssql-sql-pay-attention-to-where-value } + +Instead of filtering data on Spark side using `df.filter(df.column == 'value')` pass proper `WHERE column = 'value'` clause. +This both reduces the amount of data send from MSSQL to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in `where` clause. + +## Options { #DBR-onetl-connection-db-connection-mssql-sql-options } + + + +::: onetl.connection.db_connection.mssql.options.MSSQLSQLOptions + options: + inherited_members: true + heading_level: 3 + show_root_heading: true diff --git a/mddocs/docs/connection/db_connection/mssql/types.md b/mddocs/docs/connection/db_connection/mssql/types.md new file mode 100644 index 000000000..2182bd603 --- /dev/null +++ b/mddocs/docs/connection/db_connection/mssql/types.md @@ -0,0 +1,260 @@ +# MSSQL <-> Spark type mapping { #DBR-onetl-connection-db-connection-mssql-types-mssql-spark-type-mapping } + +!!! note + + The results below are valid for Spark 3.5.5, and may differ on other Spark versions. + +## Type detection & casting { #DBR-onetl-connection-db-connection-mssql-types-type-detection-casting } + +Spark's DataFrames always have a `schema` which is a list of columns with corresponding Spark types. All operations on a column are performed using column type. + +### Reading from MSSQL { #DBR-onetl-connection-db-connection-mssql-types-reading-from-mssql } + +This is how MSSQL connector performs this: + +- For each column in query result (`SELECT column1, column2, ... FROM table ...`) get column name and MSSQL type. +- Find corresponding `MSSQL type (read)` → `Spark type` combination (see below) for each DataFrame column. If no combination is found, raise exception. +- Create DataFrame from query with specific column names and Spark types. + +### Writing to some existing MSSQL table { #DBR-onetl-connection-db-connection-mssql-types-writing-to-some-existing-mssql-table } + +This is how MSSQL connector performs this: + +- Get names of columns in DataFrame. [^1] +- Perform `SELECT * FROM table LIMIT 0` query. +- Take only columns present in DataFrame (by name, case insensitive). For each found column get MSSQL type. +- Find corresponding `Spark type` → `MSSQL type (write)` combination (see below) for each DataFrame column. If no combination is found, raise exception. +- If `MSSQL type (write)` match `MSSQL type (read)`, no additional casts will be performed, DataFrame column will be written to MSSQL as is. +- If `MSSQL type (write)` does not match `MSSQL type (read)`, DataFrame column will be casted to target column type **on MSSQL side**. + For example, you can write column with text data to `int` column, if column contains valid integer values within supported value range and precision [^2]. + +[^1]: This allows to write data to tables with `DEFAULT` and `GENERATED` columns - if DataFrame has no such column, + it will be populated by MSSQL. + +[^2]: This is true only if DataFrame column is a `StringType()`, because text value is parsed automatically to target column type. + + But other types cannot be silently converted, like `int -> text`. This requires explicit casting, see [DBWriter]. + +### Create new table using Spark { #DBR-onetl-connection-db-connection-mssql-types-create-new-table-using-spark } + +!!! warning + + ABSOLUTELY NOT RECOMMENDED! + +This is how MSSQL connector performs this: + +- Find corresponding `Spark type` → `MSSQL type (create)` combination (see below) for each DataFrame column. If no combination is found, raise exception. +- Generate DDL for creating table in MSSQL, like `CREATE TABLE (col1 ...)`, and run it. +- Write DataFrame to created table as is. + +But some cases this may lead to using wrong column type. For example, Spark creates column of type `timestamp` +which corresponds to MSSQL's type `timestamp(0)` (precision up to seconds) instead of more precise `timestamp(6)` (precision up to nanoseconds). +This may lead to incidental precision loss, or sometimes data cannot be written to created table at all. + +So instead of relying on Spark to create tables: + +??? note "See example" + + ```python + writer = DBWriter( + connection=mssql, + target="myschema.target_tbl", + options=MSSQL.WriteOptions( + if_exists="append", + ), + ) + writer.run(df) + ``` + +Always prefer creating tables with specific types **BEFORE WRITING DATA**: + +??? note "See example" + + ```python + mssql.execute( + """ + CREATE TABLE schema.table ( + id bigint, + key text, + value datetime2(6) -- specific type and precision + ) + """, + ) + + writer = DBWriter( + connection=mssql, + target="myschema.target_tbl", + options=MSSQL.WriteOptions(if_exists="append"), + ) + writer.run(df) + ``` + +### References { #DBR-onetl-connection-db-connection-mssql-types-references } + +Here you can find source code with type conversions: + +- [MSSQL -> JDBC](https://github.com/microsoft/mssql-jdbc/blob/v12.2.0/src/main/java/com/microsoft/sqlserver/jdbc/SQLServerResultSetMetaData.java#L117-L170) +- [JDBC -> Spark](https://github.com/apache/spark/blob/v3.5.5/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MsSqlServerDialect.scala#L135-L152) +- [Spark -> JDBC](https://github.com/apache/spark/blob/v3.5.5/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MsSqlServerDialect.scala#L154-L163) +- [JDBC -> MSSQL](https://github.com/microsoft/mssql-jdbc/blob/v12.2.0/src/main/java/com/microsoft/sqlserver/jdbc/DataTypes.java#L625-L676) + +## Supported types { #DBR-onetl-connection-db-connection-mssql-types-supported-types } + +See [official documentation](https://learn.microsoft.com/en-us/sql/t-sql/data-types/data-types-transact-sql) + +### Numeric types { #DBR-onetl-connection-db-connection-mssql-types-numeric-types } + +| MSSQL type (read) | Spark type | MSSQL type (write) | MSSQL type (create) | +|-------------------------------|-----------------------------------|-------------------------------|-------------------------------| +| `decimal` | `DecimalType(P=18, S=0)` | `decimal(P=18, S=0)` | `decimal(P=18, S=0)` | +| `decimal(P=0..38)` | `DecimalType(P=0..38, S=0)` | `decimal(P=0..38, S=0)` | `decimal(P=0..38, S=0)` | +| `decimal(P=0..38, S=0..38)` | `DecimalType(P=0..38, S=0..38)` | `decimal(P=0..38, S=0..38)` | `decimal(P=0..38, S=0..38)` | +| `real` | `FloatType()` | `real` | `real` | +| `float` | `DoubleType()` | `float` | `float` | +| `smallint` | `ShortType()` | `smallint` | `smallint` | +| `tinyint`
`int` | `IntegerType()` | `int` | `int` | +| `bigint` | `LongType()` | `bigint` | `bigint` | + +### Temporal types { #DBR-onetl-connection-db-connection-mssql-types-temporal-types } + +!!! note + + MSSQL `timestamp` type is alias for `rowversion` (see [Special types][DBR-onetl-connection-db-connection-mssql-types-special-types]). It is not a temporal type! + +| MSSQL type (read) | Spark type | MSSQL type (write) | MSSQL type (create) | +|------------------------------------------|--------------------------------------|-----------------------------------|-------------------------------| +| `date` | `DateType()` | `date` | `date` | +| `smalldatetime`, minutes
`datetime`, milliseconds
`datetime2(0)`, seconds
`datetime2(3)`, milliseconds |

`TimestampType()`, microseconds |

`datetime2(6)`, microseconds |

`datetime`, milliseconds | +| `datetime2(6)`, microseconds | `TimestampType()`, microseconds | `datetime2(6)`, microseconds | `datetime`, milliseconds, **precision loss** [^3] | +| `datetime2(7)`, 100s of nanoseconds | `TimestampType()`, microseconds, **precision loss** [^4] | `datetime2(6)`, microseconds, **precision loss** [^4] | | +| `time(0)`, seconds
`time(3)`, milliseconds | `TimestampType()`, microseconds, with time format quirks [^5] | `datetime2(6)`, microseconds | `datetime`, milliseconds | +| `time(6)`, microseconds | `TimestampType()`, microseconds, with time format quirks [^5] | `datetime2(6)`, microseconds | `datetime`, milliseconds, **precision loss** [^3] | +| `time`, 100s of nanoseconds
`time(7)`, 100s of nanoseconds | `TimestampType()`, microseconds, **precision loss** [^4], with time format quirks [^5] | `datetime2(6)`, microseconds, **precision loss** [^3] | | +| `datetimeoffset` | `StringType()` | `nvarchar` | `nvarchar` | + +!!! warning + + Note that types in MSSQL and Spark have different value ranges: + + + | MySQL type | Min value | Max value | Spark type | Min value | Max value | + |-------------------|--------------------------------|--------------------------------|---------------------|--------------------------------|--------------------------------| + | `smalldatetime`
`datetime`
`datetime2`
`time` | `1900-01-01 00:00:00`
`1753-01-01 00:00:00.000`
`0001-01-01 00:00:00.000000`
`00:00:00.0000000` | `2079-06-06 23:59:00`
`9999-12-31 23:59:59.997`
`9999-12-31 23:59:59.999999`
`23:59:59.9999999` | `TimestampType()` | `0001-01-01 00:00:00.000000` | `9999-12-31 23:59:59.999999` | + + So not all of values in Spark DataFrame can be written to MSSQL. + + References: + + * [MSSQL date & time types documentation](https://learn.microsoft.com/en-us/sql/t-sql/data-types/date-and-time-types) + * [Spark DateType documentation](https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/DateType.html) + * [Spark TimestampType documentation](https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/TimestampType.html) + +[^3]: MSSQL dialect for Spark generates DDL with type `datetime` which has precision up to milliseconds (`23:59:59.999`, 10{superscript}`-3` seconds). Inserting data with microsecond and higher precision (`23:59:59.999999` .. `23.59:59.9999999`, 10{superscript}`-6` .. 10{superscript}`-7` seconds) will lead to **throwing away microseconds**. + +[^4]: MSSQL support timestamp up to 100s of nanoseconds precision (`23:59:59.9999999999`, 10{superscript}`-7` seconds), but Spark `TimestampType()` supports datetime up to microseconds precision (`23:59:59.999999`, 10{superscript}`-6` seconds). Last digit will be lost during read or write operations. + +[^5]: `time` type is the same as `datetime2` with date `1970-01-01`. So instead of reading data from MSSQL like `23:59:59.999999` it is actually read `1970-01-01 23:59:59.999999`, and vice versa. + +### String types { #DBR-onetl-connection-db-connection-mssql-types-string-types } + +| MSSQL type (read) | Spark type | MSSQL type (write) | MSSQL type (create) | +|-------------------|------------------|--------------------|---------------------| +| `char`
`char(N)`
`nchar`
`nchar(N)`
`varchar`
`varchar(N)`
`nvarchar`
`nvarchar(N)`
`mediumtext`
`text`
`ntext`
`xml` |



`StringType()` |



`nvarchar` |



`nvarchar` | + +### Binary types { #DBR-onetl-connection-db-connection-mssql-types-binary-types } + +| MSSQL type (read) | Spark type | MSSQL type (write) | MSSQL type (create) | +|--------------------|-------------------|--------------------|---------------------| +| `bit` | `BooleanType()` | `bit` | `bit` | +| `binary`
`binary(N)`
`varbinary`
`varbinary(N)`
`image` |

`BinaryType()` |

`varbinary` |

`varbinary` | + +### Special types { #DBR-onetl-connection-db-connection-mssql-types-special-types } + +| MSSQL type (read) | Spark type | MSSQL type (write) | MSSQL type (create) | +|---------------------------|------------------|--------------------|---------------------| +| `geography`
`geometry`
`hierarchyid`
`rowversion` |

`BinaryType()` |

`varbinary` |

`varbinary` | +| `sql_variant` | unsupported | | | +| `sysname`
`uniqueidentifier` | `StringType()` | `nvarchar` | `nvarchar` | + +## Explicit type cast { #DBR-onetl-connection-db-connection-mssql-types-explicit-type-cast } + +### `DBReader` { #DBR-onetl-connection-db-connection-mssql-types-dbreader } + +It is possible to explicitly cast column type using `DBReader(columns=...)` syntax. + +For example, you can use `CAST(column AS text)` to convert data to string representation on MSSQL side, and so it will be read as Spark's `StringType()`: + + ```python + from onetl.connection import MSSQL + from onetl.db import DBReader + + mssql = MSSQL(...) + + DBReader( + connection=mssql, + columns=[ + "id", + "supported_column", + "CAST(unsupported_column AS text) unsupported_column_str", + ], + ) + df = reader.run() + + # cast column content to proper Spark type + df = df.select( + df.id, + df.supported_column, + # explicit cast + df.unsupported_column_str.cast("integer").alias("parsed_integer"), + ) + ``` + +### `DBWriter` { #DBR-onetl-connection-db-connection-mssql-types-dbwriter } + +Convert dataframe column to JSON using [to_json](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.to_json.html), and write it as `text` column in MSSQL: + + ```python + mssql.execute( + """ + CREATE TABLE schema.target_tbl ( + id bigint, + struct_column_json text -- any string type, actually + ) + """, + ) + + from pyspark.sql.functions import to_json + + df = df.select( + df.id, + to_json(df.struct_column).alias("struct_column_json"), + ) + + writer.run(df) + ``` + +Then you can parse this column on MSSQL side - for example, by creating a view: + + ```sql + SELECT + id, + JSON_VALUE(struct_column_json, "$.nested.field") AS nested_field + FROM target_tbl + ``` + +Or by using [computed column](https://learn.microsoft.com/en-us/sql/relational-databases/tables/specify-computed-columns-in-a-table): + + ```sql + CREATE TABLE schema.target_table ( + id bigint, + supported_column datetime2(6), + struct_column_json text, -- any string type, actually + -- computed column + nested_field AS (JSON_VALUE(struct_column_json, "$.nested.field")) + -- or persisted column + -- nested_field AS (JSON_VALUE(struct_column_json, "$.nested.field")) PERSISTED + ) + ``` + +By default, column value is calculated on every table read. +Column marked as `PERSISTED` is calculated during insert, but this require additional space. diff --git a/mddocs/docs/connection/db_connection/mssql/write.md b/mddocs/docs/connection/db_connection/mssql/write.md new file mode 100644 index 000000000..3fbb086b7 --- /dev/null +++ b/mddocs/docs/connection/db_connection/mssql/write.md @@ -0,0 +1,58 @@ +# Writing to MSSQL using `DBWriter` { #DBR-onetl-connection-db-connection-mssql-write-writing-to-mssql-using-dbwriter } + +For writing data to MSSQL, use [DBWriter][DBR-onetl-db-writer]. + +!!! warning + + Please take into account [MSSQL types][DBR-onetl-connection-db-connection-mssql-types-mssql-spark-type-mapping] + +!!! warning + + It is always recommended to create table explicitly using [MSSQL.execute][DBR-onetl-connection-db-connection-mssql-execute-executing-statements-in-mssql] + instead of relying on Spark's table DDL generation. + + This is because Spark's DDL generator can create columns with different precision and types than it is expected, + causing precision loss or other issues. + +## Examples { #DBR-onetl-connection-db-connection-mssql-write-examples } + + ```python + from onetl.connection import MSSQL + from onetl.db import DBWriter + + mssql = MSSQL(...) + + df = ... # data is here + + writer = DBWriter( + connection=mssql, + target="schema.table", + options=MSSQL.WriteOptions(if_exists="append"), + ) + + writer.run(df) + ``` + +## Options { #DBR-onetl-connection-db-connection-mssql-write-options } + +Method above accepts [MSSQL.WriteOptions][onetl.connection.db_connection.mssql.options.MSSQLWriteOptions] + + + +::: onetl.connection.db_connection.mssql.options.MSSQLWriteOptions + options: + inherited_members: true + heading_level: 3 + show_root_heading: true diff --git a/mddocs/docs/connection/db_connection/mysql/connection.md b/mddocs/docs/connection/db_connection/mysql/connection.md new file mode 100644 index 000000000..12e3df0ba --- /dev/null +++ b/mddocs/docs/connection/db_connection/mysql/connection.md @@ -0,0 +1,18 @@ +# MySQL connection { #DBR-onetl-connection-db-connection-mysql-connection-0 } + + + +::: onetl.connection.db_connection.mysql.connection.MySQL + options: + members: + - get_packages + - check diff --git a/mddocs/docs/connection/db_connection/mysql/execute.md b/mddocs/docs/connection/db_connection/mysql/execute.md new file mode 100644 index 000000000..7d53c4b53 --- /dev/null +++ b/mddocs/docs/connection/db_connection/mysql/execute.md @@ -0,0 +1,122 @@ +# Executing statements in MySQL { #DBR-onetl-connection-db-connection-mysql-execute-executing-statements-in-mysql } + +!!! warning + + Methods below **read all the rows** returned from DB **to Spark driver memory**, and then convert them to DataFrame. + + Do **NOT** use them to read large amounts of data. Use [DBReader][DBR-onetl-connection-db-connection-mysql-read-reading-from-mysql-using-dbreader] or [MySQL.sql][DBR-onetl-connection-db-connection-mysql-sql-reading-from-mysql-using-mysql-sql] instead. + +## How to { #DBR-onetl-connection-db-connection-mysql-execute-how-to } + +There are 2 ways to execute some statement in MySQL + +### Use `MySQL.fetch` { #DBR-onetl-connection-db-connection-mysql-execute-use-mysql-fetch } + +Use this method to perform some `SELECT` query which returns **small number or rows**, like reading MySQL config, or reading data from some reference table. Method returns Spark DataFrame. + +Method accepts [MySQL.FetchOptions][onetl.connection.db_connection.mysql.options.MySQLFetchOptions]. + +Connection opened using this method should be then closed with `connection.close()` or `with connection:`. + +!!! warning + + Please take into account [MySQL types][DBR-onetl-connection-db-connection-mysql-types-mysql-spark-type-mapping]. + +#### Syntax support for `MySQL.fetch` { #DBR-onetl-connection-db-connection-mysql-execute-syntax-support-for-mysql-fetch } + +This method supports **any** query syntax supported by MySQL, like: + +- ✅︎ `SELECT ... FROM ...` +- ✅︎ `WITH alias AS (...) SELECT ...` +- ✅︎ `SELECT func(arg1, arg2)` or `{?= call func(arg1, arg2)}` - special syntax for calling function +- ✅︎ `SHOW ...` +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +#### Examples in `MySQL.fetch` { #DBR-onetl-connection-db-connection-mysql-execute-examples-in-mysql-fetch } + + ```python + from onetl.connection import MySQL + + mysql = MySQL(...) + + df = mysql.fetch( + "SELECT value FROM some.reference_table WHERE key = 'some_constant'", + options=MySQL.FetchOptions(queryTimeout=10), + ) + mysql.close() + value = df.collect()[0][0] # get value from first row and first column + ``` + +### Use `MySQL.execute` { #DBR-onetl-connection-db-connection-mysql-execute-use-mysql-execute } + +Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. + +Method accepts [MySQL.ExecuteOptions][onetl.connection.db_connection.mysql.options.MySQLExecuteOptions]. + +Connection opened using this method should be then closed with `connection.close()` or `with connection:`. + +#### Syntax support in `MySQL.execute` { #DBR-onetl-connection-db-connection-mysql-execute-syntax-support-in-mysql-execute } + +This method supports **any** query syntax supported by MySQL, like: + +- ✅︎ `CREATE TABLE ...`, `CREATE VIEW ...`, and so on +- ✅︎ `ALTER ...` +- ✅︎ `INSERT INTO ... SELECT ...`, `UPDATE ...`, `DELETE ...`, and so on +- ✅︎ `DROP TABLE ...`, `DROP VIEW ...`, and so on +- ✅︎ `CALL procedure(arg1, arg2) ...` or `{call procedure(arg1, arg2)}` - special syntax for calling procedure +- ✅︎ other statements not mentioned here +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +#### Examples for `MySQL.execute` { #DBR-onetl-connection-db-connection-mysql-execute-examples-for-mysql-execute } + + ```python + from onetl.connection import MySQL + + mysql = MySQL(...) + + mysql.execute("DROP TABLE schema.table") + mysql.execute( + """ + CREATE TABLE schema.table ( + id bigint, + key text, + value float + ) + ENGINE = InnoDB + """, + options=MySQL.ExecuteOptions(queryTimeout=10), + ) + ``` + +## Options { #DBR-onetl-connection-db-connection-mysql-execute-options } + + + +::: onetl.connection.db_connection.mysql.options.MySQLFetchOptions + options: + members: true + heading_level: 3 + show_root_heading: true + +::: onetl.connection.db_connection.mysql.options.MySQLExecuteOptions + options: + members: true + heading_level: 3 + show_root_heading: true diff --git a/mddocs/docs/connection/db_connection/mysql/index.md b/mddocs/docs/connection/db_connection/mysql/index.md new file mode 100644 index 000000000..4707d6f0c --- /dev/null +++ b/mddocs/docs/connection/db_connection/mysql/index.md @@ -0,0 +1,17 @@ +# MySQL { #DBR-onetl-connection-db-connection-mysql } + +## Connection { #DBR-onetl-connection-db-connection-mysql-connection-1 } + +* [Prerequisites][DBR-onetl-connection-db-connection-mysql-prerequisites] +* [MySQL connection][DBR-onetl-connection-db-connection-mysql-connection-0] + +## Operations { #DBR-onetl-connection-db-connection-mysql-operations } + +* [Reading from MySQL using `DBReader`][DBR-onetl-connection-db-connection-mysql-read-reading-from-mysql-using-dbreader] +* [Reading from MySQL using `MySQL.sql`][DBR-onetl-connection-db-connection-mysql-sql-reading-from-mysql-using-mysql-sql] +* [Writing to MySQL using `DBWriter`][DBR-onetl-connection-db-connection-mysql-write-writing-to-mysql-using-dbwriter] +* [Executing statements in MySQL][DBR-onetl-connection-db-connection-mysql-execute-executing-statements-in-mysql] + +## Troubleshooting { #DBR-onetl-connection-db-connection-mysql-troubleshooting } + +* [MySQL <-> Spark type mapping][DBR-onetl-connection-db-connection-mysql-types-mysql-spark-type-mapping] diff --git a/mddocs/docs/connection/db_connection/mysql/prerequisites.md b/mddocs/docs/connection/db_connection/mysql/prerequisites.md new file mode 100644 index 000000000..0f3fdf27b --- /dev/null +++ b/mddocs/docs/connection/db_connection/mysql/prerequisites.md @@ -0,0 +1,57 @@ +# Prerequisites { #DBR-onetl-connection-db-connection-mysql-prerequisites } + +## Version Compatibility { #DBR-onetl-connection-db-connection-mysql-prerequisites-version-compatibility } + +- MySQL server versions: + - Officially declared: 8.0 - 9.2 + - Actually tested: 5.7.13, 9.2.0 +- Spark versions: 2.3.x - 3.5.x +- Java versions: 8 - 20 + +See [official documentation](https://dev.mysql.com/doc/connector-j/en/connector-j-versions.html). + +## Installing PySpark { #DBR-onetl-connection-db-connection-mysql-prerequisites-installing-pyspark } + +To use MySQL connector you should have PySpark installed (or injected to `sys.path`) BEFORE creating the connector instance. + +See [installation instruction][DBR-onetl-install-spark] for more details. + +## Connecting to MySQL { #DBR-onetl-connection-db-connection-mysql-prerequisites-connecting-to-mysql } + +### Connection host { #DBR-onetl-connection-db-connection-mysql-prerequisites-connection-host } + +It is possible to connect to MySQL by using either DNS name of host or it's IP address. + +If you're using MySQL cluster, it is currently possible to connect only to **one specific node**. +Connecting to multiple nodes to perform load balancing, as well as automatic failover to new master/replica are not supported. + +### Connection port { #DBR-onetl-connection-db-connection-mysql-prerequisites-connection-port } + +Connection is usually performed to port 3306. Port may differ for different MySQL instances. +Please ask your MySQL administrator to provide required information. + +### Required grants { #DBR-onetl-connection-db-connection-mysql-prerequisites-required-grants } + +Ask your MySQL cluster administrator to set following grants for a user, used for creating a connection: + +=== "Read + Write" + + ```sql + -- allow creating tables in the target schema + GRANT CREATE ON myschema.* TO username@'192.168.1.%'; + + -- allow read & write access to specific table + GRANT SELECT, INSERT ON myschema.mytable TO username@'192.168.1.%'; + ``` + +=== "Read only" + + ```sql + -- allow read access to specific table + GRANT SELECT ON myschema.mytable TO username@'192.168.1.%'; + ``` + +In example above `'192.168.1.%''` is a network subnet `192.168.1.0 - 192.168.1.255` +where Spark driver and executors are running. To allow connecting user from any IP, use `'%'` (not secure!). + +More details can be found in [official documentation](https://dev.mysql.com/doc/refman/en/grant.html). diff --git a/mddocs/docs/connection/db_connection/mysql/read.md b/mddocs/docs/connection/db_connection/mysql/read.md new file mode 100644 index 000000000..cb8c2d84c --- /dev/null +++ b/mddocs/docs/connection/db_connection/mysql/read.md @@ -0,0 +1,96 @@ +# Reading from MySQL using `DBReader` { #DBR-onetl-connection-db-connection-mysql-read-reading-from-mysql-using-dbreader } + +[DBReader][DBR-onetl-db-reader] supports [strategy][DBR-onetl-strategy-read-strategies] for incremental data reading, but does not support custom queries, like `JOIN`. + +!!! warning + + Please take into account [MySQL types][DBR-onetl-connection-db-connection-mysql-types-mysql-spark-type-mapping] + +## Supported DBReader features { #DBR-onetl-connection-db-connection-mysql-read-supported-dbreader-features } + +- ✅︎ `columns` +- ✅︎ `where` +- ✅︎ `hwm`, supported strategies: + - ✅︎ [Snapshot strategy][DBR-onetl-strategy-snapshot-strategy] + - ✅︎ [Incremental strategy][DBR-onetl-connection-db-connection-clickhouse-read-incremental-strategy] + - ✅︎ [Snapshot batch strategy][DBR-onetl-strategy-snapshot-batch-strategy] + - ✅︎ [Incremental batch strategy][DBR-onetl-strategy-incremental-batch-strategy] +- ✅︎ `hint` (see [official documentation](https://dev.mysql.com/doc/refman/en/optimizer-hints.html)) +- ❌ `df_schema` +- ✅︎ `options` (see [MySQL.ReadOptions][onetl.connection.db_connection.mysql.options.MySQLReadOptions]) + +## Examples { #DBR-onetl-connection-db-connection-mysql-read-examples } + +Snapshot strategy: + + ```python + from onetl.connection import MySQL + from onetl.db import DBReader + + mysql = MySQL(...) + + reader = DBReader( + connection=mysql, + source="schema.table", + columns=["id", "key", "CAST(value AS text) value", "updated_dt"], + where="key = 'something'", + hint="SKIP_SCAN(schema.table key_index)", + options=MySQL.ReadOptions(partitionColumn="id", numPartitions=10), + ) + df = reader.run() + ``` + +Incremental strategy: + + ```python + from onetl.connection import MySQL + from onetl.db import DBReader + from onetl.strategy import IncrementalStrategy + + mysql = MySQL(...) + + reader = DBReader( + connection=mysql, + source="schema.table", + columns=["id", "key", "CAST(value AS text) value", "updated_dt"], + where="key = 'something'", + hint="SKIP_SCAN(schema.table key_index)", + hwm=DBReader.AutoDetectHWM(name="mysql_hwm", expression="updated_dt"), + options=MySQL.ReadOptions(partitionColumn="id", numPartitions=10), + ) + + with IncrementalStrategy(): + df = reader.run() + ``` + +## Recommendations { #DBR-onetl-connection-db-connection-mysql-read-recommendations } + +### Select only required columns { #DBR-onetl-connection-db-connection-mysql-read-select-only-required-columns } + +Instead of passing `"*"` in `DBReader(columns=[...])` prefer passing exact column names. This reduces the amount of data passed from Oracle to Spark. + +### Pay attention to `where` value { #DBR-onetl-connection-db-connection-mysql-read-pay-attention-to-where-value } + +Instead of filtering data on Spark side using `df.filter(df.column == 'value')` pass proper `DBReader(where="column = 'value'")` clause. +This both reduces the amount of data send from Oracle to Spark, and may also improve performance of the query. +Especially if there are indexes for columns used in `where` clause. + +## Options { #DBR-onetl-connection-db-connection-mysql-read-options } + + + +::: onetl.connection.db_connection.mysql.options.MySQLReadOptions + options: + members: true + heading_level: 3 + show_root_heading: true diff --git a/mddocs/docs/connection/db_connection/mysql/sql.md b/mddocs/docs/connection/db_connection/mysql/sql.md new file mode 100644 index 000000000..24ce876a4 --- /dev/null +++ b/mddocs/docs/connection/db_connection/mysql/sql.md @@ -0,0 +1,82 @@ +# Reading from MySQL using `MySQL.sql` { #DBR-onetl-connection-db-connection-mysql-sql-reading-from-mysql-using-mysql-sql } + +`MySQL.sql` allows passing custom SQL query, but does not support incremental strategies. + +!!! warning + + Please take into account [MySQL types][DBR-onetl-connection-db-connection-mysql-types-mysql-spark-type-mapping] + +!!! warning + + Statement is executed in **read-write** connection, so if you're calling some functions/procedures with DDL/DML statements inside, they can change data in your database. + +## Syntax support { #DBR-onetl-connection-db-connection-mysql-sql-syntax-support } + +Only queries with the following syntax are supported: + +- ✅︎ `SELECT ... FROM ...` +- ✅︎ `WITH alias AS (...) SELECT ...` +- ❌ `SHOW ...` +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +## Examples { #DBR-onetl-connection-db-connection-mysql-sql-examples } + + ```python + from onetl.connection import MySQL + + mysql = MySQL(...) + df = mysql.sql( + """ + SELECT + id, + key, + CAST(value AS text) value, + updated_at + FROM + some.mytable + WHERE + key = 'something' + """, + options=MySQL.SQLOptions( + partitionColumn="id", + numPartitions=10, + lowerBound=0, + upperBound=1000, + ), + ) + ``` + +## Recommendations { #DBR-onetl-connection-db-connection-mysql-sql-recommendations } + +### Select only required columns { #DBR-onetl-connection-db-connection-mysql-sql-select-only-required-columns } + +Instead of passing `SELECT * FROM ...` prefer passing exact column names `SELECT col1, col2, ...`. +This reduces the amount of data passed from MySQL to Spark. + +### Pay attention to `where` value { #DBR-onetl-connection-db-connection-mysql-sql-pay-attention-to-where-value } + +Instead of filtering data on Spark side using `df.filter(df.column == 'value')` pass proper `WHERE column = 'value'` clause. +This both reduces the amount of data send from MySQL to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in `where` clause. + +## Options { #DBR-onetl-connection-db-connection-mysql-sql-options } + + + +::: onetl.connection.db_connection.mysql.options.MySQLSQLOptions + options: + members: true + heading_level: 3 + show_root_heading: true diff --git a/mddocs/docs/connection/db_connection/mysql/types.md b/mddocs/docs/connection/db_connection/mysql/types.md new file mode 100644 index 000000000..69e227c04 --- /dev/null +++ b/mddocs/docs/connection/db_connection/mysql/types.md @@ -0,0 +1,265 @@ +# MySQL <-> Spark type mapping { #DBR-onetl-connection-db-connection-mysql-types-mysql-spark-type-mapping } + +!!! note + + The results below are valid for Spark 3.5.5, and may differ on other Spark versions. + +## Type detection & casting { #DBR-onetl-connection-db-connection-mysql-types-type-detection-casting } + +Spark's DataFrames always have a `schema` which is a list of columns with corresponding Spark types. All operations on a column are performed using column type. + +### Reading from MySQL { #DBR-onetl-connection-db-connection-mysql-types-reading-from-mysql } + +This is how MySQL connector performs this: + +- For each column in query result (`SELECT column1, column2, ... FROM table ...`) get column name and MySQL type. +- Find corresponding `MySQL type (read)` → `Spark type` combination (see below) for each DataFrame column. If no combination is found, raise exception. +- Create DataFrame from query with specific column names and Spark types. + +### Writing to some existing MySQL table { #DBR-onetl-connection-db-connection-mysql-types-writing-to-some-existing-mysql-table } + +This is how MySQL connector performs this: + +- Get names of columns in DataFrame. [^1] +- Perform `SELECT * FROM table LIMIT 0` query. +- Take only columns present in DataFrame (by name, case insensitive). For each found column get MySQL type. +- Find corresponding `Spark type` → `MySQL type (write)` combination (see below) for each DataFrame column. If no combination is found, raise exception. +- If `MySQL type (write)` match `MySQL type (read)`, no additional casts will be performed, DataFrame column will be written to MySQL as is. +- If `MySQL type (write)` does not match `MySQL type (read)`, DataFrame column will be casted to target column type **on MySQL side**. For example, you can write column with text data to `int` column, if column contains valid integer values within supported value range and precision. + +[^1]: This allows to write data to tables with `DEFAULT` and `GENERATED` columns - if DataFrame has no such column, + it will be populated by MySQL. + +### Create new table using Spark { #DBR-onetl-connection-db-connection-mysql-types-create-new-table-using-spark } + +!!! warning + + ABSOLUTELY NOT RECOMMENDED! + +This is how MySQL connector performs this: + +- Find corresponding `Spark type` → `MySQL type (create)` combination (see below) for each DataFrame column. If no combination is found, raise exception. +- Generate DDL for creating table in MySQL, like `CREATE TABLE (col1 ...)`, and run it. +- Write DataFrame to created table as is. + +But some cases this may lead to using wrong column type. For example, Spark creates column of type `timestamp` +which corresponds to MySQL type `timestamp(0)` (precision up to seconds) +instead of more precise `timestamp(6)` (precision up to nanoseconds). +This may lead to incidental precision loss, or sometimes data cannot be written to created table at all. + +So instead of relying on Spark to create tables: + +??? note "See example" + + ```python + writer = DBWriter( + connection=mysql, + target="myschema.target_tbl", + options=MySQL.WriteOptions( + if_exists="append", + createTableOptions="ENGINE = InnoDB", + ), + ) + writer.run(df) + ``` + +Always prefer creating tables with specific types **BEFORE WRITING DATA**: + +??? note "See example" + + ```python + mysql.execute( + """ + CREATE TABLE schema.table ( + id bigint, + key text, + value timestamp(6) -- specific type and precision + ) + ENGINE = InnoDB + """, + ) + + writer = DBWriter( + connection=mysql, + target="myschema.target_tbl", + options=MySQL.WriteOptions(if_exists="append"), + ) + writer.run(df) + ``` + +### References { #DBR-onetl-connection-db-connection-mysql-types-references } + +Here you can find source code with type conversions: + +- [MySQL -> JDBC](https://github.com/mysql/mysql-connector-j/blob/8.0.33/src/main/core-api/java/com/mysql/cj/MysqlType.java#L44-L623) +- [JDBC -> Spark](https://github.com/apache/spark/blob/v3.5.5/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala#L104-L132) +- [Spark -> JDBC](https://github.com/apache/spark/blob/v3.5.5/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala#L204-L211) +- [JDBC -> MySQL](https://github.com/mysql/mysql-connector-j/blob/8.0.33/src/main/core-api/java/com/mysql/cj/MysqlType.java#L625-L867) + +## Supported types { #DBR-onetl-connection-db-connection-mysql-types-supported-types } + +See [official documentation](https://dev.mysql.com/doc/refman/en/data-types.html) + +### Numeric types { #DBR-onetl-connection-db-connection-mysql-types-numeric-types } + +| MySQL type (read) | Spark type | MySQL type (write) | MySQL type (create) | +|-------------------------------|-----------------------------------|-------------------------------|-------------------------------| +| `decimal` | `DecimalType(P=10, S=0)` | `decimal(P=10, S=0)` | `decimal(P=10, S=0)` | +| `decimal(P=0..38)` | `DecimalType(P=0..38, S=0)` | `decimal(P=0..38, S=0)` | `decimal(P=0..38, S=0)` | +| `decimal(P=0..38, S=0..30)` | `DecimalType(P=0..38, S=0..30)` | `decimal(P=0..38, S=0..30)` | `decimal(P=0..38, S=0..30)` | +| `decimal(P=39..65, S=...)` | unsupported [^2] | | | +| `float`
`double` | `DoubleType()` | `double` | `double` | +| `tinyint`
`smallint`
`mediumint`
`int` |

`IntegerType()` |

`int` |

`int` | +| `bigint` | `LongType()` | `bigint` | `bigint` | + +[^2]: MySQL support decimal types with precision `P` up to 65. + + But Spark's `DecimalType(P, S)` supports maximum `P=38`. It is impossible to read, write or operate with values of larger precision, + this leads to an exception. + +### Temporal types { #DBR-onetl-connection-db-connection-mysql-types-temporal-types } + +| MySQL type (read) | Spark type | MySQL type (write) | MySQL type (create) | +|-----------------------------------|--------------------------------------|-----------------------------------|-------------------------------| +| `year`
`date` | `DateType()` | `date` | `date` | +| `datetime`, seconds
`timestamp`, seconds
`datetime(0)`, seconds
`timestamp(0)`, seconds |

`TimestampType()`, microseconds |

`timestamp(6)`, microseconds |

`timestamp(0)`, seconds | +| `datetime(3)`, milliseconds
`timestamp(3)`, milliseconds
`datetime(6)`, microseconds
`timestamp(6)`, microseconds |

`TimestampType()`, microseconds |

`timestamp(6)`, microseconds |

`timestamp(0)`, seconds, **precision loss** [^3], | +| `time`, seconds
`time(0)`, seconds | `TimestampType()`, microseconds, with time format quirks [^4] | `timestamp(6)`, microseconds | `timestamp(0)`, seconds | +| `time(3)`, milliseconds
`time(6)`, microseconds | `TimestampType()`, microseconds, with time format quirks [^4] | `timestamp(6)`, microseconds | `timestamp(0)`, seconds, **precision loss** [^3], | + +!!! warning + + Note that types in MySQL and Spark have different value ranges: + + + | MySQL type | Min value | Max value | Spark type | Min value | Max value | + |---------------|--------------------------------|--------------------------------|---------------------|--------------------------------|--------------------------------| + | `year`
`date` | `1901`
`1000-01-01` | `2155`
`9999-12-31` | `DateType()` | `0001-01-01` | `9999-12-31` | + | `datetime`
`timestamp`
`time` | `1000-01-01 00:00:00.000000`
`1970-01-01 00:00:01.000000`
`-838:59:59.000000` | `9999-12-31 23:59:59.499999`
`9999-12-31 23:59:59.499999`
`838:59:59.000000` | `TimestampType()` | `0001-01-01 00:00:00.000000` | `9999-12-31 23:59:59.999999` | + + So Spark can read all the values from MySQL, but not all of values in Spark DataFrame can be written to MySQL. + + References: + + * [MySQL year documentation](https://dev.mysql.com/doc/refman/en/year.html) + * [MySQL date, datetime & timestamp documentation](https://dev.mysql.com/doc/refman/en/datetime.html) + * [MySQL time documentation](https://dev.mysql.com/doc/refman/en/time.html) + * [Spark DateType documentation](https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/DateType.html) + * [Spark TimestampType documentation](https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/TimestampType.html) + +[^3]: MySQL dialect generates DDL with MySQL type `timestamp` which is alias for `timestamp(0)` with precision up to seconds (`23:59:59`). Inserting data with microseconds precision (`23:59:59.999999`) will lead to **throwing away microseconds**. + +[^4]: `time` type is the same as `timestamp` with date `1970-01-01`. So instead of reading data from MySQL like `23:59:59` it is actually read `1970-01-01 23:59:59`, and vice versa. + +### String types { #DBR-onetl-connection-db-connection-mysql-types-string-types } + +| MySQL type (read) | Spark type | MySQL type (write) | MySQL type (create) | +|-------------------------------|------------------|--------------------|---------------------| +| `char`
`char(N)`
`varchar(N)`
`mediumtext`
`text`
`longtext`
`json`
`enum("val1", "val2", ...)`
`set("val1", "val2", ...)` |



`StringType()` |



`longtext` |



`longtext` | + +### Binary types { #DBR-onetl-connection-db-connection-mysql-types-binary-types } + +| MySQL type (read) | Spark type | MySQL type (write) | MySQL type (create) | +|-------------------|------------------|--------------------|---------------------| +| `binary`
`binary(N)`
`varbinary(N)`
`mediumblob`
`blob`
`longblob` |


`BinaryType()` |


`blob` |


`blob` | + +### Geometry types { #DBR-onetl-connection-db-connection-mysql-types-geometry-types } + +| MySQL type (read) | Spark type | MySQL type (write) | MySQL type (create) | +|------------------------|------------------|--------------------|---------------------| +| `point`
`linestring`
`polygon`
`geometry`
`multipoint`
`multilinestring`
`multipolygon`
`geometrycollection` |



`BinaryType()` |



`blob` |



`blob` | + +## Explicit type cast { #DBR-onetl-connection-db-connection-mysql-types-explicit-type-cast } + +### `DBReader` { #DBR-onetl-connection-db-connection-mysql-types-dbreader } + +It is possible to explicitly cast column type using `DBReader(columns=...)` syntax. + +For example, you can use `CAST(column AS text)` to convert data to string representation on MySQL side, and so it will be read as Spark's `StringType()`. + +It is also possible to use [JSON_OBJECT](https://dev.mysql.com/doc/refman/en/json.html) MySQL function and parse JSON columns in MySQL with the [JSON.parse_column][onetl.file.format.json.JSON.parse_column] method. + + ```python + from pyspark.sql.types import IntegerType, StructType, StructField + + from onetl.connection import MySQL + from onetl.db import DBReader + from onetl.file.format import JSON + + mysql = MySQL(...) + + DBReader( + connection=mysql, + columns=[ + "id", + "supported_column", + "CAST(unsupported_column AS text) unsupported_column_str", + # or + "JSON_OBJECT('key', value_column) json_column", + ], + ) + df = reader.run() + + json_scheme = StructType([StructField("key", IntegerType())]) + + df = df.select( + df.id, + df.supported_column, + # explicit cast + df.unsupported_column_str.cast("integer").alias("parsed_integer"), + JSON().parse_column("json_column", json_scheme).alias("struct_column"), + ) + ``` + +### `DBWriter` { #DBR-onetl-connection-db-connection-mysql-types-dbwriter } + +To write JSON data to a `json` or `text` column in a MySQL table, use the [JSON.serialize_column][onetl.file.format.json.JSON.serialize_column] method. + + ```python + from onetl.connection import MySQL + from onetl.db import DBWriter + from onetl.file.format import JSON + + mysql.execute( + """ + CREATE TABLE schema.target_tbl ( + id bigint, + array_column_json json -- any string type, actually + ) + ENGINE = InnoDB + """, + ) + + df = df.select( + df.id, + JSON().serialize_column(df.array_column).alias("array_column_json"), + ) + + writer.run(df) + ``` + +Then you can parse this column on MySQL side - for example, by creating a view: + + ```sql + SELECT + id, + array_column_json->"$[0]" AS array_item + FROM target_tbl + ``` + +Or by using [GENERATED column](https://dev.mysql.com/doc/refman/en/create-table-generated-columns.html): + + ```sql + CREATE TABLE schema.target_table ( + id bigint, + supported_column timestamp, + array_column_json json, -- any string type, actually + -- virtual column + array_item_0 GENERATED ALWAYS AS (array_column_json->"$[0]")) VIRTUAL + -- or stired column + -- array_item_0 GENERATED ALWAYS AS (array_column_json->"$[0]")) STORED + ) + ``` + +`VIRTUAL` column value is calculated on every table read. +`STORED` column value is calculated during insert, but this require additional space. diff --git a/mddocs/docs/connection/db_connection/mysql/write.md b/mddocs/docs/connection/db_connection/mysql/write.md new file mode 100644 index 000000000..bdcf70a79 --- /dev/null +++ b/mddocs/docs/connection/db_connection/mysql/write.md @@ -0,0 +1,60 @@ +# Writing to MySQL using `DBWriter` { #DBR-onetl-connection-db-connection-mysql-write-writing-to-mysql-using-dbwriter } + +For writing data to MySQL, use [DBWriter][DBR-onetl-db-writer]. + +!!! warning + + Please take into account [MySQL types][DBR-onetl-connection-db-connection-mysql-types-mysql-spark-type-mapping] + +!!! warning + + It is always recommended to create table explicitly using [MySQL.execute][DBR-onetl-connection-db-connection-mysql-execute-executing-statements-in-mysql] instead of relying on Spark's table DDL generation. + + This is because Spark's DDL generator can create columns with different precision and types than it is expected, causing precision loss or other issues. + +## Examples { #DBR-onetl-connection-db-connection-mysql-write-examples } + + ```python + from onetl.connection import MySQL + from onetl.db import DBWriter + + mysql = MySQL(...) + + df = ... # data is here + + writer = DBWriter( + connection=mysql, + target="schema.table", + options=MySQL.WriteOptions( + if_exists="append", + # ENGINE is required by MySQL + createTableOptions="ENGINE = MergeTree() ORDER BY id", + ), + ) + + writer.run(df) + ``` + +## Options { #DBR-onetl-connection-db-connection-mysql-write-options } + +Method above accepts [MySQL.WriteOptions][onetl.connection.db_connection.mysql.options.MySQLWriteOptions] + + + +::: onetl.connection.db_connection.mysql.options.MySQLWriteOptions + options: + members: true + heading_level: 3 + show_root_heading: true diff --git a/mddocs/docs/connection/db_connection/oracle/connection.md b/mddocs/docs/connection/db_connection/oracle/connection.md new file mode 100644 index 000000000..76b7db91a --- /dev/null +++ b/mddocs/docs/connection/db_connection/oracle/connection.md @@ -0,0 +1,18 @@ +# Oracle connection { #DBR-onetl-connection-db-connection-oracle-connection-0 } + + + +::: onetl.connection.db_connection.oracle.connection.Oracle + options: + members: + - get_packages + - check diff --git a/mddocs/docs/connection/db_connection/oracle/execute.md b/mddocs/docs/connection/db_connection/oracle/execute.md new file mode 100644 index 000000000..f77393d5a --- /dev/null +++ b/mddocs/docs/connection/db_connection/oracle/execute.md @@ -0,0 +1,123 @@ +# Executing statements in Oracle { #DBR-onetl-connection-db-connection-oracle-execute-executing-statements-in-oracle } + +!!! warning + + Methods below **read all the rows** returned from DB **to Spark driver memory**, and then convert them to DataFrame. + + Do **NOT** use them to read large amounts of data. Use [DBReader][DBR-onetl-connection-db-connection-oracle-read-reading-from-oracle-using-dbreader] or [Oracle.sql][DBR-onetl-connection-db-connection-oracle-sql-reading-from-oracle-using-oracle-sql] instead. + +## How to { #DBR-onetl-connection-db-connection-oracle-execute-how-to } + +There are 2 ways to execute some statement in Oracle + +### Use `Oracle.fetch` { #DBR-onetl-connection-db-connection-oracle-execute-use-oracle-fetch } + +Use this method to execute some `SELECT` query which returns **small number or rows**, like reading +Oracle config, or reading data from some reference table. Method returns Spark DataFrame. + +Method accepts [Oracle.FetchOptions][onetl.connection.db_connection.oracle.options.OracleFetchOptions]. + +Connection opened using this method should be then closed with `connection.close()` or `with connection:`. + +!!! warning + + Please take into account [Oracle types][DBR-onetl-connection-db-connection-oracle-types-oracle-spark-type-mapping]. + +#### Syntax support in `Oracle.fetch` { #DBR-onetl-connection-db-connection-oracle-execute-syntax-support-in-oracle-fetch } + +This method supports **any** query syntax supported by Oracle, like: + +- ✅︎ `SELECT ... FROM ...` +- ✅︎ `WITH alias AS (...) SELECT ...` +- ✅︎ `SELECT func(arg1, arg2) FROM DUAL` - call function +- ✅︎ `SHOW ...` +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +#### Examples for `Oracle.fetch` { #DBR-onetl-connection-db-connection-oracle-execute-examples-for-oracle-fetch } + + ```python + from onetl.connection import Oracle + + oracle = Oracle(...) + + df = oracle.fetch( + "SELECT value FROM some.reference_table WHERE key = 'some_constant'", + options=Oracle.FetchOptions(queryTimeout=10), + ) + oracle.close() + value = df.collect()[0][0] # get value from first row and first column + ``` + +### Use `Oracle.execute` { #DBR-onetl-connection-db-connection-oracle-execute-use-oracle-execute } + +Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. + +Method accepts [Oracle.ExecuteOptions][onetl.connection.db_connection.oracle.options.OracleExecuteOptions]. + +Connection opened using this method should be then closed with `connection.close()` or `with connection:`. + +#### Syntax support in `Oracle.execute` { #DBR-onetl-connection-db-connection-oracle-execute-syntax-support-in-oracle-execute } + +This method supports **any** query syntax supported by Oracle, like: + +- ✅︎ `CREATE TABLE ...`, `CREATE VIEW ...` +- ✅︎ `ALTER ...` +- ✅︎ `INSERT INTO ... SELECT ...`, `UPDATE ...`, `DELETE ...`, and so on +- ✅︎ `DROP TABLE ...`, `DROP VIEW ...`, `TRUNCATE TABLE`, and so on +- ✅︎ `CALL procedure(arg1, arg2) ...` or `{call procedure(arg1, arg2)}` - special syntax for calling procedure +- ✅︎ `DECLARE ... BEGIN ... END` - execute PL/SQL statement +- ✅︎ other statements not mentioned here +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +#### Examples for `Oracle.execute` { #DBR-onetl-connection-db-connection-oracle-execute-examples-for-oracle-execute } + + ```python + from onetl.connection import Oracle + + oracle = Oracle(...) + + oracle.execute("DROP TABLE schema.table") + oracle.execute( + """ + CREATE TABLE schema.table ( + id bigint GENERATED ALWAYS AS IDENTITY, + key VARCHAR2(4000), + value NUMBER + ) + """, + options=Oracle.ExecuteOptions(queryTimeout=10), + ) + ``` + +## Options { #DBR-onetl-connection-db-connection-oracle-execute-options } + + + +::: onetl.connection.db_connection.oracle.options.OracleFetchOptions + options: + inherited_members: true + heading_level: 3 + show_root_heading: true + +::: onetl.connection.db_connection.oracle.options.OracleExecuteOptions + options: + inherited_members: true + heading_level: 3 + show_root_heading: true diff --git a/mddocs/docs/connection/db_connection/oracle/index.md b/mddocs/docs/connection/db_connection/oracle/index.md new file mode 100644 index 000000000..19d0469b3 --- /dev/null +++ b/mddocs/docs/connection/db_connection/oracle/index.md @@ -0,0 +1,17 @@ +# Oracle { #DBR-onetl-connection-db-connection-oracle } + +## Connection { #DBR-onetl-connection-db-connection-oracle-connection-1 } + +* [Prerequisites][DBR-onetl-connection-db-connection-oracle-prerequisites] +* [Oracle connection][DBR-onetl-connection-db-connection-oracle-connection-0] + +## Operations { #DBR-onetl-connection-db-connection-oracle-operations } + +* [Reading from Oracle using `DBReader`][DBR-onetl-connection-db-connection-oracle-read-reading-from-oracle-using-dbreader] +* [Reading from Oracle using `Oracle.sql`][DBR-onetl-connection-db-connection-oracle-sql-reading-from-oracle-using-oracle-sql] +* [Writing to Oracle using `DBWriter`][DBR-onetl-connection-db-connection-oracle-write-writing-to-oracle-using-dbwriter] +* [Executing statements in Oracle][DBR-onetl-connection-db-connection-oracle-execute-executing-statements-in-oracle] + +## Troubleshooting { #DBR-onetl-connection-db-connection-oracle-troubleshooting } + +* [Oracle <-> Spark type mapping][DBR-onetl-connection-db-connection-oracle-types-oracle-spark-type-mapping] diff --git a/mddocs/docs/connection/db_connection/oracle/prerequisites.md b/mddocs/docs/connection/db_connection/oracle/prerequisites.md new file mode 100644 index 000000000..3b634fb31 --- /dev/null +++ b/mddocs/docs/connection/db_connection/oracle/prerequisites.md @@ -0,0 +1,109 @@ +# Prerequisites { #DBR-onetl-connection-db-connection-oracle-prerequisites } + +## Version Compatibility { #DBR-onetl-connection-db-connection-oracle-prerequisites-version-compatibility } + +- Oracle Server versions: + - Officially declared: 19c, 21c, 23ai + - Actually tested: 11.2, 23.5 +- Spark versions: 2.3.x - 3.5.x +- Java versions: 8 - 20 + +See [official documentation](https://www.oracle.com/cis/database/technologies/appdev/jdbc-downloads.html). + +## Installing PySpark { #DBR-onetl-connection-db-connection-oracle-prerequisites-installing-pyspark } + +To use Oracle connector you should have PySpark installed (or injected to `sys.path`) +BEFORE creating the connector instance. + +See [installation instruction][DBR-onetl-install-spark] for more details. + +## Connecting to Oracle { #DBR-onetl-connection-db-connection-oracle-prerequisites-connecting-to-oracle } + +### Connection port { #DBR-onetl-connection-db-connection-oracle-prerequisites-connection-port } + +Connection is usually performed to port 1521. Port may differ for different Oracle instances. +Please ask your Oracle administrator to provide required information. + +### Connection host { #DBR-onetl-connection-db-connection-oracle-prerequisites-connection-host } + +It is possible to connect to Oracle by using either DNS name of host or it's IP address. + +If you're using Oracle cluster, it is currently possible to connect only to **one specific node**. +Connecting to multiple nodes to perform load balancing, as well as automatic failover to new master/replica are not supported. + +### Connect as proxy user { #DBR-onetl-connection-db-connection-oracle-prerequisites-connect-as-proxy-user } + +It is possible to connect to database as another user without knowing this user password. + +This can be enabled by granting user a special `CONNECT THROUGH` permission: + + ```sql + ALTER USER schema_owner GRANT CONNECT THROUGH proxy_user; + ``` + +Then you can connect to Oracle using credentials of `proxy_user` but specify that you need permissions of `schema_owner`: + + ```python + oracle = Oracle( + ..., + user="proxy_user[schema_owner]", + password="proxy_user password", + ) + ``` + +See [official documentation](https://oracle-base.com/articles/misc/proxy-users-and-connect-through). + +### Required grants { #DBR-onetl-connection-db-connection-oracle-prerequisites-required-grants } + +Ask your Oracle cluster administrator to set following grants for a user, +used for creating a connection: + +=== "Read + Write (schema is owned by user)" + + ```sql + -- allow user to log in + GRANT CREATE SESSION TO username; + + -- allow creating tables in user schema + GRANT CREATE TABLE TO username; + + -- allow read & write access to specific table + GRANT SELECT, INSERT ON username.mytable TO username; + ``` + +=== "Read + Write (schema is not owned by user)" + + ```sql + -- allow user to log in + GRANT CREATE SESSION TO username; + + -- allow creating tables in any schema, + -- as Oracle does not support specifying exact schema name + GRANT CREATE ANY TABLE TO username; + + -- allow read & write access to specific table + GRANT SELECT, INSERT ON someschema.mytable TO username; + + -- only if if_exists="replace_entire_table" is used: + -- allow dropping/truncating tables in any schema, + -- as Oracle does not support specifying exact schema name + GRANT DROP ANY TABLE TO username; + ``` + +=== "Read only" + + ```sql + -- allow user to log in + GRANT CREATE SESSION TO username; + + -- allow read access to specific table + GRANT SELECT ON someschema.mytable TO username; + ``` + +More details can be found in official documentation: + +- [GRANT](https://docs.oracle.com/en/database/oracle/oracle-database/23/sqlrf/GRANT.html) +- [SELECT](https://docs.oracle.com/en/database/oracle/oracle-database/23/sqlrf/SELECT.html) +- [CREATE TABLE](https://docs.oracle.com/en/database/oracle/oracle-database/23/sqlrf/SELECT.html) +- [INSERT](https://docs.oracle.com/en/database/oracle/oracle-database/23/sqlrf/INSERT.html) +- [TRUNCATE TABLE](https://docs.oracle.com/en/database/oracle/oracle-database/23/sqlrf/TRUNCATE-TABLE.html) diff --git a/mddocs/docs/connection/db_connection/oracle/read.md b/mddocs/docs/connection/db_connection/oracle/read.md new file mode 100644 index 000000000..debc318da --- /dev/null +++ b/mddocs/docs/connection/db_connection/oracle/read.md @@ -0,0 +1,96 @@ +# Reading from Oracle using `DBReader` { #DBR-onetl-connection-db-connection-oracle-read-reading-from-oracle-using-dbreader } + +[DBReader][DBR-onetl-db-reader] supports [strategy][DBR-onetl-strategy-read-strategies] for incremental data reading, but does not support custom queries, like `JOIN`. + +!!! warning + + Please take into account [Oracle types][DBR-onetl-connection-db-connection-oracle-types-oracle-spark-type-mapping] + +## Supported DBReader features { #DBR-onetl-connection-db-connection-oracle-read-supported-dbreader-features } + +- ✅︎ `columns` +- ✅︎ `where` +- ✅︎ `hwm`, supported strategies: + - ✅︎ [Snapshot strategy][DBR-onetl-strategy-snapshot-strategy] + - ✅︎ [Incremental strategy][DBR-onetl-connection-db-connection-clickhouse-read-incremental-strategy] + - ✅︎ [Snapshot batch strategy][DBR-onetl-strategy-snapshot-batch-strategy] + - ✅︎ [Incremental batch strategy][DBR-onetl-strategy-incremental-batch-strategy] +- ✅︎ `hint` (see [official documentation](https://docs.oracle.com/cd/B10500_01/server.920/a96533/hintsref.htm)) +- ❌ `df_schema` +- ✅︎ `options` (see [Oracle.ReadOptions][onetl.connection.db_connection.oracle.options.OracleReadOptions]) + +## Examples { #DBR-onetl-connection-db-connection-oracle-read-examples } + +Snapshot strategy: + + ```python + from onetl.connection import Oracle + from onetl.db import DBReader + + oracle = Oracle(...) + + reader = DBReader( + connection=oracle, + source="schema.table", + columns=["id", "key", "CAST(value AS VARCHAR2(4000)) value", "updated_dt"], + where="key = 'something'", + hint="INDEX(schema.table key_index)", + options=Oracle.ReadOptions(partitionColumn="id", numPartitions=10), + ) + df = reader.run() + ``` + +Incremental strategy: + + ```python + from onetl.connection import Oracle + from onetl.db import DBReader + from onetl.strategy import IncrementalStrategy + + oracle = Oracle(...) + + reader = DBReader( + connection=oracle, + source="schema.table", + columns=["id", "key", "CAST(value AS VARCHAR2(4000)) value", "updated_dt"], + where="key = 'something'", + hint="INDEX(schema.table key_index)", + hwm=DBReader.AutoDetectHWM(name="oracle_hwm", expression="updated_dt"), + options=Oracle.ReadOptions(partitionColumn="id", numPartitions=10), + ) + + with IncrementalStrategy(): + df = reader.run() + ``` + +## Recommendations { #DBR-onetl-connection-db-connection-oracle-read-recommendations } + +### Select only required columns { #DBR-onetl-connection-db-connection-oracle-read-select-only-required-columns } + +Instead of passing `"*"` in `DBReader(columns=[...])` prefer passing exact column names. This reduces the amount of data passed from Oracle to Spark. + +### Pay attention to `where` value { #DBR-onetl-connection-db-connection-oracle-read-pay-attention-to-where-value } + +Instead of filtering data on Spark side using `df.filter(df.column == 'value')` pass proper `DBReader(where="column = 'value'")` clause. +This both reduces the amount of data send from Oracle to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in `where` clause. + +## Options { #DBR-onetl-connection-db-connection-oracle-read-options } + + + +::: onetl.connection.db_connection.oracle.options.OracleReadOptions + options: + inherited_members: true + heading_level: 3 + show_root_heading: true diff --git a/mddocs/docs/connection/db_connection/oracle/sql.md b/mddocs/docs/connection/db_connection/oracle/sql.md new file mode 100644 index 000000000..9ac8068e9 --- /dev/null +++ b/mddocs/docs/connection/db_connection/oracle/sql.md @@ -0,0 +1,82 @@ +# Reading from Oracle using `Oracle.sql` { #DBR-onetl-connection-db-connection-oracle-sql-reading-from-oracle-using-oracle-sql } + +`Oracle.sql` allows passing custom SQL query, but does not support incremental strategies. + +!!! warning + + Please take into account [Oracle types][DBR-onetl-connection-db-connection-oracle-types-oracle-spark-type-mapping] + +!!! warning + + Statement is executed in **read-write** connection, so if you're calling some functions/procedures with DDL/DML statements inside, they can change data in your database. + +## Syntax support { #DBR-onetl-connection-db-connection-oracle-sql-syntax-support } + +Only queries with the following syntax are supported: + +- ✅︎ `SELECT ... FROM ...` +- ✅︎ `WITH alias AS (...) SELECT ...` +- ❌ `SHOW ...` +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +## Examples { #DBR-onetl-connection-db-connection-oracle-sql-examples } + + ```python + from onetl.connection import Oracle + + oracle = Oracle(...) + df = oracle.sql( + """ + SELECT + id, + key, + CAST(value AS VARCHAR2(4000)) value, + updated_at + FROM + some.mytable + WHERE + key = 'something' + """, + options=Oracle.SQLOptions( + partitionColumn="id", + numPartitions=10, + lowerBound=0, + upperBound=1000, + ), + ) + ``` + +## Recommendations { #DBR-onetl-connection-db-connection-oracle-sql-recommendations } + +### Select only required columns { #DBR-onetl-connection-db-connection-oracle-sql-select-only-required-columns } + +Instead of passing `SELECT * FROM ...` prefer passing exact column names `SELECT col1, col2, ...`. +This reduces the amount of data passed from Oracle to Spark. + +### Pay attention to `where` value { #DBR-onetl-connection-db-connection-oracle-sql-pay-attention-to-where-value } + +Instead of filtering data on Spark side using `df.filter(df.column == 'value')` pass proper `WHERE column = 'value'` clause. +This both reduces the amount of data send from Oracle to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in `where` clause. + +## Options { #DBR-onetl-connection-db-connection-oracle-sql-options } + + + +::: onetl.connection.db_connection.oracle.options.OracleSQLOptions + options: + inherited_members: true + heading_level: 3 + show_root_heading: true diff --git a/mddocs/docs/connection/db_connection/oracle/types.md b/mddocs/docs/connection/db_connection/oracle/types.md new file mode 100644 index 000000000..39440beee --- /dev/null +++ b/mddocs/docs/connection/db_connection/oracle/types.md @@ -0,0 +1,268 @@ +# Oracle <-> Spark type mapping { #DBR-onetl-connection-db-connection-oracle-types-oracle-spark-type-mapping } + +!!! note + + The results below are valid for Spark 3.5.5, and may differ on other Spark versions. + +## Type detection & casting { #DBR-onetl-connection-db-connection-oracle-types-type-detection-casting } + +Spark's DataFrames always have a `schema` which is a list of columns with corresponding Spark types. All operations on a column are performed using column type. + +### Reading from Oracle { #DBR-onetl-connection-db-connection-oracle-types-reading-from-oracle } + +This is how Oracle connector performs this: + +- For each column in query result (`SELECT column1, column2, ... FROM table ...`) get column name and Oracle type. +- Find corresponding `Oracle type (read)` → `Spark type` combination (see below) for each DataFrame column. If no combination is found, raise exception. +- Create DataFrame from query with specific column names and Spark types. + +### Writing to some existing Oracle table { #DBR-onetl-connection-db-connection-oracle-types-writing-to-some-existing-oracle-table } + +This is how Oracle connector performs this: + +- Get names of columns in DataFrame. [^1] +- Perform `SELECT * FROM table LIMIT 0` query. +- Take only columns present in DataFrame (by name, case insensitive). For each found column get Clickhouse type. +- **Find corresponding** `Oracle type (read)` → `Spark type` **combination** (see below) for each DataFrame column. If no combination is found, raise exception. [^2] +- Find corresponding `Spark type` → `Oracle type (write)` combination (see below) for each DataFrame column. If no combination is found, raise exception. +- If `Oracle type (write)` match `Oracle type (read)`, no additional casts will be performed, DataFrame column will be written to Oracle as is. +- If `Oracle type (write)` does not match `Oracle type (read)`, DataFrame column will be casted to target column type **on Oracle side**. + For example, you can write column with text data to `int` column, if column contains valid integer values within supported value range and precision. + +[^1]: This allows to write data to tables with `DEFAULT` and `GENERATED` columns - if DataFrame has no such column, it will be populated by Oracle. + +[^2]: Yes, this is weird. + +### Create new table using Spark { #DBR-onetl-connection-db-connection-oracle-types-create-new-table-using-spark } + +!!! warning + + ABSOLUTELY NOT RECOMMENDED! + +This is how Oracle connector performs this: + +- Find corresponding `Spark type` → `Oracle type (create)` combination (see below) for each DataFrame column. If no combination is found, raise exception. +- Generate DDL for creating table in Oracle, like `CREATE TABLE (col1 ...)`, and run it. +- Write DataFrame to created table as is. + +But Oracle connector support only limited number of types and almost no custom clauses (like `PARTITION BY`, `INDEX`, etc). +So instead of relying on Spark to create tables: + +??? note "See example" + + ```python + writer = DBWriter( + connection=oracle, + target="public.table", + options=Oracle.WriteOptions(if_exists="append"), + ) + writer.run(df) + ``` + +Always prefer creating table with desired DDL **BEFORE WRITING DATA**: + +??? note "See example" + + ```python + oracle.execute( + """ + CREATE TABLE username.table ( + id NUMBER, + business_dt TIMESTAMP(6), + value VARCHAR2(2000) + ) + """, + ) + + writer = DBWriter( + connection=oracle, + target="public.table", + options=Oracle.WriteOptions(if_exists="append"), + ) + writer.run(df) + ``` + +See Oracle [CREATE TABLE](https://docs.oracle.com/en/database/oracle/oracle-database/23/sqlrf/SELECT.html) documentation. + +## Supported types { #DBR-onetl-connection-db-connection-oracle-types-supported-types } + +### References { #DBR-onetl-connection-db-connection-oracle-types-references } + +See [List of Oracle types](https://docs.oracle.com/en/database/oracle/oracle-database/23/sqlrf/Data-Types.html). + +Here you can find source code with type conversions: + +- [JDBC -> Spark](https://github.com/apache/spark/blob/v3.5.5/sql/core/src/main/scala/org/apache/spark/sql/jdbc/OracleDialect.scala#L83-L109) +- [Spark -> JDBC](https://github.com/apache/spark/blob/v3.5.5/sql/core/src/main/scala/org/apache/spark/sql/jdbc/OracleDialect.scala#L111-L123) + +### Numeric types { #DBR-onetl-connection-db-connection-oracle-types-numeric-types } + +| Oracle type (read) | Spark type | Oracle type (write) | Oracle type (create) | +|----------------------------------|-----------------------------------|-------------------------------|---------------------------| +| `NUMBER` | `DecimalType(P=38, S=10)` | `NUMBER(P=38, S=10)` | `NUMBER(P=38, S=10)` | +| `NUMBER(P=0..38)` | `DecimalType(P=0..38, S=0)` | `NUMBER(P=0..38, S=0)` | `NUMBER(P=38, S=0)` | +| `NUMBER(P=0..38, S=0..38)` | `DecimalType(P=0..38, S=0..38)` | `NUMBER(P=0..38, S=0..38)` | `NUMBER(P=38, S=0..38)` | +| `NUMBER(P=..., S=-127..-1)` | unsupported [^3] | | | +| `FLOAT`
`FLOAT(N)`
`REAL`
`DOUBLE PRECISION` |
`DecimalType(P=38, S=10)` |
`NUMBER(P=38, S=10)` |
`NUMBER(P=38, S=10)` | +| `BINARY_FLOAT`
`BINARY_DOUBLE` | `FloatType()`
`DoubleType()` | `NUMBER(P=19, S=4)` | `NUMBER(P=19, S=4)` | +| `SMALLINT`
`INTEGER` | `DecimalType(P=38, S=0)` | `NUMBER(P=38, S=0)` | `NUMBER(P=38, S=0)` | +| `LONG` | `StringType()` | `CLOB` | `CLOB` | + +[^3]: Oracle support decimal types with negative scale, like `NUMBER(38, -10)`. Spark doesn't. + +### Temporal types { #DBR-onetl-connection-db-connection-oracle-types-temporal-types } + +| Oracle type (read) | Spark type | Oracle type (write) | Oracle type (create) | +|--------------------------------------------|------------------------------------|---------------------------------|---------------------------------| +| `DATE`, days | `TimestampType()`, microseconds | `TIMESTAMP(6)`, microseconds | `TIMESTAMP(6)`, microseconds | +| `TIMESTAMP`, microseconds
`TIMESTAMP(0)`, seconds
`TIMESTAMP(3)`, milliseconds
`TIMESTAMP(6)`, microseconds |

`TimestampType()`, microseconds |

`TIMESTAMP(6)`, microseconds |

`TIMESTAMP(6)`, microseconds | +| `TIMESTAMP(9)`, nanoseconds | `TimestampType()`, microseconds, **precision loss** [^4] | `TIMESTAMP(6)`, microseconds, **precision loss** | `TIMESTAMP(6)`, microseconds, **precision loss** | +| `TIMESTAMP WITH TIME ZONE`
`TIMESTAMP(N) WITH TIME ZONE`
`TIMESTAMP WITH LOCAL TIME ZONE`
`TIMESTAMP(N) WITH LOCAL TIME ZONE`
`INTERVAL YEAR TO MONTH`
`INTERVAL DAY TO SECOND` |


unsupported | | | + +!!! warning + + Note that types in Oracle and Spark have different value ranges: + + | Oracle type | Min value | Max value | Spark type | Min value | Max value | + |---------------|------------------------------------|-----------------------------------|---------------------|--------------------------------|--------------------------------| + | `date` | `-4712-01-01` | `9999-01-01` | `DateType()` | `0001-01-01` | `9999-12-31` | + | `timestamp` | `-4712-01-01 00:00:00.000000000` | `9999-12-31 23:59:59.999999999` | `TimestampType()` | `0001-01-01 00:00:00.000000` | `9999-12-31 23:59:59.999999` | + + So not all of values can be read from Oracle to Spark. + + References: + + * [Oracle date, timestamp and intervals documentation](https://oracle-base.com/articles/misc/oracle-dates-timestamps-and-intervals#DATE) + * [Spark DateType documentation](https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/DateType.html) + * [Spark TimestampType documentation](https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/TimestampType.html) + +[^4]: Oracle support timestamp up to nanoseconds precision (`23:59:59.999999999`), but Spark `TimestampType()` supports datetime up to microseconds precision (`23:59:59.999999`). Nanoseconds will be lost during read or write operations. + +### String types { #DBR-onetl-connection-db-connection-oracle-types-string-types } + +| Oracle type (read) | Spark type | Oracle type (write) | Oracle type (create) | +|-----------------------------|------------------|---------------------|----------------------| +| `CHAR`
`CHAR(N CHAR)`
`CHAR(N BYTE)`
`NCHAR`
`NCHAR(N)`
`VARCHAR(N)`
`LONG VARCHAR`
`VARCHAR2(N CHAR)`
`VARCHAR2(N BYTE)`
`NVARCHAR2(N)`
`CLOB`
`NCLOB` |




`StringType()` |




`CLOB` |




`CLOB` | + +### Binary types { #DBR-onetl-connection-db-connection-oracle-types-binary-types } + +| Oracle type (read) | Spark type | Oracle type (write) | Oracle type (create) | +|--------------------------|------------------|---------------------|----------------------| +| `RAW(N)`
`LONG RAW`
`BLOB` |
`BinaryType()` |
`BLOB` |
`BLOB` | +| `BFILE` | unsupported | | | + +### Struct types { #DBR-onetl-connection-db-connection-oracle-types-struct-types } + +| Oracle type (read) | Spark type | Oracle type (write) | Oracle type (create) | +|-------------------------------------|------------------|---------------------|----------------------| +| `XMLType`
`URIType`
`DBURIType`
`XDBURIType`
`HTTPURIType`
`CREATE TYPE ... AS OBJECT (...)` | `StringType()` | `CLOB` | `CLOB` | +| `JSON`
`CREATE TYPE ... AS VARRAY ...`
`CREATE TYPE ... AS TABLE OF ...` |
unsupported | | | + +### Special types { #DBR-onetl-connection-db-connection-oracle-types-special-types } + +| Oracle type (read) | Spark type | Oracle type (write) | Oracle type (create) | +|--------------------|-------------------|---------------------|----------------------| +| `BOOLEAN` | `BooleanType()` | `BOOLEAN` | `NUMBER(P=1, S=0)` | +| `ROWID`
`UROWID`
`UROWID(N)` |
`StringType()` |
`CLOB` |
`CLOB` | +| `ANYTYPE`
`ANYDATA`
`ANYDATASET` |
unsupported | | | + +## Explicit type cast { #DBR-onetl-connection-db-connection-oracle-types-explicit-type-cast } + +### `DBReader` { #DBR-onetl-connection-db-connection-oracle-types-dbreader } + +It is possible to explicitly cast column of unsupported type using `DBReader(columns=...)` syntax. + +For example, you can use `CAST(column AS CLOB)` to convert data to string representation on Oracle side, and so it will be read as Spark's `StringType()`. + +It is also possible to use [JSON_ARRAY](https://docs.oracle.com/en/database/oracle/oracle-database/23/sqlrf/JSON_ARRAY.html) or [JSON_OBJECT](https://docs.oracle.com/en/database/oracle/oracle-database/23/sqlrf/JSON_OBJECT.html) Oracle functions to convert column of any type to string representation. Then this JSON string can then be effectively parsed using the [JSON.parse_column][onetl.file.format.json.JSON.parse_column] method. + + ```python + from onetl.file.format import JSON + from pyspark.sql.types import IntegerType, StructType, StructField + + from onetl.connection import Oracle + from onetl.db import DBReader + + oracle = Oracle(...) + + DBReader( + connection=oracle, + columns=[ + "id", + "supported_column", + "CAST(unsupported_column AS VARCHAR2(4000)) unsupported_column_str", + # or + "JSON_ARRAY(array_column) array_column_json", + ], + ) + df = reader.run() + + json_scheme = StructType([StructField("key", IntegerType())]) + + df = df.select( + df.id, + df.supported_column, + df.unsupported_column_str.cast("integer").alias("parsed_integer"), + JSON().parse_column("array_column_json", json_scheme).alias("array_column"), + ) + ``` + +### `DBWriter` { #DBR-onetl-connection-db-connection-oracle-types-dbwriter } + +It is always possible to convert data on Spark side to string, and then write it to text column in Oracle table. + +To serialize and write JSON data to a `text` or `json` column in an Oracle table use the [JSON.serialize_column][onetl.file.format.json.JSON.serialize_column] method. + + ```python + from onetl.connection import Oracle + from onetl.db import DBWriter + from onetl.file.format import JSON + + oracle = Oracle(...) + + oracle.execute( + """ + CREATE TABLE schema.target_table ( + id INTEGER, + supported_column TIMESTAMP, + array_column_json VARCHAR2(4000) -- any string type, actually + ) + """, + ) + + write_df = df.select( + df.id, + df.supported_column, + JSON().serialize_column(df.unsupported_column).alias("array_column_json"), + ) + + writer = DBWriter( + connection=oracle, + target="schema.target_table", + ) + writer.run(write_df) + ``` + +Then you can parse this column on Oracle side - for example, by creating a view: + + ```sql + SELECT + id, + supported_column, + JSON_VALUE(array_column_json, '$[0]' RETURNING NUMBER) AS array_item_0 + FROM + schema.target_table + ``` + +Or by using [VIRTUAL column](https://oracle-base.com/articles/11g/virtual-columns-11gr1): + + ```sql + CREATE TABLE schema.target_table ( + id INTEGER, + supported_column TIMESTAMP, + array_column_json VARCHAR2(4000), -- any string type, actually + array_item_0 GENERATED ALWAYS AS (JSON_VALUE(array_column_json, '$[0]' RETURNING NUMBER)) VIRTUAL + ) + ``` + +But data will be parsed on each table read in any case, as Oracle does no support `GENERATED ALWAYS AS (...) STORED` columns. diff --git a/mddocs/docs/connection/db_connection/oracle/write.md b/mddocs/docs/connection/db_connection/oracle/write.md new file mode 100644 index 000000000..aada56364 --- /dev/null +++ b/mddocs/docs/connection/db_connection/oracle/write.md @@ -0,0 +1,56 @@ +# Writing to Oracle using `DBWriter` { #DBR-onetl-connection-db-connection-oracle-write-writing-to-oracle-using-dbwriter } + +For writing data to Oracle, use [DBWriter][DBR-onetl-db-writer]. + +!!! warning + + Please take into account [Oracle types][DBR-onetl-connection-db-connection-oracle-types-oracle-spark-type-mapping] + +!!! warning + + It is always recommended to create table explicitly using [Oracle.execute][DBR-onetl-connection-db-connection-oracle-execute-executing-statements-in-oracle] instead of relying on Spark's table DDL generation. + + This is because Spark's DDL generator can create columns with different precision and types than it is expected, causing precision loss or other issues. + +## Examples { #DBR-onetl-connection-db-connection-oracle-write-examples } + + ```python + from onetl.connection import Oracle + from onetl.db import DBWriter + + oracle = Oracle(...) + + df = ... # data is here + + writer = DBWriter( + connection=oracle, + target="schema.table", + options=Oracle.WriteOptions(if_exists="append"), + ) + + writer.run(df) + ``` + +## Options { #DBR-onetl-connection-db-connection-oracle-write-options } + +Method above accepts [OracleWriteOptions][onetl.connection.db_connection.oracle.options.OracleWriteOptions] + + + +::: onetl.connection.db_connection.oracle.options.OracleWriteOptions + options: + inherited_members: true + heading_level: 3 + show_root_heading: true diff --git a/mddocs/docs/connection/db_connection/postgres/connection.md b/mddocs/docs/connection/db_connection/postgres/connection.md new file mode 100644 index 000000000..b728b4bf9 --- /dev/null +++ b/mddocs/docs/connection/db_connection/postgres/connection.md @@ -0,0 +1,18 @@ +# Postgres connection { #DBR-onetl-connection-db-connection-postgres-connection-0 } + + + +::: onetl.connection.db_connection.postgres.connection.Postgres + options: + members: + - get_packages + - check diff --git a/mddocs/docs/connection/db_connection/postgres/execute.md b/mddocs/docs/connection/db_connection/postgres/execute.md new file mode 100644 index 000000000..7fad99425 --- /dev/null +++ b/mddocs/docs/connection/db_connection/postgres/execute.md @@ -0,0 +1,120 @@ +# Executing statements in Postgres { #DBR-onetl-connection-db-connection-postgres-execute-executing-statements-in-postgres } + +!!! warning + + Methods below **read all the rows** returned from DB **to Spark driver memory**, and then convert them to DataFrame. + + Do **NOT** use them to read large amounts of data. Use [DBReader][DBR-onetl-connection-db-connection-postgres-read-reading-from-postgres-using-dbreader] or [Postgres.sql][DBR-onetl-connection-db-connection-postgres-sql-reading-from-postgres-using-postgres-sql] instead. + +## How to { #DBR-onetl-connection-db-connection-postgres-execute-how-to } + +There are 2 ways to execute some statement in Postgres + +### Use `Postgres.fetch` { #DBR-onetl-connection-db-connection-postgres-execute-use-postgres-fetch } + +Use this method to execute some `SELECT` query which returns **small number or rows**, like reading Postgres config, or reading data from some reference table. Method returns Spark DataFrame. + +Method accepts [Postgres.FetchOptions][onetl.connection.db_connection.postgres.options.PostgresFetchOptions]. + +Connection opened using this method should be then closed with `connection.close()` or `with connection:`. + +!!! warning + + Please take into account [Postgres types][DBR-onetl-connection-db-connection-postgres-types-postgres-spark-type-mapping]. + +#### Syntax support in `Postgres.fetch` { #DBR-onetl-connection-db-connection-postgres-execute-syntax-support-in-postgres-fetch } + +This method supports **any** query syntax supported by Postgres, like: + +- ✅︎ `SELECT ... FROM ...` +- ✅︎ `WITH alias AS (...) SELECT ...` +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +#### Examples for `Postgres.fetch` { #DBR-onetl-connection-db-connection-postgres-execute-examples-for-postgres-fetch } + + ```python + from onetl.connection import Postgres + + postgres = Postgres(...) + + df = postgres.fetch( + "SELECT value FROM some.reference_table WHERE key = 'some_constant'", + options=Postgres.FetchOptions(queryTimeout=10), + ) + postgres.close() + value = df.collect()[0][0] # get value from first row and first column + ``` + +### Use `Postgres.execute` { #DBR-onetl-connection-db-connection-postgres-execute-use-postgres-execute } + +Use this method to execute DDL and DML operations. Each method call runs operation in a separated transaction, and then commits it. + +Method accepts [Postgres.ExecuteOptions][onetl.connection.db_connection.postgres.options.PostgresExecuteOptions]. + +Connection opened using this method should be then closed with `connection.close()` or `with connection:`. + +#### Syntax support in `Postgres.execute` { #DBR-onetl-connection-db-connection-postgres-execute-syntax-support-in-postgres-execute } + +This method supports **any** query syntax supported by Postgres, like: + +- ✅︎ `CREATE TABLE ...`, `CREATE VIEW ...`, and so on +- ✅︎ `ALTER ...` +- ✅︎ `INSERT INTO ... SELECT ...`, `UPDATE ...`, `DELETE ...`, and so on +- ✅︎ `DROP TABLE ...`, `DROP VIEW ...`, `TRUNCATE TABLE`, and so on +- ✅︎ `CALL procedure(arg1, arg2) ...` +- ✅︎ `SELECT func(arg1, arg2)` or `{call func(arg1, arg2)}` - special syntax for calling functions +- ✅︎ other statements not mentioned here +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +#### Examples for `Postgres.execute` { #DBR-onetl-connection-db-connection-postgres-execute-examples-for-postgres-execute } + + ```python + from onetl.connection import Postgres + + postgres = Postgres(...) + + postgres.execute("DROP TABLE schema.table") + postgres.execute( + """ + CREATE TABLE schema.table ( + id bigint GENERATED ALWAYS AS IDENTITY, + key text, + value real + ) + """, + options=Postgres.ExecuteOptions(queryTimeout=10), + ) + ``` + +## Options { #DBR-onetl-connection-db-connection-postgres-execute-options } + + + +::: onetl.connection.db_connection.postgres.options.PostgresFetchOptions + options: + inherited_members: true + heading_level: 3 + show_root_heading: true + +::: onetl.connection.db_connection.postgres.options.PostgresExecuteOptions + options: + inherited_members: true + heading_level: 3 + show_root_heading: true diff --git a/mddocs/docs/connection/db_connection/postgres/index.md b/mddocs/docs/connection/db_connection/postgres/index.md new file mode 100644 index 000000000..1ef6ece0d --- /dev/null +++ b/mddocs/docs/connection/db_connection/postgres/index.md @@ -0,0 +1,17 @@ +# Postgres { #DBR-onetl-connection-db-connection-postgres } + +## Connection { #DBR-onetl-connection-db-connection-postgres-connection-1 } + +* [Prerequisites][DBR-onetl-connection-db-connection-postgres-prerequisites] +* [Postgres connection][DBR-onetl-connection-db-connection-postgres-connection-0] + +## Operations { #DBR-onetl-connection-db-connection-postgres-operations } + +* [Reading from Postgres using `DBReader`][DBR-onetl-connection-db-connection-postgres-read-reading-from-postgres-using-dbreader] +* [Reading from Postgres using `Postgres.sql`][DBR-onetl-connection-db-connection-postgres-sql-reading-from-postgres-using-postgres-sql] +* [Writing to Postgres using `DBWriter`][DBR-onetl-connection-db-connection-postgres-write-writing-to-postgres-using-dbwriter] +* [Executing statements in Postgres][DBR-onetl-connection-db-connection-postgres-execute-executing-statements-in-postgres] + +## Troubleshooting { #DBR-onetl-connection-db-connection-postgres-troubleshooting } + +* [Postgres <-> Spark type mapping][DBR-onetl-connection-db-connection-postgres-types-postgres-spark-type-mapping] diff --git a/mddocs/docs/connection/db_connection/postgres/prerequisites.md b/mddocs/docs/connection/db_connection/postgres/prerequisites.md new file mode 100644 index 000000000..8382f4232 --- /dev/null +++ b/mddocs/docs/connection/db_connection/postgres/prerequisites.md @@ -0,0 +1,66 @@ +# Prerequisites { #DBR-onetl-connection-db-connection-postgres-prerequisites } + +## Version Compatibility { #DBR-onetl-connection-db-connection-postgres-prerequisites-version-compatibility } + +- PostgreSQL server versions: + - Officially declared: 8.2 - 17 + - Actually tested: 9.4.26, 17.3 +- Spark versions: 2.3.x - 3.5.x +- Java versions: 8 - 20 + +See [official documentation](https://jdbc.postgresql.org/). + +## Installing PySpark { #DBR-onetl-connection-db-connection-postgres-prerequisites-installing-pyspark } + +To use Postgres connector you should have PySpark installed (or injected to `sys.path`) **BEFORE** creating the connector instance. + +See [installation instruction][DBR-onetl-install-spark] for more details. + +## Connecting to Postgres { #DBR-onetl-connection-db-connection-postgres-prerequisites-connecting-to-postgres } + +### Allowing connection to Postgres instance { #DBR-onetl-connection-db-connection-postgres-prerequisites-allowing-connection-to-postgres-instance } + +Ask your Postgres administrator to allow your user (and probably IP) to connect to instance, e.g. by updating `pg_hba.conf` file. + +See [official documentation](https://www.postgresql.org/docs/current/auth-pg-hba-conf.html). + +### Connection port { #DBR-onetl-connection-db-connection-postgres-prerequisites-connection-port } + +Connection is usually performed to port 5432. Port may differ for different Postgres instances. +Please ask your Postgres administrator to provide required information. + +### Connection host { #DBR-onetl-connection-db-connection-postgres-prerequisites-connection-host } + +It is possible to connect to Postgres by using either DNS name of host or it's IP address. + +If you're using Postgres cluster, it is currently possible to connect only to **one specific node**. +Connecting to multiple nodes to perform load balancing, as well as automatic failover to new master/replica are not supported. + +### Required grants { #DBR-onetl-connection-db-connection-postgres-prerequisites-required-grants } + +Ask your Postgres cluster administrator to set following grants for a user, used for creating a connection: + +=== "Read + Write" + + ```sql + -- allow creating tables in specific schema + GRANT USAGE, CREATE ON SCHEMA myschema TO username; + + -- allow read & write access to specific table + GRANT SELECT, INSERT ON myschema.mytable TO username; + + -- only if if_exists="replace_entire_table" is used: + GRANT TRUNCATE ON myschema.mytable TO username; + ``` + +=== "Read only" + + ```sql + -- allow creating tables in specific schema + GRANT USAGE ON SCHEMA myschema TO username; + + -- allow read access to specific table + GRANT SELECT ON myschema.mytable TO username; + ``` + +More details can be found in [official documentation](https://www.postgresql.org/docs/current/sql-grant.html). diff --git a/mddocs/docs/connection/db_connection/postgres/read.md b/mddocs/docs/connection/db_connection/postgres/read.md new file mode 100644 index 000000000..41848313c --- /dev/null +++ b/mddocs/docs/connection/db_connection/postgres/read.md @@ -0,0 +1,94 @@ +# Reading from Postgres using `DBReader` { #DBR-onetl-connection-db-connection-postgres-read-reading-from-postgres-using-dbreader } + +[DBReader][DBR-onetl-db-reader] supports [strategy][DBR-onetl-strategy-read-strategies] for incremental data reading, but does not support custom queries, like `JOIN`. + +!!! warning + + Please take into account [Postgres types][DBR-onetl-connection-db-connection-postgres-types-postgres-spark-type-mapping] + +## Supported DBReader features { #DBR-onetl-connection-db-connection-postgres-read-supported-dbreader-features } + +- ✅︎ `columns` +- ✅︎ `where` +- ✅︎ `hwm`, supported strategies: + - ✅︎ [Snapshot strategy][DBR-onetl-strategy-snapshot-strategy] + - ✅︎ [Incremental strategy][DBR-onetl-connection-db-connection-clickhouse-read-incremental-strategy] + - ✅︎ [Snapshot batch strategy][DBR-onetl-strategy-snapshot-batch-strategy] + - ✅︎ [Incremental batch strategy][DBR-onetl-strategy-incremental-batch-strategy] +- ❌ `hint` (is not supported by Postgres) +- ❌ `df_schema` +- ✅︎ `options` (see [Postgres.ReadOptions][onetl.connection.db_connection.postgres.options.PostgresReadOptions]) + +## Examples { #DBR-onetl-connection-db-connection-postgres-read-examples } + +Snapshot strategy: + + ```python + from onetl.connection import Postgres + from onetl.db import DBReader + + postgres = Postgres(...) + + reader = DBReader( + connection=postgres, + source="schema.table", + columns=["id", "key", "CAST(value AS text) value", "updated_dt"], + where="key = 'something'", + options=Postgres.ReadOptions(partitionColumn="id", numPartitions=10), + ) + df = reader.run() + ``` + +Incremental strategy: + + ```python + from onetl.connection import Postgres + from onetl.db import DBReader + from onetl.strategy import IncrementalStrategy + + postgres = Postgres(...) + + reader = DBReader( + connection=postgres, + source="schema.table", + columns=["id", "key", "CAST(value AS text) value", "updated_dt"], + where="key = 'something'", + hwm=DBReader.AutoDetectHWM(name="postgres_hwm", expression="updated_dt"), + options=Postgres.ReadOptions(partitionColumn="id", numPartitions=10), + ) + + with IncrementalStrategy(): + df = reader.run() + ``` + +## Recommendations { #DBR-onetl-connection-db-connection-postgres-read-recommendations } + +### Select only required columns { #DBR-onetl-connection-db-connection-postgres-read-select-only-required-columns } + +Instead of passing `"*"` in `DBReader(columns=[...])` prefer passing exact column names. This reduces the amount of data passed from Postgres to Spark. + +### Pay attention to `where` value { #DBR-onetl-connection-db-connection-postgres-read-pay-attention-to-where-value } + +Instead of filtering data on Spark side using `df.filter(df.column == 'value')` pass proper `DBReader(where="column = 'value'")` clause. +This both reduces the amount of data send from Postgres to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in `where` clause. + +## Options { #DBR-onetl-connection-db-connection-postgres-read-options } + + + +::: onetl.connection.db_connection.postgres.options.PostgresReadOptions + options: + inherited_members: true + heading_level: 3 + show_root_heading: true diff --git a/mddocs/docs/connection/db_connection/postgres/sql.md b/mddocs/docs/connection/db_connection/postgres/sql.md new file mode 100644 index 000000000..3eafc0771 --- /dev/null +++ b/mddocs/docs/connection/db_connection/postgres/sql.md @@ -0,0 +1,81 @@ +# Reading from Postgres using `Postgres.sql` { #DBR-onetl-connection-db-connection-postgres-sql-reading-from-postgres-using-postgres-sql } + +`Postgres.sql` allows passing custom SQL query, but does not support incremental strategies. + +!!! warning + + Please take into account [Postgres types][DBR-onetl-connection-db-connection-postgres-types-postgres-spark-type-mapping] + +!!! warning + + Statement is executed in **read-write** connection, so if you're calling some functions/procedures with DDL/DML statements inside, they can change data in your database. + +## Syntax support { #DBR-onetl-connection-db-connection-postgres-sql-syntax-support } + +Only queries with the following syntax are supported: + +- ✅︎ `SELECT ... FROM ...` +- ✅︎ `WITH alias AS (...) SELECT ...` +- ❌ `SET ...; SELECT ...;` - multiple statements not supported + +## Examples { #DBR-onetl-connection-db-connection-postgres-sql-examples } + + ```python + from onetl.connection import Postgres + + postgres = Postgres(...) + df = postgres.sql( + """ + SELECT + id, + key, + CAST(value AS text) value, + updated_at + FROM + some.mytable + WHERE + key = 'something' + """, + options=Postgres.SQLOptions( + partitionColumn="id", + numPartitions=10, + lowerBound=0, + upperBound=1000, + ), + ) + ``` + +## Recommendations { #DBR-onetl-connection-db-connection-postgres-sql-recommendations } + +### Select only required columns { #DBR-onetl-connection-db-connection-postgres-sql-select-only-required-columns } + +Instead of passing `SELECT * FROM ...` prefer passing exact column names `SELECT col1, col2, ...`. +This reduces the amount of data passed from Postgres to Spark. + +### Pay attention to `where` value { #DBR-onetl-connection-db-connection-postgres-sql-pay-attention-to-where-value } + +Instead of filtering data on Spark side using `df.filter(df.column == 'value')` pass proper `WHERE column = 'value'` clause. +This both reduces the amount of data send from Postgres to Spark, and may also improve performance of the query. +Especially if there are indexes or partitions for columns used in `where` clause. + +## Options { #DBR-onetl-connection-db-connection-postgres-sql-options } + + + +::: onetl.connection.db_connection.postgres.options.PostgresSQLOptions + options: + inherited_members: true + heading_level: 3 + show_root_heading: true diff --git a/mddocs/docs/connection/db_connection/postgres/types.md b/mddocs/docs/connection/db_connection/postgres/types.md new file mode 100644 index 000000000..3c7371f1c --- /dev/null +++ b/mddocs/docs/connection/db_connection/postgres/types.md @@ -0,0 +1,355 @@ +# Postgres <-> Spark type mapping { #DBR-onetl-connection-db-connection-postgres-types-postgres-spark-type-mapping } + +!!! note + + The results below are valid for Spark 3.5.5, and may differ on other Spark versions. + +## Type detection & casting { #DBR-onetl-connection-db-connection-postgres-types-type-detection-casting } + +Spark's DataFrames always have a `schema` which is a list of columns with corresponding Spark types. All operations on a column are performed using column type. + +### Reading from Postgres { #DBR-onetl-connection-db-connection-postgres-types-reading-from-postgres } + +This is how Postgres connector performs this: + +- For each column in query result (`SELECT column1, column2, ... FROM table ...`) get column name and Postgres type. +- Find corresponding `Postgres type (read)` → `Spark type` combination (see below) for each DataFrame column [^1]. If no combination is found, raise exception. +- Create DataFrame from query with specific column names and Spark types. + +### Writing to some existing Postgres table { #DBR-onetl-connection-db-connection-postgres-types-writing-to-some-existing-postgres-table } + +This is how Postgres connector performs this: + +- Get names of columns in DataFrame. [^1] +- Perform `SELECT * FROM table LIMIT 0` query. +- Take only columns present in DataFrame (by name, case insensitive) [^2]. For each found column get Postgres type. +- Find corresponding `Spark type` → `Postgres type (write)` combination (see below) for each DataFrame column. If no combination is found, raise exception. +- If `Postgres type (write)` match `Postgres type (read)`, no additional casts will be performed, DataFrame column will be written to Postgres as is. +- If `Postgres type (write)` does not match `Postgres type (read)`, DataFrame column will be casted to target column type **on Postgres side**. + For example, you can write column with text data to `int` column, if column contains valid integer values within supported value range and precision [^3]. + +[^1]: All Postgres types that doesn't have corresponding Java type are converted to `String`. + +[^2]: This allows to write data to tables with `DEFAULT` and `GENERATED` columns - if DataFrame has no such column, it will be populated by Postgres. + +[^3]: This is true only if either DataFrame column is a `StringType()`, or target column is `text` type. + + But other types cannot be silently converted, like `bytea -> bit(N)`. This requires explicit casting, see [Manual conversion to string]. + +### Create new table using Spark { #DBR-onetl-connection-db-connection-postgres-types-create-new-table-using-spark } + +!!! warning + + ABSOLUTELY NOT RECOMMENDED! + +This is how Postgres connector performs this: + +- Find corresponding `Spark type` → `Postgres type (create)` combination (see below) for each DataFrame column. If no combination is found, raise exception. +- Generate DDL for creating table in Postgres, like `CREATE TABLE (col1 ...)`, and run it. +- Write DataFrame to created table as is. + +But Postgres connector support only limited number of types and almost no custom clauses (like `PARTITION BY`, `INDEX`, etc). +So instead of relying on Spark to create tables: + +??? note "See example" + + ```python + writer = DBWriter( + connection=postgres, + target="public.table", + options=Postgres.WriteOptions( + if_exists="append", + createTableOptions="PARTITION BY RANGE (id)", + ), + ) + writer.run(df) + ``` + +Always prefer creating table with desired DDL **BEFORE WRITING DATA**: + +??? note "See example" + + ```python + postgres.execute( + """ + CREATE TABLE public.table ( + id bigint, + business_dt timestamp(6), + value json + ) + PARTITION BY RANGE (Id) + """, + ) + + writer = DBWriter( + connection=postgres, + target="public.table", + options=Postgres.WriteOptions(if_exists="append"), + ) + writer.run(df) + ``` + +See Postgres [CREATE TABLE](https://www.postgresql.org/docs/current/sql-createtable.html) documentation. + +## Supported types { #DBR-onetl-connection-db-connection-postgres-types-supported-types } + +### References { #DBR-onetl-connection-db-connection-postgres-types-references } + +See [List of Postgres types](https://www.postgresql.org/docs/current/datatype.html). + +Here you can find source code with type conversions: + +- [Postgres <-> JDBC](https://github.com/pgjdbc/pgjdbc/blob/REL42.6.0/pgjdbc/src/main/java/org/postgresql/jdbc/TypeInfoCache.java#L78-L112) +- [JDBC -> Spark](https://github.com/apache/spark/blob/v3.5.5/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala#L52-L108) +- [Spark -> JDBC](https://github.com/apache/spark/blob/v3.5.5/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala#L118-L132) + +### Numeric types { #DBR-onetl-connection-db-connection-postgres-types-numeric-types } + +| Postgres type (read) | Spark type | Postgres type (write) | Postgres type (create) | +|--------------------------------|------------------------------------|------------------------------|-----------------------------------------------| +| `decimal` | `DecimalType(P=38, S=18)` | `decimal(P=38, S=18)` |

`decimal` (unbounded) {: rowspan=3} | +| `decimal(P=0..38)` | `DecimalType(P=0..38, S=0)` | `decimal(P=0..38, S=0)` | ⁠ {: style="padding:0"} | +| `decimal(P=0..38, S=0..38)` | `DecimalType(P=0..38, S=0..38)` | `decimal(P=0..38, S=0..38)` | ⁠ {: style="padding:0"} | +| `decimal(P=39.., S=0..)` | unsupported [^4] | | | +| `decimal(P=.., S=..-1)` | unsupported [^5] | | | +| `real` | `FloatType()` | `real` | `real` | +| `double precision` | `DoubleType()` | `double precision` | `double precision` | +| `smallint` | `ShortType()` | `smallint` | `smallint` | +| `-` | `ByteType()` | | | +| `integer` | `IntegerType()` | `integer` | `integer` | +| `bigint` | `LongType()` | `bigint` | `bigint` | +| `money` |



`StringType()` [^1] {: rowspan=5} |



`text` {: rowspan=5} |



`text` {: rowspan=5} | +| `int4range` | ⁠ {: style="padding:0"} | ⁠ {: style="padding:0"} | ⁠ {: style="padding:0"} | +| `int8range` | ⁠ {: style="padding:0"} | ⁠ {: style="padding:0"} | ⁠ {: style="padding:0"} | +| `numrange` | ⁠ {: style="padding:0"} | ⁠ {: style="padding:0"} | ⁠ {: style="padding:0"} | +| `int2vector` | ⁠ {: style="padding:0"} | ⁠ {: style="padding:0"} | ⁠ {: style="padding:0"} | + +[^4]: Postgres support decimal types with unlimited precision. + + But Spark's `DecimalType(P, S)` supports maximum `P=38` (128 bit). It is impossible to read, write or operate with values of larger precision, + this leads to an exception. + +[^5]: Postgres support decimal types with negative scale, like `decimal(38, -10)`. Spark doesn't. + +### Temporal types { #DBR-onetl-connection-db-connection-postgres-types-temporal-types } + +| Postgres type (read) | Spark type | Postgres type (write) | Postgres type (create) | +|------------------------------------|------------------------------|-----------------------|-------------------------| +| `date` | `DateType()` | `date` | `date` | +| `time`
`time(0..6)`
`time with time zone`
`time(0..6) with time zone` |
`TimestampType()`,
with time format quirks [^6] |

`timestamp(6)` |

`timestamp(6)` | +| `timestamp`
`timestamp(0..6)`
`timestamp with time zone`
`timestamp(0..6) with time zone` |

`TimestampType()` |

`timestamp(6)` |

`timestamp(6)` | +| `-` | `TimestampNTZType()` | `timestamp(6)` | `timestamp(6)` | +| `interval` of any precision | `StringType()` [^1] | `text` | `text` | +| `-` | `DayTimeIntervalType()` | unsupported | unsupported | +| `-` | `YearMonthIntervalType()` | unsupported | unsupported | +| `daterange`
`tsrange`
`tstzrange` |
`StringType()` [^1] |

`text` |

`text` | + +!!! warning + + Note that types in Postgres and Spark have different value ranges: + + + | Postgres type | Min value | Max value | Spark type | Min value | Max value | + |---------------|---------------------------------|----------------------------------|---------------------|--------------------------------|--------------------------------| + | `date` | `-4713-01-01` | `5874897-01-01` | `DateType()` | `0001-01-01` | `9999-12-31` | + | `timestamp` | `-4713-01-01 00:00:00.000000` | `294276-12-31 23:59:59.999999` |

`TimestampType()` {: rowspan=2} |

`0001-01-01 00:00:00.000000` {: rowspan=2} |

`9999-12-31 23:59:59.999999` {: rowspan=2} | + | `time` | `00:00:00.000000` | `24:00:00.000000` | ⁠ {: style="padding:0"} | ⁠ {: style="padding:0"} | ⁠ {: style="padding:0"} | + + + So not all of values can be read from Postgres to Spark. + + References: + + * [Postgres date/time types documentation](https://www.postgresql.org/docs/current/datatype-datetime.html) + * [Spark DateType documentation](https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/DateType.html) + * [Spark TimestampType documentation](https://spark.apache.org/docs/latest/api/java/org/apache/spark/sql/types/TimestampType.html) + +[^6]: `time` type is the same as `timestamp` with date `1970-01-01`. So instead of reading data from Postgres like `23:59:59` + it is actually read `1970-01-01 23:59:59`, and vice versa. + +### String types { #DBR-onetl-connection-db-connection-postgres-types-string-types } + +| Postgres type (read) | Spark type | Postgres type (write) | Postgres type (create) | +|-----------------------------|-----------------------|-----------------------|-------------------------| +| `character`
`character(N)`
`character varying`
`character varying(N)`
`text`
`json`
`jsonb`
`xml` |



`StringType()` |




`text` {: rowspan=2} |




`text` {: rowspan=2} | +| `CREATE TYPE ... AS ENUM`
`tsvector`
`tsquery` |
`StringType()`[^1] | ⁠ {: style="padding:0"} | ⁠ {: style="padding:0"} | +| `-` | `CharType()` | `unsupported` | `unsupported` | +| `-` | `VarcharType()` | `unsupported` | `unsupported` | + +### Binary types { #DBR-onetl-connection-db-connection-postgres-types-binary-types } + +| Postgres type (read) | Spark type | Postgres type (write) | Postgres type (create) | +|--------------------------|-----------------------|-----------------------------|-------------------------| +| `boolean` | `BooleanType()` | `boolean` | `boolean` | +| `bit`
`bit(N=1)` | `BooleanType()` | `bool`,
**cannot insert data** [^3] | `bool` | +| `bit(N=2..)` | `ByteType()` | `bytea`,
**cannot insert data** [^3] | `bytea` | +| `bit varying`
`bit varying(N)` | `StringType()` [^1] | `text` | `text` | +| `bytea` | `BinaryType()` | `bytea` | `bytea` | + +### Struct types { #DBR-onetl-connection-db-connection-postgres-types-struct-types } + +| Postgres type (read) | Spark type | Postgres type (write) | Postgres type (create) | +|--------------------------------|-----------------------|-----------------------|-------------------------| +| `T[]` | `ArrayType(T)` | `T[]` | `T[]` | +| `T[][]` | unsupported | | | +| `CREATE TYPE sometype (...)` | `StringType()` [^1] | `text` | `text` | +| `-` | `StructType()`
`MapType()` | unsupported | | + +### Network types { #DBR-onetl-connection-db-connection-postgres-types-network-types } + +| Postgres type (read) | Spark type | Postgres type (write) | Postgres type (create) | +|----------------------|-----------------------|-----------------------|-------------------------| +| `cidr`
`inet`
`macaddr`
`macaddr8` | `StringType()` [^1] |

`text` |

`text` | + +### Geo types { #DBR-onetl-connection-db-connection-postgres-types-geo-types } + +| Postgres type (read) | Spark type | Postgres type (write) | Postgres type (create) | +|----------------------|-----------------------|-----------------------|-------------------------| +| `circle`
`box`
`line`
`lseg`
`path`
`point`
`polygon` |


`StringType()` [^1] |


`text` |


`text` | + +## Explicit type cast { #DBR-onetl-connection-db-connection-postgres-types-explicit-type-cast } + +### `DBReader` { #DBR-onetl-connection-db-connection-postgres-types-dbreader } + +It is possible to explicitly cast column of unsupported type using `DBReader(columns=...)` syntax. + +For example, you can use `CAST(column AS text)` to convert data to string representation on Postgres side, and so it will be read as Spark's `StringType()`. + +It is also possible to use [to_json](https://www.postgresql.org/docs/current/functions-json.html) Postgres function to convert column of any type to string representation, and then parse this column on Spark side you can use the [JSON.parse_column][onetl.file.format.json.JSON.parse_column] method: + + ```python + from pyspark.sql.types import IntegerType + + from onetl.connection import Postgres + from onetl.db import DBReader + from onetl.file.format import JSON + + postgres = Postgres(...) + + DBReader( + connection=postgres, + columns=[ + "id", + "supported_column", + "CAST(unsupported_column AS text) unsupported_column_str", + # or + "to_json(unsupported_column) array_column_json", + ], + ) + df = reader.run() + + json_schema = StructType( + [ + StructField("id", IntegerType(), nullable=True), + StructField("name", StringType(), nullable=True), + ..., + ] + ) + df = df.select( + df.id, + df.supported_column, + # explicit cast + df.unsupported_column_str.cast("integer").alias("parsed_integer"), + JSON().parse_column("array_column_json", json_schema).alias("json_string"), + ) + ``` + +### `DBWriter` { #DBR-onetl-connection-db-connection-postgres-types-dbwriter } + +It is always possible to convert data on the Spark side to a string, and then write it to a text column in a Postgres table. + +#### Using JSON.serialize_column { #DBR-onetl-connection-db-connection-postgres-types-using-json-serialize-column } + +You can use the [JSON.serialize_column][onetl.file.format.json.JSON.serialize_column] method for data serialization: + + ```python + from onetl.file.format import JSON + from pyspark.sql.functions import col + + from onetl.connection import Postgres + from onetl.db import DBWriter + + postgres = Postgres(...) + + postgres.execute( + """ + CREATE TABLE schema.target_table ( + id int, + supported_column timestamp, + array_column_json jsonb -- any column type, actually + ) + """, + ) + + write_df = df.select( + df.id, + df.supported_column, + JSON().serialize_column(df.unsupported_column).alias("array_column_json"), + ) + + writer = DBWriter( + connection=postgres, + target="schema.target_table", + ) + writer.run(write_df) + ``` + +Then you can parse this column on the Postgres side (for example, by creating a view): + + ```sql + SELECT + id, + supported_column, + array_column_json->'0' AS array_item_0 + FROM + schema.target_table + ``` + +To avoid casting the value on every table read you can use [GENERATED ALWAYS STORED](https://www.postgresql.org/docs/current/ddl-generated-columns.html) column, but this requires 2x space (for original and parsed value). + +#### Manual conversion to string { #DBR-onetl-connection-db-connection-postgres-types-manual-conversion-to-string } + +Postgres connector also supports conversion text value directly to target column type, if this value has a proper format. + +For example, you can write data like `[123, 345)` to `int8range` type because Postgres allows cast `'[123, 345)'::int8range'`: + + ```python + from pyspark.sql.ftypes import StringType + from pyspark.sql.functions import udf + + from onetl.connection import Postgres + from onetl.db import DBReader + + postgres = Postgres(...) + + postgres.execute( + """ + CREATE TABLE schema.target_table ( + id int, + range_column int8range -- any column type, actually + ) + """, + ) + + + @udf(returnType=StringType()) + def array_to_range(value: tuple): + """This UDF allows to convert tuple[start, end] to Postgres' range format""" + start, end = value + return f"[{start},{end})" + + + write_df = df.select( + df.id, + array_to_range(df.range_column).alias("range_column"), + ) + + writer = DBWriter( + connection=postgres, + target="schema.target_table", + ) + writer.run(write_df) + ``` + +This can be tricky to implement and may lead to longer write process. +But this does not require extra space on Postgres side, and allows to avoid explicit value cast on every table read. diff --git a/mddocs/docs/connection/db_connection/postgres/write.md b/mddocs/docs/connection/db_connection/postgres/write.md new file mode 100644 index 000000000..1fb67e7c6 --- /dev/null +++ b/mddocs/docs/connection/db_connection/postgres/write.md @@ -0,0 +1,58 @@ +# Writing to Postgres using `DBWriter` { #DBR-onetl-connection-db-connection-postgres-write-writing-to-postgres-using-dbwriter } + +For writing data to Postgres, use [DBWriter][DBR-onetl-db-writer]. + +!!! warning + + Please take into account [Postgres types][DBR-onetl-connection-db-connection-postgres-types-postgres-spark-type-mapping] + +!!! warning + + It is always recommended to create table explicitly using [Postgres.execute][DBR-onetl-connection-db-connection-postgres-execute-executing-statements-in-postgres] + instead of relying on Spark's table DDL generation. + + This is because Spark's DDL generator can create columns with different precision and types than it is expected, + causing precision loss or other issues. + +## Examples { #DBR-onetl-connection-db-connection-postgres-write-examples } + + ```python + from onetl.connection import Postgres + from onetl.db import DBWriter + + postgres = Postgres(...) + + df = ... # data is here + + writer = DBWriter( + connection=postgres, + target="schema.table", + options=Postgres.WriteOptions(if_exists="append"), + ) + + writer.run(df) + ``` + +## Options { #DBR-onetl-connection-db-connection-postgres-write-options } + +Method above accepts [Postgres.WriteOptions][onetl.connection.db_connection.postgres.options.PostgresWriteOptions] + + + +::: onetl.connection.db_connection.postgres.options.PostgresWriteOptions + options: + inherited_members: true + heading_level: 3 + show_root_heading: true diff --git a/mddocs/docs/connection/file_connection/ftp.md b/mddocs/docs/connection/file_connection/ftp.md new file mode 100644 index 000000000..07dc0a06c --- /dev/null +++ b/mddocs/docs/connection/file_connection/ftp.md @@ -0,0 +1,33 @@ +# FTP connection { #DBR-onetl-connection-file-connection-ftp-connection } + + + +::: onetl.connection.file_connection.ftp.FTP + options: + members: + - __init__ + - check + - path_exists + - is_file + - is_dir + - get_stat + - resolve_dir + - resolve_file + - create_dir + - remove_file + - remove_dir + - rename_dir + - rename_file + - list_dir + - walk + - download_file + - upload_file diff --git a/mddocs/docs/connection/file_connection/ftps.md b/mddocs/docs/connection/file_connection/ftps.md new file mode 100644 index 000000000..a1129c2b6 --- /dev/null +++ b/mddocs/docs/connection/file_connection/ftps.md @@ -0,0 +1,33 @@ +# FTPS connection { #DBR-onetl-connection-file-connection-ftps-connection } + + + +::: onetl.connection.file_connection.ftps.FTPS + options: + members: + - __init__ + - check + - path_exists + - is_file + - is_dir + - get_stat + - resolve_dir + - resolve_file + - create_dir + - remove_file + - remove_dir + - rename_dir + - rename_file + - list_dir + - walk + - download_file + - upload_file diff --git a/mddocs/docs/connection/file_connection/hdfs/connection.md b/mddocs/docs/connection/file_connection/hdfs/connection.md new file mode 100644 index 000000000..2e6efa8d0 --- /dev/null +++ b/mddocs/docs/connection/file_connection/hdfs/connection.md @@ -0,0 +1,33 @@ +# HDFS connection { #DBR-onetl-connection-file-connection-hdfs-connection-0 } + + + +::: onetl.connection.file_connection.hdfs.connection.HDFS + options: + members: + - get_current + - check + - path_exists + - is_file + - is_dir + - get_stat + - resolve_dir + - resolve_file + - create_dir + - remove_file + - remove_dir + - rename_dir + - rename_file + - list_dir + - walk + - download_file + - upload_file diff --git a/mddocs/docs/connection/file_connection/hdfs/index.md b/mddocs/docs/connection/file_connection/hdfs/index.md new file mode 100644 index 000000000..538f621ae --- /dev/null +++ b/mddocs/docs/connection/file_connection/hdfs/index.md @@ -0,0 +1,9 @@ +# HDFS { #DBR-onetl-connection-file-connection-hdfs } + +## Connection { #DBR-onetl-connection-file-connection-hdfs-connection-1 } + +* [HDFS connection][DBR-onetl-connection-file-connection-hdfs-connection-0] + +## For developers { #DBR-onetl-connection-file-connection-hdfs-for-developers } + +* [HDFS Slots][DBR-onetl-connection-file-connection-hdfs-slots] diff --git a/mddocs/docs/connection/file_connection/hdfs/slots.md b/mddocs/docs/connection/file_connection/hdfs/slots.md new file mode 100644 index 000000000..33b501a1a --- /dev/null +++ b/mddocs/docs/connection/file_connection/hdfs/slots.md @@ -0,0 +1,24 @@ +# HDFS Slots { #DBR-onetl-connection-file-connection-hdfs-slots } + + + +::: onetl.connection.file_connection.hdfs.slots.HDFSSlots + options: + members: + - normalize_cluster_name + - normalize_namenode_host + - get_known_clusters + - get_cluster_namenodes + - get_current_cluster + - get_webhdfs_port + - is_namenode_active diff --git a/mddocs/docs/connection/file_connection/index.md b/mddocs/docs/connection/file_connection/index.md new file mode 100644 index 000000000..37cb92acc --- /dev/null +++ b/mddocs/docs/connection/file_connection/index.md @@ -0,0 +1,9 @@ +# File Connections { #DBR-onetl-connection-file-connection-file-connections } + +* [FTP][DBR-onetl-connection-file-connection-ftp-connection] +* [FTPS][DBR-onetl-connection-file-connection-ftps-connection] +* [HDFS][DBR-onetl-connection-file-connection-hdfs] +* [Samba][DBR-onetl-connection-file-connection-samba-connection] +* [SFTP][DBR-onetl-connection-file-connection-sftp-connection] +* [S3][DBR-onetl-connection-file-connection-s3-connection] +* [Webdav][DBR-onetl-connection-file-connection-webdav-connection] diff --git a/mddocs/docs/connection/file_connection/s3.md b/mddocs/docs/connection/file_connection/s3.md new file mode 100644 index 000000000..9d9682d11 --- /dev/null +++ b/mddocs/docs/connection/file_connection/s3.md @@ -0,0 +1,32 @@ +# S3 connection { #DBR-onetl-connection-file-connection-s3-connection } + + + +::: onetl.connection.file_connection.s3.S3 + options: + members: + - __init__ + - check + - path_exists + - is_file + - is_dir + - get_stat + - resolve_dir + - resolve_file + - create_dir + - remove_file + - remove_dir + - rename_file + - list_dir + - walk + - download_file + - upload_file diff --git a/mddocs/docs/connection/file_connection/samba.md b/mddocs/docs/connection/file_connection/samba.md new file mode 100644 index 000000000..5d5505cad --- /dev/null +++ b/mddocs/docs/connection/file_connection/samba.md @@ -0,0 +1,31 @@ +# Samba connection { #DBR-onetl-connection-file-connection-samba-connection } + + + +::: onetl.connection.file_connection.samba.Samba + options: + members: + - __init__ + - check + - path_exists + - is_file + - is_dir + - get_stat + - resolve_dir + - resolve_file + - create_dir + - remove_file + - remove_dir + - rename_file + - list_dir + - download_file + - upload_file diff --git a/mddocs/docs/connection/file_connection/sftp.md b/mddocs/docs/connection/file_connection/sftp.md new file mode 100644 index 000000000..05d42ae7b --- /dev/null +++ b/mddocs/docs/connection/file_connection/sftp.md @@ -0,0 +1,33 @@ +# SFTP connection { #DBR-onetl-connection-file-connection-sftp-connection } + + + +::: onetl.connection.file_connection.sftp.SFTP + options: + members: + - __init__ + - check + - path_exists + - is_file + - is_dir + - get_stat + - resolve_dir + - resolve_file + - create_dir + - remove_file + - remove_dir + - rename_dir + - rename_file + - list_dir + - walk + - download_file + - upload_file diff --git a/mddocs/docs/connection/file_connection/webdav.md b/mddocs/docs/connection/file_connection/webdav.md new file mode 100644 index 000000000..39d974278 --- /dev/null +++ b/mddocs/docs/connection/file_connection/webdav.md @@ -0,0 +1,32 @@ +# WebDAV connection { #DBR-onetl-connection-file-connection-webdav-connection } + + + +::: onetl.connection.file_connection.webdav.WebDAV + options: + members: + - __init__ + - check + - path_exists + - is_file + - is_dir + - get_stat + - resolve_dir + - resolve_file + - create_dir + - remove_file + - remove_dir + - rename_file + - list_dir + - walk + - download_file + - upload_file diff --git a/mddocs/docs/connection/file_df_connection/base.md b/mddocs/docs/connection/file_df_connection/base.md new file mode 100644 index 000000000..d0692d67f --- /dev/null +++ b/mddocs/docs/connection/file_df_connection/base.md @@ -0,0 +1,20 @@ +# Base interface { #DBR-onetl-connection-file-df-connection-base-interface } + + + +::: onetl.base.base_file_df_connection.BaseFileDFConnection + options: + members: + - check + - check_if_format_supported + - read_files_as_df + - write_df_as_files diff --git a/mddocs/docs/connection/file_df_connection/index.md b/mddocs/docs/connection/file_df_connection/index.md new file mode 100644 index 000000000..bb2389bd8 --- /dev/null +++ b/mddocs/docs/connection/file_df_connection/index.md @@ -0,0 +1,15 @@ +# File DataFrame Connections { #DBR-onetl-connection-file-df-connection-file-dataframe-connections } + +* [Spark LocalFS][DBR-onetl-connection-file-df-connection-spark-local-fs-spark-localfs] +* [Spark HDFS][DBR-onetl-connection-file-df-connection-spark-hdfs] + * [Prerequisites][DBR-onetl-connection-file-df-connection-spark-hdfs-prerequisites] + * [Connection][DBR-onetl-connection-file-df-connection-spark-hdfs-connection] + * [Slots][DBR-onetl-connection-file-df-connection-spark-hdfs-slots] +* [Spark S3][DBR-onetl-connection-file-df-connection-spark-s3] + * [Prerequisites][DBR-onetl-connection-file-df-connection-spark-s3-prerequisites] + * [Connection][DBR-onetl-connection-file-df-connection-spark-s3-connection] + * [Troubleshooting][DBR-onetl-connection-file-df-connection-spark-s3-troubleshooting] + +## For developers { #DBR-onetl-connection-file-df-connection-for-developers } + +* [Base interface][DBR-onetl-connection-file-df-connection-base-interface] diff --git a/mddocs/docs/connection/file_df_connection/spark_hdfs/connection.md b/mddocs/docs/connection/file_df_connection/spark_hdfs/connection.md new file mode 100644 index 000000000..c3a66239b --- /dev/null +++ b/mddocs/docs/connection/file_df_connection/spark_hdfs/connection.md @@ -0,0 +1,18 @@ +# Spark HDFS Connection { #DBR-onetl-connection-file-df-connection-spark-hdfs-connection } + + + +::: onetl.connection.file_df_connection.spark_hdfs.connection.SparkHDFS + options: + members: + - check + - get_current diff --git a/mddocs/docs/connection/file_df_connection/spark_hdfs/index.md b/mddocs/docs/connection/file_df_connection/spark_hdfs/index.md new file mode 100644 index 000000000..969d8cbc2 --- /dev/null +++ b/mddocs/docs/connection/file_df_connection/spark_hdfs/index.md @@ -0,0 +1,8 @@ +# Spark HDFS { #DBR-onetl-connection-file-df-connection-spark-hdfs } + +* [Prerequisites][DBR-onetl-connection-file-df-connection-spark-hdfs-prerequisites] +* [Connection][DBR-onetl-connection-file-df-connection-spark-hdfs-connection] + +## For developers { #DBR-onetl-connection-file-df-connection-spark-hdfs-for-developers } + +* [Slots][DBR-onetl-connection-file-df-connection-spark-hdfs-slots] diff --git a/mddocs/docs/connection/file_df_connection/spark_hdfs/prerequisites.md b/mddocs/docs/connection/file_df_connection/spark_hdfs/prerequisites.md new file mode 100644 index 000000000..6d4eadcc1 --- /dev/null +++ b/mddocs/docs/connection/file_df_connection/spark_hdfs/prerequisites.md @@ -0,0 +1,44 @@ +# Prerequisites { #DBR-onetl-connection-file-df-connection-spark-hdfs-prerequisites } + +## Version Compatibility { #DBR-onetl-connection-file-df-connection-spark-hdfs-prerequisites-version-compatibility } + +- Hadoop versions: 2.x, 3.x (only with Hadoop 3.x libraries) +- Spark versions: 2.3.x - 3.5.x +- Java versions: 8 - 20 + +## Installing PySpark { #DBR-onetl-connection-file-df-connection-spark-hdfs-prerequisites-installing-pyspark } + +To use SparkHDFS connector you should have PySpark installed (or injected to `sys.path`) +BEFORE creating the connector instance. + +See [installation instruction][DBR-onetl-install-spark] for more details. + +## Using Kerberos { #DBR-onetl-connection-file-df-connection-spark-hdfs-prerequisites-using-kerberos } + +Some of Hadoop managed clusters use Kerberos authentication. In this case, you should call [kinit](https://web.mit.edu/kerberos/krb5-1.12/doc/user/user_commands/kinit.html) command +**BEFORE** starting Spark session to generate Kerberos ticket. See [Kerberos installation instructions][DBR-onetl-install-kerberos-support]. + +Sometimes it is also required to pass keytab file to Spark config, allowing Spark executors to generate own Kerberos tickets: + +=== "Spark 3" + + ```python + SparkSession.builder + .option("spark.kerberos.access.hadoopFileSystems", "hdfs://namenode1.domain.com:9820,hdfs://namenode2.domain.com:9820") + .option("spark.kerberos.principal", "user") + .option("spark.kerberos.keytab", "/path/to/keytab") + .gerOrCreate() + ``` + +=== "Spark 2" + + ```python + SparkSession.builder + .option("spark.yarn.access.hadoopFileSystems", "hdfs://namenode1.domain.com:9820,hdfs://namenode2.domain.com:9820") + .option("spark.yarn.principal", "user") + .option("spark.yarn.keytab", "/path/to/keytab") + .gerOrCreate() + ``` + +See [Spark security documentation](https://spark.apache.org/docs/latest/security.html#kerberos) +for more details. diff --git a/mddocs/docs/connection/file_df_connection/spark_hdfs/slots.md b/mddocs/docs/connection/file_df_connection/spark_hdfs/slots.md new file mode 100644 index 000000000..31402b047 --- /dev/null +++ b/mddocs/docs/connection/file_df_connection/spark_hdfs/slots.md @@ -0,0 +1,24 @@ +# Spark HDFS Slots { #DBR-onetl-connection-file-df-connection-spark-hdfs-slots } + + + +::: onetl.connection.file_df_connection.spark_hdfs.slots.SparkHDFSSlots + options: + members: + - normalize_cluster_name + - normalize_namenode_host + - get_known_clusters + - get_cluster_namenodes + - get_current_cluster + - get_ipc_port + - is_namenode_active diff --git a/mddocs/docs/connection/file_df_connection/spark_local_fs.md b/mddocs/docs/connection/file_df_connection/spark_local_fs.md new file mode 100644 index 000000000..447b57a3d --- /dev/null +++ b/mddocs/docs/connection/file_df_connection/spark_local_fs.md @@ -0,0 +1,17 @@ +# Spark LocalFS { #DBR-onetl-connection-file-df-connection-spark-local-fs-spark-localfs } + + + +::: onetl.connection.file_df_connection.spark_local_fs.SparkLocalFS + options: + members: + - check diff --git a/mddocs/docs/connection/file_df_connection/spark_s3/connection.md b/mddocs/docs/connection/file_df_connection/spark_s3/connection.md new file mode 100644 index 000000000..36f2866e7 --- /dev/null +++ b/mddocs/docs/connection/file_df_connection/spark_s3/connection.md @@ -0,0 +1,20 @@ +# Spark S3 Connection { #DBR-onetl-connection-file-df-connection-spark-s3-connection } + + + +::: onetl.connection.file_df_connection.spark_s3.connection.SparkS3 + options: + members: + - check + - close + - get_packages + - get_exclude_packages diff --git a/mddocs/docs/connection/file_df_connection/spark_s3/index.md b/mddocs/docs/connection/file_df_connection/spark_s3/index.md new file mode 100644 index 000000000..f2eee4316 --- /dev/null +++ b/mddocs/docs/connection/file_df_connection/spark_s3/index.md @@ -0,0 +1,5 @@ +# Spark S3 { #DBR-onetl-connection-file-df-connection-spark-s3 } + +* [Prerequisites][DBR-onetl-connection-file-df-connection-spark-s3-prerequisites] +* [Connection][DBR-onetl-connection-file-df-connection-spark-s3-connection] +* [Troubleshooting][DBR-onetl-connection-file-df-connection-spark-s3-troubleshooting] diff --git a/mddocs/docs/connection/file_df_connection/spark_s3/prerequisites.md b/mddocs/docs/connection/file_df_connection/spark_s3/prerequisites.md new file mode 100644 index 000000000..5c4ee6626 --- /dev/null +++ b/mddocs/docs/connection/file_df_connection/spark_s3/prerequisites.md @@ -0,0 +1,60 @@ +# Prerequisites { #DBR-onetl-connection-file-df-connection-spark-s3-prerequisites } + +## Version Compatibility { #DBR-onetl-connection-file-df-connection-spark-s3-prerequisites-version-compatibility } + +- Spark versions: 3.2.x - 3.5.x (only with Hadoop 3.x libraries) +- Java versions: 8 - 20 + +## Installing PySpark { #DBR-onetl-connection-file-df-connection-spark-s3-prerequisites-installing-pyspark } + +To use SparkS3 connector you should have PySpark installed (or injected to `sys.path`) +BEFORE creating the connector instance. + +See [installation instruction][DBR-onetl-install-spark] for more details. + +## Connecting to S3 { #DBR-onetl-connection-file-df-connection-spark-s3-prerequisites-connecting-to-s3 } + +### Bucket access style { #DBR-onetl-connection-file-df-connection-spark-s3-prerequisites-bucket-access-style } + +AWS and some other S3 cloud providers allows bucket access using domain style only, e.g. `https://mybucket.s3provider.com`. + +Other implementations, like Minio, by default allows path style access only, e.g. `https://s3provider.com/mybucket` +(see [MINIO_DOMAIN](https://min.io/docs/minio/linux/reference/minio-server/minio-server.html#envvar.MINIO_DOMAIN)). + +You should set `path.style.access` to `True` or `False`, to choose the preferred style. + +### Authentication { #DBR-onetl-connection-file-df-connection-spark-s3-prerequisites-authentication } + +Different S3 instances can use different authentication methods, like: + +- `access_key + secret_key` (or username + password) +- `access_key + secret_key + session_token` + +Usually these are just passed to SparkS3 constructor: + +```python +SparkS3( + access_key=..., + secret_key=..., + session_token=..., +) +``` + +But some S3 cloud providers, like AWS, may require custom credential providers. You can pass them like: + +```python +SparkS3( + extra={ + # provider class + "aws.credentials.provider": "org.apache.hadoop.fs.s3a.auth.AssumedRoleCredentialProvider", + # other options, if needed + "assumed.role.arn": "arn:aws:iam::90066806600238:role/s3-restricted", + }, +) +``` + +See [Hadoop-AWS](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html#Changing_Authentication_Providers) documentation. + +## Troubleshooting { #DBR-onetl-connection-file-df-connection-spark-s3-prerequisites-troubleshooting } + +See [troubleshooting guide][DBR-onetl-connection-file-df-connection-spark-s3-troubleshooting]. diff --git a/mddocs/docs/connection/file_df_connection/spark_s3/troubleshooting.md b/mddocs/docs/connection/file_df_connection/spark_s3/troubleshooting.md new file mode 100644 index 000000000..c5ec2e9e0 --- /dev/null +++ b/mddocs/docs/connection/file_df_connection/spark_s3/troubleshooting.md @@ -0,0 +1,363 @@ +# Spark S3 Troubleshooting { #DBR-onetl-connection-file-df-connection-spark-s3-troubleshooting } + +!!! note + + General guide: [Troubleshooting][DBR-onetl-troubleshooting]. + +More details: + +- [Hadoop AWS Troubleshooting Guide](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/troubleshooting_s3a.html) +- [Hadoop AWS Performance Guide](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/performance.html) +- [Spark integration with Cloud Infrastructures](https://spark.apache.org/docs/latest/cloud-integration.html) + +## `SparkS3.check()` and other methods hang { #DBR-onetl-connection-file-df-connection-spark-s3-troubleshooting-sparks3-check-and-other-methods-hang } + +### Details { #DBR-onetl-connection-file-df-connection-spark-s3-troubleshooting-details } + +S3 may not respond for connection attempts for a long time if it's under heavy load. +To handle this, Hadoop AWS library has retry mechanism. By default it retries 7 times with 500ms interval. + +Hadoop AWS is based on AWS SDK library, which also has retry mechanism. This mechanism is not disabled because it handles different +errors than Hadoop AWS, so they complement each other. Default number of attempts in AWS SDK is 20 with minimal 5s interval, +which is exponentially increasing with each failed attempt. + +It is not a problem if S3 source is not accessible at all, like hostname cannot be resolved, or port is not opened. +These errors are not recoverable, and retry mechanism is not activated. + +But errors like SSL issues, are considered recoverable, and this causing retry of retry over increasing interval. +So user is waiting for [almost 15 minutes](https://issues.apache.org/jira/browse/HADOOP-18839) just to get exception message. + +### How to determine reason { #DBR-onetl-connection-file-df-connection-spark-s3-troubleshooting-how-to-determine-reason } + +#### Make logging more verbose { #DBR-onetl-connection-file-df-connection-spark-s3-troubleshooting-make-logging-more-verbose } + +Change Spark session log level to [DEBUG][DBR-onetl-troubleshooting-spark-troubleshooting] to print result of each attempt. +Resulting logs will look like this + +??? note "See log" + + ```text + 23/08/03 11:25:10 DEBUG S3AFileSystem: Using S3ABlockOutputStream with buffer = disk; block=67108864; queue limit=4 + 23/08/03 11:25:10 DEBUG S3Guard: Metastore option source [core-default.xml] + 23/08/03 11:25:10 DEBUG S3Guard: Using NullMetadataStore metadata store for s3a filesystem + 23/08/03 11:25:10 DEBUG S3AFileSystem: S3Guard is disabled on this bucket: test-bucket + 23/08/03 11:25:10 DEBUG DirectoryPolicyImpl: Directory markers will be deleted + 23/08/03 11:25:10 DEBUG S3AFileSystem: Directory marker retention policy is DirectoryMarkerRetention{policy='delete'} + 23/08/03 11:25:10 DEBUG S3AUtils: Value of fs.s3a.multipart.purge.age is 86400 + 23/08/03 11:25:10 DEBUG S3AUtils: Value of fs.s3a.bulk.delete.page.size is 250 + 23/08/03 11:25:10 DEBUG FileSystem: Creating FS s3a://test-bucket/fake: duration 0:01.029s + 23/08/03 11:25:10 DEBUG IOStatisticsStoreImpl: Incrementing counter op_is_directory by 1 with final value 1 + 23/08/03 11:25:10 DEBUG S3AFileSystem: Getting path status for s3a://test-bucket/fake (fake); needEmptyDirectory=false + 23/08/03 11:25:10 DEBUG S3AFileSystem: S3GetFileStatus s3a://test-bucket/fake + 23/08/03 11:25:10 DEBUG S3AFileSystem: LIST List test-bucket:/fake/ delimiter=/ keys=2 requester pays=false + 23/08/03 11:25:10 DEBUG S3AFileSystem: Starting: LIST + 23/08/03 11:25:10 DEBUG IOStatisticsStoreImpl: Incrementing counter object_list_request by 1 with final value 1 + 23/08/03 11:25:10 DEBUG AWSCredentialProviderList: Using credentials from SimpleAWSCredentialsProvider + 23/08/03 11:25:10 DEBUG request: Sending Request: GET https://test-bucket.localhost:9000 / Parameters: ({"list-type":["2"],"delimiter":["/"],"max-keys":["2"],"prefix":["fake/"],"fetch-owner":["false"]}Headers: (amz-sdk-invocation-id: e6d62603-96e4-a80f-10a1-816e0822bc71, Content-Type: application/octet-stream, User-Agent: Hadoop 3.3.4, aws-sdk-java/1.12.262 Linux/6.4.7-1-MANJARO OpenJDK_64-Bit_Server_VM/25.292-b10 java/1.8.0_292 scala/2.12.17 vendor/AdoptOpenJDK cfg/retry-mode/legacy, ) + 23/08/03 11:25:10 DEBUG AWS4Signer: AWS4 Canonical Request: '"GET + / + delimiter=%2F&fetch-owner=false&list-type=2&max-keys=2&prefix=fake%2F + amz-sdk-invocation-id:e6d62603-96e4-a80f-10a1-816e0822bc71 + amz-sdk-request:attempt=1;max=21 + amz-sdk-retry:0/0/500 + content-type:application/octet-stream + host:test-bucket.localhost:9000 + user-agent:Hadoop 3.3.4, aws-sdk-java/1.12.262 Linux/6.4.7-1-MANJARO OpenJDK_64-Bit_Server_VM/25.292-b10 java/1.8.0_292 scala/2.12.17 vendor/AdoptOpenJDK cfg/retry-mode/legacy + x-amz-content-sha256:UNSIGNED-PAYLOAD + x-amz-date:20230803T112510Z + + amz-sdk-invocation-id;amz-sdk-request;amz-sdk-retry;content-type;host;user-agent;x-amz-content-sha256;x-amz-date + UNSIGNED-PAYLOAD" + 23/08/03 11:25:10 DEBUG AWS4Signer: AWS4 String to Sign: '"AWS4-HMAC-SHA256 + 20230803T112510Z + 20230803/us-east-1/s3/aws4_request + 31a317bb7f6d97248dd0cf03429d701cbb3e29ce889cfbb98ba7a34c57a3bfba" + 23/08/03 11:25:10 DEBUG AWS4Signer: Generating a new signing key as the signing key not available in the cache for the date 1691020800000 + 23/08/03 11:25:10 DEBUG RequestAddCookies: CookieSpec selected: default + 23/08/03 11:25:10 DEBUG RequestAuthCache: Auth cache not set in the context + 23/08/03 11:25:10 DEBUG PoolingHttpClientConnectionManager: Connection request: [route: {s}->https://test-bucket.localhost:9000][total available: 0; route allocated: 0 of 96; total allocated: 0 of 96] + 23/08/03 11:25:10 DEBUG PoolingHttpClientConnectionManager: Connection leased: [id: 0][route: {s}->https://test-bucket.localhost:9000][total available: 0; route allocated: 1 of 96; total allocated: 1 of 96] + 23/08/03 11:25:10 DEBUG MainClientExec: Opening connection {s}->https://test-bucket.localhost:9000 + 23/08/03 11:25:10 DEBUG DefaultHttpClientConnectionOperator: Connecting to test-bucket.localhost/127.0.0.1:9000 + 23/08/03 11:25:10 DEBUG SSLConnectionSocketFactory: Connecting socket to test-bucket.localhost/127.0.0.1:9000 with timeout 5000 + 23/08/03 11:25:10 DEBUG SSLConnectionSocketFactory: Enabled protocols: [TLSv1.2] + 23/08/03 11:25:10 DEBUG SSLConnectionSocketFactory: Enabled cipher suites:[TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384, TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384, TLS_RSA_WITH_AES_256_CBC_SHA256, TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA384, TLS_ECDH_RSA_WITH_AES_256_CBC_SHA384, TLS_DHE_RSA_WITH_AES_256_CBC_SHA256, TLS_DHE_DSS_WITH_AES_256_CBC_SHA256, TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA, TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA, TLS_RSA_WITH_AES_256_CBC_SHA, TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA, TLS_ECDH_RSA_WITH_AES_256_CBC_SHA, TLS_DHE_RSA_WITH_AES_256_CBC_SHA, TLS_DHE_DSS_WITH_AES_256_CBC_SHA, TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256, TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256, TLS_RSA_WITH_AES_128_CBC_SHA256, TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA256, TLS_ECDH_RSA_WITH_AES_128_CBC_SHA256, TLS_DHE_RSA_WITH_AES_128_CBC_SHA256, TLS_DHE_DSS_WITH_AES_128_CBC_SHA256, TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA, TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA, TLS_RSA_WITH_AES_128_CBC_SHA, TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA, TLS_ECDH_RSA_WITH_AES_128_CBC_SHA, TLS_DHE_RSA_WITH_AES_128_CBC_SHA, TLS_DHE_DSS_WITH_AES_128_CBC_SHA, TLS_EMPTY_RENEGOTIATION_INFO_SCSV] + 23/08/03 11:25:10 DEBUG SSLConnectionSocketFactory: Starting handshake + 23/08/03 11:25:10 DEBUG ClientConnectionManagerFactory: + java.lang.reflect.InvocationTargetException + at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) + at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) + at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) + at java.lang.reflect.Method.invoke(Method.java:498) + at com.amazonaws.http.conn.ClientConnectionManagerFactory$Handler.invoke(ClientConnectionManagerFactory.java:76) + at com.amazonaws.http.conn.$Proxy32.connect(Unknown Source) + at com.amazonaws.thirdparty.apache.http.impl.execchain.MainClientExec.establishRoute(MainClientExec.java:393) + at com.amazonaws.thirdparty.apache.http.impl.execchain.MainClientExec.execute(MainClientExec.java:236) + at com.amazonaws.thirdparty.apache.http.impl.execchain.ProtocolExec.execute(ProtocolExec.java:186) + at com.amazonaws.thirdparty.apache.http.impl.client.InternalHttpClient.doExecute(InternalHttpClient.java:185) + at com.amazonaws.thirdparty.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:83) + at com.amazonaws.thirdparty.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:56) + at com.amazonaws.http.apache.client.impl.SdkHttpClient.execute(SdkHttpClient.java:72) + at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeOneRequest(AmazonHttpClient.java:1346) + at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeHelper(AmazonHttpClient.java:1157) + at com.amazonaws.http.AmazonHttpClient$RequestExecutor.doExecute(AmazonHttpClient.java:814) + at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeWithTimer(AmazonHttpClient.java:781) + at com.amazonaws.http.AmazonHttpClient$RequestExecutor.execute(AmazonHttpClient.java:755) + at com.amazonaws.http.AmazonHttpClient$RequestExecutor.access$500(AmazonHttpClient.java:715) + at com.amazonaws.http.AmazonHttpClient$RequestExecutionBuilderImpl.execute(AmazonHttpClient.java:697) + at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:561) + at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:541) + at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:5456) + at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:5403) + at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:5397) + at com.amazonaws.services.s3.AmazonS3Client.listObjectsV2(AmazonS3Client.java:971) + at org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$listObjects$11(S3AFileSystem.java:2595) + at org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.lambda$trackDurationOfOperation$5(IOStatisticsBinding.java:499) + at org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:414) + at org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:377) + at org.apache.hadoop.fs.s3a.S3AFileSystem.listObjects(S3AFileSystem.java:2586) + at org.apache.hadoop.fs.s3a.S3AFileSystem.s3GetFileStatus(S3AFileSystem.java:3832) + at org.apache.hadoop.fs.s3a.S3AFileSystem.innerGetFileStatus(S3AFileSystem.java:3688) + at org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$isDirectory$35(S3AFileSystem.java:4724) + at org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.lambda$trackDurationOfOperation$5(IOStatisticsBinding.java:499) + at org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.trackDuration(IOStatisticsBinding.java:444) + at org.apache.hadoop.fs.s3a.S3AFileSystem.trackDurationAndSpan(S3AFileSystem.java:2337) + at org.apache.hadoop.fs.s3a.S3AFileSystem.trackDurationAndSpan(S3AFileSystem.java:2356) + at org.apache.hadoop.fs.s3a.S3AFileSystem.isDirectory(S3AFileSystem.java:4722) + at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:54) + at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:366) + at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:229) + at org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:211) + at scala.Option.getOrElse(Option.scala:189) + at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211) + at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:186) + at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) + at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) + at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) + at java.lang.reflect.Method.invoke(Method.java:498) + at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) + at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374) + at py4j.Gateway.invoke(Gateway.java:282) + at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) + at py4j.commands.CallCommand.execute(CallCommand.java:79) + at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182) + at py4j.ClientServerConnection.run(ClientServerConnection.java:106) + at java.lang.Thread.run(Thread.java:748) + Caused by: javax.net.ssl.SSLException: Unsupported or unrecognized SSL message + at sun.security.ssl.SSLSocketInputRecord.handleUnknownRecord(SSLSocketInputRecord.java:448) + at sun.security.ssl.SSLSocketInputRecord.decode(SSLSocketInputRecord.java:184) + at sun.security.ssl.SSLTransport.decode(SSLTransport.java:109) + at sun.security.ssl.SSLSocketImpl.decode(SSLSocketImpl.java:1383) + at sun.security.ssl.SSLSocketImpl.readHandshakeRecord(SSLSocketImpl.java:1291) + at sun.security.ssl.SSLSocketImpl.startHandshake(SSLSocketImpl.java:435) + at com.amazonaws.thirdparty.apache.http.conn.ssl.SSLConnectionSocketFactory.createLayeredSocket(SSLConnectionSocketFactory.java:436) + at com.amazonaws.thirdparty.apache.http.conn.ssl.SSLConnectionSocketFactory.connectSocket(SSLConnectionSocketFactory.java:384) + at com.amazonaws.thirdparty.apache.http.impl.conn.DefaultHttpClientConnectionOperator.connect(DefaultHttpClientConnectionOperator.java:142) + at com.amazonaws.thirdparty.apache.http.impl.conn.PoolingHttpClientConnectionManager.connect(PoolingHttpClientConnectionManager.java:376) + ... 58 more + 23/08/03 11:25:10 DEBUG DefaultManagedHttpClientConnection: http-outgoing-0: Shutdown connection + 23/08/03 11:25:10 DEBUG MainClientExec: Connection discarded + 23/08/03 11:25:10 DEBUG PoolingHttpClientConnectionManager: Connection released: [id: 0][route: {s}->https://test-bucket.localhost:9000][total available: 0; route allocated: 0 of 96; total allocated: 0 of 96] + 23/08/03 11:25:10 DEBUG AmazonHttpClient: Unable to execute HTTP request: Unsupported or unrecognized SSL message Request will be retried. + 23/08/03 11:25:10 DEBUG request: Retrying Request: GET https://test-bucket.localhost:9000 / Parameters: ({"list-type":["2"],"delimiter":["/"],"max-keys":["2"],"prefix":["fake/"],"fetch-owner":["false"]}Headers: (amz-sdk-invocation-id: e6d62603-96e4-a80f-10a1-816e0822bc71, Content-Type: application/octet-stream, User-Agent: Hadoop 3.3.4, aws-sdk-java/1.12.262 Linux/6.4.7-1-MANJARO OpenJDK_64-Bit_Server_VM/25.292-b10 java/1.8.0_292 scala/2.12.17 vendor/AdoptOpenJDK cfg/retry-mode/legacy, ) + 23/08/03 11:25:10 DEBUG AmazonHttpClient: Retriable error detected, will retry in 49ms, attempt number: 0 + ``` + +#### Change number of retries { #DBR-onetl-connection-file-df-connection-spark-s3-troubleshooting-change-number-of-retries } + +You can also change number of retries performed by both libraries using `extra` parameter: + +```python +spark_s3 = SparkS3( + ..., + extra={ + "attempts.maximum": 1, + "retry.limit": 1, + }, +) +``` + +So accessing S3 will fail almost immediately if there is any error. + +### Most common mistakes { #DBR-onetl-connection-file-df-connection-spark-s3-troubleshooting-most-common-mistakes } + +#### No network access { #DBR-onetl-connection-file-df-connection-spark-s3-troubleshooting-no-network-access } + +```text +Caused by: java.net.ConnectException: Connection refused +``` + +Mostly caused by: + +- Trying to access port number which S3 server does not listen +- You're trying to access host which is unreachable from your network (e.g. running behind some proxy or VPN) +- There are some firewall restrictions for accessing specific host or port + +#### Using HTTPS protocol for HTTP port { #DBR-onetl-connection-file-df-connection-spark-s3-troubleshooting-using-https-protocol-for-http-port } + +```text +Caused by: javax.net.ssl.SSLException: Unsupported or unrecognized SSL message +``` + +By default, SparkS3 uses HTTPS protocol for connection. +If you change port number, this does not lead to changing protocol: + +```python +spark_s3 = SparkS3(host="s3provider.com", port=8080, ...) +``` + +You should pass protocol explicitly: + +```python +spark_s3 = SparkS3(host="s3provider.com", port=8080, protocol="http", ...) +``` + +#### SSL certificate is self-signed { #DBR-onetl-connection-file-df-connection-spark-s3-troubleshooting-ssl-certificate-is-self-signed } + +```text +sun.security.provider.certpath.SunCertPathBuilderException: unable to find valid certification path to requested target +``` + +To connect to HTTPS port with self-signed certificate, you should +[add certificate chain to Java TrustedStore](https://stackoverflow.com/questions/373295/digital-certificate-how-to-import-cer-file-in-to-truststore-file-using). + +Another option is to disable SSL check: + +```python +spark_s3 = SparkS3( + ..., + extra={ + "connection.ssl.enabled": False, + }, +) +``` + +But is is **NOT** recommended. + +#### Accessing S3 without domain-style access style support { #DBR-onetl-connection-file-df-connection-spark-s3-troubleshooting-accessing-s3-without-domain-style-access-style-support } + +```text +Caused by: java.net.UnknownHostException: my-bucket.s3provider.com +``` + +To use path-style access, use option below: + +```python +spark_s3 = SparkS3( + host="s3provider.com", + bucket="my-bucket", + ..., + extra={ + "path.style.access": True, + }, +) +``` + +## Slow or unstable writing to S3 { #DBR-onetl-connection-file-df-connection-spark-s3-troubleshooting-slow-or-unstable-writing-to-s3 } + +Hadoop AWS allows to use different writing strategies for different S3 implementations, depending +on list of supported features by server. + +These strategies are called [committers](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/committers.html). +There are [different types of committers](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/committers.html#Switching_to_an_S3A_Committer): + +- `file` (default) +- `directory` +- `partitioned` +- `magic` + +### `file` committer { #DBR-onetl-connection-file-df-connection-spark-s3-troubleshooting-file-committer } + +This committer is quite slow and unstable, so it is not recommended to use: + +```text +WARN AbstractS3ACommitterFactory: Using standard FileOutputCommitter to commit work. This is slow and potentially unsafe. +``` + +This is caused by the fact it creates files in the temp directory on remote filesystem, and after all of them are written successfully, +they are moved to target directory on same remote filesystem. + +This is not an issue for HDFS which does support file move operations and also support renaming directory +as atomic operation with `O(1)` time complexity. + +But S3 does support only file copying, so moving is performed via copy + delete. +Also it does not support atomic directory rename operation. Instead, renaming files with the same prefix has time complexity `O(n)`. + +### `directory` and `partitioned` committers { #DBR-onetl-connection-file-df-connection-spark-s3-troubleshooting-directory-and-partitioned-committers } + +These are [staging committers](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/committer_architecture.html), +meaning that they create temp directories on local filesystem, and after all files are written successfully, +they will be uploaded to S3. Local filesystems do support file moving and directory renaming, +so these committers does not have issues that `file` committer has. + +But they both require free space on local filesystem, and this may be an issue if user need to write large amount of data. +Also this can be an issue for container environment, like Kubernetes, there resources should be allocated before starting a container. + +### `magic` committer { #DBR-onetl-connection-file-df-connection-spark-s3-troubleshooting-magic-committer } + +This committer uses multipart upload feature of S3 API, allowing to create multiple files +and after all of them were written successfully finish the transaction. Before transaction is finished, +files will not be accessible by other clients. + +Because it does not require neither file moving operations, nor directory atomic rename, +upload process is done in most efficient way S3 support. +This [drastically increases writing performance](https://spot.io/blog/improve-apache-spark-performance-with-the-s3-magic-committer/). + +To use this committer, set [following properties](https://github.com/apache/spark/pull/32518) while creating Spark session. + +=== "S3 your main distributed filesystem (Spark on Kubernetes)" + + ```python + # https://issues.apache.org/jira/browse/SPARK-23977 + # https://spark.apache.org/docs/latest/cloud-integration.html#committing-work-into-cloud-storage-safely-and-fast + spark = ( + SparkSession.builder.appName("spark-app-name") + .config("spark.hadoop.fs.s3a.committer.magic.enabled", "true") + .config("spark.hadoop.fs.s3a.committer.name", "magic") + .config("spark.hadoop.mapreduce.outputcommitter.factory.scheme.s3a", "org.apache.hadoop.fs.s3a.commit.S3ACommitterFactory") + .config("spark.sql.parquet.output.committer.class", "org.apache.spark.internal.io.cloud.BindingParquetOutputCommitter") + .config("spark.sql.sources.commitProtocolClass", "org.apache.spark.internal.io.cloud.PathOutputCommitProtocol") + .getOrCreate() + ) + ``` + +=== "HDFS is your main distributed filesystem (Spark on Hadoop)" + + ```python + # https://community.cloudera.com/t5/Support-Questions/spark-sql-sources-partitionOverwriteMode-dynamic-quot-not/m-p/343483/highlight/true + spark = ( + SparkSession.builder.appName("spark-app-name") + .config("spark.hadoop.fs.s3a.committer.magic.enabled", "true") + .config("spark.hadoop.fs.s3a.committer.name", "magic") + .getOrCreate() + ) + ``` + +!!! warning + + `magic` committer requires S3 implementation to have strong consistency - file upload API return response only if it was written on enough number of cluster nodes, and any cluster node error does not lead to missing or corrupting files. + + Some S3 implementations does have strong consistency (like [AWS S3](https://aws.amazon.com/ru/blogs/aws/amazon-s3-update-strong-read-after-write-consistency/) and [MinIO](https://blog.min.io/migrating-hdfs-to-object-storage/)), some not. Please contact your S3 provider to get information about S3 implementation consistency. + +!!! warning + + `magic` committer does not support `if_exists="replace_overlapping_partitions"`. + Either use another `if_exists` value, or use `partitioned` committer. + +### See also { #DBR-onetl-connection-file-df-connection-spark-s3-troubleshooting-see-also } + +- [directory.marker.retention="keep"](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/directory_markers.html) + +## Slow reading from S3 { #DBR-onetl-connection-file-df-connection-spark-s3-troubleshooting-slow-reading-from-s3 } + +Please read following documentation: + +- [prefetch.enabled](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/prefetching.html) +- [experimental.input.fadvise](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/performance.html#Improving_data_input_performance_through_fadvise) +- [Parquet and ORC I/O settings](https://spark.apache.org/docs/latest/cloud-integration.html#parquet-io-settings) + +If you're reading data from row-based formats, like [CSV][DBR-onetl-file-df-file-formats-csv], prefer +[experimental.input.fadvise="sequential" with increased readahead.range](https://issues.apache.org/jira/browse/HADOOP-17789?focusedCommentId=17383559#comment-17383559). + +But for other file formats, especially using compression, prefer +[experimental.input.fadvise="normal"](https://issues.apache.org/jira/browse/HADOOP-17789?focusedCommentId=17383743#comment-17383743) diff --git a/mddocs/docs/connection/index.md b/mddocs/docs/connection/index.md new file mode 100644 index 000000000..9bb177d96 --- /dev/null +++ b/mddocs/docs/connection/index.md @@ -0,0 +1,34 @@ +# Connection { #DBR-onetl-connection } + +## DB Connection { #DBR-onetl-connection-db-connection } + +* [DB Connections](db_connection/index.md) + * [Clickhouse](db_connection/clickhouse/index.md) + * [Greenplum](db_connection/greenplum/index.md) + * [Kafka](db_connection/kafka/index.md) + * [Iceberg](db_connection/iceberg/index.md) + * [Hive](db_connection/hive/index.md) + * [MongoDB](db_connection/mongodb/index.md) + * [MSSQL](db_connection/mssql/index.md) + * [MySQL](db_connection/mysql/index.md) + * [Oracle](db_connection/oracle/index.md) + * [Postgres](db_connection/postgres/index.md) + +## File Connection { #DBR-onetl-connection-file-connection } + +* [File Connections](file_connection/index.md) + * [FTP](file_connection/ftp.md) + * [FTPS](file_connection/ftps.md) + * [HDFS](file_connection/hdfs/index.md) + * [Samba](file_connection/samba.md) + * [SFTP](file_connection/sftp.md) + * [S3](file_connection/s3.md) + * [Webdav](file_connection/webdav.md) + +## File DataFrame Connection { #DBR-onetl-connection-file-dataframe-connection } + +* [File DataFrame Connections](file_df_connection/index.md) + * [Spark LocalFS](file_df_connection/spark_local_fs.md) + * [Spark HDFS](file_df_connection/spark_hdfs/index.md) + * [Spark S3](file_df_connection/spark_s3/index.md) + * [Base interface](file_df_connection/base.md) diff --git a/mddocs/docs/contributing.md b/mddocs/docs/contributing.md new file mode 100644 index 000000000..406d17f73 --- /dev/null +++ b/mddocs/docs/contributing.md @@ -0,0 +1,398 @@ +# Contributing Guide { #DBR-onetl-contributing-guide } + +Welcome! There are many ways to contribute, including submitting bug +reports, improving documentation, submitting feature requests, reviewing +new submissions, or contributing code that can be incorporated into the +project. + +## Limitations { #DBR-onetl-contributing-limitations } + +We should keep close to these items during development: + +* Some companies still use old Spark versions, like 2.3.1. So it is required to keep compatibility if possible, e.g. adding branches for different Spark versions. +* Different users uses onETL in different ways - some uses only DB connectors, some only files. Connector-specific dependencies should be optional. +* Instead of creating classes with a lot of different options, prefer splitting them into smaller classes, e.g. options class, context manager, etc, and using composition. + +## Initial setup for local development { #DBR-onetl-contributing-initial-setup-for-local-development } + +### Install Git { #DBR-onetl-contributing-install-git } + +Please follow [instruction](https://docs.github.com/en/get-started/quickstart/set-up-git). + +### Create a fork { #DBR-onetl-contributing-create-a-fork } + +If you are not a member of a development team building onETL, you should create a fork before making any changes. + +Please follow [instruction](https://docs.github.com/en/get-started/quickstart/fork-a-repo). + +### Clone the repo { #DBR-onetl-contributing-clone-the-repo } + +Open terminal and run these commands: + +```bash +git clone git@github.com:myuser/onetl.git -b develop + +cd onetl +``` + +### Setup environment { #DBR-onetl-contributing-setup-environment } + +Create virtualenv and install dependencies: + +```bash +python -m venv venv +source venv/bin/activate +pip install -U wheel +pip install -U pip setuptools +pip install -U \ + -r requirements/core.txt \ + -r requirements/ftp.txt \ + -r requirements/hdfs.txt \ + -r requirements/kerberos.txt \ + -r requirements/s3.txt \ + -r requirements/sftp.txt \ + -r requirements/webdav.txt \ + -r requirements/dev.txt \ + -r requirements/docs.txt \ + -r requirements/tests/base.txt \ + -r requirements/tests/clickhouse.txt \ + -r requirements/tests/kafka.txt \ + -r requirements/tests/mongodb.txt \ + -r requirements/tests/mssql.txt \ + -r requirements/tests/mysql.txt \ + -r requirements/tests/postgres.txt \ + -r requirements/tests/oracle.txt \ + -r requirements/tests/pydantic-2.txt \ + -r requirements/tests/spark-3.5.5.txt + +# TODO: remove after https://github.com/zqmillet/sphinx-plantuml/pull/4 +pip install sphinx-plantuml --no-deps +``` + +### Enable pre-commit hooks { #DBR-onetl-contributing-enable-pre-commit-hooks } + +Install pre-commit hooks: + +```bash +pre-commit install --install-hooks +``` + +Test pre-commit hooks run: + +```bash +pre-commit run +``` + +## How to { #DBR-onetl-contributing-how-to } + +### Run tests locally { #DBR-onetl-contributing-run-tests-locally } + +#### Using docker-compose { #DBR-onetl-contributing-using-docker-compose } + +Build image for running tests: + +```bash +docker-compose build +``` + +Start all containers with dependencies: + +```bash +docker-compose --profile all up -d +``` + +You can run limited set of dependencies: + +```bash +docker-compose --profile mongodb up -d +``` + +Run tests: + +```bash +docker-compose run --rm onetl ./run_tests.sh +``` + +You can pass additional arguments, they will be passed to pytest: + +```bash +docker-compose run --rm onetl ./run_tests.sh -m mongodb -lsx -vvvv --log-cli-level=INFO +``` + +You can run interactive bash session and use it: + +```bash +docker-compose run --rm onetl bash + +./run_tests.sh -m mongodb -lsx -vvvv --log-cli-level=INFO +``` + +See logs of test container: + +```bash +docker-compose logs -f onetl +``` + +Stop all containers and remove created volumes: + +```bash +docker-compose --profile all down -v +``` + +#### Without docker-compose { #DBR-onetl-contributing-without-docker-compose } + +!!! warning + + To run HDFS tests locally you should add the following line to your `/etc/hosts` (file path depends on OS): + + ```default + # HDFS server returns container hostname as connection address, causing error in DNS resolution + 127.0.0.1 hdfs + ``` + +!!! note + + To run Oracle tests you need to install [Oracle instantclient](https://www.oracle.com/database/technologies/instant-client.html), + and pass its path to `ONETL_ORA_CLIENT_PATH` and `LD_LIBRARY_PATH` environment variables, + e.g. `ONETL_ORA_CLIENT_PATH=/path/to/client64/lib`. + + It may also require to add the same path into `LD_LIBRARY_PATH` environment variable + +!!! note + + To run Greenplum tests, you should: + + * Download [VMware Greenplum connector for Spark][DBR-onetl-connection-db-connection-greenplum-prerequisites] + * Either move it to `~/.ivy2/jars/`, or pass file path to `CLASSPATH` + * Set environment variable `ONETL_GP_PACKAGE_VERSION=local`. + +Start all containers with dependencies: + +```bash +docker-compose --profile all up -d +``` + +You can run limited set of dependencies: + +```bash +docker-compose --profile mongodb up -d +``` + +Load environment variables with connection properties: + +```bash +source .env.local +``` + +Run tests: + +```bash +./run_tests.sh +``` + +You can pass additional arguments, they will be passed to pytest: + +```bash +./run_tests.sh -m mongodb -lsx -vvvv --log-cli-level=INFO +``` + +Stop all containers and remove created volumes: + +```bash +docker-compose --profile all down -v +``` + +### Build documentation { #DBR-onetl-contributing-build-documentation } + +Build documentation using Sphinx: + +```bash +cd docs +make html +``` + +Then open in browser `docs/_build/index.html`. + +## Review process { #DBR-onetl-contributing-review-process } + +Please create a new GitHub issue for any significant changes and +enhancements that you wish to make. Provide the feature you would like +to see, why you need it, and how it will work. Discuss your ideas +transparently and get community feedback before proceeding. + +Significant Changes that you wish to contribute to the project should be +discussed first in a GitHub issue that clearly outlines the changes and +benefits of the feature. + +Small Changes can directly be crafted and submitted to the GitHub +Repository as a Pull Request. + +### Create pull request { #DBR-onetl-contributing-create-pull-request } + +Commit your changes: + +```bash +git commit -m "Commit message" +git push +``` + +Then open Github interface and [create pull request](https://docs.github.com/en/get-started/quickstart/contributing-to-projects#making-a-pull-request). +Please follow guide from PR body template. + +After pull request is created, it get a corresponding number, e.g. 123 (`pr_number`). + +### Write release notes { #DBR-onetl-contributing-write-release-notes } + +`onETL` uses [towncrier](https://pypi.org/project/towncrier/) +for changelog management. + +To submit a change note about your PR, add a text file into the +[docs/changelog/next_release](changelog/NEXT_RELEASE.md) folder. It should contain an +explanation of what applying this PR will change in the way +end-users interact with the project. One sentence is usually +enough but feel free to add as many details as you feel necessary +for the users to understand what it means. + +**Use the past tense** for the text in your fragment because, +combined with others, it will be a part of the “news digest” +telling the readers **what changed** in a specific version of +the library *since the previous version*. + +You should also use +reStructuredText syntax for highlighting code (inline or block), +linking parts of the docs or external sites. +If you wish to sign your change, feel free to add `-- by +:user:`github-username`` at the end (replace `github-username` +with your own!). + +Finally, name your file following the convention that Towncrier +understands: it should start with the number of an issue or a +PR followed by a dot, then add a patch type, like `feature`, +`doc`, `misc` etc., and add `.rst` as a suffix. If you +need to add more than one fragment, you may add an optional +sequence number (delimited with another period) between the type +and the suffix. + +In general the name will follow `..rst` pattern, +where the categories are: + +* `feature`: Any new feature +* `bugfix`: A bug fix +* `improvement`: An improvement +* `doc`: A change to the documentation +* `dependency`: Dependency-related changes +* `misc`: Changes internal to the repo like CI, test and build changes + +A pull request may have more than one of these components, for example +a code change may introduce a new feature that deprecates an old +feature, in which case two fragments should be added. It is not +necessary to make a separate documentation fragment for documentation +changes accompanying the relevant code changes. + +#### Examples for adding changelog entries to your Pull Requests { #DBR-onetl-contributing-examples-for-adding-changelog-entries-to-your-pull-requests } + +```rst title="docs/changelog/next_release/1234.doc.1.rst" +Added a `:github:user:` role to Sphinx config -- by :github:user:`someuser` +``` + +```rst title="docs/changelog/next_release/2345.bugfix.rst" +Fixed behavior of `WebDAV` connector -- by :github:user:`someuser` +``` + +```rst +Added support of `timeout` in `S3` connector +-- by :github:user:`someuser`, :github:user:`anotheruser` and :github:user:`otheruser` +``` + +#### How to skip change notes check? { #DBR-onetl-contributing-how-to-skip-change-notes-check } + +Just add `ci:skip-changelog` label to pull request. + +!!! tip + + See [pyproject.toml](../../pyproject.toml) for all available categories (`tool.towncrier.type`). + +#### Release Process { #DBR-onetl-contributing-release-process } + +Before making a release from the `develop` branch, follow these steps: + +1. Checkout to `develop` branch and update it to the actual state + +```bash +git checkout develop +git pull -p +``` + +2. Backup `NEXT_RELEASE.rst` + +```bash +cp "docs/changelog/NEXT_RELEASE.rst" "docs/changelog/temp_NEXT_RELEASE.rst" +``` + +3. Build the Release notes with Towncrier + +```bash +VERSION=$(cat onetl/VERSION) +towncrier build "--version=${VERSION}" --yes +``` + +4. Change file with changelog to release version number + +```bash +mv docs/changelog/NEXT_RELEASE.rst "docs/changelog/${VERSION}.rst" +``` + +5. Remove content above the version number heading in the `${VERSION}.rst` file + +```bash +awk '!/^.*towncrier release notes start/' "docs/changelog/${VERSION}.rst" > temp && mv temp "docs/changelog/${VERSION}.rst" +``` + +6. Update Changelog Index + +```bash +awk -v version=${VERSION} '/DRAFT/{print;print " " version;next}1' docs/changelog/index.rst > temp && mv temp docs/changelog/index.rst +``` + +7. Restore `NEXT_RELEASE.rst` file from backup + +```bash +mv "docs/changelog/temp_NEXT_RELEASE.rst" "docs/changelog/NEXT_RELEASE.rst" +``` + +8. Commit and push changes to `develop` branch + +```bash +git add . +git commit -m "Prepare for release ${VERSION}" +git push +``` + +9. Merge `develop` branch to `master`, **WITHOUT** squashing + +```bash +git checkout master +git pull +git merge develop +git push +``` + +10. Add git tag to the latest commit in `master` branch + +```bash +git tag "$VERSION" +git push origin "$VERSION" +``` + +11. Update version in `develop` branch **after release**: + +```bash +git checkout develop + +NEXT_VERSION=$(echo "$VERSION" | awk -F. '/[0-9]+\./{$NF++;print}' OFS=.) +echo "$NEXT_VERSION" > onetl/VERSION + +git add . +git commit -m "Bump version" +git push +``` diff --git a/mddocs/docs/db/index.md b/mddocs/docs/db/index.md new file mode 100644 index 000000000..cbd6b7763 --- /dev/null +++ b/mddocs/docs/db/index.md @@ -0,0 +1,6 @@ +# DB { #DBR-onetl-db } + +## DB classes { #DBR-onetl-db-classes } + +* [DB Reader][DBR-onetl-db-reader] +* [DB Writer][DBR-onetl-db-writer] diff --git a/mddocs/docs/db/reader.md b/mddocs/docs/db/reader.md new file mode 100644 index 000000000..306853248 --- /dev/null +++ b/mddocs/docs/db/reader.md @@ -0,0 +1,19 @@ +# DB Reader { #DBR-onetl-db-reader } + + + +::: onetl.db.db_reader.db_reader.DBReader + options: + members: + - run + - has_data + - raise_if_no_data diff --git a/mddocs/docs/db/writer.md b/mddocs/docs/db/writer.md new file mode 100644 index 000000000..7015eeb22 --- /dev/null +++ b/mddocs/docs/db/writer.md @@ -0,0 +1,24 @@ +# DB Writer { #DBR-onetl-db-writer } + + + +::: onetl.db.db_writer.db_writer.DBWriter + options: + members: + - run diff --git a/mddocs/docs/file/file_downloader/file_downloader.md b/mddocs/docs/file/file_downloader/file_downloader.md new file mode 100644 index 000000000..cd38edefc --- /dev/null +++ b/mddocs/docs/file/file_downloader/file_downloader.md @@ -0,0 +1,27 @@ +# File Downloader { #DBR-onetl-file-downloader-0 } + + + +::: onetl.file.file_downloader.file_downloader.FileDownloader + options: + members: + - run + - view_files diff --git a/mddocs/docs/file/file_downloader/index.md b/mddocs/docs/file/file_downloader/index.md new file mode 100644 index 000000000..e04823bf9 --- /dev/null +++ b/mddocs/docs/file/file_downloader/index.md @@ -0,0 +1,5 @@ +# File Downloader { #DBR-onetl-file-downloader-1 } + +* [File Downloader][DBR-onetl-file-downloader-0] +* [File Downloader Options][DBR-onetl-file-downloader-options] +* [File Downloader Result][DBR-onetl-file-downloader-result] diff --git a/mddocs/docs/file/file_downloader/options.md b/mddocs/docs/file/file_downloader/options.md new file mode 100644 index 000000000..0978b0fd4 --- /dev/null +++ b/mddocs/docs/file/file_downloader/options.md @@ -0,0 +1,3 @@ +# File Downloader Options { #DBR-onetl-file-downloader-options } + +::: onetl.file.file_downloader.options.FileDownloaderOptions diff --git a/mddocs/docs/file/file_downloader/result.md b/mddocs/docs/file/file_downloader/result.md new file mode 100644 index 000000000..9fe3c714e --- /dev/null +++ b/mddocs/docs/file/file_downloader/result.md @@ -0,0 +1,40 @@ +# File Downloader Result { #DBR-onetl-file-downloader-result } + + + +::: onetl.file.file_downloader.result.DownloadResult + options: + members: + - successful + - failed + - skipped + - missing + - successful_count + - failed_count + - skipped_count + - missing_count + - total_count + - successful_size + - failed_size + - skipped_size + - total_size + - raise_if_failed + - raise_if_missing + - raise_if_skipped + - raise_if_empty + - is_empty + - raise_if_contains_zero_size + - details + - summary + - dict + - json diff --git a/mddocs/docs/file/file_filters/base.md b/mddocs/docs/file/file_filters/base.md new file mode 100644 index 000000000..02e39c13d --- /dev/null +++ b/mddocs/docs/file/file_filters/base.md @@ -0,0 +1,24 @@ +# Base interface { #DBR-onetl-file-filters-base-interface } + + + +::: onetl.base.base_file_filter.BaseFileFilter + options: + members: + - match diff --git a/mddocs/docs/file/file_filters/exclude_dir.md b/mddocs/docs/file/file_filters/exclude_dir.md new file mode 100644 index 000000000..d642862a5 --- /dev/null +++ b/mddocs/docs/file/file_filters/exclude_dir.md @@ -0,0 +1,17 @@ +# ExcludeDir { #DBR-onetl-file-filters-exclude-dir-excludedir } + + + +::: onetl.file.filter.exclude_dir.ExcludeDir + options: + members: + - match diff --git a/mddocs/docs/file/file_filters/file_filter.md b/mddocs/docs/file/file_filters/file_filter.md new file mode 100644 index 000000000..337e9bef2 --- /dev/null +++ b/mddocs/docs/file/file_filters/file_filter.md @@ -0,0 +1,17 @@ +# File Filter (legacy) { #DBR-onetl-file-filters-file-filter-legacy } + + + +::: onetl.core.file_filter.file_filter.FileFilter + options: + members: + - match diff --git a/mddocs/docs/file/file_filters/file_mtime_filter.md b/mddocs/docs/file/file_filters/file_mtime_filter.md new file mode 100644 index 000000000..50db64095 --- /dev/null +++ b/mddocs/docs/file/file_filters/file_mtime_filter.md @@ -0,0 +1,17 @@ +# FileModifiedTime { #DBR-onetl-file-filters-file-mtime-filter-filemodifiedtime } + + + +::: onetl.file.filter.file_mtime.FileModifiedTime + options: + members: + - match diff --git a/mddocs/docs/file/file_filters/file_size_filter.md b/mddocs/docs/file/file_filters/file_size_filter.md new file mode 100644 index 000000000..13c102927 --- /dev/null +++ b/mddocs/docs/file/file_filters/file_size_filter.md @@ -0,0 +1,17 @@ +# FileSizeRange { #DBR-onetl-file-filters-file-size-filter-filesizerange } + + + +::: onetl.file.filter.file_size.FileSizeRange + options: + members: + - match diff --git a/mddocs/docs/file/file_filters/glob.md b/mddocs/docs/file/file_filters/glob.md new file mode 100644 index 000000000..797557607 --- /dev/null +++ b/mddocs/docs/file/file_filters/glob.md @@ -0,0 +1,17 @@ +# Glob { #DBR-onetl-file-filters-glob } + + + +::: onetl.file.filter.glob.Glob + options: + members: + - match diff --git a/mddocs/docs/file/file_filters/index.md b/mddocs/docs/file/file_filters/index.md new file mode 100644 index 000000000..21d97588e --- /dev/null +++ b/mddocs/docs/file/file_filters/index.md @@ -0,0 +1,16 @@ +# File Filters { #DBR-onetl-file-filters } + +* [Glob][DBR-onetl-file-filters-glob] +* [Regexp][DBR-onetl-file-filters-regexp] +* [ExcludeDir][DBR-onetl-file-filters-exclude-dir-excludedir] +* [FileSizeRange][DBR-onetl-file-filters-file-size-filter-filesizerange] +* [FileModifiedTime][DBR-onetl-file-filters-file-mtime-filter-filemodifiedtime] + +## Legacy { #DBR-onetl-file-filters-legacy } + +* [File Filter (legacy)][DBR-onetl-file-filters-file-filter-legacy] + +## For developers { #DBR-onetl-file-filters-for-developers } + +* [Base interface][DBR-onetl-file-filters-base-interface] +* [match_all_filters][DBR-onetl-file-filters-match-all-filters] diff --git a/mddocs/docs/file/file_filters/match_all_filters.md b/mddocs/docs/file/file_filters/match_all_filters.md new file mode 100644 index 000000000..98c3f32cf --- /dev/null +++ b/mddocs/docs/file/file_filters/match_all_filters.md @@ -0,0 +1,13 @@ +# match_all_filters { #DBR-onetl-file-filters-match-all-filters } + + + +::: onetl.file.filter.match_all_filters diff --git a/mddocs/docs/file/file_filters/regexp.md b/mddocs/docs/file/file_filters/regexp.md new file mode 100644 index 000000000..e0e47d479 --- /dev/null +++ b/mddocs/docs/file/file_filters/regexp.md @@ -0,0 +1,18 @@ +# Regexp { #DBR-onetl-file-filters-regexp } + + + + +::: onetl.file.filter.regexp.Regexp + options: + members: + - match diff --git a/mddocs/docs/file/file_limits/base.md b/mddocs/docs/file/file_limits/base.md new file mode 100644 index 000000000..eadf374e1 --- /dev/null +++ b/mddocs/docs/file/file_limits/base.md @@ -0,0 +1,28 @@ +# Base interface { #DBR-onetl-file-limits-base-interface } + + + +::: onetl.base.base_file_limit.BaseFileLimit + options: + members: + - reset + - stops_at + - is_reached diff --git a/mddocs/docs/file/file_limits/file_limit.md b/mddocs/docs/file/file_limits/file_limit.md new file mode 100644 index 000000000..ecb1fa5e5 --- /dev/null +++ b/mddocs/docs/file/file_limits/file_limit.md @@ -0,0 +1,19 @@ +# File Limit (legacy) { #DBR-onetl-file-limits-file-limit-legacy } + + + +::: onetl.core.file_limit.file_limit.FileLimit + options: + members: + - reset + - stops_at + - is_reached diff --git a/mddocs/docs/file/file_limits/index.md b/mddocs/docs/file/file_limits/index.md new file mode 100644 index 000000000..0cd8efaab --- /dev/null +++ b/mddocs/docs/file/file_limits/index.md @@ -0,0 +1,15 @@ +# File limits { #DBR-onetl-file-limits } + +* [MaxFilesCount][DBR-onetl-file-limits-max-files-count-maxfilescount] +* [TotalFilesSize][DBR-onetl-file-limits-total-files-size-totalfilessize] + +## Legacy { #DBR-onetl-file-limits-legacy } + +* [File Limit [legacy]][DBR-onetl-file-limits-file-limit-legacy] + +## For developers { #DBR-onetl-file-limits-for-developers } + +* [Base interface][DBR-onetl-file-limits-base-interface] +* [limits_stop_at][DBR-onetl-file-limits-limits-stop-at] +* [limits_reached][DBR-onetl-file-limits-limits-reached] +* [reset_limits][DBR-onetl-file-limits-reset-limits] diff --git a/mddocs/docs/file/file_limits/limits_reached.md b/mddocs/docs/file/file_limits/limits_reached.md new file mode 100644 index 000000000..e0785db31 --- /dev/null +++ b/mddocs/docs/file/file_limits/limits_reached.md @@ -0,0 +1,13 @@ +# limits_reached { #DBR-onetl-file-limits-limits-reached } + + + +::: onetl.file.limit.limits_reached diff --git a/mddocs/docs/file/file_limits/limits_stop_at.md b/mddocs/docs/file/file_limits/limits_stop_at.md new file mode 100644 index 000000000..92740b9fd --- /dev/null +++ b/mddocs/docs/file/file_limits/limits_stop_at.md @@ -0,0 +1,13 @@ +# limits_stop_at { #DBR-onetl-file-limits-limits-stop-at } + + + +::: onetl.file.limit.limits_stop_at diff --git a/mddocs/docs/file/file_limits/max_files_count.md b/mddocs/docs/file/file_limits/max_files_count.md new file mode 100644 index 000000000..513664da2 --- /dev/null +++ b/mddocs/docs/file/file_limits/max_files_count.md @@ -0,0 +1,19 @@ +# MaxFilesCount { #DBR-onetl-file-limits-max-files-count-maxfilescount } + + + +::: onetl.file.limit.max_files_count.MaxFilesCount + options: + members: + - reset + - stops_at + - is_reached diff --git a/mddocs/docs/file/file_limits/reset_limits.md b/mddocs/docs/file/file_limits/reset_limits.md new file mode 100644 index 000000000..e3e3a495c --- /dev/null +++ b/mddocs/docs/file/file_limits/reset_limits.md @@ -0,0 +1,13 @@ +# reset_limits { #DBR-onetl-file-limits-reset-limits } + + + +::: onetl.file.limit.reset_limits diff --git a/mddocs/docs/file/file_limits/total_files_size.md b/mddocs/docs/file/file_limits/total_files_size.md new file mode 100644 index 000000000..f641dc406 --- /dev/null +++ b/mddocs/docs/file/file_limits/total_files_size.md @@ -0,0 +1,19 @@ +# TotalFilesSize { #DBR-onetl-file-limits-total-files-size-totalfilessize } + + + +::: onetl.file.limit.total_files_size.TotalFilesSize + options: + members: + - reset + - stops_at + - is_reached diff --git a/mddocs/docs/file/file_mover/file_mover.md b/mddocs/docs/file/file_mover/file_mover.md new file mode 100644 index 000000000..2fc1df6c2 --- /dev/null +++ b/mddocs/docs/file/file_mover/file_mover.md @@ -0,0 +1,27 @@ +# File Mover { #DBR-onetl-file-mover-0 } + + + +::: onetl.file.file_mover.file_mover.FileMover + options: + members: + - run + - view_files diff --git a/mddocs/docs/file/file_mover/index.md b/mddocs/docs/file/file_mover/index.md new file mode 100644 index 000000000..75ec37dcc --- /dev/null +++ b/mddocs/docs/file/file_mover/index.md @@ -0,0 +1,5 @@ +# File Mover { #DBR-onetl-file-mover-1 } + +* [File Mover][DBR-onetl-file-mover-0] +* [File Mover Options][DBR-onetl-file-mover-options] +* [File Mover Result][DBR-onetl-file-mover-result] diff --git a/mddocs/docs/file/file_mover/options.md b/mddocs/docs/file/file_mover/options.md new file mode 100644 index 000000000..2bcb58320 --- /dev/null +++ b/mddocs/docs/file/file_mover/options.md @@ -0,0 +1,16 @@ +# File Mover Options { #DBR-onetl-file-mover-options } + + + +::: onetl.file.file_mover.options.FileMoverOptions diff --git a/mddocs/docs/file/file_mover/result.md b/mddocs/docs/file/file_mover/result.md new file mode 100644 index 000000000..80beac1e6 --- /dev/null +++ b/mddocs/docs/file/file_mover/result.md @@ -0,0 +1,39 @@ +# File Mover Result { #DBR-onetl-file-mover-result } + + + +::: onetl.file.file_mover.result.MoveResult + options: + members: + - successful + - failed + - skipped + - missing + - successful_count + - failed_count + - skipped_count + - missing_count + - total_count + - successful_size + - failed_size + - skipped_size + - total_size + - raise_if_failed + - raise_if_missing + - raise_if_skipped + - raise_if_empty + - is_empty + - raise_if_contains_zero_size + - details + - summary + - dict + - json diff --git a/mddocs/docs/file/file_uploader/file_uploader.md b/mddocs/docs/file/file_uploader/file_uploader.md new file mode 100644 index 000000000..cfdcbf6a4 --- /dev/null +++ b/mddocs/docs/file/file_uploader/file_uploader.md @@ -0,0 +1,27 @@ +# File Uploader { #DBR-onetl-file-uploader-0 } + + + +::: onetl.file.file_uploader.file_uploader.FileUploader + options: + members: + - run + - view_files diff --git a/mddocs/docs/file/file_uploader/index.md b/mddocs/docs/file/file_uploader/index.md new file mode 100644 index 000000000..cc3f03b5c --- /dev/null +++ b/mddocs/docs/file/file_uploader/index.md @@ -0,0 +1,5 @@ +# File Uploader { #DBR-onetl-file-uploader-1 } + +* [File Uploader][DBR-onetl-file-uploader-0] +* [File Uploader Options][DBR-onetl-file-uploader-options] +* [File Uploader Result][DBR-onetl-file-uploader-result] diff --git a/mddocs/docs/file/file_uploader/options.md b/mddocs/docs/file/file_uploader/options.md new file mode 100644 index 000000000..552d36217 --- /dev/null +++ b/mddocs/docs/file/file_uploader/options.md @@ -0,0 +1,20 @@ +# File Uploader Options { #DBR-onetl-file-uploader-options } + + + +::: onetl.file.file_uploader.options.FileUploaderOptions + options: + members: + - source_dir + - target_dir diff --git a/mddocs/docs/file/file_uploader/result.md b/mddocs/docs/file/file_uploader/result.md new file mode 100644 index 000000000..785cd590d --- /dev/null +++ b/mddocs/docs/file/file_uploader/result.md @@ -0,0 +1,39 @@ +# File Uploader Result { #DBR-onetl-file-uploader-result } + + + +::: onetl.file.file_uploader.result.UploadResult + options: + members: + - successful + - failed + - skipped + - missing + - successful_count + - failed_count + - skipped_count + - missing_count + - total_count + - successful_size + - failed_size + - skipped_size + - total_size + - raise_if_failed + - raise_if_missing + - raise_if_skipped + - raise_if_empty + - is_empty + - raise_if_contains_zero_size + - details + - summary + - dict + - json diff --git a/mddocs/docs/file/index.md b/mddocs/docs/file/index.md new file mode 100644 index 000000000..63294d374 --- /dev/null +++ b/mddocs/docs/file/index.md @@ -0,0 +1,7 @@ +# File classes { #DBR-onetl-file-classes } + +* [File Downloader][DBR-onetl-file-downloader-0] +* [File Uploader][DBR-onetl-file-uploader-0] +* [File Mover][DBR-onetl-file-mover-0] +* [File Filters][DBR-onetl-file-filters] +* [File Limits][DBR-onetl-file-limits] diff --git a/mddocs/docs/file_df/file_df_reader/file_df_reader.md b/mddocs/docs/file_df/file_df_reader/file_df_reader.md new file mode 100644 index 000000000..e9f3494fe --- /dev/null +++ b/mddocs/docs/file_df/file_df_reader/file_df_reader.md @@ -0,0 +1,18 @@ +# FileDF Reader { #DBR-onetl-file-df-reader-filedf-reader-0 } + + + +::: onetl.file.file_df_reader.file_df_reader.FileDFReader + options: + members: + - run diff --git a/mddocs/docs/file_df/file_df_reader/index.md b/mddocs/docs/file_df/file_df_reader/index.md new file mode 100644 index 000000000..00d357035 --- /dev/null +++ b/mddocs/docs/file_df/file_df_reader/index.md @@ -0,0 +1,4 @@ +# FileDF Reader { #DBR-onetl-file-df-reader-filedf-reader-1 } + +* [FileDF Reader][DBR-onetl-file-df-reader-filedf-reader-0] +* [Options][DBR-onetl-file-df-reader-options] diff --git a/mddocs/docs/file_df/file_df_reader/options.md b/mddocs/docs/file_df/file_df_reader/options.md new file mode 100644 index 000000000..ba85c39b7 --- /dev/null +++ b/mddocs/docs/file_df/file_df_reader/options.md @@ -0,0 +1,15 @@ +# Options { #DBR-onetl-file-df-reader-options } + + + +::: onetl.file.file_df_reader.options.FileDFReaderOptions diff --git a/mddocs/docs/file_df/file_df_writer/file_df_writer.md b/mddocs/docs/file_df/file_df_writer/file_df_writer.md new file mode 100644 index 000000000..13ba45377 --- /dev/null +++ b/mddocs/docs/file_df/file_df_writer/file_df_writer.md @@ -0,0 +1,18 @@ +# FileDF Writer { #DBR-onetl-file-df-writer-filedf-writer-0 } + + + +::: onetl.file.file_df_writer.file_df_writer.FileDFWriter + options: + members: + - run diff --git a/mddocs/docs/file_df/file_df_writer/index.md b/mddocs/docs/file_df/file_df_writer/index.md new file mode 100644 index 000000000..0d53aa9bd --- /dev/null +++ b/mddocs/docs/file_df/file_df_writer/index.md @@ -0,0 +1,4 @@ +# FileDF Writer { #DBR-onetl-file-df-writer-filedf-writer-1 } + +* [FileDF Writer][DBR-onetl-file-df-writer-filedf-writer-0] +* [Options][DBR-onetl-file-df-writer-options] diff --git a/mddocs/docs/file_df/file_df_writer/options.md b/mddocs/docs/file_df/file_df_writer/options.md new file mode 100644 index 000000000..2e2872004 --- /dev/null +++ b/mddocs/docs/file_df/file_df_writer/options.md @@ -0,0 +1,15 @@ +# Options { #DBR-onetl-file-df-writer-options } + + + +::: onetl.file.file_df_writer.options.FileDFWriterOptions diff --git a/mddocs/docs/file_df/file_formats/avro.md b/mddocs/docs/file_df/file_formats/avro.md new file mode 100644 index 000000000..8c2c0fdec --- /dev/null +++ b/mddocs/docs/file_df/file_formats/avro.md @@ -0,0 +1,29 @@ +# Avro { #DBR-onetl-file-df-file-formats-avro } + + + +::: onetl.file.format.avro.Avro + options: + members: + - get_packages + - parse_column + - serialize_column + - schema_dict + - schema_url + - recordName + - recordNamespace + - compression + - mode + - datetimeRebaseMode + - positionalFieldMatching + - enableStableIdentifiersForUnionType diff --git a/mddocs/docs/file_df/file_formats/base.md b/mddocs/docs/file_df/file_formats/base.md new file mode 100644 index 000000000..72234efb7 --- /dev/null +++ b/mddocs/docs/file_df/file_formats/base.md @@ -0,0 +1,31 @@ +# Base interface { #DBR-onetl-file-df-file-formats-base-interface } + + + +::: onetl.base.base_file_format.BaseReadableFileFormat + options: + members: + - check_if_supported + - apply_to_reader + +::: onetl.base.base_file_format.BaseWritableFileFormat + options: + members: + - check_if_supported + - apply_to_writer diff --git a/mddocs/docs/file_df/file_formats/csv.md b/mddocs/docs/file_df/file_formats/csv.md new file mode 100644 index 000000000..a4f4beb1c --- /dev/null +++ b/mddocs/docs/file_df/file_formats/csv.md @@ -0,0 +1,48 @@ +# CSV { #DBR-onetl-file-df-file-formats-csv } + + + +::: onetl.file.format.csv.CSV + options: + members: + - __init__ + - parse_column + - serialize_column + - charToEscapeQuoteEscaping + - columnNameOfCorruptRecord + - comment + - compression + - dateFormat + - delimiter + - emptyValue + - enforceSchema + - escapeQuotes + - header + - ignoreLeadingWhiteSpace + - ignoreTrailingWhiteSpace + - inferSchema + - locale + - maxCharsPerColumn + - mode + - multiLine + - nanValue + - negativeInf + - nullValue + - positiveInf + - preferDate + - quote + - quoteAll + - samplingRatio + - timestampFormat + - timestampNTZFormat + - unescapedQuoteHandling diff --git a/mddocs/docs/file_df/file_formats/excel.md b/mddocs/docs/file_df/file_formats/excel.md new file mode 100644 index 000000000..b692c6a20 --- /dev/null +++ b/mddocs/docs/file_df/file_formats/excel.md @@ -0,0 +1,30 @@ +# Excel { #DBR-onetl-file-df-file-formats-excel } + + + +::: onetl.file.format.excel.Excel + options: + members: + - get_packages + - header + - dataAddress + - treatEmptyValuesAsNulls + - setErrorCellsToFallbackValues + - usePlainNumberFormat + - inferSchema + - timestampFormat + - maxRowsInMemory + - maxByteArraySize + - tempFileThreshold + - excerptSize + - workbookPassword diff --git a/mddocs/docs/file_df/file_formats/index.md b/mddocs/docs/file_df/file_formats/index.md new file mode 100644 index 000000000..5a82a7f89 --- /dev/null +++ b/mddocs/docs/file_df/file_formats/index.md @@ -0,0 +1,14 @@ +# File formats { #DBR-onetl-file-df-file-formats } + +* [Avro][DBR-onetl-file-df-file-formats-avro] +* [CSV][DBR-onetl-file-df-file-formats-csv] +* [Excel][DBR-onetl-file-df-file-formats-excel] +* [JSON][DBR-onetl-file-df-file-formats-json] +* [JSONLine][DBR-onetl-file-df-file-formats-jsonline] +* [ORC][DBR-onetl-file-df-file-formats-orc] +* [Parquet][DBR-onetl-file-df-file-formats-parquet] +* [XML][DBR-onetl-file-df-file-formats-xml] + +## For developers { #DBR-onetl-file-df-file-formats-for-developers } + +* [Base interface][DBR-onetl-file-df-file-formats-base-interface] diff --git a/mddocs/docs/file_df/file_formats/json.md b/mddocs/docs/file_df/file_formats/json.md new file mode 100644 index 000000000..ff9b36b76 --- /dev/null +++ b/mddocs/docs/file_df/file_formats/json.md @@ -0,0 +1,40 @@ +# JSON { #DBR-onetl-file-df-file-formats-json } + + + +::: onetl.file.format.json.JSON + options: + members: + - __init__ + - parse_column + - serialize_column + - allowBackslashEscapingAnyCharacter + - allowComments + - allowNonNumericNumbers + - allowNumericLeadingZeros + - allowSingleQuotes + - allowUnquotedControlChars + - allowUnquotedFieldNames + - columnNameOfCorruptRecord + - dateFormat + - dropFieldIfAllNull + - encoding + - lineSep + - locale + - mode + - prefersDecimal + - primitivesAsString + - samplingRatio + - timestampFormat + - timestampNTZFormat + - timezone diff --git a/mddocs/docs/file_df/file_formats/jsonline.md b/mddocs/docs/file_df/file_formats/jsonline.md new file mode 100644 index 000000000..bd92e4533 --- /dev/null +++ b/mddocs/docs/file_df/file_formats/jsonline.md @@ -0,0 +1,40 @@ +# JSONLine { #DBR-onetl-file-df-file-formats-jsonline } + + + +::: onetl.file.format.jsonline.JSONLine + options: + members: + - __init__ + - allowBackslashEscapingAnyCharacter + - allowComments + - allowNonNumericNumbers + - allowNumericLeadingZeros + - allowSingleQuotes + - allowUnquotedControlChars + - allowUnquotedFieldNames + - columnNameOfCorruptRecord + - compression + - dateFormat + - dropFieldIfAllNull + - encoding + - ignoreNullFields + - lineSep + - locale + - mode + - prefersDecimal + - primitivesAsString + - samplingRatio + - timestampFormat + - timestampNTZFormat + - timezone diff --git a/mddocs/docs/file_df/file_formats/orc.md b/mddocs/docs/file_df/file_formats/orc.md new file mode 100644 index 000000000..e016926ee --- /dev/null +++ b/mddocs/docs/file_df/file_formats/orc.md @@ -0,0 +1,20 @@ +# ORC { #DBR-onetl-file-df-file-formats-orc } + + + +::: onetl.file.format.orc.ORC + options: + members: + - __init__ + - mergeSchema + - compression diff --git a/mddocs/docs/file_df/file_formats/parquet.md b/mddocs/docs/file_df/file_formats/parquet.md new file mode 100644 index 000000000..e7acfdc9e --- /dev/null +++ b/mddocs/docs/file_df/file_formats/parquet.md @@ -0,0 +1,20 @@ +# Parquet { #DBR-onetl-file-df-file-formats-parquet } + + + +::: onetl.file.format.parquet.Parquet + options: + members: + - __init__ + - mergeSchema + - compression diff --git a/mddocs/docs/file_df/file_formats/xml.md b/mddocs/docs/file_df/file_formats/xml.md new file mode 100644 index 000000000..de63d5455 --- /dev/null +++ b/mddocs/docs/file_df/file_formats/xml.md @@ -0,0 +1,39 @@ +# XML { #DBR-onetl-file-df-file-formats-xml } + + + +::: onetl.file.format.xml.XML + options: + members: + - get_packages + - parse_column + - arrayElementName + - attributePrefix + - charset + - columnNameOfCorruptRecord + - compression + - dateFormat + - declaration + - excludeAttribute + - ignoreNamespace + - ignoreSurroundingSpaces + - inferSchema + - mode + - nullValue + - rootTag + - row_tag + - rowValidationXSDPath + - samplingRatio + - timestampFormat + - valueTag + - wildcardColName diff --git a/mddocs/docs/file_df/index.md b/mddocs/docs/file_df/index.md new file mode 100644 index 000000000..3db6b9081 --- /dev/null +++ b/mddocs/docs/file_df/index.md @@ -0,0 +1,5 @@ +# File DataFrame classes { #DBR-onetl-file-df-file-dataframe-classes } + +* [FileDF Reader][DBR-onetl-file-df-reader-filedf-reader-0] +* [FileDF Writer][DBR-onetl-file-df-writer-filedf-writer-0] +* [File Formats][DBR-onetl-file-df-file-formats] diff --git a/mddocs/docs/hooks/design.md b/mddocs/docs/hooks/design.md new file mode 100644 index 000000000..c9fb7b8dd --- /dev/null +++ b/mddocs/docs/hooks/design.md @@ -0,0 +1,660 @@ +# High level design { #DBR-onetl-hooks-design-high-level-design } + +## What are hooks? { #DBR-onetl-hooks-design-what-are-hooks } + +Hook mechanism is a part of onETL which allows to inject some additional behavior into existing methods of (almost) any class. + +### Features { #DBR-onetl-hooks-design-features } + +Hooks mechanism allows to: + +- Inspect and validate input arguments and output results of method call +- Access, modify or replace method call result (but NOT input arguments) +- Wrap method calls with a context manager and catch raised exceptions + +Hooks can be placed into [`plugins`][DBR-onetl-plugins], allowing to modify onETL behavior by installing some additional package. + +### Limitations { #DBR-onetl-hooks-design-limitations } + +- Hooks can be bound to methods of a class only (not functions). +- Only methods decorated with [`slot-decorator`][DBR-onetl-hooks-slot-decorator] implement hooks mechanism. These class and methods are marked as `support_hooks`. +- Hooks can be bound to public methods only. + +## Terms { #DBR-onetl-hooks-design-terms } + +- [`slot-decorator`][DBR-onetl-hooks-slot-decorator] - method of a class with a special decorator +- `Callback` - function which implements some additional logic which modifies slot behavior +- [`hook-decorator`][DBR-onetl-hooks-hook-decorator] - wrapper around callback which stores hook state, priority and some useful methods +- `Hooks mechanism` - calling `Slot()` will call all enabled hooks which are bound to the slot. Implemented by [`support-hooks-decorator`][DBR-onetl-hooks-support-hooks-decorator]. + +## How to implement hooks? { #DBR-onetl-hooks-design-how-to-implement-hooks } + +### TL;DR { #DBR-onetl-hooks-design-tldr } + +```python +from onetl.hooks import support_hooks, slot, hook + + +@support_hooks # enabling hook mechanism for the class +class MyClass: + def __init__(self, data): + self.data = data + + # this is slot + @slot + def method(self, arg): + pass + + +@MyClass.method.bind # bound hook to the slot +@hook # this is hook +def callback(obj, arg): # this is callback + print(obj.data, arg) + + +obj = MyClass(1) +obj.method(2) # will call callback(obj, 1) + +# prints "1 2" +``` + +#### Define a slot { #DBR-onetl-hooks-design-define-a-slot } + +- Create a class with a method: + +```python +class MyClass: + def __init__(self, data): + self.data = data + + def method(self, arg): + return self.data, arg +``` + +- Add [`slot-decorator`][DBR-onetl-hooks-slot-decorator] to the method: + +```python +from onetl.hooks import support_hooks, slot, hook + + +class MyClass: + @slot + def method(self, arg): + return self.data, arg +``` + +If method has other decorators like `@classmethod` or `@staticmethod`, `@slot` should be placed on the top: + +```python +from onetl.hooks import support_hooks, slot, hook + + +class MyClass: + @slot + @classmethod + def class_method(cls, arg): + return cls, arg + + @slot + @staticmethod + def static_method(arg): + return arg +``` + +- Add [`support-hooks-decorator`][DBR-onetl-hooks-support-hooks-decorator] to the class: + +```python +from onetl.hooks import support_hooks, slot, hook + + +@support_hooks +class MyClass: + @slot + def method(self, arg): + return self.data, arg +``` + +Slot is created. + +#### Define a callback { #DBR-onetl-hooks-design-define-a-callback } + +Define some function (a.k.a callback): + +```python +def callback(self, arg): + print(self.data, arg) +``` + +It should have signature *compatible* with `MyClass.method`. *Compatible* does not mean *exactly the same* - for example, you can rename positional arguments: + +```python +def callback(obj, arg): + print(obj.data, arg) +``` + +Use `*args` and `**kwargs` to omit arguments you don't care about: + +```python +def callback(obj, *args, **kwargs): + print(obj.data, args, kwargs) +``` + +There is also an argument `method_name` which has a special meaning - the method name which the callback is bound to is passed into this argument: + +```python +def callback(obj, *args, method_name: str, **kwargs): + print(obj.data, args, method_name, kwargs) +``` + +!!! note + + `method_name` should always be a keyword argument, **NOT** positional. + +!!! warning + + If callback signature is not compatible with slot signature, an exception will be raised, but **ONLY** while slot is called. + +#### Define a hook { #DBR-onetl-hooks-design-define-a-hook } + +Add [`hook-decorator`][DBR-onetl-hooks-hook-decorator] to create a hook from your callback: + +```python +@hook +def callback(obj, arg): + print(obj.data, arg) +``` + +You can pass more options to the `@hook` decorator, like state or priority. See decorator documentation for more details. + +#### Bind hook to the slot { #DBR-onetl-hooks-design-bind-hook-to-the-slot } + +Use `Slot.bind` method to bind hook to the slot: + +```python +@MyClass.method.bind +@hook +def callback(obj, arg): + print(obj, arg) +``` + +You can bind more than one hook to the same slot, and bind same hook to multiple slots: + +```python +@MyClass.method1.bind +@MyClass.method2.bind +@hook +def callback1(obj, arg): + "Will be called by both MyClass.method1 and MyClass.method2" + + +@MyClass.method1.bind +@hook +def callback2(obj, arg): + "Will be called by MyClass.method1 too" +``` + +## How hooks are called? { #DBR-onetl-hooks-design-how-hooks-are-called } + +### General { #DBR-onetl-hooks-design-general } + +Just call the method decorated by `@slot` to trigger the hook: + +```python +obj = MyClass(1) +obj.method(2) # will call callback(obj, 2) + +# prints "1 2" +``` + +There are some special callback types that has a slightly different behavior. + +### Context managers { #DBR-onetl-hooks-design-context-managers } + +`@hook` decorator can be placed on a context manager class: + +```python +@hook +class ContextManager: + def __init__(self, obj, arg): + self.obj = obj + self.arg = arg + + def __enter__(self): + # do something on enter + print(obj.data, arg) + return self + + def __exit__(self, exc_type, exc_value, traceback): + # do something on exit + return False +``` + +Context manager is entered while calling the `Slot()`, and exited then the call is finished. + +If present, method `process_result` has a special meaning - it can receive `MyClass.method` call result, and also modify/replace it: + +```python +@hook +class ContextManager: + def __init__(self, obj, arg): + self.obj = obj + self.arg = arg + + def __enter__(self): + # do something on enter + print(obj.data, arg) + return self + + def __exit__(self, exc_type, exc_value, traceback): + # do something on exit + return False + + def process_result(self, result): + # do something with method call result + return modified(result) +``` + +See examples below for more information. + +### Generator function { #DBR-onetl-hooks-design-generator-function } + +`@hook` decorator can be placed on a generator function: + +```python +@hook +def callback(obj, arg): + print(obj.data, arg) + # this is called before original method body + + yield # method is called here + + # this is called after original method body +``` + +It is converted to a context manager, in the same manner as [contextlib.contextmanager](https://docs.python.org/3/library/contextlib.html#contextlib.contextmanager). + +Generator body can be wrapped with `try..except..finally` to catch exceptions: + +```python +@hook +def callback(obj, arg): + print(obj.data, arg) + + try: + # this is called before original method body + + yield # method is called here + except Exception as e: + process_exception(a) + finally: + # this is called after original method body + finalizer() +``` + +There is also a special syntax which allows generator to access and modify/replace method call result: + +```python +@hook +def callback(obj, arg): + original_result = yield # method is called here + + new_result = do_something(original_result) + + yield new_result # modify/replace the result +``` + +### Calling hooks in details { #DBR-onetl-hooks-design-calling-hooks-in-details } + +- The callback will be called with the same arguments as the original method. + + - If slot is a regular method: + + ```python + callback_result = callback(self, *args, **kwargs) + ``` + + Here `self` is a class instance (`obj`). + + - If slot is a class method: + + ```python + callback_result = callback(cls, *args, **kwargs) + ``` + + Here `cls` is the class itself (`MyClass`). + + - If slot is a static method: + + ```python + callback_result = callback(*args, **kwargs) + ``` + + Neither object not class are passed to the callback in this case. + +- If `callback_result` is a context manager, enter the context. Context manager can catch all the exceptions raised. + + > If there are multiple hooks bound the the slot, every context manager will be entered. + +- Then call the original method wrapped by `@slot`: + + ```python + original_result = method(*args, **kwargs) + ``` + +- Process `original_result`: + + - If `callback_result` object has method `process_result`, or is a generator wrapped with `@hook`, call it: + + ```python + new_result = callback_result.process_result(original_result) + ``` + + - Otherwise set `new_result = callback_result`. + + - If there are multiple hooks bound the the method, pass `new_result` through the chain: + + ```python + new_result = callback1_result.process_result(original_result) + new_result = callback2_result.process_result(new_result or original_result) + new_result = callback3_result.process_result(new_result or original_result) + ``` + +- Finally return: + + ```python + return new_result or original_result + ``` + + All `None` values are ignored on every step above. + +- Exit all the context managers entered during the slot call. + +### Hooks priority { #DBR-onetl-hooks-design-hooks-priority } + +Hooks are executed in the following order: + +1. Parent class slot + [`FIRST`][onetl.hooks.hook.HookPriority.FIRST] +2. Inherited class slot + [`FIRST`][onetl.hooks.hook.HookPriority.FIRST] +3. Parent class slot + [`NORMAL`][onetl.hooks.hook.HookPriority.NORMAL] +4. Inherited class slot + [`NORMAL`][onetl.hooks.hook.HookPriority.NORMAL] +5. Parent class slot + [`LAST`][onetl.hooks.hook.HookPriority.LAST] +6. Inherited class slot + [`LAST`][onetl.hooks.hook.HookPriority.LAST] + +Hooks with the same priority and inheritance will be executed in the same order they were registered (`Slot.bind` call). + +!!! note + + Calls of `super()` inside inherited class methods does not trigger hooks call. Hooks are triggered only if method is called explicitly. + + This allow to wrap with a hook the entire slot call without influencing its internal logic. + +### Hook types { #DBR-onetl-hooks-design-hook-types } + +Here are several examples of using hooks. These types are not exceptional, they can be mixed - for example, hook can both modify method result and catch exceptions. + +#### Before hook { #DBR-onetl-hooks-design-before-hook } + +Can be used for inspecting or validating input args of the original function: + +```python +@hook +def before1(obj, arg): + print(obj, arg) + # original method is called after exiting this function + + +@hook +def before2(obj, arg): + if arg == 1: + raise ValueError("arg=1 is not allowed") + return None # return None is the same as no return statement +``` + +Executed before calling the original method wrapped by `@slot`. If hook raises an exception, method will not be called at all. + +#### After hook { #DBR-onetl-hooks-design-after-hook } + +Can be used for performing some actions after original method was successfully executed: + +```python +@hook +def after1(obj, arg): + yield # original method is called here + print(obj, arg) + + +@hook +def after2(obj, arg): + yield None # yielding None is the same as empty yield + if arg == 1: + raise ValueError("arg=1 is not allowed") +``` + +If original method raises an exception, the block of code after `yield` will not be called. + +#### Context hook { #DBR-onetl-hooks-design-context-hook } + +Can be used for catching and handling some exceptions, or to determine that there was no exception during slot call: + +=== "Generator syntax" + + ```python + # This is just the same as using @contextlib.contextmanager + + @hook + def context_generator(obj, arg): + try: + yield # original method is called here + print(obj, arg) # <-- this line will not be called if method raised an exception + except SomeException as e: + magic(e) + finally: + finalizer() + ``` +=== "Context manager syntax" + + ```python + @hook + class ContextManager: + def __init__(self, obj, args): + self.obj = obj + self.args = args + + def __enter__(self): + return self + + # original method is called between __enter__ and __exit__ + + def __exit__(self, exc_type, exc_value, traceback): + result = False + if exc_type is not None and isinstance(exc_value, SomeException): + magic(exc_value) + result = True # suppress exception + else: + print(self.obj, self.arg) + finalizer() + return result + ``` + +!!! note + + Contexts are exited in the reverse order of the hook calls. So if some hook raised an exception, it will be passed into the previous hook, not the next one. + + It is recommended to specify the proper priority for the hook, e.g. [`FIRST`][onetl.hooks.hook.HookPriority.FIRST] + +#### Replacing result hook { #DBR-onetl-hooks-design-replacing-result-hook } + +Replaces the output result of the original method. + +Can be used for delegating some implementation details for third-party extensions. +See [`hive`][DBR-onetl-connection-db-connection-hive] and [`hdfs`][DBR-onetl-connection-file-connection-hdfs] as an example. + +```python +@hook +def replace1(obj, arg): + result = arg + 10 # any non-None return result + + # original method call result is ignored, output will always be arg + 10 + return result + + +@hook +def replace2(obj, arg): + yield arg + 10 # same as above +``` + +!!! note + + If there are multiple hooks bound to the same slot, the result of last hook will be used. + It is recommended to specify the proper priority for the hook, e.g. [`LAST`][onetl.hooks.hook.HookPriority.LAST] + +#### Accessing result hook { #DBR-onetl-hooks-design-accessing-result-hook } + +Can access output result of the original method and inspect or validate it: + +=== "Generator syntax" + + ```python + @hook + def access_result(obj, arg): + result = yield # original method is called here, and result can be used in the hook + print(result) + yield # does not modify result + ``` + +=== "Context manager syntax" + + ```python + @hook + class ModifiesResult: + def __init__(self, obj, args): + self.obj = obj + self.args = args + + def __enter__(self): + return self + + # original method is called between __enter__ and __exit__ + # result is passed into process_result method of context manager, if present + + def process_result(self, result): + print(result) # result can be used in the hook + return None # does not modify result. same as no return statement in the method + + def __exit__(self, exc_type, exc_value, traceback): + return False + + ``` + +#### Modifying result hook { #DBR-onetl-hooks-design-modifying-result-hook } + +Can access output result of the original method, and return the modified one: + +=== "Generator syntax" + + ```python + @hook + def modifies_result(obj, arg): + result = yield # original method is called here, and result can be used in the hook + yield result + 10 # modify output result. None values are ignored + ``` + +=== "Context manager syntax" + + ```python + @hook + class ModifiesResult: + def __init__(self, obj, args): + self.obj = obj + self.args = args + + def __enter__(self): + return self + + # original method is called between __enter__ and __exit__ + # result is passed into process_result method of context manager, if present + + def process_result(self, result): + print(result) # result can be used in the hook + return result + 10 # modify output result. None values are ignored + + def __exit__(self, exc_type, exc_value, traceback): + return False + ``` + +!!! note + + If there are multiple hooks bound to the same slot, the result of last hook will be used. + It is recommended to specify the proper priority for the hook, e.g. [`LAST`][onetl.hooks.hook.HookPriority.LAST] + +## How to enable/disable hooks? { #DBR-onetl-hooks-design-how-to-enabledisable-hooks } + +You can enable/disable/temporary disable hooks on 4 different levels: + +- Manage global hooks state (level 1): + + - [`onetl.hooks.hooks_state.stop_all_hooks`][onetl.hooks.hooks_state.stop_all_hooks] + - [`onetl.hooks.hooks_state.resume_all_hooks`][onetl.hooks.hooks_state.resume_all_hooks] + - [`onetl.hooks.hooks_state.skip_all_hooks`][onetl.hooks.hooks_state.skip_all_hooks] + +- Manage all hooks bound to a specific class (level 2): + + - [`onetl.hooks.support_hooks.suspend_hooks`][onetl.hooks.support_hooks.suspend_hooks] + - [`onetl.hooks.support_hooks.resume_hooks`][onetl.hooks.support_hooks.resume_hooks] + - [`onetl.hooks.support_hooks.skip_hooks`][onetl.hooks.support_hooks.skip_hooks] + +- Manage all hooks bound to a specific slot (level 3): + + - [`onetl.hooks.slot.Slot.suspend_hooks`][onetl.hooks.slot.Slot.suspend_hooks] + - [`onetl.hooks.slot.Slot.resume_hooks`][onetl.hooks.slot.Slot.resume_hooks] + - [`onetl.hooks.slot.Slot.skip_hooks`][onetl.hooks.slot.Slot.skip_hooks] + +- Manage state of a specific hook (level 4): + + - [`onetl.hooks.hook.Hook.enable`][onetl.hooks.hook.Hook.enable] + - [`onetl.hooks.hook.Hook.disable`][onetl.hooks.hook.Hook.disable] + +More details in the documentation above. + +!!! note + + All of these levels are independent. + + Calling `stop` on the level 1 has higher priority than level 2, and so on. But calling `resume` on the level 1 does not automatically resume hooks stopped in the level 2, they should be resumed explicitly. + +## How to see logs of the hook mechanism? { #DBR-onetl-hooks-design-how-to-see-logs-of-the-hook-mechanism } + +Hooks registration emits logs with `DEBUG` level: + +```python +from onetl.logs import setup_logging + +setup_logging() +``` + +```text +DEBUG |onETL| Registered hook 'mymodule.callback1' for 'MyClass.method' (enabled=True, priority=HookPriority.NORMAL) +DEBUG |onETL| Registered hook 'mymodule.callback2' for 'MyClass.method' (enabled=True, priority=HookPriority.NORMAL) +DEBUG |onETL| Registered hook 'mymodule.callback3' for 'MyClass.method' (enabled=False, priority=HookPriority.NORMAL) +``` + +But most of logs are emitted with even lower level `NOTICE`, to make output less verbose: + +```python +from onetl.logs import NOTICE, setup_logging + +setup_logging(level=NOTICE) +``` + +```text +NOTICE |Hooks| 2 hooks registered for 'MyClass.method' +NOTICE |Hooks| Calling hook 'mymodule.callback1' (1/2) +NOTICE |Hooks| Hook is finished with returning non-None result +NOTICE |Hooks| Calling hook 'mymodule.callback2' (2/2) +NOTICE |Hooks| This is a context manager, entering ... +NOTICE |Hooks| Calling original method 'MyClass.method' +NOTICE |Hooks| Method call is finished +NOTICE |Hooks| Method call result (*NOT* None) will be replaced with result of hook 'mymodule.callback1' +NOTICE |Hooks| Passing result to 'process_result' method of context manager 'mymodule.callback2' +NOTICE |Hooks| Method call result (*NOT* None) is modified by hook! +``` diff --git a/mddocs/docs/hooks/global_state.md b/mddocs/docs/hooks/global_state.md new file mode 100644 index 000000000..f32058ef8 --- /dev/null +++ b/mddocs/docs/hooks/global_state.md @@ -0,0 +1,49 @@ +# Hooks global state { #DBR-onetl-hooks-global-state } + + + +::: onetl.hooks.hooks_state + options: + members: + - skip_all_hooks + - stop_all_hooks + - resume_all_hooks + +::: onetl.hooks.hooks_state.HooksState.skip + options: + show_source: false + show_root_heading: true + +::: onetl.hooks.hooks_state.HooksState.stop + options: + show_source: false + show_root_heading: true + +::: onetl.hooks.hooks_state.HooksState.resume + options: + show_source: false + show_root_heading: true diff --git a/mddocs/docs/hooks/hook.md b/mddocs/docs/hooks/hook.md new file mode 100644 index 000000000..9d9b30bb9 --- /dev/null +++ b/mddocs/docs/hooks/hook.md @@ -0,0 +1,50 @@ +# `@hook` decorator { #DBR-onetl-hooks-hook-decorator } + + + +::: onetl.hooks.hook.hook + +::: onetl.hooks.hook.HookPriority + options: + members: + - FIRST + - NORMAL + - LAST + +::: onetl.hooks.hook.Hook + options: + members: + - enable + - disable + - skip diff --git a/mddocs/docs/hooks/index.md b/mddocs/docs/hooks/index.md new file mode 100644 index 000000000..796d5590f --- /dev/null +++ b/mddocs/docs/hooks/index.md @@ -0,0 +1,9 @@ +# Hooks { #DBR-onetl-hooks } + +:octicons-versions-16: **version added 0.6.0** + +* [High level design][DBR-onetl-hooks-design-high-level-design] +* [@hook decorator][DBR-onetl-hooks-hook-decorator] +* [@slot decorator][DBR-onetl-hooks-slot-decorator] +* [@support_hooks decorator][DBR-onetl-hooks-support-hooks-decorator] +* [Hooks global state][DBR-onetl-hooks-global-state] diff --git a/mddocs/docs/hooks/slot.md b/mddocs/docs/hooks/slot.md new file mode 100644 index 000000000..e404abf56 --- /dev/null +++ b/mddocs/docs/hooks/slot.md @@ -0,0 +1,30 @@ +# `@slot` decorator { #DBR-onetl-hooks-slot-decorator } + + + +::: onetl.hooks.slot.slot + +::: onetl.hooks.slot.Slot diff --git a/mddocs/docs/hooks/support_hooks.md b/mddocs/docs/hooks/support_hooks.md new file mode 100644 index 000000000..2aeb35275 --- /dev/null +++ b/mddocs/docs/hooks/support_hooks.md @@ -0,0 +1,48 @@ +# `@support_hooks` decorator { #DBR-onetl-hooks-support-hooks-decorator } + + + + +::: onetl.hooks.support_hooks + options: + members: + - skip_hooks + - suspend_hooks + - resume_hooks + - support_hooks diff --git a/mddocs/docs/hwm_store/index.md b/mddocs/docs/hwm_store/index.md new file mode 100644 index 000000000..3a60d60ae --- /dev/null +++ b/mddocs/docs/hwm_store/index.md @@ -0,0 +1,9 @@ +# HWM { #DBR-onetl-hwm-store-hwm } + +Since onETL v0.10.0, the `HWMStore` and `HWM` classes have been moved to a separate library {{ etl_entities_link }}. + +The only class was left intact is [`yaml-hwm-store`][DBR-onetl-hwm-store-yaml-hwm-store], **which is default** in onETL. + +Other known implementation is [HorizonHWMStore](https://horizon-hwm-store.readthedocs.io/). + +* [YAML HWM Store][DBR-onetl-hwm-store-yaml-hwm-store] diff --git a/mddocs/docs/hwm_store/yaml_hwm_store.md b/mddocs/docs/hwm_store/yaml_hwm_store.md new file mode 100644 index 000000000..04fe9d938 --- /dev/null +++ b/mddocs/docs/hwm_store/yaml_hwm_store.md @@ -0,0 +1,19 @@ +# YAML HWM Store { #DBR-onetl-hwm-store-yaml-hwm-store } + + + +::: onetl.hwm.store.yaml_hwm_store.YAMLHWMStore + options: + members: + - get_hwm + - set_hwm + - __enter__ diff --git a/mddocs/docs/index.md b/mddocs/docs/index.md new file mode 100644 index 000000000..67f0f807d --- /dev/null +++ b/mddocs/docs/index.md @@ -0,0 +1,18 @@ +# onETL { #DBR-onetl--onetl } + +{{ repo_status_badge }} +{{ pypi_release_bage }} +{{ pypi_license_bage }} +{{ pypi_pyversion_bage }} +{{ pypi_downloads_bage }} + +{{ docs_status_badge }} +{{ ci_status_badge }} +{{ precommit_badge }} + + +{{ onetl_logo_wide }} + +----8<---- +docs/snippet_0.md +----8<---- diff --git a/mddocs/docs/install/files.md b/mddocs/docs/install/files.md new file mode 100644 index 000000000..023e7c8cb --- /dev/null +++ b/mddocs/docs/install/files.md @@ -0,0 +1,18 @@ +# File connections { #DBR-onetl-install-files-file-connections } + +All File (but not *FileDF*) connection classes (`FTP`, `SFTP`, `HDFS` and so on) requires specific Python clients to be installed. + +Each client can be installed explicitly by passing connector name (in lowercase) to `extras`: + +```bash +pip install onetl[ftp] # specific connector +pip install onetl[ftp,ftps,sftp,hdfs,s3,webdav,samba] # multiple connectors +``` + +To install all file connectors at once you can pass `files` to `extras`: + +```bash +pip install onetl[files] +``` + +**Otherwise class import will fail.** diff --git a/mddocs/docs/install/full.md b/mddocs/docs/install/full.md new file mode 100644 index 000000000..ddace90f8 --- /dev/null +++ b/mddocs/docs/install/full.md @@ -0,0 +1,14 @@ +# Full bundle { #DBR-onetl-install-full-bundle } + +To install all connectors and dependencies, you can pass `all` into `extras`: + +```bash +pip install onetl[all] + +# this is just the same as +pip install onetl[spark,files,kerberos] +``` + +!!! warning + + This method consumes a lot of disk space, and requires for Java & Kerberos libraries to be installed into your OS. diff --git a/mddocs/docs/install/index.md b/mddocs/docs/install/index.md new file mode 100644 index 000000000..a2c48b059 --- /dev/null +++ b/mddocs/docs/install/index.md @@ -0,0 +1,34 @@ +# How to install { #DBR-onetl-install-how-to-install-0 } + +Base `onetl` package contains: + +* `DBReader`, `DBWriter` and related classes +* `FileDownloader`, `FileUploader`, `FileMover` and related classes, like file filters & limits +* `FileDFReader`, `FileDFWriter` and related classes, like file formats +* Read Strategies & HWM classes +* Plugins support + +It can be installed via: + +```bash +pip install onetl +``` + +!!! warning + + This method does NOT include any connections. + + This method is recommended for use in third-party libraries which require for `onetl` to be installed, + but do not use its connection classes. + + +## Installation in details { #DBR-onetl-install-installation-in-details } + +### How to install { #DBR-onetl-install-how-to-install-1 } + +* [How to install][DBR-onetl-install-how-to-install-0] +* [Minimal installation][DBR-onetl-install-minimal-installation] +* [Spark][DBR-onetl-install-spark] +* [File connections][DBR-onetl-install-files-file-connections] +* [Kerberos support][DBR-onetl-install-kerberos-support] +* [Full bundle][DBR-onetl-install-full-bundle] diff --git a/mddocs/docs/install/kerberos.md b/mddocs/docs/install/kerberos.md new file mode 100644 index 000000000..57095ca73 --- /dev/null +++ b/mddocs/docs/install/kerberos.md @@ -0,0 +1,30 @@ +# Kerberos support { #DBR-onetl-install-kerberos-support } + +Most of Hadoop instances set up with Kerberos support, +so some connections require additional setup to work properly. + +* `HDFS` + Uses [requests-kerberos](https://pypi.org/project/requests-kerberos/) and + [GSSApi](https://pypi.org/project/gssapi/) for authentication. + It also uses `kinit` executable to generate Kerberos ticket. +* `Hive` and `SparkHDFS` + require Kerberos ticket to exist before creating Spark session. + +So you need to install OS packages with: + +* `krb5` libs +* Headers for `krb5` +* `gcc` or other compiler for C sources + +The exact installation instruction depends on your OS, here are some examples: + +```bash +apt install libkrb5-dev krb5-user gcc # Debian-based +dnf install krb5-devel krb5-libs krb5-workstation gcc # CentOS, OracleLinux +``` + +Also you should pass `kerberos` to `extras` to install required Python packages: + +```bash +pip install onetl[kerberos] +``` diff --git a/mddocs/docs/install/minimal.md b/mddocs/docs/install/minimal.md new file mode 100644 index 000000000..ac73a0edc --- /dev/null +++ b/mddocs/docs/install/minimal.md @@ -0,0 +1,22 @@ +# Minimal installation { #DBR-onetl-install-minimal-installation } + +Base `onetl` package contains: + +* `DBReader`, `DBWriter` and related classes +* `FileDownloader`, `FileUploader`, `FileMover` and related classes, like file filters & limits +* `FileDFReader`, `FileDFWriter` and related classes, like file formats +* Read Strategies & HWM classes +* Plugins support + +It can be installed via: + +```bash +pip install onetl +``` + +!!! warning + + This method does NOT include any connections. + + This method is recommended for use in third-party libraries which require for `onetl` to be installed, + but do not use its connection classes. diff --git a/mddocs/docs/install/spark.md b/mddocs/docs/install/spark.md new file mode 100644 index 000000000..a6c9fe785 --- /dev/null +++ b/mddocs/docs/install/spark.md @@ -0,0 +1,358 @@ +# Spark { #DBR-onetl-install-spark } + + + +All DB connection classes (`Clickhouse`, `Greenplum`, `Hive` and others) and all FileDF connection classes (`SparkHDFS`, `SparkLocalFS`, `SparkS3`) require Spark to be installed. + +## Installing Java { #DBR-onetl-install-spark-installing-java } + + + +Firstly, you should install JDK. The exact installation instruction depends on your OS, here are some examples: + +```bash +yum install java-1.8.0-openjdk-devel # CentOS 7 + Spark 2 +dnf install java-11-openjdk-devel # CentOS 8 + Spark 3 +apt-get install openjdk-11-jdk # Debian-based + Spark 3 +``` + +### Compatibility matrix { #DBR-onetl-install-spark-compatibility-matrix } + +| Spark | Python | Java | Scala | +|-----------------------------------------------------------|------------|------------|---------| +| [2.3.x](https://spark.apache.org/docs/2.3.1/#downloading) | 3.7 only | 8 only | 2.11 | +| [2.4.x](https://spark.apache.org/docs/2.4.8/#downloading) | 3.7 only | 8 only | 2.11 | +| [3.2.x](https://spark.apache.org/docs/3.2.4/#downloading) | 3.7 - 3.10 | 8u201 - 11 | 2.12 | +| [3.3.x](https://spark.apache.org/docs/3.3.4/#downloading) | 3.7 - 3.12 | 8u201 - 17 | 2.12 | +| [3.4.x](https://spark.apache.org/docs/3.4.4/#downloading) | 3.7 - 3.12 | 8u362 - 20 | 2.12 | +| [3.5.x](https://spark.apache.org/docs/3.5.5/#downloading) | 3.8 - 3.13 | 8u371 - 20 | 2.12 | + +## Installing PySpark { #DBR-onetl-install-spark-installing-pyspark } + + + +Then you should install PySpark via passing `spark` to `extras`: + +```bash +pip install onetl[spark] # install latest PySpark +``` + +or install PySpark explicitly: + +```bash +pip install onetl pyspark==3.5.5 # install a specific PySpark version +``` + +or inject PySpark to `sys.path` in some other way BEFORE creating a class instance. +**Otherwise connection object cannot be created.** + +## Injecting Java packages { #DBR-onetl-install-spark-injecting-java-packages } + +Some DB and FileDF connection classes require specific packages to be inserted to `CLASSPATH` of Spark session, +like JDBC drivers. + +This is usually done by setting up `spark.jars.packages` option while creating Spark session: + +```python +# here is a list of packages to be downloaded: +maven_packages = ( + Greenplum.get_packages(spark_version="3.2") + + MySQL.get_packages() + + Postgres.get_packages() +) + +spark = ( + SparkSession.builder.config("spark.app.name", "onetl") + .config("spark.jars.packages", ",".join(maven_packages)) + .getOrCreate() +) +``` + +Spark automatically resolves package and all its dependencies, download them and inject to Spark session +(both driver and all executors). + +This requires internet access, because package metadata and `.jar` files are fetched from [Maven Repository](https://mvnrepository.com/). + +But sometimes it is required to: + +- Install package without direct internet access (isolated network) +- Install package which is not available in Maven + +There are several ways to do that. + +### Using `spark.jars` { #DBR-onetl-install-spark-using-spark-jars } + +The most simple solution, but this requires to store raw `.jar` files somewhere on filesystem or web server. + +- Download `package.jar` files (it's usually something like `some-package_1.0.0.jar`). Local file name does not matter, but it should be unique. +- (For `spark.submit.deployMode=cluster`) place downloaded files to HDFS or deploy to any HTTP web server serving static files. See [official documentation](https://spark.apache.org/docs/latest/submitting-applications.html#advanced-dependency-management) for more details. +- Create Spark session with passing `.jar` absolute file path to `spark.jars` Spark config option: + +=== "spark.submit.deployMode=client (default)" + + ```python + jar_files = ["/path/to/package.jar"] + + # do not pass spark.jars.packages + spark = ( + SparkSession.builder.config("spark.app.name", "onetl") + .config("spark.jars", ",".join(jar_files)) + .getOrCreate() + ) + ``` + +=== "spark.submit.deployMode=cluster" + + ```python + # you can also pass URLs like http://domain.com/path/to/downloadable/package.jar + jar_files = ["hdfs:///path/to/package.jar"] + + # do not pass spark.jars.packages + spark = ( + SparkSession.builder.config("spark.app.name", "onetl") + .config("spark.jars", ",".join(jar_files)) + .getOrCreate() + ) + ``` + +### Using `spark.jars.repositories` { #DBR-onetl-install-spark-using-spark-jars-repositories } + +!!! note + + In this case Spark still will try to fetch packages from the internet, so if you don't have internet access, Spark session will be created with significant delay because of all attempts to fetch packages. + +Can be used if you have access both to public repos (like Maven) and a private Artifactory/Nexus repo. + +- Setup private Maven repository in [JFrog Artifactory](https://jfrog.com/artifactory/) or [Sonatype Nexus](https://www.sonatype.com/products/sonatype-nexus-repository). +- Download `package.jar` file (it's usually something like `some-package_1.0.0.jar`). Local file name does not matter. +- Upload `package.jar` file to private repository (with same `groupId` and `artifactoryId` as in source package in Maven). +- Pass repo URL to `spark.jars.repositories` Spark config option. +- Create Spark session with passing Package name to `spark.jars.packages` Spark config option: + +```python +maven_packages = ( + Greenplum.get_packages(spark_version="3.2") + + MySQL.get_packages() + + Postgres.get_packages() +) + +spark = ( + SparkSession.builder.config("spark.app.name", "onetl") + .config("spark.jars.repositories", "http://nexus.mydomain.com/private-repo/") + .config("spark.jars.packages", ",".join(maven_packages)) + .getOrCreate() +) +``` + +### Using `spark.jars.ivySettings` { #DBR-onetl-install-spark-using-spark-jars-ivysettings } + +Same as above, but can be used even if there is no network access to public repos like Maven. + +- Setup private Maven repository in [JFrog Artifactory](https://jfrog.com/artifactory/) or [Sonatype Nexus](https://www.sonatype.com/products/sonatype-nexus-repository). +- Download `package.jar` file (it's usually something like `some-package_1.0.0.jar`). Local file name does not matter. +- Upload `package.jar` file to [private repository](https://help.sonatype.com/repomanager3/nexus-repository-administration/repository-management#RepositoryManagement-HostedRepository) (with same `groupId` and `artifactoryId` as in source package in Maven). +- Create `ivysettings.xml` file (see below). +- Add here a resolver with repository URL (and credentials, if required). +- Pass `ivysettings.xml` absolute path to `spark.jars.ivySettings` Spark config option. +- Create Spark session with passing package name to `spark.jars.packages` Spark config option: + +=== "ivysettings-all-packages-uploaded-to-nexus.xml" + + ```xml + + + + + + + + + + + + + + ``` + +=== "ivysettings-private-packages-in-nexus-public-in-maven.xml" + + ```xml + + + + + + + + + + + + + + + + + + ``` + +=== "ivysettings-private-packages-in-nexus-public-fetched-using-proxy-repo.xml" + + ```xml + + + + + + + + + + + + + + + + ``` + +=== "ivysettings-nexus-with-auth-required.xml" + + ```xml + + + + + + + + + + + + + + + + + + + + ``` + +```python title="script.py" +maven_packages = ( + Greenplum.get_packages(spark_version="3.2") + + MySQL.get_packages() + + Postgres.get_packages() +) + +spark = ( + SparkSession.builder.config("spark.app.name", "onetl") + .config("spark.jars.ivySettings", "/path/to/ivysettings.xml") + .config("spark.jars.packages", ",".join(maven_packages)) + .getOrCreate() +) +``` + +### Place `.jar` file to `~/.ivy2/jars/` { #DBR-onetl-install-spark-place-jar-file-to-ivy2jars } + +Can be used to pass already downloaded file to Ivy, and skip resolving package from Maven. + +- Download `package.jar` file (it's usually something like `some-package_1.0.0.jar`). Local file name does not matter, but it should be unique. +- Move it to `~/.ivy2/jars/` folder. +- Create Spark session with passing package name to `spark.jars.packages` Spark config option: + +```python +maven_packages = ( + Greenplum.get_packages(spark_version="3.2") + + MySQL.get_packages() + + Postgres.get_packages() +) + +spark = ( + SparkSession.builder.config("spark.app.name", "onetl") + .config("spark.jars.packages", ",".join(maven_packages)) + .getOrCreate() +) +``` + +### Place `.jar` file to Spark jars folder { #DBR-onetl-install-spark-place-jar-file-to-spark-jars-folder } + +!!! note + + Package file should be placed on all hosts/containers Spark is running, both driver and all executors. + + Usually this is used only with either: + * `spark.master=local` (driver and executors are running on the same host), + * `spark.master=k8s://...` (`.jar` files are added to image or to volume mounted to all pods). + +Can be used to embed `.jar` files to a default Spark classpath. + +- Download `package.jar` file (it's usually something like `some-package_1.0.0.jar`). Local file name does not matter, but it should be unique. +- Move it to `$SPARK_HOME/jars/` folder, e.g. `~/.local/lib/python3.7/site-packages/pyspark/jars/` or `/opt/spark/3.2.3/jars/`. +- Create Spark session **WITHOUT** passing Package name to `spark.jars.packages` + +```python +# no need to set spark.jars.packages or any other spark.jars.* option +# all jars already present in CLASSPATH, and loaded automatically + +spark = SparkSession.builder.config("spark.app.name", "onetl").getOrCreate() +``` + +### Manually adding `.jar` files to `CLASSPATH` { #DBR-onetl-install-spark-manually-adding-jar-files-to-classpath } + +!!! note + + Package file should be placed on all hosts/containers Spark is running, both driver and all executors. + + Usually this is used only with either: + * `spark.master=local` (driver and executors are running on the same host), + * `spark.master=k8s://...` (`.jar` files are added to image or to volume mounted to all pods). + +Can be used to embed `.jar` files to a default Java classpath. + +- Download `package.jar` file (it's usually something like `some-package_1.0.0.jar`). Local file name does not matter. +- Set environment variable `CLASSPATH` to `/path/to/package.jar`. You can set multiple file paths +- Create Spark session **WITHOUT** passing Package name to `spark.jars.packages` + +```python +# no need to set spark.jars.packages or any other spark.jars.* option +# all jars already present in CLASSPATH, and loaded automatically + +import os + +jar_files = ["/path/to/package.jar"] +# different delimiters for Windows and Linux +delimiter = ";" if os.name == "nt" else ":" +spark = ( + SparkSession.builder.config("spark.app.name", "onetl") + .config("spark.driver.extraClassPath", delimiter.join(jar_files)) + .config("spark.executor.extraClassPath", delimiter.join(jar_files)) + .getOrCreate() +) +``` diff --git a/mddocs/docs/logging.md b/mddocs/docs/logging.md new file mode 100644 index 000000000..a33d41034 --- /dev/null +++ b/mddocs/docs/logging.md @@ -0,0 +1,156 @@ +# Logging { #DBR-onetl-logging } + +Logging is quite important to understand what's going on under the hood of onETL. + +Default logging level for Python interpreters is `WARNING`, but most of onETL logs are in `INFO` level, so users usually don't see much. + +To change logging level, there is a function [setup_logging][onetl.log.setup_logging] which should be called at the top of the script: + +```python +from onetl.log import setup_logging +from other.lib import some, more, imports + +setup_logging() + +# rest of code +... +``` + +This changes both log level and log formatting to something like this: + +??? "See logs" + + ```text + 2024-04-12 10:12:10,834 [INFO ] MainThread: |onETL| Using IncrementalStrategy as a strategy + 2024-04-12 10:12:10,835 [INFO ] MainThread: =================================== DBReader.run() starts =================================== + 2024-04-12 10:12:10,835 [INFO ] MainThread: |DBReader| Getting Spark type for HWM expression: 'updated_at' + 2024-04-12 10:12:10,836 [INFO ] MainThread: |MSSQL| Fetching schema of table 'source_schema.table' ... + 2024-04-12 10:12:11,636 [INFO ] MainThread: |MSSQL| Schema fetched. + 2024-04-12 10:12:11,642 [INFO ] MainThread: |DBReader| Got Spark field: StructField('updated_at', TimestampType(), True) + 2024-04-12 10:12:11,642 [INFO ] MainThread: |DBReader| Detected HWM type: 'ColumnDateTimeHWM' + 2024-04-12 10:12:11,643 [INFO ] MainThread: |IncrementalStrategy| Fetching HWM from HorizonHWMStore: + 2024-04-12 10:12:11,643 [INFO ] MainThread: name = 'updated_at#source_schema.table@mssql:/mssql.host:1433/somedb' + 2024-04-12 10:12:12,181 [INFO ] MainThread: |IncrementalStrategy| Fetched HWM: + 2024-04-12 10:12:12,182 [INFO ] MainThread: hwm = ColumnDateTimeHWM( + 2024-04-12 10:12:12,182 [INFO ] MainThread: name = 'updated_at#source_schema.table@mssql:/mssql.host:1433/somedb', + 2024-04-12 10:12:12,182 [INFO ] MainThread: entity = 'source_schema.table', + 2024-04-12 10:12:12,182 [INFO ] MainThread: expression = 'updated_at', + 2024-04-12 10:12:12,184 [INFO ] MainThread: value = datetime.datetime(2024, 4, 11, 18, 10, 2, 120000), + 2024-04-12 10:12:12,184 [INFO ] MainThread: ) + 2024-04-12 10:12:12,184 [INFO ] MainThread: |MSSQL| -> |Spark| Reading DataFrame from source using parameters: + 2024-04-12 10:12:12,185 [INFO ] MainThread: source = 'source_schema.table' + 2024-04-12 10:12:12,185 [INFO ] MainThread: columns = [ + 2024-04-12 10:12:12,185 [INFO ] MainThread: 'id', + 2024-04-12 10:12:12,186 [INFO ] MainThread: 'new_value', + 2024-04-12 10:12:12,186 [INFO ] MainThread: 'old_value', + 2024-04-12 10:12:12,186 [INFO ] MainThread: 'updated_at', + 2024-04-12 10:12:12,186 [INFO ] MainThread: ] + 2024-04-12 10:12:12,187 [INFO ] MainThread: where = "field = 'some'" + 2024-04-12 10:12:12,187 [INFO ] MainThread: hwm = AutoDetectHWM( + 2024-04-12 10:12:12,187 [INFO ] MainThread: name = 'updated_at#source_schema.table@mssql:/mssql.host:1433/somedb', + 2024-04-12 10:12:12,187 [INFO ] MainThread: entity = 'source_schema.table', + 2024-04-12 10:12:12,187 [INFO ] MainThread: expression = 'updated_at', + 2024-04-12 10:12:12,188 [INFO ] MainThread: ) + 2024-04-12 10:12:12,188 [INFO ] MainThread: options = { + 2024-04-12 10:12:12,188 [INFO ] MainThread: 'fetchsize': 100000, + 2024-04-12 10:12:12,188 [INFO ] MainThread: 'numPartitions': 1, + 2024-04-12 10:12:12,189 [INFO ] MainThread: 'partitioningMode': 'range', + 2024-04-12 10:12:12,189 [INFO ] MainThread: } + 2024-04-12 10:12:12,189 [INFO ] MainThread: |MSSQL| Checking connection availability... + 2024-04-12 10:12:12,189 [INFO ] MainThread: |MSSQL| Using connection parameters: + 2024-04-12 10:12:12,190 [INFO ] MainThread: user = 'db_user' + 2024-04-12 10:12:12,190 [INFO ] MainThread: password = SecretStr('**********') + 2024-04-12 10:12:12,190 [INFO ] MainThread: host = 'mssql.host' + 2024-04-12 10:12:12,190 [INFO ] MainThread: port = 1433 + 2024-04-12 10:12:12,191 [INFO ] MainThread: database = 'somedb' + 2024-04-12 10:12:12,191 [INFO ] MainThread: extra = {'applicationIntent': 'ReadOnly', 'trustServerCertificate': 'true'} + 2024-04-12 10:12:12,191 [INFO ] MainThread: jdbc_url = 'jdbc:sqlserver:/mssql.host:1433' + 2024-04-12 10:12:12,579 [INFO ] MainThread: |MSSQL| Connection is available. + 2024-04-12 10:12:12,581 [INFO ] MainThread: |MSSQL| Executing SQL query (on driver): + 2024-04-12 10:12:12,581 [INFO ] MainThread: SELECT + 2024-04-12 10:12:12,581 [INFO ] MainThread: MIN(updated_at) AS "min", + 2024-04-12 10:12:12,582 [INFO ] MainThread: MAX(updated_at) AS "max" + 2024-04-12 10:12:12,582 [INFO ] MainThread: FROM + 2024-04-12 10:12:12,582 [INFO ] MainThread: source_schema.table + 2024-04-12 10:12:12,582 [INFO ] MainThread: WHERE + 2024-04-12 10:12:12,582 [INFO ] MainThread: (field = 'some') + 2024-04-12 10:12:12,583 [INFO ] MainThread: AND + 2024-04-12 10:12:12,583 [INFO ] MainThread: (updated_at >= CAST('2024-04-11T18:10:02.120000' AS datetime2)) + 2024-04-12 10:16:22,537 [INFO ] MainThread: |MSSQL| Received values: + 2024-04-12 10:16:22,538 [INFO ] MainThread: MIN(updated_at) = datetime.datetime(2024, 4, 11, 21, 10, 7, 397000) + 2024-04-12 10:16:22,538 [INFO ] MainThread: MAX(updated_at) = datetime.datetime(2024, 4, 12, 13, 12, 2, 123000) + 2024-04-12 10:16:22,540 [INFO ] MainThread: |MSSQL| Executing SQL query (on executor): + 2024-04-12 10:16:22,540 [INFO ] MainThread: SELECT + 2024-04-12 10:16:22,540 [INFO ] MainThread: id, + 2024-04-12 10:16:22,541 [INFO ] MainThread: new_value, + 2024-04-12 10:16:22,541 [INFO ] MainThread: old_value, + 2024-04-12 10:16:22,541 [INFO ] MainThread: updated_at + 2024-04-12 10:16:22,541 [INFO ] MainThread: FROM + 2024-04-12 10:16:22,541 [INFO ] MainThread: source_schema.table + 2024-04-12 10:16:22,542 [INFO ] MainThread: WHERE + 2024-04-12 10:16:22,542 [INFO ] MainThread: (field = 'some') + 2024-04-12 10:16:22,542 [INFO ] MainThread: AND + 2024-04-12 10:16:22,542 [INFO ] MainThread: (updated_at > CAST('2024-04-11T18:10:02.120000' AS datetime2)) + 2024-04-12 10:16:22,542 [INFO ] MainThread: AND + 2024-04-12 10:16:22,542 [INFO ] MainThread: (updated_at <= CAST('2024-04-12T13:12:02.123000' AS datetime2)) + 2024-04-12 10:16:22,892 [INFO ] MainThread: |Spark| DataFrame successfully created from SQL statement + 2024-04-12 10:16:22,892 [INFO ] MainThread: ------------------------------------ DBReader.run() ends ------------------------------------ + 2024-04-12 10:40:42,409 [INFO ] MainThread: =================================== DBWriter.run() starts =================================== + 2024-04-12 10:40:42,409 [INFO ] MainThread: |Spark| -> |Hive| Writing DataFrame to target using parameters: + 2024-04-12 10:40:42,410 [INFO ] MainThread: target = 'target_source_schema.table' + 2024-04-12 10:40:42,410 [INFO ] MainThread: options = { + 2024-04-12 10:40:42,410 [INFO ] MainThread: 'mode': 'append', + 2024-04-12 10:40:42,410 [INFO ] MainThread: 'format': 'orc', + 2024-04-12 10:40:42,410 [INFO ] MainThread: 'partitionBy': 'part_dt', + 2024-04-12 10:40:42,410 [INFO ] MainThread: } + 2024-04-12 10:40:42,411 [INFO ] MainThread: df_schema: + 2024-04-12 10:40:42,412 [INFO ] MainThread: root + 2024-04-12 10:40:42,412 [INFO ] MainThread: |-- id: integer (nullable = true) + 2024-04-12 10:40:42,413 [INFO ] MainThread: |-- new_value: string (nullable = true) + 2024-04-12 10:40:42,413 [INFO ] MainThread: |-- old_value: string (nullable = true) + 2024-04-12 10:40:42,413 [INFO ] MainThread: |-- updated_at: timestamp (nullable = true) + 2024-04-12 10:40:42,413 [INFO ] MainThread: |-- part_dt: date (nullable = true) + 2024-04-12 10:40:42,414 [INFO ] MainThread: + 2024-04-12 10:40:42,421 [INFO ] MainThread: |Hive| Checking connection availability... + 2024-04-12 10:40:42,421 [INFO ] MainThread: |Hive| Using connection parameters: + 2024-04-12 10:40:42,421 [INFO ] MainThread: cluster = 'dwh' + 2024-04-12 10:40:42,475 [INFO ] MainThread: |Hive| Connection is available. + 2024-04-12 10:40:42,476 [INFO ] MainThread: |Hive| Fetching schema of table 'target_source_schema.table' ... + 2024-04-12 10:40:43,518 [INFO ] MainThread: |Hive| Schema fetched. + 2024-04-12 10:40:43,521 [INFO ] MainThread: |Hive| Table 'target_source_schema.table' already exists + 2024-04-12 10:40:43,521 [WARNING ] MainThread: |Hive| User-specified options {'partitionBy': 'part_dt'} are ignored while inserting into existing table. Using only table parameters from Hive metastore + 2024-04-12 10:40:43,782 [INFO ] MainThread: |Hive| Inserting data into existing table 'target_source_schema.table' ... + 2024-04-12 11:06:07,396 [INFO ] MainThread: |Hive| Data is successfully inserted into table 'target_source_schema.table'. + 2024-04-12 11:06:07,397 [INFO ] MainThread: ------------------------------------ DBWriter.run() ends ------------------------------------ + 2024-04-12 11:06:07,397 [INFO ] MainThread: |onETL| Exiting IncrementalStrategy + 2024-04-12 11:06:07,397 [INFO ] MainThread: |IncrementalStrategy| Saving HWM to 'HorizonHWMStore': + 2024-04-12 11:06:07,397 [INFO ] MainThread: hwm = ColumnDateTimeHWM( + 2024-04-12 11:06:07,397 [INFO ] MainThread: name = 'updated_at#source_schema.table@mssql:/mssql.host:1433/somedb', + 2024-04-12 11:06:07,397 [INFO ] MainThread: entity = 'source_source_schema.table', + 2024-04-12 11:06:07,397 [INFO ] MainThread: expression = 'updated_at', + 2024-04-12 11:06:07,397 [INFO ] MainThread: value = datetime.datetime(2024, 4, 12, 13, 12, 2, 123000), + 2024-04-12 11:06:07,397 [INFO ] MainThread: ) + 2024-04-12 11:06:07,495 [INFO ] MainThread: |IncrementalStrategy| HWM has been saved + ``` + +Each step performed by onETL is extensively logged, which should help with debugging. + +You can make logs even more verbose by changing level to `DEBUG`: + +```python +from onetl.log import setup_logging + +setup_logging(level="DEBUG", enable_clients=True) + +# rest of code +... +``` + +This also changes log level for all underlying Python libraries, e.g. showing each HTTP request being made, and so on. + +::: onetl.log + options: + members: + - setup_logging + - setup_clients_logging + - set_default_logging_format diff --git a/mddocs/docs/nav.md b/mddocs/docs/nav.md new file mode 100644 index 000000000..e38662c11 --- /dev/null +++ b/mddocs/docs/nav.md @@ -0,0 +1,224 @@ +* [About onETL](index.md) +* [Concepts](concepts.md) +* [Contribution Guide](contributing.md) +* [QuickStart](quickstart.md) +* [Logging](logging.md) +* [Plugins](plugins.md) +* [Security](security.md) +* [Connections](connection/index.md) + * [DB connections](connection/db_connection/index.md) + * [Clickhouse](connection/db_connection/clickhouse/index.md) + * [Connection](connection/db_connection/clickhouse/connection.md) + * [Prerequisites](connection/db_connection/clickhouse/prerequisites.md) + * [Execute](connection/db_connection/clickhouse/execute.md) + * [Read](connection/db_connection/clickhouse/read.md) + * [Write](connection/db_connection/clickhouse/write.md) + * [SQL](connection/db_connection/clickhouse/sql.md) + * [Types](connection/db_connection/clickhouse/types.md) + * [Greenplum](connection/db_connection/greenplum/index.md) + * [Connection](connection/db_connection/greenplum/connection.md) + * [Prerequisites](connection/db_connection/greenplum/prerequisites.md) + * [Execute](connection/db_connection/greenplum/execute.md) + * [Read](connection/db_connection/greenplum/read.md) + * [Write](connection/db_connection/greenplum/write.md) + * [Types](connection/db_connection/greenplum/types.md) + * [Hive](connection/db_connection/hive/index.md) + * [Connection](connection/db_connection/hive/connection.md) + * [Prerequisites](connection/db_connection/hive/prerequisites.md) + * [Execute](connection/db_connection/hive/execute.md) + * [Read](connection/db_connection/hive/read.md) + * [Write](connection/db_connection/hive/write.md) + * [SQL](connection/db_connection/hive/sql.md) + * [Slots](connection/db_connection/hive/slots.md) + * [Iceberg](connection/db_connection/iceberg/index.md) + * [Connection](connection/db_connection/iceberg/connection.md) + * [Prerequisites](connection/db_connection/iceberg/prerequisites.md) + * [Execute](connection/db_connection/iceberg/execute.md) + * [Read](connection/db_connection/iceberg/read.md) + * [Write](connection/db_connection/iceberg/write.md) + * [SQL](connection/db_connection/iceberg/sql.md) + * [Basic Auth](connection/db_connection/iceberg/auth_basic.md) + * [Bearer Auth](connection/db_connection/iceberg/auth_bearer.md) + * [OAuth2 Client Credentials](connection/db_connection/iceberg/auth_oauth2_client_credentials.md) + * [Catalog REST](connection/db_connection/iceberg/catalog_rest.md) + * [Catalog Filesystem](connection/db_connection/iceberg/catalog_filesystem.md) + * [Warehouse S3](connection/db_connection/iceberg/warehouse_s3.md) + * [Warehouse Filesystem](connection/db_connection/iceberg/warehouse_filesystem.md) + * [Warehouse Delegated](connection/db_connection/iceberg/warehouse_delegated.md) + * [Kafka](connection/db_connection/kafka/index.md) + * [Connection](connection/db_connection/kafka/connection.md) + * [Prerequisites](connection/db_connection/kafka/prerequisites.md) + * [Read](connection/db_connection/kafka/read.md) + * [Write](connection/db_connection/kafka/write.md) + * [Auth](connection/db_connection/kafka/auth.md) + * [Basic Auth](connection/db_connection/kafka/basic_auth.md) + * [Kerberos Auth](connection/db_connection/kafka/kerberos_auth.md) + * [Scram Auth](connection/db_connection/kafka/scram_auth.md) + * [protocol](connection/db_connection/kafka/protocol.md) + * [Plaintext protocol](connection/db_connection/kafka/plaintext_protocol.md) + * [SSL protocol](connection/db_connection/kafka/ssl_protocol.md) + * [Slots](connection/db_connection/kafka/slots.md) + * [Troubleshooting](connection/db_connection/kafka/troubleshooting.md) + * [MongoDB](connection/db_connection/mongodb/index.md) + * [Connection](connection/db_connection/mongodb/connection.md) + * [Prerequisites](connection/db_connection/mongodb/prerequisites.md) + * [Read](connection/db_connection/mongodb/read.md) + * [Write](connection/db_connection/mongodb/write.md) + * [Pipelines](connection/db_connection/mongodb/pipeline.md) + * [Types](connection/db_connection/mongodb/types.md) + * [MSSQL](connection/db_connection/mssql/index.md) + * [Connection](connection/db_connection/mssql/connection.md) + * [Prerequisites](connection/db_connection/mssql/prerequisites.md) + * [Execute](connection/db_connection/mssql/execute.md) + * [Read](connection/db_connection/mssql/read.md) + * [Write](connection/db_connection/mssql/write.md) + * [SQL](connection/db_connection/mssql/sql.md) + * [Types](connection/db_connection/mssql/types.md) + * [MySQL](connection/db_connection/mysql/index.md) + * [Connection](connection/db_connection/mysql/connection.md) + * [Prerequisites](connection/db_connection/mysql/prerequisites.md) + * [Execute](connection/db_connection/mysql/execute.md) + * [Read](connection/db_connection/mysql/read.md) + * [Write](connection/db_connection/mysql/write.md) + * [SQL](connection/db_connection/mysql/sql.md) + * [Types](connection/db_connection/mysql/types.md) + * [Oracle](connection/db_connection/oracle/index.md) + * [Connection](connection/db_connection/oracle/connection.md) + * [Prerequisites](connection/db_connection/oracle/prerequisites.md) + * [Execute](connection/db_connection/oracle/execute.md) + * [Read](connection/db_connection/oracle/read.md) + * [Write](connection/db_connection/oracle/write.md) + * [SQL](connection/db_connection/oracle/sql.md) + * [Types](connection/db_connection/oracle/types.md) + * [Postgres](connection/db_connection/postgres/index.md) + * [Connection](connection/db_connection/postgres/connection.md) + * [Prerequisites](connection/db_connection/postgres/prerequisites.md) + * [Execute](connection/db_connection/postgres/execute.md) + * [Read](connection/db_connection/postgres/read.md) + * [Write](connection/db_connection/postgres/write.md) + * [SQL](connection/db_connection/postgres/sql.md) + * [Types](connection/db_connection/postgres/types.md) + * [File connection](connection/file_connection/index.md) + * [FTP](connection/file_connection/ftp.md) + * [SFTP](connection/file_connection/sftp.md) + * [FTPS](connection/file_connection/ftps.md) + * [S3](connection/file_connection/s3.md) + * [Samba](connection/file_connection/samba.md) + * [WebDAV](connection/file_connection/webdav.md) + * [HDFS](connection/file_connection/hdfs/index.md) + * [Connection](connection/file_connection/hdfs/connection.md) + * [Slots](connection/file_connection/hdfs/slots.md) + * [File as DataFrame](connection/file_df_connection/index.md) + * [Base](connection/file_df_connection/base.md) + * [Local files](connection/file_df_connection/spark_local_fs.md) + * [HDFS Files](connection/file_df_connection/spark_hdfs/index.md) + * [Connection](connection/file_df_connection/spark_hdfs/connection.md) + * [Prerequisites](connection/file_df_connection/spark_hdfs/prerequisites.md) + * [Slots](connection/file_df_connection/spark_hdfs/slots.md) + * [S3 Files](connection/file_df_connection/spark_s3/index.md) + * [Connection](connection/file_df_connection/spark_s3/connection.md) + * [Prerequisites](connection/file_df_connection/spark_s3/prerequisites.md) + * [Troubleshooting](connection/file_df_connection/spark_s3/troubleshooting.md) +* [DB](db/index.md) + * [DBReader](db/reader.md) + * [DBWriter](db/writer.md) +* [File](file/index.md) + * [File Downloader](file/file_downloader/index.md) + * [File Downloader](file/file_downloader/file_downloader.md) + * [Options](file/file_downloader/options.md) + * [Result](file/file_downloader/result.md) + * [File Uploader](file/file_uploader/index.md) + * [File Uploader](file/file_uploader/file_uploader.md) + * [Options](file/file_uploader/options.md) + * [Result](file/file_uploader/result.md) + * [File Mover](file/file_mover/index.md) + * [File Mover](file/file_mover/file_mover.md) + * [Options](file/file_mover/options.md) + * [Result](file/file_mover/result.md) + * [File Filters](file/file_filters/index.md) + * [Base](file/file_filters/base.md) + * [Glob](file/file_filters/glob.md) + * [Regexp](file/file_filters/regexp.md) + * [ExcludeDir](file/file_filters/exclude_dir.md) + * [FileSizeRange](file/file_filters/file_size_filter.md) + * [FileModifiedTime](file/file_filters/file_mtime_filter.md) + * [Match all filters](file/file_filters/match_all_filters.md) + * [File Filter (legacy)](file/file_filters/file_filter.md) + * [File Limits](file/file_limits/index.md) + * [Base interface](file/file_limits/base.md) + * [MaxFilesCount](file/file_limits/max_files_count.md) + * [TotalFilesSize](file/file_limits/total_files_size.md) + * [limits_stop_at](file/file_limits/limits_stop_at.md) + * [limits_reached](file/file_limits/limits_reached.md) + * [reset_limits](file/file_limits/reset_limits.md) + * [File Limit [legacy]](file/file_limits/file_limit.md) +* [File as DataFrame](file_df/index.md) + * [Reader](file_df/file_df_reader/index.md) + * [File DataFrame Reader](file_df/file_df_reader/file_df_reader.md) + * [Options](file_df/file_df_reader/options.md) + * [Writer](file_df/file_df_writer/index.md) + * [File DataFrame Writer](file_df/file_df_writer/file_df_writer.md) + * [Options](file_df/file_df_writer/options.md) + * [File Formats](file_df/file_formats/index.md) + * [Base](file_df/file_formats/base.md) + * [Avro](file_df/file_formats/avro.md) + * [CSV](file_df/file_formats/csv.md) + * [Excel](file_df/file_formats/excel.md) + * [JSON](file_df/file_formats/json.md) + * [JSONLines](file_df/file_formats/jsonline.md) + * [ORC](file_df/file_formats/orc.md) + * [Parquet](file_df/file_formats/parquet.md) + * [XML](file_df/file_formats/xml.md) +* [Hooks](hooks/index.md) + * [High level design](hooks/design.md) + * [@hook decorator](hooks/hook.md) + * [@slot decorator](hooks/slot.md) + * [@support_hooks decorator](hooks/support_hooks.md) + * [Hooks global state](hooks/global_state.md) +* [HWM](hwm_store/index.md) + * [YAML HWM Store](hwm_store/yaml_hwm_store.md) +* [Install](install/index.md) + * [Full](install/full.md) + * [Minimal install](install/minimal.md) + * [Files](install/files.md) + * [Spark](install/spark.md) + * [Kerberos](install/kerberos.md) +* [Strategy](strategy/index.md) + * [Snapshot Strategy](strategy/snapshot_strategy.md) + * [Incremental Strategy](strategy/incremental_strategy.md) + * [Snapshot Batch Strategy](strategy/snapshot_batch_strategy.md) + * [Incremental Batch Strategy](strategy/incremental_batch_strategy.md) +* [Troubleshooting](troubleshooting/index.md) + * [Spark](troubleshooting/spark.md) +* [Changelog](changelog/index.md) + * [0.15.0](changelog/0.15.0.md) + * [0.14.1](changelog/0.14.1.md) + * [0.14.0](changelog/0.14.0.md) + * [0.13.5](changelog/0.13.5.md) + * [0.13.4](changelog/0.13.4.md) + * [0.13.3](changelog/0.13.3.md) + * [0.13.1](changelog/0.13.1.md) + * [0.13.0](changelog/0.13.0.md) + * [0.12.5](changelog/0.12.5.md) + * [0.12.4](changelog/0.12.4.md) + * [0.12.3](changelog/0.12.3.md) + * [0.12.2](changelog/0.12.2.md) + * [0.12.1](changelog/0.12.1.md) + * [0.12.0](changelog/0.12.0.md) + * [0.11.2](changelog/0.11.2.md) + * [0.11.1](changelog/0.11.1.md) + * [0.11.0](changelog/0.11.0.md) + * [0.10.2](changelog/0.10.2.md) + * [0.10.1](changelog/0.10.1.md) + * [0.10.0](changelog/0.10.0.md) + * [0.9.5](changelog/0.9.5.md) + * [0.9.4](changelog/0.9.4.md) + * [0.9.3](changelog/0.9.3.md) + * [0.9.2](changelog/0.9.2.md) + * [0.9.1](changelog/0.9.1.md) + * [0.9.0](changelog/0.9.0.md) + * [0.8.1](changelog/0.8.1.md) + * [0.8.0](changelog/0.8.0.md) + * [0.7.2](changelog/0.7.2.md) + * [0.7.1](changelog/0.7.1.md) + * [0.7.0](changelog/0.7.0.md) diff --git a/mddocs/docs/plugins.md b/mddocs/docs/plugins.md new file mode 100644 index 000000000..e28f87b27 --- /dev/null +++ b/mddocs/docs/plugins.md @@ -0,0 +1,143 @@ +# Plugins { #DBR-onetl-plugins } + +:octicons-versions-16: **version added 0.6.0** + +## What are plugins? { #DBR-onetl-plugins-what-are-plugins } + +### Terms { #DBR-onetl-plugins-terms } + +- `Plugin` - some Python package which implements some extra functionality for onETL, like [hooks][DBR-onetl-hooks] +- `Plugin autoimport` - onETL behavior which allows to automatically import this package if it contains proper metadata (`entry_points`) + +### Features { #DBR-onetl-plugins-features } + +Plugins mechanism allows to: + +- Automatically register [hooks][DBR-onetl-hooks] which can alter onETL behavior +- Automatically register new classes, like HWM type, HWM stores and so on + +### Limitations { #DBR-onetl-plugins-limitations } + +Unlike other projects (like *Airflow 1.x*), plugins does not inject imported classes or functions to `onetl.*` namespace. +Users should import classes from the plugin package **explicitly** to avoid name collisions. + +## How to implement plugin? { #DBR-onetl-plugins-how-to-implement-plugin } + +Create a Python package `some-plugin` with a file `some_plugin/setup.py`: + +```python +# some_plugin/setup.py +from setuptools import setup + +setup( + # if you want to import something from onETL, add it to requirements list + install_requires=["onetl"], + entry_points={ + # this key enables plugins autoimport functionality + "onetl.plugins": [ + "some-plugin-name=some_plugin.module", # automatically import all module content + "some-plugin-class=some_plugin.module.internals:MyClass", # import a specific class + "some-plugin-function=some_plugin.module.internals:my_function", # import a specific function + ], + }, +) +``` + +See [setuptools documentation for entry_points](https://setuptools.pypa.io/en/latest/userguide/entry_point.html) + +## How plugins are imported? { #DBR-onetl-plugins-how-plugins-are-imported } + +- User should install a package implementing the plugin: + +```bash +pip install some-package +``` + +- Then user should import something from `onetl` module or its submodules: + +```python +import onetl +from onetl.connection import SomeConnection + +# and so on +``` + +- This import automatically executes something like: + +```python +import some_plugin.module +from some_plugin.module.internals import MyClass +from some_plugin.module.internals import my_function +``` + +If specific module/class/function uses some registration capabilities of onETL, +like [`@hook` decorator][DBR-onetl-hooks-hook-decorator], it will be executed during this import. + +## How to enable/disable plugins? { #DBR-onetl-plugins-how-to-enabledisable-plugins } + +:octicons-versions-16: **version added 0.7.0** + +### Disable/enable all plugins { #DBR-onetl-plugins-disableenable-all-plugins } + +By default plugins are enabled. + +To disabled them, you can set environment variable `ONETL_PLUGINS_ENABLED` to `false` BEFORE +importing onETL. This will disable all plugins autoimport. + +But user is still be able to explicitly import `some_plugin.module`, executing +all decorators and registration capabilities of onETL. + +### Disable a specific plugin (blacklist) { #DBR-onetl-plugins-disable-a-specific-plugin-blacklist } + +If some plugin is failing during import, you can disable it by setting up environment variable +`ONETL_PLUGINS_BLACKLIST=some-failing-plugin`. Multiple plugin names could be passed with `,` as delimiter. + +Again, this environment variable should be set BEFORE importing onETL. + +### Disable all plugins except a specific one (whitelist) { #DBR-onetl-plugins-disable-all-plugins-except-a-specific-one-whitelist } + +You can also disable all plugins except a specific one by setting up environment variable +`ONETL_PLUGINS_WHITELIST=some-not-failing-plugin`. Multiple plugin names could be passed with `,` as delimiter. + +Again, this environment variable should be set BEFORE importing onETL. + +If both whitelist and blacklist environment variables are set, blacklist has a higher priority. + +## How to see logs of the plugins mechanism? { #DBR-onetl-plugins-how-to-see-logs-of-the-plugins-mechanism } + +Plugins registration emits logs with `INFO` level: + +```python +import logging + +logging.basicConfig(level=logging.INFO) +``` + +```text +INFO |onETL| Found 2 plugins +INFO |onETL| Loading plugin 'my-plugin' +INFO |onETL| Skipping plugin 'failing' because it is in a blacklist +``` + +More detailed logs are emitted with `DEBUG` level, to make output less verbose: + +```python +import logging + +logging.basicConfig(level=logging.DEBUG) +``` + +```text +DEBUG |onETL| Searching for plugins with group 'onetl.plugins' +DEBUG |Plugins| Plugins whitelist: [] +DEBUG |Plugins| Plugins blacklist: ['failing-plugin'] +INFO |Plugins| Found 2 plugins +INFO |onETL| Loading plugin (1/2): +DEBUG name: 'my-plugin' +DEBUG package: 'my-package' +DEBUG version: '0.1.0' +DEBUG importing: 'my_package.my_module:MyClass' +DEBUG |onETL| Successfully loaded plugin 'my-plugin' +DEBUG source: '/usr/lib/python3.11/site-packages/my_package/my_module/my_class.py' +INFO |onETL| Skipping plugin 'failing' because it is in a blacklist +``` diff --git a/mddocs/docs/quickstart.md b/mddocs/docs/quickstart.md new file mode 100644 index 000000000..9b7963b79 --- /dev/null +++ b/mddocs/docs/quickstart.md @@ -0,0 +1,538 @@ +# onETL { #DBR-onetl-quickstart-onetl } + +{{ repo_status_badge }} +{{ pypi_release_bage }} +{{ pypi_license_bage }} +{{ pypi_pyversion_bage }} +{{ pypi_downloads_bage }} + +{{ docs_status_badge }} +{{ ci_status_badge }} +{{ precommit_badge }} + + +{{ onetl_logo_wide }} + +----8<---- +docs/snippet_0.md +----8<---- + + + +## Documentation { #DBR-onetl-quickstart-documentation } + +See at [ReadTheDocs](https://onetl.readthedocs.io/en/latest/) + +## How to install { #DBR-onetl-quickstart-how-to-install } + + + +### Minimal installation { #DBR-onetl-quickstart-minimal-installation } + + + +Base `onetl` package contains: + +- `DBReader`, `DBWriter` and related classes +- `FileDownloader`, `FileUploader`, `FileMover` and related classes, like file filters & limits +- `FileDFReader`, `FileDFWriter` and related classes, like file formats +- Read Strategies & HWM classes +- Plugins support + +It can be installed via: + +```bash +pip install onetl +``` + +!!! warning + + This method does NOT include any connections. + + This method is recommended for use in third-party libraries which require for `onetl` to be installed, + but do not use its connection classes. + +### With DB and FileDF connections { #DBR-onetl-quickstart-with-db-and-filedf-connections } + + + +All DB connection classes (`Clickhouse`, `Greenplum`, `Hive` and others) +and all FileDF connection classes (`SparkHDFS`, `SparkLocalFS`, `SparkS3`) +require Spark to be installed. + + + +Firstly, you should install JDK. The exact installation instruction depends on your OS, here are some examples: + +```bash +yum install java-1.8.0-openjdk-devel # CentOS 7 | Spark 2 +dnf install java-11-openjdk-devel # CentOS 8 | Spark 3 +apt-get install openjdk-11-jdk # Debian-based | Spark 3 +``` + + + +#### Compatibility matrix { #DBR-onetl-quickstart-compatibility-matrix } + +| Spark | Python | Java | Scala | +| --------------------------------------------------------- | ---------- | ---------- | ----- | +| [2.3.x](https://spark.apache.org/docs/2.3.1/#downloading) | 3.7 only | 8 only | 2.11 | +| [2.4.x](https://spark.apache.org/docs/2.4.8/#downloading) | 3.7 only | 8 only | 2.11 | +| [3.2.x](https://spark.apache.org/docs/3.2.4/#downloading) | 3.7 - 3.10 | 8u201 - 11 | 2.12 | +| [3.3.x](https://spark.apache.org/docs/3.3.4/#downloading) | 3.7 - 3.12 | 8u201 - 17 | 2.12 | +| [3.4.x](https://spark.apache.org/docs/3.4.4/#downloading) | 3.7 - 3.12 | 8u362 - 20 | 2.12 | +| [3.5.x](https://spark.apache.org/docs/3.5.5/#downloading) | 3.8 - 3.13 | 8u371 - 20 | 2.12 | + + + +Then you should install PySpark via passing `spark` to `extras`: + +```bash +pip install onetl[spark] # install latest PySpark +``` + +or install PySpark explicitly: + +```bash +pip install onetl pyspark==3.5.5 # install a specific PySpark version +``` + +or inject PySpark to `sys.path` in some other way BEFORE creating a class instance. +**Otherwise connection object cannot be created.** + +### With File connections { #DBR-onetl-quickstart-with-file-connections } + + + +All File (but not *FileDF*) connection classes (`FTP`, `SFTP`, `HDFS` and so on) requires specific Python clients to be installed. + +Each client can be installed explicitly by passing connector name (in lowercase) to `extras`: + +```bash +pip install onetl[ftp] # specific connector +pip install onetl[ftp,ftps,sftp,hdfs,s3,webdav,samba] # multiple connectors +``` + +To install all file connectors at once you can pass `files` to `extras`: + +```bash +pip install onetl[files] +``` + +**Otherwise class import will fail.** + +### With Kerberos support { #DBR-onetl-quickstart-with-kerberos-support } + + + +Most of Hadoop instances set up with Kerberos support, +so some connections require additional setup to work properly. + +- `HDFS` + Uses [requests-kerberos](https://pypi.org/project/requests-kerberos/) and + [GSSApi](https://pypi.org/project/gssapi/) for authentication. + It also uses `kinit` executable to generate Kerberos ticket. +- `Hive` and `SparkHDFS` + require Kerberos ticket to exist before creating Spark session. + +So you need to install OS packages with: + +- `krb5` libs +- Headers for `krb5` +- `gcc` or other compiler for C sources + +The exact installation instruction depends on your OS, here are some examples: + +```bash +apt install libkrb5-dev krb5-user gcc # Debian-based +dnf install krb5-devel krb5-libs krb5-workstation gcc # CentOS, OracleLinux +``` + +Also you should pass `kerberos` to `extras` to install required Python packages: + +```bash +pip install onetl[kerberos] +``` + +### Full bundle { #DBR-onetl-quickstart-full-bundle } + + + +To install all connectors and dependencies, you can pass `all` into `extras`: + +```bash +pip install onetl[all] + +# this is just the same as +pip install onetl[spark,files,kerberos] +``` + +!!! warning + + This method consumes a lot of disk space, and requires for Java & Kerberos libraries to be installed into your OS. + + + +## Quick start { #DBR-onetl-quickstart-quick-start } + +### MSSQL → Hive { #DBR-onetl-quickstart-mssql-hive } + +Read data from MSSQL, transform & write to Hive. + +```bash +# install onETL and PySpark +pip install onetl[spark] +``` + +```python +# Import pyspark to initialize the SparkSession +from pyspark.sql import SparkSession + +# import function to setup onETL logging +from onetl.log import setup_logging + +# Import required connections +from onetl.connection import MSSQL, Hive + +# Import onETL classes to read & write data +from onetl.db import DBReader, DBWriter + +# change logging level to INFO, and set up default logging format and handler +setup_logging() + +# Initialize new SparkSession with MSSQL driver loaded +maven_packages = MSSQL.get_packages() +spark = ( + SparkSession.builder.appName("spark_app_onetl_demo") + .config("spark.jars.packages", ",".join(maven_packages)) + .enableHiveSupport() # for Hive + .getOrCreate() +) + +# Initialize MSSQL connection and check if database is accessible +mssql = MSSQL( + host="mssqldb.demo.com", + user="onetl", + password="onetl", + database="Telecom", + spark=spark, + # These options are passed to MSSQL JDBC Driver: + extra={"applicationIntent": "ReadOnly"}, +).check() + +# >>> INFO:|MSSQL| Connection is available + +# Initialize DBReader +reader = DBReader( + connection=mssql, + source="dbo.demo_table", + columns=["on", "etl"], + # Set some MSSQL read options: + options=MSSQL.ReadOptions(fetchsize=10000), +) + +# checks that there is data in the table, otherwise raises exception +reader.raise_if_no_data() + +# Read data to DataFrame +df = reader.run() +df.printSchema() +# root +# |-- id: integer (nullable = true) +# |-- phone_number: string (nullable = true) +# |-- region: string (nullable = true) +# |-- birth_date: date (nullable = true) +# |-- registered_at: timestamp (nullable = true) +# |-- account_balance: double (nullable = true) + +# Apply any PySpark transformations +from pyspark.sql.functions import lit + +df_to_write = df.withColumn("engine", lit("onetl")) +df_to_write.printSchema() +# root +# |-- id: integer (nullable = true) +# |-- phone_number: string (nullable = true) +# |-- region: string (nullable = true) +# |-- birth_date: date (nullable = true) +# |-- registered_at: timestamp (nullable = true) +# |-- account_balance: double (nullable = true) +# |-- engine: string (nullable = false) + +# Initialize Hive connection +hive = Hive(cluster="rnd-dwh", spark=spark) + +# Initialize DBWriter +db_writer = DBWriter( + connection=hive, + target="dl_sb.demo_table", + # Set some Hive write options: + options=Hive.WriteOptions(if_exists="replace_entire_table"), +) + +# Write data from DataFrame to Hive +db_writer.run(df_to_write) + +# Success! +``` + +### SFTP → HDFS { #DBR-onetl-quickstart-sftp-hdfs } + +Download files from SFTP & upload them to HDFS. + +```bash +# install onETL with SFTP and HDFS clients, and Kerberos support +pip install onetl[hdfs,sftp,kerberos] +``` + +```python +# import function to setup onETL logging +from onetl.log import setup_logging + +# Import required connections +from onetl.connection import SFTP, HDFS + +# Import onETL classes to download & upload files +from onetl.file import FileDownloader, FileUploader + +# import filter & limit classes +from onetl.file.filter import Glob, ExcludeDir +from onetl.file.limit import MaxFilesCount + +# change logging level to INFO, and set up default logging format and handler +setup_logging() + +# Initialize SFTP connection and check it +sftp = SFTP( + host="sftp.test.com", + user="someuser", + password="somepassword", +).check() + +# >>> INFO:|SFTP| Connection is available + +# Initialize downloader +file_downloader = FileDownloader( + connection=sftp, + source_path="/remote/tests/Report", # path on SFTP + local_path="/local/onetl/Report", # local fs path + filters=[ + # download only files matching the glob + Glob("*.csv"), + # exclude files from this directory + ExcludeDir("/remote/tests/Report/exclude_dir/"), + ], + limits=[ + # download max 1000 files per run + MaxFilesCount(1000), + ], + options=FileDownloader.Options( + # delete files from SFTP after successful download + delete_source=True, + # mark file as failed if it already exist in local_path + if_exists="error", + ), +) + +# Download files to local filesystem +download_result = file_downloader.run() + +# Method run returns a DownloadResult object, +# which contains collection of downloaded files, divided to 4 categories +download_result + +# DownloadResult( +# successful=[ +# LocalPath('/local/onetl/Report/file_1.json'), +# LocalPath('/local/onetl/Report/file_2.json'), +# ], +# failed=[FailedRemoteFile('/remote/onetl/Report/file_3.json')], +# ignored=[RemoteFile('/remote/onetl/Report/file_4.json')], +# missing=[], +# ) + +# Raise exception if there are failed files, or there were no files in the remote filesystem +download_result.raise_if_failed() or download_result.raise_if_empty() + +# Do any kind of magic with files: rename files, remove header for csv files, ... +renamed_files = my_rename_function(download_result.success) + +# function removed "_" from file names +# [ +# LocalPath('/home/onetl/Report/file1.json'), +# LocalPath('/home/onetl/Report/file2.json'), +# ] + +# Initialize HDFS connection +hdfs = HDFS( + host="my.name.node", + user="someuser", + password="somepassword", # or keytab +) + +# Initialize uploader +file_uploader = FileUploader( + connection=hdfs, + target_path="/user/onetl/Report/", # hdfs path +) + +# Upload files from local fs to HDFS +upload_result = file_uploader.run(renamed_files) + +# Method run returns a UploadResult object, +# which contains collection of uploaded files, divided to 4 categories +upload_result + +# UploadResult( +# successful=[RemoteFile('/user/onetl/Report/file1.json')], +# failed=[FailedLocalFile('/local/onetl/Report/file2.json')], +# ignored=[], +# missing=[], +# ) + +# Raise exception if there are failed files, or there were no files in the local filesystem, or some input file is missing +upload_result.raise_if_failed() or upload_result.raise_if_empty() or upload_result.raise_if_missing() + +# Success! +``` + +### S3 → Postgres { #DBR-onetl-quickstart-s3-postgres } + +Read files directly from S3 path, convert them to dataframe, transform it and then write to a database. + +```bash +# install onETL and PySpark +pip install onetl[spark] +``` + +```python +# Import pyspark to initialize the SparkSession +from pyspark.sql import SparkSession + +# import function to setup onETL logging +from onetl.log import setup_logging + +# Import required connections +from onetl.connection import Postgres, SparkS3 + +# Import onETL classes to read files +from onetl.file import FileDFReader +from onetl.file.format import CSV + +# Import onETL classes to write data +from onetl.db import DBWriter + +# change logging level to INFO, and set up default logging format and handler +setup_logging() + +# Initialize new SparkSession with Hadoop AWS libraries and Postgres driver loaded +maven_packages = SparkS3.get_packages(spark_version="3.5.5") + Postgres.get_packages() +exclude_packages = SparkS3.get_exclude_packages() +spark = ( + SparkSession.builder.appName("spark_app_onetl_demo") + .config("spark.jars.packages", ",".join(maven_packages)) + .config("spark.jars.excludes", ",".join(exclude_packages)) + .getOrCreate() +) + +# Initialize S3 connection and check it +spark_s3 = SparkS3( + host="s3.test.com", + protocol="https", + bucket="my-bucket", + access_key="somekey", + secret_key="somesecret", + # Access bucket as s3.test.com/my-bucket + extra={"path.style.access": True}, + spark=spark, +).check() + +# >>> INFO:|SparkS3| Connection is available + +# Describe file format and parsing options +csv = CSV( + delimiter=";", + header=True, + encoding="utf-8", +) + +# Describe DataFrame schema of files +from pyspark.sql.types import ( + DateType, + DoubleType, + IntegerType, + StringType, + StructField, + StructType, + TimestampType, +) + +df_schema = StructType( + [ + StructField("id", IntegerType()), + StructField("phone_number", StringType()), + StructField("region", StringType()), + StructField("birth_date", DateType()), + StructField("registered_at", TimestampType()), + StructField("account_balance", DoubleType()), + ], +) + +# Initialize file df reader +reader = FileDFReader( + connection=spark_s3, + source_path="/remote/tests/Report", # path on S3 there *.csv files are located + format=csv, # file format with specific parsing options + df_schema=df_schema, # columns & types +) + +# Read files directly from S3 as Spark DataFrame +df = reader.run() + +# Check that DataFrame schema is same as expected +df.printSchema() +# root +# |-- id: integer (nullable = true) +# |-- phone_number: string (nullable = true) +# |-- region: string (nullable = true) +# |-- birth_date: date (nullable = true) +# |-- registered_at: timestamp (nullable = true) +# |-- account_balance: double (nullable = true) + +# Apply any PySpark transformations +from pyspark.sql.functions import lit + +df_to_write = df.withColumn("engine", lit("onetl")) +df_to_write.printSchema() +# root +# |-- id: integer (nullable = true) +# |-- phone_number: string (nullable = true) +# |-- region: string (nullable = true) +# |-- birth_date: date (nullable = true) +# |-- registered_at: timestamp (nullable = true) +# |-- account_balance: double (nullable = true) +# |-- engine: string (nullable = false) + +# Initialize Postgres connection +postgres = Postgres( + host="192.169.11.23", + user="onetl", + password="somepassword", + database="mydb", + spark=spark, +) + +# Initialize DBWriter +db_writer = DBWriter( + connection=postgres, + # write to specific table + target="public.my_table", + # with some writing options + options=Postgres.WriteOptions(if_exists="append"), +) + +# Write DataFrame to Postgres table +db_writer.run(df_to_write) + +# Success! +``` diff --git a/mddocs/docs/security.md b/mddocs/docs/security.md new file mode 100644 index 000000000..1111f2ae1 --- /dev/null +++ b/mddocs/docs/security.md @@ -0,0 +1,25 @@ +# Security { #DBR-onetl-security } + +## Supported Python versions { #DBR-onetl-security-supported-python-versions } + +3.7 or above + +## Product development security recommendations { #DBR-onetl-security-product-development-security-recommendations } + +1. Update dependencies to last stable version +2. Build SBOM for the project +3. Perform SAST (Static Application Security Testing) where possible + +## Product development security requirements { #DBR-onetl-security-product-development-security-requirements } + +1. No binaries in repository +2. No passwords, keys, access tokens in source code +3. No “Critical” and/or “High” vulnerabilities in contributed source code + +## Vulnerability reports { #DBR-onetl-security-vulnerability-reports } + +Please, use email [mailto:onetools@mts.ru](mailto:onetools@mts.ru) for reporting security issues or anything that can cause any consequences for security. + +Please avoid any public disclosure (including registering issues) at least until it is fixed. + +Thank you in advance for understanding. diff --git a/mddocs/docs/snippet_0.md b/mddocs/docs/snippet_0.md new file mode 100644 index 000000000..7bfa97234 --- /dev/null +++ b/mddocs/docs/snippet_0.md @@ -0,0 +1,43 @@ +## What is onETL? { #DBR-onetl-snippet-0-what-is-onetl } + +Python ETL/ELT library powered by [Apache Spark](https://spark.apache.org/) & other open-source tools. + +## Goals { #DBR-onetl-snippet-0-goals } + +- Provide unified classes to extract data from (**E**) & load data to (**L**) various stores. +- Provides [Spark DataFrame API](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.html) for performing transformations (**T**) in terms of *ETL*. +- Provide direct assess to database, allowing to execute SQL queries, as well as DDL, DML, and call functions/procedures. This can be used for building up *ELT* pipelines. +- Support different [read strategies][DBR-onetl-strategy-read-strategies] for incremental and batch data fetching. +- Provide [hooks][DBR-onetl-hooks] & [plugins][DBR-onetl-plugins] mechanism for altering behavior of internal classes. + +## Non-goals { #DBR-onetl-snippet-0-non-goals } + +- onETL is not a Spark replacement. It just provides additional functionality that Spark does not have, and improves UX for end users. +- onETL is not a framework, as it does not have requirements to project structure, naming, the way of running ETL/ELT processes, configuration, etc. All of that should be implemented in some other tool. +- onETL is deliberately developed without any integration with scheduling software like Apache Airflow. All integrations should be implemented as separated tools. +- Only batch operations, no streaming. For streaming prefer [Apache Flink](https://flink.apache.org/). + +## Requirements { #DBR-onetl-snippet-0-requirements } + +- **Python** 3.7 - 3.13 +- PySpark 2.3.x - 3.5.x (depends on used connector) +- Java 8+ (required by Spark, see below) +- Kerberos libs & GCC (required by `Hive`, `HDFS` and `SparkHDFS` connectors) + +## Supported storages { #DBR-onetl-snippet-0-supported-storages } + +| Type | Storage | Powered by | +|--------------------|--------------|-------------------------------------------------------------------------------------------------------------------------| +| Database {: rowspan=5} | Clickhouse
MSSQL
MySQL
Postgres
Oracle
Teradata |

Apache Spark [JDBC Data Source](https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html) | +| Hive | Apache Spark [Hive integration](https://spark.apache.org/docs/latest/sql-data-sources-hive-tables.html) | +| Kafka | Apache Spark [Kafka integration](https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html) | +| Greenplum | VMware [Greenplum Spark connector](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/index.html) | +| MongoDB | [MongoDB Spark connector](https://www.mongodb.com/docs/spark-connector/current) | +| File {: rowspan=6} | HDFS | [HDFS Python client](https://pypi.org/project/hdfs/) | +| S3 | [minio-py client](https://pypi.org/project/minio/) | +| SFTP | [Paramiko library](https://pypi.org/project/paramiko/) | +| FTP
FTPS | [FTPUtil library](https://pypi.org/project/ftputil/) | +| WebDAV | [WebdavClient3 library](https://pypi.org/project/webdavclient3/) | +| Samba | [pysmb library](https://pypi.org/project/pysmb/) | +| Files as DataFrame {: rowspan=2} | SparkLocalFS
SparkHDFS | Apache Spark [File Data Source](https://spark.apache.org/docs/latest/sql-data-sources-generic-options.html) | +| SparkS3 | [Hadoop AWS](https://hadoop.apache.org/docs/current3/hadoop-aws/tools/hadoop-aws/index.html) library | diff --git a/mddocs/strategy/incremental_batch_strategy.md b/mddocs/docs/strategy/incremental_batch_strategy.md similarity index 78% rename from mddocs/strategy/incremental_batch_strategy.md rename to mddocs/docs/strategy/incremental_batch_strategy.md index 6cda393e6..09bad06f5 100644 --- a/mddocs/strategy/incremental_batch_strategy.md +++ b/mddocs/docs/strategy/incremental_batch_strategy.md @@ -1,6 +1,6 @@ -# Incremental Batch Strategy { #incremental-batch-strategy-0 } +# Incremental Batch Strategy { #DBR-onetl-strategy-incremental-batch-strategy } - ::: onetl.connection.db_connection.clickhouse.options.ClickhouseWriteOptions diff --git a/mddocs/docs/connection/db_connection/greenplum/connection.md b/mddocs/docs/connection/db_connection/greenplum/connection.md index 823855c9d..94c5bec79 100644 --- a/mddocs/docs/connection/db_connection/greenplum/connection.md +++ b/mddocs/docs/connection/db_connection/greenplum/connection.md @@ -1,6 +1,6 @@ # Greenplum connection { #DBR-onetl-connection-db-connection-greenplum-connection-0 } -> E : CHECK IF TABLE EXISTS gp_table E -->> A : TABLE EXISTS A ->> E : SHOW SCHEMA FOR gp_table E -->> A : (id bigint, col1 int, col2 text, ...) - + note over A,H: == DBReader.run() == - + A ->> B: START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION 1 A ->> C: START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION 2 A ->> D: START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION N - + note right of A : This is done in parallel,
executors are independent
|
|
|
V B ->> E: CREATE WRITABLE EXTERNAL TABLE spark_executor1 (id bigint, col1 int, col2 text, ...)
USING address=executor1_host:executor1_port
INSERT INTO EXTERNAL TABLE spark_executor1 FROM gp_table WHERE gp_segment_id = 1 note right of E : Each white vertical line here is a opened connection to master.
Usually, **N+1** connections are created from Spark to Greenplum master activate E E -->> F: SELECT DATA FROM gp_table_data_on_segment1 TO spark_executor1 note right of F : No direct requests between Greenplum segments & Spark driver.
Data transfer is always initiated by Greenplum segments. - + C ->> E: CREATE WRITABLE EXTERNAL TABLE spark_executor2 (id bigint, col1 int, col2 text, ...)
USING address=executor2_host:executor2_port
INSERT INTO EXTERNAL TABLE spark_executor2 FROM gp_table WHERE gp_segment_id = 2 activate E E -->> G: SELECT DATA FROM gp_table_data_on_segment2 TO spark_executor2 - + D ->> E: CREATE WRITABLE EXTERNAL TABLE spark_executorN (id bigint, col1 int, col2 text, ...)
USING address=executorN_host:executorN_port
INSERT INTO EXTERNAL TABLE spark_executorN FROM gp_table WHERE gp_segment_id = N activate E E -->> H: SELECT DATA FROM gp_table_data_on_segmentN TO spark_executorN - + F -xB: INITIALIZE CONNECTION TO Spark executor1
PUSH DATA TO Spark executor1 note left of B : Circle is an open GPFDIST port,
listened by executor - + G -xC: INITIALIZE CONNECTION TO Spark executor2
PUSH DATA TO Spark executor2 H -xD: INITIALIZE CONNECTION TO Spark executorN
PUSH DATA TO Spark executorN - + note over A,H: == Spark.stop() == - + B -->> E : DROP TABLE spark_executor1 deactivate E C -->> E : DROP TABLE spark_executor2 deactivate E D -->> E : DROP TABLE spark_executorN deactivate E - + B -->> A: DONE C -->> A: DONE D -->> A: DONE - + A -->> E : CLOSE CONNECTION deactivate E deactivate A @@ -422,7 +422,7 @@ You should use [UNLOGGED](https://docs.vmware.com/en/VMware-Greenplum/7/greenplu ## Options { #DBR-onetl-connection-db-connection-greenplum-read-options } -> F: SELECT DATA FROM gp_table_data_on_segment1 TO spark_executor1 activate F - + note right of F : No direct requests between Greenplum segments & Spark.
Data transfer is always initiated by Greenplum segments. C ->> E: CREATE WRITABLE EXTERNAL TABLE spark_executor2 (id bigint, col1 int, col2 text, ...)
USING address=executor2_host:executor2_port
INSERT INTO EXTERNAL TABLE spark_executor2 FROM gp_table WHERE gp_segment_id = 2 @@ -176,7 +176,7 @@ High-level schema is described in [Greenplum prerequisites][DBR-onetl-connection E -->> H: SELECT DATA FROM gp_table_data_on_segmentN TO spark_executorN activate H - + F -xB: INITIALIZE CONNECTION TO Spark executor1
PUSH DATA TO Spark executor1 deactivate F note left of B : Circle is an open GPFDIST port,
listened by executor @@ -202,7 +202,7 @@ High-level schema is described in [Greenplum prerequisites][DBR-onetl-connection D -->> A: DONE deactivate D - + A -->> E: CLOSE CONNECTION deactivate E deactivate A @@ -210,7 +210,7 @@ High-level schema is described in [Greenplum prerequisites][DBR-onetl-connection ## Options { #DBR-onetl-connection-db-connection-greenplum-write-options } - ::: onetl.connection.db_connection.hive.options.HiveWriteOptions diff --git a/mddocs/docs/connection/db_connection/kafka/auth.md b/mddocs/docs/connection/db_connection/kafka/auth.md index ce2bd7d77..412ef14b5 100644 --- a/mddocs/docs/connection/db_connection/kafka/auth.md +++ b/mddocs/docs/connection/db_connection/kafka/auth.md @@ -1,6 +1,6 @@ # Kafka Auth { #DBR-onetl-connection-db-connection-kafka-auth } - ::: onetl.connection.db_connection.kafka.connection.Kafka diff --git a/mddocs/docs/connection/db_connection/kafka/kerberos_auth.md b/mddocs/docs/connection/db_connection/kafka/kerberos_auth.md index 7034fbc45..7d4ebc3be 100644 --- a/mddocs/docs/connection/db_connection/kafka/kerberos_auth.md +++ b/mddocs/docs/connection/db_connection/kafka/kerberos_auth.md @@ -1,6 +1,6 @@ # Kafka KerberosAuth { #DBR-onetl-connection-db-connection-kafka-kerberos-auth-kafka-kerberosauth } - ::: onetl.connection.db_connection.kafka.kafka_plaintext_protocol.KafkaPlaintextProtocol diff --git a/mddocs/docs/connection/db_connection/kafka/protocol.md b/mddocs/docs/connection/db_connection/kafka/protocol.md index df7186094..c08c1cc70 100644 --- a/mddocs/docs/connection/db_connection/kafka/protocol.md +++ b/mddocs/docs/connection/db_connection/kafka/protocol.md @@ -1,6 +1,6 @@ # Kafka Protocol { #DBR-onetl-connection-db-connection-kafka-protocol } - ::: onetl.connection.db_connection.mssql.connection.MSSQL diff --git a/mddocs/docs/connection/db_connection/mssql/execute.md b/mddocs/docs/connection/db_connection/mssql/execute.md index 858354c73..646e7e2b7 100644 --- a/mddocs/docs/connection/db_connection/mssql/execute.md +++ b/mddocs/docs/connection/db_connection/mssql/execute.md @@ -89,7 +89,7 @@ This method supports **any** query syntax supported by MSSQL, like: ## Options { #DBR-onetl-connection-db-connection-mssql-execute-options } - ::: onetl.connection.file_connection.ftp.FTP diff --git a/mddocs/docs/connection/file_connection/ftps.md b/mddocs/docs/connection/file_connection/ftps.md index a1129c2b6..ff71c80ea 100644 --- a/mddocs/docs/connection/file_connection/ftps.md +++ b/mddocs/docs/connection/file_connection/ftps.md @@ -1,6 +1,6 @@ # FTPS connection { #DBR-onetl-connection-file-connection-ftps-connection } - ::: onetl.connection.file_connection.hdfs.connection.HDFS diff --git a/mddocs/docs/connection/file_connection/hdfs/slots.md b/mddocs/docs/connection/file_connection/hdfs/slots.md index 33b501a1a..21a9a5269 100644 --- a/mddocs/docs/connection/file_connection/hdfs/slots.md +++ b/mddocs/docs/connection/file_connection/hdfs/slots.md @@ -1,6 +1,6 @@ # HDFS Slots { #DBR-onetl-connection-file-connection-hdfs-slots } - ::: onetl.connection.file_connection.hdfs.slots.HDFSSlots diff --git a/mddocs/docs/connection/file_connection/s3.md b/mddocs/docs/connection/file_connection/s3.md index 9d9682d11..527fcb646 100644 --- a/mddocs/docs/connection/file_connection/s3.md +++ b/mddocs/docs/connection/file_connection/s3.md @@ -1,6 +1,6 @@ # S3 connection { #DBR-onetl-connection-file-connection-s3-connection } - ::: onetl.connection.file_connection.samba.Samba diff --git a/mddocs/docs/connection/file_connection/sftp.md b/mddocs/docs/connection/file_connection/sftp.md index 05d42ae7b..d7c9ca687 100644 --- a/mddocs/docs/connection/file_connection/sftp.md +++ b/mddocs/docs/connection/file_connection/sftp.md @@ -1,6 +1,6 @@ # SFTP connection { #DBR-onetl-connection-file-connection-sftp-connection } - ::: onetl.connection.file_connection.sftp.SFTP diff --git a/mddocs/docs/connection/file_connection/webdav.md b/mddocs/docs/connection/file_connection/webdav.md index 39d974278..1c939aadd 100644 --- a/mddocs/docs/connection/file_connection/webdav.md +++ b/mddocs/docs/connection/file_connection/webdav.md @@ -1,6 +1,6 @@ # WebDAV connection { #DBR-onetl-connection-file-connection-webdav-connection } - ::: onetl.connection.file_connection.webdav.WebDAV diff --git a/mddocs/docs/connection/file_df_connection/base.md b/mddocs/docs/connection/file_df_connection/base.md index d0692d67f..2f20b0da8 100644 --- a/mddocs/docs/connection/file_df_connection/base.md +++ b/mddocs/docs/connection/file_df_connection/base.md @@ -1,6 +1,6 @@ # Base interface { #DBR-onetl-connection-file-df-connection-base-interface } - ::: onetl.db.db_reader.db_reader.DBReader diff --git a/mddocs/docs/db/writer.md b/mddocs/docs/db/writer.md index 7015eeb22..9a0b83c22 100644 --- a/mddocs/docs/db/writer.md +++ b/mddocs/docs/db/writer.md @@ -1,6 +1,6 @@ # DB Writer { #DBR-onetl-db-writer } - ::: onetl.db.db_writer.db_writer.DBWriter diff --git a/mddocs/docs/file/file_downloader/file_downloader.md b/mddocs/docs/file/file_downloader/file_downloader.md index cd38edefc..5adc98f65 100644 --- a/mddocs/docs/file/file_downloader/file_downloader.md +++ b/mddocs/docs/file/file_downloader/file_downloader.md @@ -1,6 +1,6 @@ # File Downloader { #DBR-onetl-file-downloader-0 } - ::: onetl.file.file_downloader.file_downloader.FileDownloader diff --git a/mddocs/docs/file/file_downloader/result.md b/mddocs/docs/file/file_downloader/result.md index 9fe3c714e..de660371f 100644 --- a/mddocs/docs/file/file_downloader/result.md +++ b/mddocs/docs/file/file_downloader/result.md @@ -1,6 +1,6 @@ # File Downloader Result { #DBR-onetl-file-downloader-result } - ::: onetl.file.file_downloader.result.DownloadResult diff --git a/mddocs/docs/file/file_filters/base.md b/mddocs/docs/file/file_filters/base.md index 02e39c13d..f10c994bc 100644 --- a/mddocs/docs/file/file_filters/base.md +++ b/mddocs/docs/file/file_filters/base.md @@ -1,6 +1,6 @@ # Base interface { #DBR-onetl-file-filters-base-interface } - ::: onetl.base.base_file_filter.BaseFileFilter diff --git a/mddocs/docs/file/file_filters/exclude_dir.md b/mddocs/docs/file/file_filters/exclude_dir.md index d642862a5..978ba1748 100644 --- a/mddocs/docs/file/file_filters/exclude_dir.md +++ b/mddocs/docs/file/file_filters/exclude_dir.md @@ -1,6 +1,6 @@ # ExcludeDir { #DBR-onetl-file-filters-exclude-dir-excludedir } - ::: onetl.file.filter.exclude_dir.ExcludeDir diff --git a/mddocs/docs/file/file_filters/file_filter.md b/mddocs/docs/file/file_filters/file_filter.md index 337e9bef2..9eae6dba6 100644 --- a/mddocs/docs/file/file_filters/file_filter.md +++ b/mddocs/docs/file/file_filters/file_filter.md @@ -1,6 +1,6 @@ # File Filter (legacy) { #DBR-onetl-file-filters-file-filter-legacy } - ::: onetl.core.file_filter.file_filter.FileFilter diff --git a/mddocs/docs/file/file_filters/file_mtime_filter.md b/mddocs/docs/file/file_filters/file_mtime_filter.md index 50db64095..4d71e97af 100644 --- a/mddocs/docs/file/file_filters/file_mtime_filter.md +++ b/mddocs/docs/file/file_filters/file_mtime_filter.md @@ -1,6 +1,6 @@ # FileModifiedTime { #DBR-onetl-file-filters-file-mtime-filter-filemodifiedtime } - ::: onetl.file.filter.file_mtime.FileModifiedTime diff --git a/mddocs/docs/file/file_filters/file_size_filter.md b/mddocs/docs/file/file_filters/file_size_filter.md index 13c102927..b57a15583 100644 --- a/mddocs/docs/file/file_filters/file_size_filter.md +++ b/mddocs/docs/file/file_filters/file_size_filter.md @@ -1,6 +1,6 @@ # FileSizeRange { #DBR-onetl-file-filters-file-size-filter-filesizerange } - ::: onetl.file.filter.file_size.FileSizeRange diff --git a/mddocs/docs/file/file_filters/glob.md b/mddocs/docs/file/file_filters/glob.md index 797557607..e98d1d6a0 100644 --- a/mddocs/docs/file/file_filters/glob.md +++ b/mddocs/docs/file/file_filters/glob.md @@ -1,6 +1,6 @@ # Glob { #DBR-onetl-file-filters-glob } - ::: onetl.file.filter.glob.Glob diff --git a/mddocs/docs/file/file_filters/match_all_filters.md b/mddocs/docs/file/file_filters/match_all_filters.md index 98c3f32cf..8d9a2f457 100644 --- a/mddocs/docs/file/file_filters/match_all_filters.md +++ b/mddocs/docs/file/file_filters/match_all_filters.md @@ -1,13 +1,13 @@ # match_all_filters { #DBR-onetl-file-filters-match-all-filters } - ::: onetl.file.filter.match_all_filters diff --git a/mddocs/docs/file/file_filters/regexp.md b/mddocs/docs/file/file_filters/regexp.md index e0e47d479..f91490404 100644 --- a/mddocs/docs/file/file_filters/regexp.md +++ b/mddocs/docs/file/file_filters/regexp.md @@ -1,6 +1,6 @@ # Regexp { #DBR-onetl-file-filters-regexp } - diff --git a/mddocs/docs/file/file_limits/base.md b/mddocs/docs/file/file_limits/base.md index eadf374e1..02f155336 100644 --- a/mddocs/docs/file/file_limits/base.md +++ b/mddocs/docs/file/file_limits/base.md @@ -1,6 +1,6 @@ # Base interface { #DBR-onetl-file-limits-base-interface } - ::: onetl.base.base_file_limit.BaseFileLimit diff --git a/mddocs/docs/file/file_limits/file_limit.md b/mddocs/docs/file/file_limits/file_limit.md index ecb1fa5e5..cefdcdc26 100644 --- a/mddocs/docs/file/file_limits/file_limit.md +++ b/mddocs/docs/file/file_limits/file_limit.md @@ -1,6 +1,6 @@ # File Limit (legacy) { #DBR-onetl-file-limits-file-limit-legacy } - ::: onetl.core.file_limit.file_limit.FileLimit diff --git a/mddocs/docs/file/file_limits/limits_reached.md b/mddocs/docs/file/file_limits/limits_reached.md index e0785db31..2a292d67b 100644 --- a/mddocs/docs/file/file_limits/limits_reached.md +++ b/mddocs/docs/file/file_limits/limits_reached.md @@ -1,6 +1,6 @@ # limits_reached { #DBR-onetl-file-limits-limits-reached } - ::: onetl.file.limit.limits_stop_at diff --git a/mddocs/docs/file/file_limits/max_files_count.md b/mddocs/docs/file/file_limits/max_files_count.md index 513664da2..5c59cb177 100644 --- a/mddocs/docs/file/file_limits/max_files_count.md +++ b/mddocs/docs/file/file_limits/max_files_count.md @@ -1,6 +1,6 @@ # MaxFilesCount { #DBR-onetl-file-limits-max-files-count-maxfilescount } - ::: onetl.file.limit.max_files_count.MaxFilesCount diff --git a/mddocs/docs/file/file_limits/reset_limits.md b/mddocs/docs/file/file_limits/reset_limits.md index e3e3a495c..d3766c6d9 100644 --- a/mddocs/docs/file/file_limits/reset_limits.md +++ b/mddocs/docs/file/file_limits/reset_limits.md @@ -1,13 +1,13 @@ # reset_limits { #DBR-onetl-file-limits-reset-limits } - ::: onetl.file.limit.reset_limits diff --git a/mddocs/docs/file/file_limits/total_files_size.md b/mddocs/docs/file/file_limits/total_files_size.md index f641dc406..3b91b6043 100644 --- a/mddocs/docs/file/file_limits/total_files_size.md +++ b/mddocs/docs/file/file_limits/total_files_size.md @@ -1,6 +1,6 @@ # TotalFilesSize { #DBR-onetl-file-limits-total-files-size-totalfilessize } - ::: onetl.file.file_uploader.options.FileUploaderOptions diff --git a/mddocs/docs/file/file_uploader/result.md b/mddocs/docs/file/file_uploader/result.md index 785cd590d..59933d8e7 100644 --- a/mddocs/docs/file/file_uploader/result.md +++ b/mddocs/docs/file/file_uploader/result.md @@ -1,6 +1,6 @@ # File Uploader Result { #DBR-onetl-file-uploader-result } - ::: onetl.file.file_uploader.result.UploadResult diff --git a/mddocs/docs/file_df/file_df_reader/file_df_reader.md b/mddocs/docs/file_df/file_df_reader/file_df_reader.md index e9f3494fe..068d32c19 100644 --- a/mddocs/docs/file_df/file_df_reader/file_df_reader.md +++ b/mddocs/docs/file_df/file_df_reader/file_df_reader.md @@ -1,6 +1,6 @@ # FileDF Reader { #DBR-onetl-file-df-reader-filedf-reader-0 } - ::: onetl.file.file_df_reader.file_df_reader.FileDFReader diff --git a/mddocs/docs/file_df/file_df_reader/options.md b/mddocs/docs/file_df/file_df_reader/options.md index ba85c39b7..fd672a766 100644 --- a/mddocs/docs/file_df/file_df_reader/options.md +++ b/mddocs/docs/file_df/file_df_reader/options.md @@ -1,6 +1,6 @@ # Options { #DBR-onetl-file-df-reader-options } - ::: onetl.file.file_df_writer.options.FileDFWriterOptions diff --git a/mddocs/docs/file_df/file_formats/avro.md b/mddocs/docs/file_df/file_formats/avro.md index 8c2c0fdec..92657a064 100644 --- a/mddocs/docs/file_df/file_formats/avro.md +++ b/mddocs/docs/file_df/file_formats/avro.md @@ -1,6 +1,6 @@ # Avro { #DBR-onetl-file-df-file-formats-avro } - ::: onetl.file.format.parquet.Parquet diff --git a/mddocs/docs/file_df/file_formats/xml.md b/mddocs/docs/file_df/file_formats/xml.md index de63d5455..6785e3709 100644 --- a/mddocs/docs/file_df/file_formats/xml.md +++ b/mddocs/docs/file_df/file_formats/xml.md @@ -1,6 +1,6 @@ # XML { #DBR-onetl-file-df-file-formats-xml } - ::: onetl.hooks.hook.hook diff --git a/mddocs/docs/hooks/slot.md b/mddocs/docs/hooks/slot.md index e404abf56..2a2c3ad38 100644 --- a/mddocs/docs/hooks/slot.md +++ b/mddocs/docs/hooks/slot.md @@ -1,6 +1,6 @@ # `@slot` decorator { #DBR-onetl-hooks-slot-decorator } - - ::: onetl.hooks.support_hooks diff --git a/mddocs/docs/hwm_store/yaml_hwm_store.md b/mddocs/docs/hwm_store/yaml_hwm_store.md index 04fe9d938..4f44e323e 100644 --- a/mddocs/docs/hwm_store/yaml_hwm_store.md +++ b/mddocs/docs/hwm_store/yaml_hwm_store.md @@ -1,6 +1,6 @@ # YAML HWM Store { #DBR-onetl-hwm-store-yaml-hwm-store } - From 168cc3126e11b34ca501e94d2a526ec73a64e045 Mon Sep 17 00:00:00 2001 From: Sattar Gyulmamedov Date: Tue, 7 Apr 2026 17:41:15 +0300 Subject: [PATCH 14/28] remove json and dict methods from docstring include in result of file_uploader doc --- mddocs/docs/file/file_uploader/result.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/mddocs/docs/file/file_uploader/result.md b/mddocs/docs/file/file_uploader/result.md index 59933d8e7..cb8f07302 100644 --- a/mddocs/docs/file/file_uploader/result.md +++ b/mddocs/docs/file/file_uploader/result.md @@ -35,5 +35,3 @@ - raise_if_contains_zero_size - details - summary - - dict - - json From 0f768cc3a7a8ca8304158672dd7f6669ffa17425 Mon Sep 17 00:00:00 2001 From: Sattar Gyulmamedov Date: Tue, 7 Apr 2026 17:48:21 +0300 Subject: [PATCH 15/28] fix members in options of file_uploader docs --- mddocs/docs/file/file_uploader/options.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mddocs/docs/file/file_uploader/options.md b/mddocs/docs/file/file_uploader/options.md index 56ae6d754..6ccf57f4c 100644 --- a/mddocs/docs/file/file_uploader/options.md +++ b/mddocs/docs/file/file_uploader/options.md @@ -16,5 +16,6 @@ ::: onetl.file.file_uploader.options.FileUploaderOptions options: members: - - source_dir - - target_dir + - if_exists + - delete_local + - workers From 2c26e33bbae3a76027e0f61aaf93ba731dceb7e8 Mon Sep 17 00:00:00 2001 From: Sattar Gyulmamedov Date: Tue, 7 Apr 2026 18:05:05 +0300 Subject: [PATCH 16/28] fix markup in iceberg catalog docs --- .../connection/db_connection/iceberg/catalog_rest.md | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/mddocs/docs/connection/db_connection/iceberg/catalog_rest.md b/mddocs/docs/connection/db_connection/iceberg/catalog_rest.md index c410962d0..71d2904ff 100644 --- a/mddocs/docs/connection/db_connection/iceberg/catalog_rest.md +++ b/mddocs/docs/connection/db_connection/iceberg/catalog_rest.md @@ -5,13 +5,9 @@ inherited_members: true heading_level: 3 show_root_heading: true - extensions: - - griffe_sphinx - - griffe_pydantic: {schema: false} - ## Authentication { #DBR-onetl-connection-db-connection-iceberg-catalog-rest-authentication } - +- [Basic Authentication][DBR-onetl-connection-db-connection-iceberg-auth-basic-basic-authentication] +- [Bearer Token Authentication][DBR-onetl-connection-db-connection-iceberg-auth-bearer-bearer-token-authentication] +- [OAuth2 Client Credentials Flow][DBR-onetl-connection-db-connection-iceberg-auth-oauth2-client-credentials-oauth2-client-credentials-flow] From 8f816ebd6b070d93927f6c80077f3a818cb7886d Mon Sep 17 00:00:00 2001 From: Sattar Gyulmamedov Date: Tue, 7 Apr 2026 18:13:20 +0300 Subject: [PATCH 17/28] remove plantuml diagrams --- .../db_connection/greenplum/execute.md | 62 +++++----------- .../db_connection/greenplum/read.md | 70 ------------------ .../db_connection/greenplum/write.md | 74 ------------------- mddocs/mkdocs.yml | 6 +- 4 files changed, 20 insertions(+), 192 deletions(-) diff --git a/mddocs/docs/connection/db_connection/greenplum/execute.md b/mddocs/docs/connection/db_connection/greenplum/execute.md index 087d22cf9..62d43c10b 100644 --- a/mddocs/docs/connection/db_connection/greenplum/execute.md +++ b/mddocs/docs/connection/db_connection/greenplum/execute.md @@ -99,59 +99,31 @@ The only port used while interacting with Greenplum in this case is `5432` (Gree ??? note "Spark <-> Greenplum interaction during Greenplum.execute()/Greenplum.fetch()" - ```plantuml - @startuml - title Greenplum master <-> Spark driver - box Spark - participant "Spark driver" - end box - - box "Greenplum" - participant "Greenplum master" - end box - - == Greenplum.check() == - - activate "Spark driver" - "Spark driver" -> "Greenplum master" ++ : CONNECT - - == Greenplum.execute(statement) == - "Spark driver" --> "Greenplum master" : EXECUTE statement - "Greenplum master" -> "Spark driver" : RETURN result - - == Greenplum.close() == - "Spark driver" --> "Greenplum master" : CLOSE CONNECTION - - deactivate "Greenplum master" - deactivate "Spark driver" - @enduml - ``` - ```mermaid - --- - title: Greenplum master <—> Spark driver - --- + --- + title: Greenplum master <—> Spark driver + --- - sequenceDiagram - box Spark - participant A as Spark driver - end - box Greenplum - participant B as Greenplum master - end + sequenceDiagram + box Spark + participant A as Spark driver + end + box Greenplum + participant B as Greenplum master + end - Note over A,B: == Greenplum.check() == + Note over A,B: == Greenplum.check() == - A->>B: CONNECT + A->>B: CONNECT - Note over A,B: == Greenplum.execute(statement) == + Note over A,B: == Greenplum.execute(statement) == - A-->>B: EXECUTE statement - B-->> A: RETURN result + A-->>B: EXECUTE statement + B-->> A: RETURN result - Note over A,B: == Greenplum.close() == + Note over A,B: == Greenplum.close() == - A ->> B: CLOSE CONNECTION + A ->> B: CLOSE CONNECTION ``` ## Options { #DBR-onetl-connection-db-connection-greenplum-execute-options } diff --git a/mddocs/docs/connection/db_connection/greenplum/read.md b/mddocs/docs/connection/db_connection/greenplum/read.md index 231dd5ee5..4c77f867f 100644 --- a/mddocs/docs/connection/db_connection/greenplum/read.md +++ b/mddocs/docs/connection/db_connection/greenplum/read.md @@ -106,76 +106,6 @@ High-level schema is described in [Greenplum prerequisites][DBR-onetl-connection ??? note "Spark <-> Greenplum interaction during DBReader.run()" - ```plantuml - @startuml - title Greenplum master <-> Spark driver - box "Spark" - participant "Spark driver" - participant "Spark executor1" - participant "Spark executor2" - participant "Spark executorN" - end box - - box "Greenplum" - participant "Greenplum master" - participant "Greenplum segment1" - participant "Greenplum segment2" - participant "Greenplum segmentN" - end box - - == Greenplum.check() == - - activate "Spark driver" - "Spark driver" -> "Greenplum master" ++ : CONNECT - - "Spark driver" --> "Greenplum master" : CHECK IF TABLE EXISTS gp_table - "Greenplum master" --> "Spark driver" : TABLE EXISTS - "Spark driver" -> "Greenplum master" : SHOW SCHEMA FOR gp_table - "Greenplum master" --> "Spark driver" : (id bigint, col1 int, col2 text, ...) - - == DBReader.run() == - - "Spark driver" -> "Spark executor1" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION 1 - "Spark driver" -> "Spark executor2" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION 2 - "Spark driver" -> "Spark executorN" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION N - - note right of "Spark driver" : This is done in parallel,\nexecutors are independent\n|\n|\n|\nV - "Spark executor1" -> "Greenplum master" ++ : CREATE WRITABLE EXTERNAL TABLE spark_executor1 (id bigint, col1 int, col2 text, ...) USING address=executor1_host:executor1_port;\nINSERT INTO EXTERNAL TABLE spark_executor1 FROM gp_table WHERE gp_segment_id = 1 - note right of "Greenplum master" : Each white vertical line here is a opened connection to master.\nUsually, **N+1** connections are created from Spark to Greenplum master - "Greenplum master" --> "Greenplum segment1" ++ : SELECT DATA FROM gp_table_data_on_segment1 TO spark_executor1 - note right of "Greenplum segment1" : No direct requests between Greenplum segments & Spark driver.\nData transfer is always initiated by Greenplum segments. - - "Spark executor2" -> "Greenplum master" ++ : CREATE WRITABLE EXTERNAL TABLE spark_executor2 (id bigint, col1 int, col2 text, ...) USING address=executor2_host:executor2_port;\nINSERT INTO EXTERNAL TABLE spark_executor2 FROM gp_table WHERE gp_segment_id = 2 - "Greenplum master" --> "Greenplum segment2" ++ : SELECT DATA FROM gp_table_data_on_segment2 TO spark_executor2 - - "Spark executorN" -> "Greenplum master" ++ : CREATE WRITABLE EXTERNAL TABLE spark_executorN (id bigint, col1 int, col2 text, ...) USING address=executorN_host:executorN_port;\nINSERT INTO EXTERNAL TABLE spark_executorN FROM gp_table WHERE gp_segment_id = N - "Greenplum master" --> "Greenplum segmentN" ++ : SELECT DATA FROM gp_table_data_on_segmentN TO spark_executorN - - "Greenplum segment1" ->o "Spark executor1" -- : INITIALIZE CONNECTION TO Spark executor1\nPUSH DATA TO Spark executor1 - note left of "Spark executor1" : Circle is an open GPFDIST port,\nlistened by executor - - "Greenplum segment2" ->o "Spark executor2" -- : INITIALIZE CONNECTION TO Spark executor2\nPUSH DATA TO Spark executor2 - "Greenplum segmentN" ->o "Spark executorN" -- : INITIALIZE CONNECTION TO Spark executorN\nPUSH DATA TO Spark executorN - - == Spark.stop() == - - "Spark executor1" --> "Greenplum master" : DROP TABLE spark_executor1 - deactivate "Greenplum master" - "Spark executor2" --> "Greenplum master" : DROP TABLE spark_executor2 - deactivate "Greenplum master" - "Spark executorN" --> "Greenplum master" : DROP TABLE spark_executorN - deactivate "Greenplum master" - - "Spark executor1" --> "Spark driver" -- : DONE - "Spark executor2" --> "Spark driver" -- : DONE - "Spark executorN" --> "Spark driver" -- : DONE - - "Spark driver" --> "Greenplum master" : CLOSE CONNECTION - deactivate "Greenplum master" - deactivate "Spark driver" - @enduml - ``` - ```mermaid --- title: Greenplum master <-> Spark driver diff --git a/mddocs/docs/connection/db_connection/greenplum/write.md b/mddocs/docs/connection/db_connection/greenplum/write.md index 068b05b4b..a090447a0 100644 --- a/mddocs/docs/connection/db_connection/greenplum/write.md +++ b/mddocs/docs/connection/db_connection/greenplum/write.md @@ -43,80 +43,6 @@ High-level schema is described in [Greenplum prerequisites][DBR-onetl-connection ??? note "Spark <-> Greenplum interaction during DBWriter.run()" - ```plantuml - @startuml - title Greenplum master <-> Spark driver - box "Spark" - participant "Spark driver" - participant "Spark executor1" - participant "Spark executor2" - participant "Spark executorN" - end box - - box "Greenplum" - participant "Greenplum master" - participant "Greenplum segment1" - participant "Greenplum segment2" - participant "Greenplum segmentN" - end box - - == Greenplum.check() == - - activate "Spark driver" - "Spark driver" -> "Greenplum master" ++ : CONNECT - "Spark driver" --> "Greenplum master" ++ : CHECK IF TABLE EXISTS gp_table - "Greenplum master" --> "Spark driver" : TABLE NOT EXISTS - - == DBWriter.run(df) == - - "Spark driver" -> "Spark executor1" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION 1 - "Spark driver" -> "Spark executor2" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION 2 - "Spark driver" -> "Spark executorN" ++ : START EXECUTOR FOR df(id bigint, col1 int, col2 text, ...) PARTITION N - - note right of "Spark driver" : This is done in parallel,\nexecutors are independent\n|\n|\n|\nV - "Spark executor1" -> "Greenplum master" ++ : CREATE READABLE EXTERNAL TABLE spark_executor1 (id bigint, col1 int, col2 text, ...) USING address=executor1_host:executor1_port;\nINSERT INTO gp_table FROM spark_executor1 - note right of "Greenplum master" : Each white vertical line here is a opened connection to master.\nUsually, **N+1** connections are created from Spark to Greenplum master - "Greenplum master" --> "Greenplum segment1" ++ : SELECT DATA FROM spark_executor1 TO gp_table_data_on_segment1 - note right of "Greenplum segment1" : No direct requests between Greenplum segments & Spark.\nData transfer is always initiated by Greenplum segments. - - "Spark executor2" -> "Greenplum master" ++ : CREATE READABLE EXTERNAL TABLE spark_executor2 (id bigint, col1 int, col2 text, ...) USING address=executor2_host:executor2_port;\nINSERT INTO gp_table FROM spark_executor2 - "Greenplum master" --> "Greenplum segment2" ++ : SELECT DATA FROM spark_executor2 TO gp_table_data_on_segment2 - - "Spark executorN" -> "Greenplum master" ++ : CREATE READABLE EXTERNAL TABLE spark_executorN (id bigint, col1 int, col2 text, ...) USING address=executorN_host:executorN_port;\nINSERT INTO gp_table FROM spark_executorN - "Greenplum master" --> "Greenplum segmentN" ++ : SELECT DATA FROM spark_executorN TO gp_table_data_on_segmentN - - "Greenplum segment1" -->o "Spark executor1" : INITIALIZE CONNECTION TO Spark executor1 - "Spark executor1" -> "Greenplum segment1" : READ DATA FROM Spark executor1 - note left of "Spark executor1" : Circle is an open GPFDIST port,\nlistened by executor - deactivate "Greenplum segment1" - - "Greenplum segment2" -->o "Spark executor2" : INITIALIZE CONNECTION TO Spark executor2 - "Spark executor2" -> "Greenplum segment2" : READ DATA FROM Spark executor2 - deactivate "Greenplum segment2" - - "Greenplum segmentN" -->o "Spark executorN" : INITIALIZE CONNECTION TO Spark executorN - "Spark executorN" -> "Greenplum segmentN" : READ DATA FROM Spark executorN - deactivate "Greenplum segmentN" - - == Finished == - - "Spark executor1" --> "Greenplum master" : DROP TABLE spark_executor1 - deactivate "Greenplum master" - "Spark executor2" --> "Greenplum master" : DROP TABLE spark_executor2 - deactivate "Greenplum master" - "Spark executorN" --> "Greenplum master" : DROP TABLE spark_executorN - deactivate "Greenplum master" - - "Spark executor1" --> "Spark driver" -- : DONE - "Spark executor2" --> "Spark driver" -- : DONE - "Spark executorN" --> "Spark driver" -- : DONE - - "Spark driver" --> "Greenplum master" : CLOSE CONNECTION - deactivate "Greenplum master" - deactivate "Spark driver" - @enduml - ``` - ```mermaid --- title: Greenplum master <-> Spark driver diff --git a/mddocs/mkdocs.yml b/mddocs/mkdocs.yml index 19a99fb3b..720e0bf04 100644 --- a/mddocs/mkdocs.yml +++ b/mddocs/mkdocs.yml @@ -60,9 +60,9 @@ plugins: # - griffe_sphinx # - griffe_pydantic: {schema: false} - macros - - plantuml: - puml_url: https://www.plantuml.com/plantuml/ - puml_keyword: plantuml + # - plantuml: + # puml_url: https://www.plantuml.com/plantuml/ + # puml_keyword: plantuml # - i18n: # docs_structure: folder # languages: From edaf597a4220b34d022aa0dd4eec92193c5ec540 Mon Sep 17 00:00:00 2001 From: Sattar Gyulmamedov Date: Tue, 7 Apr 2026 18:18:26 +0300 Subject: [PATCH 18/28] remove json & dict methods from docs --- mddocs/docs/file/file_downloader/result.md | 2 -- mddocs/docs/file/file_mover/result.md | 2 -- 2 files changed, 4 deletions(-) diff --git a/mddocs/docs/file/file_downloader/result.md b/mddocs/docs/file/file_downloader/result.md index de660371f..42d2b45d3 100644 --- a/mddocs/docs/file/file_downloader/result.md +++ b/mddocs/docs/file/file_downloader/result.md @@ -36,5 +36,3 @@ - raise_if_contains_zero_size - details - summary - - dict - - json diff --git a/mddocs/docs/file/file_mover/result.md b/mddocs/docs/file/file_mover/result.md index caae35556..0b7a667dc 100644 --- a/mddocs/docs/file/file_mover/result.md +++ b/mddocs/docs/file/file_mover/result.md @@ -35,5 +35,3 @@ - raise_if_contains_zero_size - details - summary - - dict - - json From c21684cdd4a2115a2857540dc096019c436233b2 Mon Sep 17 00:00:00 2001 From: Sattar Gyulmamedov Date: Tue, 7 Apr 2026 18:30:31 +0300 Subject: [PATCH 19/28] remove griffe options --- mddocs/docs/connection/db_connection/kafka/auth.md | 4 ---- mddocs/docs/connection/db_connection/kafka/basic_auth.md | 3 --- mddocs/docs/connection/db_connection/kafka/connection.md | 4 ---- mddocs/docs/connection/db_connection/kafka/kerberos_auth.md | 3 --- .../docs/connection/db_connection/kafka/plaintext_protocol.md | 3 --- mddocs/docs/connection/db_connection/kafka/protocol.md | 4 ---- mddocs/docs/connection/db_connection/kafka/read.md | 3 --- mddocs/docs/connection/db_connection/kafka/scram_auth.md | 3 --- mddocs/docs/connection/db_connection/kafka/slots.md | 4 ---- mddocs/docs/connection/db_connection/kafka/ssl_protocol.md | 3 --- mddocs/docs/connection/db_connection/kafka/write.md | 3 --- 11 files changed, 37 deletions(-) diff --git a/mddocs/docs/connection/db_connection/kafka/auth.md b/mddocs/docs/connection/db_connection/kafka/auth.md index 412ef14b5..2d08e1a88 100644 --- a/mddocs/docs/connection/db_connection/kafka/auth.md +++ b/mddocs/docs/connection/db_connection/kafka/auth.md @@ -13,7 +13,3 @@ --> ::: onetl.connection.db_connection.kafka.kafka_auth.KafkaAuth - options: - extensions: - - griffe_sphinx - - griffe_pydantic: {schema: false} diff --git a/mddocs/docs/connection/db_connection/kafka/basic_auth.md b/mddocs/docs/connection/db_connection/kafka/basic_auth.md index 10d9c0c28..d1350be95 100644 --- a/mddocs/docs/connection/db_connection/kafka/basic_auth.md +++ b/mddocs/docs/connection/db_connection/kafka/basic_auth.md @@ -18,6 +18,3 @@ inherited_members: true heading_level: 3 show_root_heading: true - extensions: - - griffe_sphinx - - griffe_pydantic: {schema: false} diff --git a/mddocs/docs/connection/db_connection/kafka/connection.md b/mddocs/docs/connection/db_connection/kafka/connection.md index 854dd1c02..c6d2f21fa 100644 --- a/mddocs/docs/connection/db_connection/kafka/connection.md +++ b/mddocs/docs/connection/db_connection/kafka/connection.md @@ -12,7 +12,3 @@ --> ::: onetl.connection.db_connection.kafka.connection.Kafka - options: - extensions: - - griffe_sphinx - - griffe_pydantic: {schema: false} diff --git a/mddocs/docs/connection/db_connection/kafka/kerberos_auth.md b/mddocs/docs/connection/db_connection/kafka/kerberos_auth.md index 7d4ebc3be..ad33b5f5d 100644 --- a/mddocs/docs/connection/db_connection/kafka/kerberos_auth.md +++ b/mddocs/docs/connection/db_connection/kafka/kerberos_auth.md @@ -18,6 +18,3 @@ inherited_members: true heading_level: 3 show_root_heading: true - extensions: - - griffe_sphinx - - griffe_pydantic: {schema: false} diff --git a/mddocs/docs/connection/db_connection/kafka/plaintext_protocol.md b/mddocs/docs/connection/db_connection/kafka/plaintext_protocol.md index 3eb33ce56..eb6a67e98 100644 --- a/mddocs/docs/connection/db_connection/kafka/plaintext_protocol.md +++ b/mddocs/docs/connection/db_connection/kafka/plaintext_protocol.md @@ -18,6 +18,3 @@ inherited_members: true heading_level: 3 show_root_heading: true - extensions: - - griffe_sphinx - - griffe_pydantic: {schema: false} diff --git a/mddocs/docs/connection/db_connection/kafka/protocol.md b/mddocs/docs/connection/db_connection/kafka/protocol.md index c08c1cc70..08e0bac95 100644 --- a/mddocs/docs/connection/db_connection/kafka/protocol.md +++ b/mddocs/docs/connection/db_connection/kafka/protocol.md @@ -13,7 +13,3 @@ --> ::: onetl.connection.db_connection.kafka.kafka_protocol.KafkaProtocol - options: - extensions: - - griffe_sphinx - - griffe_pydantic: {schema: false} diff --git a/mddocs/docs/connection/db_connection/kafka/read.md b/mddocs/docs/connection/db_connection/kafka/read.md index 98b484b9d..ce1701301 100644 --- a/mddocs/docs/connection/db_connection/kafka/read.md +++ b/mddocs/docs/connection/db_connection/kafka/read.md @@ -140,6 +140,3 @@ deserialized_df = read_df.select( inherited_members: true heading_level: 3 show_root_heading: true - extensions: - - griffe_sphinx - - griffe_pydantic: {schema: false} diff --git a/mddocs/docs/connection/db_connection/kafka/scram_auth.md b/mddocs/docs/connection/db_connection/kafka/scram_auth.md index a3255bc2d..0539ee95e 100644 --- a/mddocs/docs/connection/db_connection/kafka/scram_auth.md +++ b/mddocs/docs/connection/db_connection/kafka/scram_auth.md @@ -19,6 +19,3 @@ inherited_members: true heading_level: 3 show_root_heading: true - extensions: - - griffe_sphinx - - griffe_pydantic: {schema: false} diff --git a/mddocs/docs/connection/db_connection/kafka/slots.md b/mddocs/docs/connection/db_connection/kafka/slots.md index a87a76112..d7c2f2313 100644 --- a/mddocs/docs/connection/db_connection/kafka/slots.md +++ b/mddocs/docs/connection/db_connection/kafka/slots.md @@ -13,7 +13,3 @@ --> ::: onetl.connection.db_connection.kafka.slots.KafkaSlots - options: - extensions: - - griffe_sphinx - - griffe_pydantic: {schema: false} diff --git a/mddocs/docs/connection/db_connection/kafka/ssl_protocol.md b/mddocs/docs/connection/db_connection/kafka/ssl_protocol.md index 6f7e0c9f0..8203401a4 100644 --- a/mddocs/docs/connection/db_connection/kafka/ssl_protocol.md +++ b/mddocs/docs/connection/db_connection/kafka/ssl_protocol.md @@ -18,6 +18,3 @@ inherited_members: true heading_level: 3 show_root_heading: true - extensions: - - griffe_sphinx - - griffe_pydantic: {schema: false} diff --git a/mddocs/docs/connection/db_connection/kafka/write.md b/mddocs/docs/connection/db_connection/kafka/write.md index e95ad346d..883e92a5d 100644 --- a/mddocs/docs/connection/db_connection/kafka/write.md +++ b/mddocs/docs/connection/db_connection/kafka/write.md @@ -80,6 +80,3 @@ writer.run(write_df) inherited_members: true heading_level: 3 show_root_heading: true - extensions: - - griffe_sphinx - - griffe_pydantic: {schema: false} From 9b918f0e3c89d5c96b7ee4044e6706e1d9cb7109 Mon Sep 17 00:00:00 2001 From: sga Date: Tue, 7 Apr 2026 20:27:59 +0300 Subject: [PATCH 20/28] remove rst anchors --- mddocs/docs/quickstart.md | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/mddocs/docs/quickstart.md b/mddocs/docs/quickstart.md index 0eeb1fb52..a8b4ab1ab 100644 --- a/mddocs/docs/quickstart.md +++ b/mddocs/docs/quickstart.md @@ -17,7 +17,7 @@ docs/include_0.md ----8<---- - + ## Documentation { #DBR-onetl-quickstart-documentation } @@ -25,11 +25,11 @@ See at [ReadTheDocs](https://onetl.readthedocs.io/en/latest/) ## How to install { #DBR-onetl-quickstart-how-to-install } - + ### Minimal installation { #DBR-onetl-quickstart-minimal-installation } - + Base `onetl` package contains: @@ -54,13 +54,13 @@ pip install onetl ### With DB and FileDF connections { #DBR-onetl-quickstart-with-db-and-filedf-connections } - + All DB connection classes (`Clickhouse`, `Greenplum`, `Hive` and others) and all FileDF connection classes (`SparkHDFS`, `SparkLocalFS`, `SparkS3`) require Spark to be installed. - + Firstly, you should install JDK. The exact installation instruction depends on your OS, here are some examples: @@ -70,7 +70,7 @@ dnf install java-11-openjdk-devel # CentOS 8 | Spark 3 apt-get install openjdk-11-jdk # Debian-based | Spark 3 ``` - + #### Compatibility matrix { #DBR-onetl-quickstart-compatibility-matrix } @@ -83,7 +83,7 @@ apt-get install openjdk-11-jdk # Debian-based | Spark 3 | [3.4.x](https://spark.apache.org/docs/3.4.4/#downloading) | 3.7 - 3.12 | 8u362 - 20 | 2.12 | | [3.5.x](https://spark.apache.org/docs/3.5.5/#downloading) | 3.8 - 3.13 | 8u371 - 20 | 2.12 | - + Then you should install PySpark via passing `spark` to `extras`: @@ -102,7 +102,7 @@ or inject PySpark to `sys.path` in some other way BEFORE creating a class instan ### With File connections { #DBR-onetl-quickstart-with-file-connections } - + All File (but not *FileDF*) connection classes (`FTP`, `SFTP`, `HDFS` and so on) requires specific Python clients to be installed. @@ -123,7 +123,7 @@ pip install onetl[files] ### With Kerberos support { #DBR-onetl-quickstart-with-kerberos-support } - + Most of Hadoop instances set up with Kerberos support, so some connections require additional setup to work properly. @@ -156,7 +156,7 @@ pip install onetl[kerberos] ### Full bundle { #DBR-onetl-quickstart-full-bundle } - + To install all connectors and dependencies, you can pass `all` into `extras`: @@ -171,7 +171,7 @@ pip install onetl[spark,files,kerberos] This method consumes a lot of disk space, and requires for Java & Kerberos libraries to be installed into your OS. - + ## Quick start { #DBR-onetl-quickstart-quick-start } From d71a8ab0dff414d04051d867bb75991af4a7170d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 7 Apr 2026 17:28:20 +0000 Subject: [PATCH 21/28] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- mddocs/docs/quickstart.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mddocs/docs/quickstart.md b/mddocs/docs/quickstart.md index a8b4ab1ab..a65a8c2d7 100644 --- a/mddocs/docs/quickstart.md +++ b/mddocs/docs/quickstart.md @@ -17,7 +17,7 @@ docs/include_0.md ----8<---- - + ## Documentation { #DBR-onetl-quickstart-documentation } From 6f2d87a4d8353a94d9a9e26e010ae1ee892fc737 Mon Sep 17 00:00:00 2001 From: sga Date: Tue, 7 Apr 2026 22:21:46 +0300 Subject: [PATCH 22/28] remove comments with rst-code fragments --- .../db_connection/clickhouse/connection.md | 10 ----- .../db_connection/clickhouse/execute.md | 22 ----------- .../db_connection/clickhouse/read.md | 13 ------- .../db_connection/clickhouse/sql.md | 13 ------- .../db_connection/clickhouse/write.md | 13 ------- .../db_connection/greenplum/connection.md | 10 ----- .../db_connection/greenplum/execute.md | 21 ---------- .../db_connection/greenplum/read.md | 12 ------ .../db_connection/greenplum/write.md | 12 ------ .../db_connection/hive/connection.md | 11 ------ .../connection/db_connection/hive/execute.md | 9 ----- .../connection/db_connection/hive/slots.md | 11 ------ .../docs/connection/db_connection/hive/sql.md | 9 ----- .../connection/db_connection/hive/write.md | 12 ------ .../connection/db_connection/kafka/auth.md | 11 ------ .../db_connection/kafka/basic_auth.md | 12 ------ .../db_connection/kafka/connection.md | 10 ----- .../db_connection/kafka/kerberos_auth.md | 12 ------ .../db_connection/kafka/plaintext_protocol.md | 12 ------ .../db_connection/kafka/protocol.md | 11 ------ .../connection/db_connection/kafka/read.md | 12 ------ .../db_connection/kafka/scram_auth.md | 13 ------- .../connection/db_connection/kafka/slots.md | 11 ------ .../db_connection/kafka/ssl_protocol.md | 12 ------ .../connection/db_connection/kafka/write.md | 12 ------ .../db_connection/mongodb/connection.md | 11 ------ .../db_connection/mongodb/pipeline.md | 20 ---------- .../connection/db_connection/mongodb/read.md | 12 ------ .../connection/db_connection/mongodb/write.md | 14 ------- .../db_connection/mssql/connection.md | 10 ----- .../connection/db_connection/mssql/execute.md | 21 ---------- .../connection/db_connection/mssql/read.md | 13 ------- .../connection/db_connection/mssql/sql.md | 13 ------- .../connection/db_connection/mssql/write.md | 13 ------- .../db_connection/mysql/connection.md | 10 ----- .../connection/db_connection/mysql/execute.md | 18 --------- .../connection/db_connection/mysql/read.md | 11 ------ .../connection/db_connection/mysql/sql.md | 13 ------- .../connection/db_connection/mysql/write.md | 13 ------- .../db_connection/oracle/connection.md | 10 ----- .../db_connection/oracle/execute.md | 18 --------- .../connection/db_connection/oracle/read.md | 11 ------ .../connection/db_connection/oracle/sql.md | 13 ------- .../connection/db_connection/oracle/write.md | 13 ------- .../db_connection/postgres/connection.md | 10 ----- .../db_connection/postgres/execute.md | 18 --------- .../connection/db_connection/postgres/read.md | 11 ------ .../connection/db_connection/postgres/sql.md | 13 ------- .../db_connection/postgres/write.md | 13 ------- mddocs/docs/connection/file_connection/ftp.md | 10 ----- .../docs/connection/file_connection/ftps.md | 10 ----- .../file_connection/hdfs/connection.md | 10 ----- .../connection/file_connection/hdfs/slots.md | 11 ------ mddocs/docs/connection/file_connection/s3.md | 10 ----- .../docs/connection/file_connection/samba.md | 10 ----- .../docs/connection/file_connection/sftp.md | 10 ----- .../docs/connection/file_connection/webdav.md | 10 ----- .../connection/file_df_connection/base.md | 10 ----- .../spark_hdfs/connection.md | 10 ----- .../file_df_connection/spark_hdfs/slots.md | 11 ------ .../file_df_connection/spark_local_fs.md | 10 ----- .../file_df_connection/spark_s3/connection.md | 10 ----- mddocs/docs/db/reader.md | 10 ----- mddocs/docs/db/writer.md | 17 --------- .../file/file_downloader/file_downloader.md | 19 ---------- mddocs/docs/file/file_downloader/result.md | 11 ------ mddocs/docs/file/file_filters/base.md | 17 --------- mddocs/docs/file/file_filters/exclude_dir.md | 10 ----- mddocs/docs/file/file_filters/file_filter.md | 10 ----- .../file/file_filters/file_mtime_filter.md | 10 ----- .../file/file_filters/file_size_filter.md | 10 ----- mddocs/docs/file/file_filters/glob.md | 10 ----- .../file/file_filters/match_all_filters.md | 9 ----- mddocs/docs/file/file_filters/regexp.md | 11 ------ mddocs/docs/file/file_limits/base.md | 19 ---------- mddocs/docs/file/file_limits/file_limit.md | 10 ----- .../docs/file/file_limits/limits_reached.md | 9 ----- .../docs/file/file_limits/limits_stop_at.md | 9 ----- .../docs/file/file_limits/max_files_count.md | 10 ----- mddocs/docs/file/file_limits/reset_limits.md | 9 ----- .../docs/file/file_limits/total_files_size.md | 10 ----- mddocs/docs/file/file_mover/file_mover.md | 19 ---------- mddocs/docs/file/file_mover/options.md | 12 ------ mddocs/docs/file/file_mover/result.md | 10 ----- .../docs/file/file_uploader/file_uploader.md | 19 ---------- mddocs/docs/file/file_uploader/options.md | 12 ------ mddocs/docs/file/file_uploader/result.md | 10 ----- .../file_df/file_df_reader/file_df_reader.md | 11 ------ mddocs/docs/file_df/file_df_reader/options.md | 11 ------ .../file_df/file_df_writer/file_df_writer.md | 11 ------ mddocs/docs/file_df/file_df_writer/options.md | 11 ------ mddocs/docs/file_df/file_formats/avro.md | 11 ------ mddocs/docs/file_df/file_formats/base.md | 17 --------- mddocs/docs/file_df/file_formats/csv.md | 11 ------ mddocs/docs/file_df/file_formats/excel.md | 11 ------ mddocs/docs/file_df/file_formats/json.md | 11 ------ mddocs/docs/file_df/file_formats/jsonline.md | 11 ------ mddocs/docs/file_df/file_formats/orc.md | 11 ------ mddocs/docs/file_df/file_formats/parquet.md | 11 ------ mddocs/docs/file_df/file_formats/xml.md | 11 ------ mddocs/docs/hooks/global_state.md | 25 ------------ mddocs/docs/hooks/hook.md | 32 ---------------- mddocs/docs/hooks/slot.md | 24 ------------ mddocs/docs/hooks/support_hooks.md | 38 ------------------- mddocs/docs/hwm_store/yaml_hwm_store.md | 10 ----- mddocs/docs/index.md | 2 +- mddocs/docs/install/spark.md | 21 ---------- mddocs/docs/quickstart.md | 2 +- .../strategy/incremental_batch_strategy.md | 10 ----- mddocs/docs/strategy/incremental_strategy.md | 10 ----- .../docs/strategy/snapshot_batch_strategy.md | 10 ----- mddocs/docs/strategy/snapshot_strategy.md | 10 ----- 112 files changed, 2 insertions(+), 1403 deletions(-) diff --git a/mddocs/docs/connection/db_connection/clickhouse/connection.md b/mddocs/docs/connection/db_connection/clickhouse/connection.md index 74afb2e4c..225b088a4 100644 --- a/mddocs/docs/connection/db_connection/clickhouse/connection.md +++ b/mddocs/docs/connection/db_connection/clickhouse/connection.md @@ -1,15 +1,5 @@ # Clickhouse connection { #DBR-onetl-connection-db-connection-clickhouse-connection-0 } - ::: onetl.connection.db_connection.clickhouse.connection.Clickhouse options: diff --git a/mddocs/docs/connection/db_connection/clickhouse/execute.md b/mddocs/docs/connection/db_connection/clickhouse/execute.md index 77d18a756..08ef695f3 100644 --- a/mddocs/docs/connection/db_connection/clickhouse/execute.md +++ b/mddocs/docs/connection/db_connection/clickhouse/execute.md @@ -97,28 +97,6 @@ So it should **NOT** be used to read large amounts of data. Use [DBReader][DBR-o ## Options { #DBR-onetl-connection-db-connection-clickhouse-execute-options } - ::: onetl.connection.db_connection.clickhouse.options.ClickhouseFetchOptions options: diff --git a/mddocs/docs/connection/db_connection/clickhouse/read.md b/mddocs/docs/connection/db_connection/clickhouse/read.md index dc35fcb09..ad82d8453 100644 --- a/mddocs/docs/connection/db_connection/clickhouse/read.md +++ b/mddocs/docs/connection/db_connection/clickhouse/read.md @@ -77,19 +77,6 @@ Especially if there are indexes or partitions for columns used in `where` clause ## Options { #DBR-onetl-connection-db-connection-clickhouse-read-options } - ::: onetl.connection.db_connection.clickhouse.options.ClickhouseReadOptions options: diff --git a/mddocs/docs/connection/db_connection/clickhouse/sql.md b/mddocs/docs/connection/db_connection/clickhouse/sql.md index fc1659f14..c893b5fed 100644 --- a/mddocs/docs/connection/db_connection/clickhouse/sql.md +++ b/mddocs/docs/connection/db_connection/clickhouse/sql.md @@ -61,19 +61,6 @@ Especially if there are indexes or partitions for columns used in `where` clause ## Options { #DBR-onetl-connection-db-connection-clickhouse-sql-options } - ::: onetl.connection.db_connection.clickhouse.options.ClickhouseSQLOptions options: diff --git a/mddocs/docs/connection/db_connection/clickhouse/write.md b/mddocs/docs/connection/db_connection/clickhouse/write.md index 6d5938b41..7a86f9b90 100644 --- a/mddocs/docs/connection/db_connection/clickhouse/write.md +++ b/mddocs/docs/connection/db_connection/clickhouse/write.md @@ -42,19 +42,6 @@ writer.run(df) Method above accepts [Clickhouse.WriteOptions][onetl.connection.db_connection.clickhouse.options.ClickhouseWriteOptions] - ::: onetl.connection.db_connection.clickhouse.options.ClickhouseWriteOptions options: diff --git a/mddocs/docs/connection/db_connection/greenplum/connection.md b/mddocs/docs/connection/db_connection/greenplum/connection.md index 94c5bec79..8f6af6b43 100644 --- a/mddocs/docs/connection/db_connection/greenplum/connection.md +++ b/mddocs/docs/connection/db_connection/greenplum/connection.md @@ -1,15 +1,5 @@ # Greenplum connection { #DBR-onetl-connection-db-connection-greenplum-connection-0 } - ::: onetl.connection.db_connection.greenplum.connection.Greenplum options: diff --git a/mddocs/docs/connection/db_connection/greenplum/execute.md b/mddocs/docs/connection/db_connection/greenplum/execute.md index 62d43c10b..dba466228 100644 --- a/mddocs/docs/connection/db_connection/greenplum/execute.md +++ b/mddocs/docs/connection/db_connection/greenplum/execute.md @@ -128,27 +128,6 @@ The only port used while interacting with Greenplum in this case is `5432` (Gree ## Options { #DBR-onetl-connection-db-connection-greenplum-execute-options } - ::: onetl.connection.db_connection.greenplum.options.GreenplumFetchOptions options: diff --git a/mddocs/docs/connection/db_connection/greenplum/read.md b/mddocs/docs/connection/db_connection/greenplum/read.md index 4c77f867f..70ec1f7be 100644 --- a/mddocs/docs/connection/db_connection/greenplum/read.md +++ b/mddocs/docs/connection/db_connection/greenplum/read.md @@ -352,18 +352,6 @@ You should use [UNLOGGED](https://docs.vmware.com/en/VMware-Greenplum/7/greenplu ## Options { #DBR-onetl-connection-db-connection-greenplum-read-options } - ::: onetl.connection.db_connection.greenplum.options.GreenplumReadOptions options: diff --git a/mddocs/docs/connection/db_connection/greenplum/write.md b/mddocs/docs/connection/db_connection/greenplum/write.md index a090447a0..a59992e57 100644 --- a/mddocs/docs/connection/db_connection/greenplum/write.md +++ b/mddocs/docs/connection/db_connection/greenplum/write.md @@ -136,18 +136,6 @@ High-level schema is described in [Greenplum prerequisites][DBR-onetl-connection ## Options { #DBR-onetl-connection-db-connection-greenplum-write-options } - ::: onetl.connection.db_connection.greenplum.options.GreenplumWriteOptions options: diff --git a/mddocs/docs/connection/db_connection/hive/connection.md b/mddocs/docs/connection/db_connection/hive/connection.md index d858abb29..38fc9233c 100644 --- a/mddocs/docs/connection/db_connection/hive/connection.md +++ b/mddocs/docs/connection/db_connection/hive/connection.md @@ -1,16 +1,5 @@ # Hive Connection { #DBR-onetl-connection-db-connection-hive-connection-0 } - ::: onetl.connection.db_connection.hive.connection.Hive options: diff --git a/mddocs/docs/connection/db_connection/hive/execute.md b/mddocs/docs/connection/db_connection/hive/execute.md index 34d773ce4..e924201f8 100644 --- a/mddocs/docs/connection/db_connection/hive/execute.md +++ b/mddocs/docs/connection/db_connection/hive/execute.md @@ -42,15 +42,6 @@ This method supports **any** query syntax supported by Hive, like: ### Details { #DBR-onetl-connection-db-connection-hive-execute-details } - ::: onetl.connection.db_connection.hive.connection.Hive.execute options: diff --git a/mddocs/docs/connection/db_connection/hive/slots.md b/mddocs/docs/connection/db_connection/hive/slots.md index c10913d13..57ab47f69 100644 --- a/mddocs/docs/connection/db_connection/hive/slots.md +++ b/mddocs/docs/connection/db_connection/hive/slots.md @@ -1,16 +1,5 @@ # Hive Slots { #DBR-onetl-connection-db-connection-hive-slots } - ::: onetl.connection.db_connection.hive.slots.HiveSlots options: diff --git a/mddocs/docs/connection/db_connection/hive/sql.md b/mddocs/docs/connection/db_connection/hive/sql.md index 97f55c68a..296574345 100644 --- a/mddocs/docs/connection/db_connection/hive/sql.md +++ b/mddocs/docs/connection/db_connection/hive/sql.md @@ -65,15 +65,6 @@ Supported operators are: `=`, `>`, `<` and `BETWEEN`, and only against some **st ## Details { #DBR-onetl-connection-db-connection-hive-sql-details } - ::: onetl.connection.db_connection.hive.connection.Hive.sql options: diff --git a/mddocs/docs/connection/db_connection/hive/write.md b/mddocs/docs/connection/db_connection/hive/write.md index 9904be826..5772037bd 100644 --- a/mddocs/docs/connection/db_connection/hive/write.md +++ b/mddocs/docs/connection/db_connection/hive/write.md @@ -166,18 +166,6 @@ then `sort_columns` should start with `repartition_columns` or be equal to it. ## Options { #DBR-onetl-connection-db-connection-hive-write-options } - ::: onetl.connection.db_connection.hive.options.HiveWriteOptions options: diff --git a/mddocs/docs/connection/db_connection/kafka/auth.md b/mddocs/docs/connection/db_connection/kafka/auth.md index 2d08e1a88..a330fecaa 100644 --- a/mddocs/docs/connection/db_connection/kafka/auth.md +++ b/mddocs/docs/connection/db_connection/kafka/auth.md @@ -1,15 +1,4 @@ # Kafka Auth { #DBR-onetl-connection-db-connection-kafka-auth } - ::: onetl.connection.db_connection.kafka.kafka_auth.KafkaAuth diff --git a/mddocs/docs/connection/db_connection/kafka/basic_auth.md b/mddocs/docs/connection/db_connection/kafka/basic_auth.md index d1350be95..f7e22a00f 100644 --- a/mddocs/docs/connection/db_connection/kafka/basic_auth.md +++ b/mddocs/docs/connection/db_connection/kafka/basic_auth.md @@ -1,17 +1,5 @@ # Kafka BasicAuth { #DBR-onetl-connection-db-connection-kafka-basic-auth-kafka-basicauth } - ::: onetl.connection.db_connection.kafka.kafka_basic_auth.KafkaBasicAuth options: diff --git a/mddocs/docs/connection/db_connection/kafka/connection.md b/mddocs/docs/connection/db_connection/kafka/connection.md index c6d2f21fa..681aa39ab 100644 --- a/mddocs/docs/connection/db_connection/kafka/connection.md +++ b/mddocs/docs/connection/db_connection/kafka/connection.md @@ -1,14 +1,4 @@ # Kafka Connection { #DBR-onetl-connection-db-connection-kafka-connection-0 } - ::: onetl.connection.db_connection.kafka.connection.Kafka diff --git a/mddocs/docs/connection/db_connection/kafka/kerberos_auth.md b/mddocs/docs/connection/db_connection/kafka/kerberos_auth.md index ad33b5f5d..92b8e72f0 100644 --- a/mddocs/docs/connection/db_connection/kafka/kerberos_auth.md +++ b/mddocs/docs/connection/db_connection/kafka/kerberos_auth.md @@ -1,17 +1,5 @@ # Kafka KerberosAuth { #DBR-onetl-connection-db-connection-kafka-kerberos-auth-kafka-kerberosauth } - ::: onetl.connection.db_connection.kafka.kafka_kerberos_auth.KafkaKerberosAuth options: diff --git a/mddocs/docs/connection/db_connection/kafka/plaintext_protocol.md b/mddocs/docs/connection/db_connection/kafka/plaintext_protocol.md index eb6a67e98..f8caab1cd 100644 --- a/mddocs/docs/connection/db_connection/kafka/plaintext_protocol.md +++ b/mddocs/docs/connection/db_connection/kafka/plaintext_protocol.md @@ -1,17 +1,5 @@ # Kafka PlaintextProtocol { #DBR-onetl-connection-db-connection-kafka-plaintext-protocol-kafka-plaintextprotocol } - ::: onetl.connection.db_connection.kafka.kafka_plaintext_protocol.KafkaPlaintextProtocol options: diff --git a/mddocs/docs/connection/db_connection/kafka/protocol.md b/mddocs/docs/connection/db_connection/kafka/protocol.md index 08e0bac95..a0cc0f8fe 100644 --- a/mddocs/docs/connection/db_connection/kafka/protocol.md +++ b/mddocs/docs/connection/db_connection/kafka/protocol.md @@ -1,15 +1,4 @@ # Kafka Protocol { #DBR-onetl-connection-db-connection-kafka-protocol } - ::: onetl.connection.db_connection.kafka.kafka_protocol.KafkaProtocol diff --git a/mddocs/docs/connection/db_connection/kafka/read.md b/mddocs/docs/connection/db_connection/kafka/read.md index ce1701301..6e6aa39f0 100644 --- a/mddocs/docs/connection/db_connection/kafka/read.md +++ b/mddocs/docs/connection/db_connection/kafka/read.md @@ -122,18 +122,6 @@ deserialized_df = read_df.select( ## Options { #DBR-onetl-connection-db-connection-kafka-read-options } - ::: onetl.connection.db_connection.kafka.options.KafkaReadOptions options: diff --git a/mddocs/docs/connection/db_connection/kafka/scram_auth.md b/mddocs/docs/connection/db_connection/kafka/scram_auth.md index 0539ee95e..a7542af3f 100644 --- a/mddocs/docs/connection/db_connection/kafka/scram_auth.md +++ b/mddocs/docs/connection/db_connection/kafka/scram_auth.md @@ -1,18 +1,5 @@ # Kafka ScramAuth { #DBR-onetl-connection-db-connection-kafka-scram-auth-kafka-scramauth } - ::: onetl.connection.db_connection.kafka.kafka_scram_auth.KafkaScramAuth options: diff --git a/mddocs/docs/connection/db_connection/kafka/slots.md b/mddocs/docs/connection/db_connection/kafka/slots.md index d7c2f2313..a8c83288e 100644 --- a/mddocs/docs/connection/db_connection/kafka/slots.md +++ b/mddocs/docs/connection/db_connection/kafka/slots.md @@ -1,15 +1,4 @@ # Kafka Slots { #DBR-onetl-connection-db-connection-kafka-slots } - ::: onetl.connection.db_connection.kafka.slots.KafkaSlots diff --git a/mddocs/docs/connection/db_connection/kafka/ssl_protocol.md b/mddocs/docs/connection/db_connection/kafka/ssl_protocol.md index 8203401a4..f302dd7b2 100644 --- a/mddocs/docs/connection/db_connection/kafka/ssl_protocol.md +++ b/mddocs/docs/connection/db_connection/kafka/ssl_protocol.md @@ -1,17 +1,5 @@ # Kafka SSLProtocol { #DBR-onetl-connection-db-connection-kafka-ssl-protocol-kafka-sslprotocol } - ::: onetl.connection.db_connection.kafka.kafka_ssl_protocol.KafkaSSLProtocol options: diff --git a/mddocs/docs/connection/db_connection/kafka/write.md b/mddocs/docs/connection/db_connection/kafka/write.md index 883e92a5d..e8b39dc63 100644 --- a/mddocs/docs/connection/db_connection/kafka/write.md +++ b/mddocs/docs/connection/db_connection/kafka/write.md @@ -62,18 +62,6 @@ writer.run(write_df) ## Options { #DBR-onetl-connection-db-connection-kafka-write-options } - ::: onetl.connection.db_connection.kafka.options.KafkaWriteOptions options: diff --git a/mddocs/docs/connection/db_connection/mongodb/connection.md b/mddocs/docs/connection/db_connection/mongodb/connection.md index e02b7e670..252f0fc4d 100644 --- a/mddocs/docs/connection/db_connection/mongodb/connection.md +++ b/mddocs/docs/connection/db_connection/mongodb/connection.md @@ -1,16 +1,5 @@ # MongoDB Connection { #DBR-onetl-connection-db-connection-mongodb-connection-0 } - ::: onetl.connection.db_connection.mongodb.connection.MongoDB options: diff --git a/mddocs/docs/connection/db_connection/mongodb/pipeline.md b/mddocs/docs/connection/db_connection/mongodb/pipeline.md index 735813d75..79a6efdc8 100644 --- a/mddocs/docs/connection/db_connection/mongodb/pipeline.md +++ b/mddocs/docs/connection/db_connection/mongodb/pipeline.md @@ -16,26 +16,6 @@ Especially if there are indexes for columns used in `pipeline` value. ## References { #DBR-onetl-connection-db-connection-mongodb-pipeline-references } - ::: onetl.connection.db_connection.mongodb.connection.MongoDB.pipeline options: diff --git a/mddocs/docs/connection/db_connection/mongodb/read.md b/mddocs/docs/connection/db_connection/mongodb/read.md index 38d7698d9..fba78334b 100644 --- a/mddocs/docs/connection/db_connection/mongodb/read.md +++ b/mddocs/docs/connection/db_connection/mongodb/read.md @@ -124,18 +124,6 @@ Especially if there are indexes for columns used in `where` clause. ## Read options { #DBR-onetl-connection-db-connection-mongodb-read-options } - ::: onetl.connection.db_connection.mongodb.options.MongoDBReadOptions options: diff --git a/mddocs/docs/connection/db_connection/mongodb/write.md b/mddocs/docs/connection/db_connection/mongodb/write.md index 267df750e..8bdf8ac10 100644 --- a/mddocs/docs/connection/db_connection/mongodb/write.md +++ b/mddocs/docs/connection/db_connection/mongodb/write.md @@ -31,20 +31,6 @@ For writing data to MongoDB, use [DBWriter][DBR-onetl-db-writer]. Method above accepts [MongoDB.WriteOptions][onetl.connection.db_connection.mongodb.options.MongoDBWriteOptions] - ::: onetl.connection.db_connection.mongodb.options.MongoDBWriteOptions options: diff --git a/mddocs/docs/connection/db_connection/mssql/connection.md b/mddocs/docs/connection/db_connection/mssql/connection.md index 7eb54d7af..b2be8a984 100644 --- a/mddocs/docs/connection/db_connection/mssql/connection.md +++ b/mddocs/docs/connection/db_connection/mssql/connection.md @@ -1,15 +1,5 @@ # MSSQL connection { #DBR-onetl-connection-db-connection-mssql-connection-0 } - ::: onetl.connection.db_connection.mssql.connection.MSSQL options: diff --git a/mddocs/docs/connection/db_connection/mssql/execute.md b/mddocs/docs/connection/db_connection/mssql/execute.md index 646e7e2b7..f31a8f930 100644 --- a/mddocs/docs/connection/db_connection/mssql/execute.md +++ b/mddocs/docs/connection/db_connection/mssql/execute.md @@ -89,27 +89,6 @@ This method supports **any** query syntax supported by MSSQL, like: ## Options { #DBR-onetl-connection-db-connection-mssql-execute-options } - ::: onetl.connection.db_connection.mssql.options.MSSQLFetchOptions options: diff --git a/mddocs/docs/connection/db_connection/mssql/read.md b/mddocs/docs/connection/db_connection/mssql/read.md index 82ba086dc..022c00746 100644 --- a/mddocs/docs/connection/db_connection/mssql/read.md +++ b/mddocs/docs/connection/db_connection/mssql/read.md @@ -77,19 +77,6 @@ Especially if there are indexes or partitions for columns used in `where` clause ## Options { #DBR-onetl-connection-db-connection-mssql-read-options } - ::: onetl.connection.db_connection.mssql.options.MSSQLReadOptions options: diff --git a/mddocs/docs/connection/db_connection/mssql/sql.md b/mddocs/docs/connection/db_connection/mssql/sql.md index 973ecd127..9776a59d1 100644 --- a/mddocs/docs/connection/db_connection/mssql/sql.md +++ b/mddocs/docs/connection/db_connection/mssql/sql.md @@ -61,19 +61,6 @@ Especially if there are indexes or partitions for columns used in `where` clause ## Options { #DBR-onetl-connection-db-connection-mssql-sql-options } - ::: onetl.connection.db_connection.mssql.options.MSSQLSQLOptions options: diff --git a/mddocs/docs/connection/db_connection/mssql/write.md b/mddocs/docs/connection/db_connection/mssql/write.md index e16b1ec42..deaf554c8 100644 --- a/mddocs/docs/connection/db_connection/mssql/write.md +++ b/mddocs/docs/connection/db_connection/mssql/write.md @@ -37,19 +37,6 @@ For writing data to MSSQL, use [DBWriter][DBR-onetl-db-writer]. Method above accepts [MSSQL.WriteOptions][onetl.connection.db_connection.mssql.options.MSSQLWriteOptions] - ::: onetl.connection.db_connection.mssql.options.MSSQLWriteOptions options: diff --git a/mddocs/docs/connection/db_connection/mysql/connection.md b/mddocs/docs/connection/db_connection/mysql/connection.md index 366a732e2..c97d7124d 100644 --- a/mddocs/docs/connection/db_connection/mysql/connection.md +++ b/mddocs/docs/connection/db_connection/mysql/connection.md @@ -1,15 +1,5 @@ # MySQL connection { #DBR-onetl-connection-db-connection-mysql-connection-0 } - ::: onetl.connection.db_connection.mysql.connection.MySQL options: diff --git a/mddocs/docs/connection/db_connection/mysql/execute.md b/mddocs/docs/connection/db_connection/mysql/execute.md index e87b87186..663dbdeb0 100644 --- a/mddocs/docs/connection/db_connection/mysql/execute.md +++ b/mddocs/docs/connection/db_connection/mysql/execute.md @@ -90,24 +90,6 @@ This method supports **any** query syntax supported by MySQL, like: ## Options { #DBR-onetl-connection-db-connection-mysql-execute-options } - ::: onetl.connection.db_connection.mysql.options.MySQLFetchOptions options: diff --git a/mddocs/docs/connection/db_connection/mysql/read.md b/mddocs/docs/connection/db_connection/mysql/read.md index c7a693696..ac5aca318 100644 --- a/mddocs/docs/connection/db_connection/mysql/read.md +++ b/mddocs/docs/connection/db_connection/mysql/read.md @@ -77,17 +77,6 @@ Especially if there are indexes for columns used in `where` clause. ## Options { #DBR-onetl-connection-db-connection-mysql-read-options } - ::: onetl.connection.db_connection.mysql.options.MySQLReadOptions options: diff --git a/mddocs/docs/connection/db_connection/mysql/sql.md b/mddocs/docs/connection/db_connection/mysql/sql.md index 5024d2b7d..81baff3e4 100644 --- a/mddocs/docs/connection/db_connection/mysql/sql.md +++ b/mddocs/docs/connection/db_connection/mysql/sql.md @@ -61,19 +61,6 @@ Especially if there are indexes or partitions for columns used in `where` clause ## Options { #DBR-onetl-connection-db-connection-mysql-sql-options } - ::: onetl.connection.db_connection.mysql.options.MySQLSQLOptions options: diff --git a/mddocs/docs/connection/db_connection/mysql/write.md b/mddocs/docs/connection/db_connection/mysql/write.md index 144650c0d..07d767f4e 100644 --- a/mddocs/docs/connection/db_connection/mysql/write.md +++ b/mddocs/docs/connection/db_connection/mysql/write.md @@ -39,19 +39,6 @@ For writing data to MySQL, use [DBWriter][DBR-onetl-db-writer]. Method above accepts [MySQL.WriteOptions][onetl.connection.db_connection.mysql.options.MySQLWriteOptions] - ::: onetl.connection.db_connection.mysql.options.MySQLWriteOptions options: diff --git a/mddocs/docs/connection/db_connection/oracle/connection.md b/mddocs/docs/connection/db_connection/oracle/connection.md index b743b3199..ff1f1723f 100644 --- a/mddocs/docs/connection/db_connection/oracle/connection.md +++ b/mddocs/docs/connection/db_connection/oracle/connection.md @@ -1,15 +1,5 @@ # Oracle connection { #DBR-onetl-connection-db-connection-oracle-connection-0 } - ::: onetl.connection.db_connection.oracle.connection.Oracle options: diff --git a/mddocs/docs/connection/db_connection/oracle/execute.md b/mddocs/docs/connection/db_connection/oracle/execute.md index bf9466eff..e871a7f92 100644 --- a/mddocs/docs/connection/db_connection/oracle/execute.md +++ b/mddocs/docs/connection/db_connection/oracle/execute.md @@ -91,24 +91,6 @@ This method supports **any** query syntax supported by Oracle, like: ## Options { #DBR-onetl-connection-db-connection-oracle-execute-options } - ::: onetl.connection.db_connection.oracle.options.OracleFetchOptions options: diff --git a/mddocs/docs/connection/db_connection/oracle/read.md b/mddocs/docs/connection/db_connection/oracle/read.md index 2115f468b..410a33c4e 100644 --- a/mddocs/docs/connection/db_connection/oracle/read.md +++ b/mddocs/docs/connection/db_connection/oracle/read.md @@ -77,17 +77,6 @@ Especially if there are indexes or partitions for columns used in `where` clause ## Options { #DBR-onetl-connection-db-connection-oracle-read-options } - ::: onetl.connection.db_connection.oracle.options.OracleReadOptions options: diff --git a/mddocs/docs/connection/db_connection/oracle/sql.md b/mddocs/docs/connection/db_connection/oracle/sql.md index 540d34dc4..8335a145c 100644 --- a/mddocs/docs/connection/db_connection/oracle/sql.md +++ b/mddocs/docs/connection/db_connection/oracle/sql.md @@ -61,19 +61,6 @@ Especially if there are indexes or partitions for columns used in `where` clause ## Options { #DBR-onetl-connection-db-connection-oracle-sql-options } - ::: onetl.connection.db_connection.oracle.options.OracleSQLOptions options: diff --git a/mddocs/docs/connection/db_connection/oracle/write.md b/mddocs/docs/connection/db_connection/oracle/write.md index fe67be2de..fd2a25d86 100644 --- a/mddocs/docs/connection/db_connection/oracle/write.md +++ b/mddocs/docs/connection/db_connection/oracle/write.md @@ -35,19 +35,6 @@ For writing data to Oracle, use [DBWriter][DBR-onetl-db-writer]. Method above accepts [OracleWriteOptions][onetl.connection.db_connection.oracle.options.OracleWriteOptions] - ::: onetl.connection.db_connection.oracle.options.OracleWriteOptions options: diff --git a/mddocs/docs/connection/db_connection/postgres/connection.md b/mddocs/docs/connection/db_connection/postgres/connection.md index 09bacf1cd..d89ba76dc 100644 --- a/mddocs/docs/connection/db_connection/postgres/connection.md +++ b/mddocs/docs/connection/db_connection/postgres/connection.md @@ -1,15 +1,5 @@ # Postgres connection { #DBR-onetl-connection-db-connection-postgres-connection-0 } - ::: onetl.connection.db_connection.postgres.connection.Postgres options: diff --git a/mddocs/docs/connection/db_connection/postgres/execute.md b/mddocs/docs/connection/db_connection/postgres/execute.md index 2b4e73d9c..54d91972a 100644 --- a/mddocs/docs/connection/db_connection/postgres/execute.md +++ b/mddocs/docs/connection/db_connection/postgres/execute.md @@ -88,24 +88,6 @@ This method supports **any** query syntax supported by Postgres, like: ## Options { #DBR-onetl-connection-db-connection-postgres-execute-options } - ::: onetl.connection.db_connection.postgres.options.PostgresFetchOptions options: diff --git a/mddocs/docs/connection/db_connection/postgres/read.md b/mddocs/docs/connection/db_connection/postgres/read.md index 169f301a3..eaad0252b 100644 --- a/mddocs/docs/connection/db_connection/postgres/read.md +++ b/mddocs/docs/connection/db_connection/postgres/read.md @@ -75,17 +75,6 @@ Especially if there are indexes or partitions for columns used in `where` clause ## Options { #DBR-onetl-connection-db-connection-postgres-read-options } - ::: onetl.connection.db_connection.postgres.options.PostgresReadOptions options: diff --git a/mddocs/docs/connection/db_connection/postgres/sql.md b/mddocs/docs/connection/db_connection/postgres/sql.md index a018d5e52..d03f0a924 100644 --- a/mddocs/docs/connection/db_connection/postgres/sql.md +++ b/mddocs/docs/connection/db_connection/postgres/sql.md @@ -60,19 +60,6 @@ Especially if there are indexes or partitions for columns used in `where` clause ## Options { #DBR-onetl-connection-db-connection-postgres-sql-options } - ::: onetl.connection.db_connection.postgres.options.PostgresSQLOptions options: diff --git a/mddocs/docs/connection/db_connection/postgres/write.md b/mddocs/docs/connection/db_connection/postgres/write.md index 59abbdcaa..8489d8ad3 100644 --- a/mddocs/docs/connection/db_connection/postgres/write.md +++ b/mddocs/docs/connection/db_connection/postgres/write.md @@ -37,19 +37,6 @@ For writing data to Postgres, use [DBWriter][DBR-onetl-db-writer]. Method above accepts [Postgres.WriteOptions][onetl.connection.db_connection.postgres.options.PostgresWriteOptions] - ::: onetl.connection.db_connection.postgres.options.PostgresWriteOptions options: diff --git a/mddocs/docs/connection/file_connection/ftp.md b/mddocs/docs/connection/file_connection/ftp.md index 9e68d418c..220697e80 100644 --- a/mddocs/docs/connection/file_connection/ftp.md +++ b/mddocs/docs/connection/file_connection/ftp.md @@ -1,15 +1,5 @@ # FTP connection { #DBR-onetl-connection-file-connection-ftp-connection } - ::: onetl.connection.file_connection.ftp.FTP options: diff --git a/mddocs/docs/connection/file_connection/ftps.md b/mddocs/docs/connection/file_connection/ftps.md index ff71c80ea..2182543c0 100644 --- a/mddocs/docs/connection/file_connection/ftps.md +++ b/mddocs/docs/connection/file_connection/ftps.md @@ -1,15 +1,5 @@ # FTPS connection { #DBR-onetl-connection-file-connection-ftps-connection } - ::: onetl.connection.file_connection.ftps.FTPS options: diff --git a/mddocs/docs/connection/file_connection/hdfs/connection.md b/mddocs/docs/connection/file_connection/hdfs/connection.md index 5c7437a08..3ceeb224f 100644 --- a/mddocs/docs/connection/file_connection/hdfs/connection.md +++ b/mddocs/docs/connection/file_connection/hdfs/connection.md @@ -1,15 +1,5 @@ # HDFS connection { #DBR-onetl-connection-file-connection-hdfs-connection-0 } - ::: onetl.connection.file_connection.hdfs.connection.HDFS options: diff --git a/mddocs/docs/connection/file_connection/hdfs/slots.md b/mddocs/docs/connection/file_connection/hdfs/slots.md index 21a9a5269..a2be17a68 100644 --- a/mddocs/docs/connection/file_connection/hdfs/slots.md +++ b/mddocs/docs/connection/file_connection/hdfs/slots.md @@ -1,16 +1,5 @@ # HDFS Slots { #DBR-onetl-connection-file-connection-hdfs-slots } - ::: onetl.connection.file_connection.hdfs.slots.HDFSSlots options: diff --git a/mddocs/docs/connection/file_connection/s3.md b/mddocs/docs/connection/file_connection/s3.md index 527fcb646..3d5ce75c0 100644 --- a/mddocs/docs/connection/file_connection/s3.md +++ b/mddocs/docs/connection/file_connection/s3.md @@ -1,15 +1,5 @@ # S3 connection { #DBR-onetl-connection-file-connection-s3-connection } - ::: onetl.connection.file_connection.s3.S3 options: diff --git a/mddocs/docs/connection/file_connection/samba.md b/mddocs/docs/connection/file_connection/samba.md index 4c186ca2a..0f802a3bc 100644 --- a/mddocs/docs/connection/file_connection/samba.md +++ b/mddocs/docs/connection/file_connection/samba.md @@ -1,15 +1,5 @@ # Samba connection { #DBR-onetl-connection-file-connection-samba-connection } - ::: onetl.connection.file_connection.samba.Samba options: diff --git a/mddocs/docs/connection/file_connection/sftp.md b/mddocs/docs/connection/file_connection/sftp.md index d7c9ca687..400edbd96 100644 --- a/mddocs/docs/connection/file_connection/sftp.md +++ b/mddocs/docs/connection/file_connection/sftp.md @@ -1,15 +1,5 @@ # SFTP connection { #DBR-onetl-connection-file-connection-sftp-connection } - ::: onetl.connection.file_connection.sftp.SFTP options: diff --git a/mddocs/docs/connection/file_connection/webdav.md b/mddocs/docs/connection/file_connection/webdav.md index 1c939aadd..32a540115 100644 --- a/mddocs/docs/connection/file_connection/webdav.md +++ b/mddocs/docs/connection/file_connection/webdav.md @@ -1,15 +1,5 @@ # WebDAV connection { #DBR-onetl-connection-file-connection-webdav-connection } - ::: onetl.connection.file_connection.webdav.WebDAV options: diff --git a/mddocs/docs/connection/file_df_connection/base.md b/mddocs/docs/connection/file_df_connection/base.md index 2f20b0da8..bed5fcb3b 100644 --- a/mddocs/docs/connection/file_df_connection/base.md +++ b/mddocs/docs/connection/file_df_connection/base.md @@ -1,15 +1,5 @@ # Base interface { #DBR-onetl-connection-file-df-connection-base-interface } - ::: onetl.base.base_file_df_connection.BaseFileDFConnection options: diff --git a/mddocs/docs/connection/file_df_connection/spark_hdfs/connection.md b/mddocs/docs/connection/file_df_connection/spark_hdfs/connection.md index 6c1450b3d..f60ad0571 100644 --- a/mddocs/docs/connection/file_df_connection/spark_hdfs/connection.md +++ b/mddocs/docs/connection/file_df_connection/spark_hdfs/connection.md @@ -1,15 +1,5 @@ # Spark HDFS Connection { #DBR-onetl-connection-file-df-connection-spark-hdfs-connection } - ::: onetl.connection.file_df_connection.spark_hdfs.connection.SparkHDFS options: diff --git a/mddocs/docs/connection/file_df_connection/spark_hdfs/slots.md b/mddocs/docs/connection/file_df_connection/spark_hdfs/slots.md index 4ab8af827..97d72f589 100644 --- a/mddocs/docs/connection/file_df_connection/spark_hdfs/slots.md +++ b/mddocs/docs/connection/file_df_connection/spark_hdfs/slots.md @@ -1,16 +1,5 @@ # Spark HDFS Slots { #DBR-onetl-connection-file-df-connection-spark-hdfs-slots } - ::: onetl.connection.file_df_connection.spark_hdfs.slots.SparkHDFSSlots options: diff --git a/mddocs/docs/connection/file_df_connection/spark_local_fs.md b/mddocs/docs/connection/file_df_connection/spark_local_fs.md index 15b389366..9e2b67e62 100644 --- a/mddocs/docs/connection/file_df_connection/spark_local_fs.md +++ b/mddocs/docs/connection/file_df_connection/spark_local_fs.md @@ -1,15 +1,5 @@ # Spark LocalFS { #DBR-onetl-connection-file-df-connection-spark-local-fs-spark-localfs } - ::: onetl.connection.file_df_connection.spark_local_fs.SparkLocalFS options: diff --git a/mddocs/docs/connection/file_df_connection/spark_s3/connection.md b/mddocs/docs/connection/file_df_connection/spark_s3/connection.md index d5e0dae0b..d62f5a3e0 100644 --- a/mddocs/docs/connection/file_df_connection/spark_s3/connection.md +++ b/mddocs/docs/connection/file_df_connection/spark_s3/connection.md @@ -1,15 +1,5 @@ # Spark S3 Connection { #DBR-onetl-connection-file-df-connection-spark-s3-connection } - ::: onetl.connection.file_df_connection.spark_s3.connection.SparkS3 options: diff --git a/mddocs/docs/db/reader.md b/mddocs/docs/db/reader.md index 81d2985de..8ad69cfe2 100644 --- a/mddocs/docs/db/reader.md +++ b/mddocs/docs/db/reader.md @@ -1,15 +1,5 @@ # DB Reader { #DBR-onetl-db-reader } - ::: onetl.db.db_reader.db_reader.DBReader options: diff --git a/mddocs/docs/db/writer.md b/mddocs/docs/db/writer.md index 9a0b83c22..d1e6bebad 100644 --- a/mddocs/docs/db/writer.md +++ b/mddocs/docs/db/writer.md @@ -1,22 +1,5 @@ # DB Writer { #DBR-onetl-db-writer } - ::: onetl.db.db_writer.db_writer.DBWriter options: diff --git a/mddocs/docs/file/file_downloader/file_downloader.md b/mddocs/docs/file/file_downloader/file_downloader.md index 2e5e385ae..5a79e8340 100644 --- a/mddocs/docs/file/file_downloader/file_downloader.md +++ b/mddocs/docs/file/file_downloader/file_downloader.md @@ -1,24 +1,5 @@ # File Downloader { #DBR-onetl-file-downloader-0 } - ::: onetl.file.file_downloader.file_downloader.FileDownloader options: diff --git a/mddocs/docs/file/file_downloader/result.md b/mddocs/docs/file/file_downloader/result.md index 42d2b45d3..c1f75fa67 100644 --- a/mddocs/docs/file/file_downloader/result.md +++ b/mddocs/docs/file/file_downloader/result.md @@ -1,16 +1,5 @@ # File Downloader Result { #DBR-onetl-file-downloader-result } - ::: onetl.file.file_downloader.result.DownloadResult options: diff --git a/mddocs/docs/file/file_filters/base.md b/mddocs/docs/file/file_filters/base.md index f10c994bc..21da91782 100644 --- a/mddocs/docs/file/file_filters/base.md +++ b/mddocs/docs/file/file_filters/base.md @@ -1,22 +1,5 @@ # Base interface { #DBR-onetl-file-filters-base-interface } - ::: onetl.base.base_file_filter.BaseFileFilter options: diff --git a/mddocs/docs/file/file_filters/exclude_dir.md b/mddocs/docs/file/file_filters/exclude_dir.md index 978ba1748..37bdb585d 100644 --- a/mddocs/docs/file/file_filters/exclude_dir.md +++ b/mddocs/docs/file/file_filters/exclude_dir.md @@ -1,15 +1,5 @@ # ExcludeDir { #DBR-onetl-file-filters-exclude-dir-excludedir } - ::: onetl.file.filter.exclude_dir.ExcludeDir options: diff --git a/mddocs/docs/file/file_filters/file_filter.md b/mddocs/docs/file/file_filters/file_filter.md index 9eae6dba6..616d6e770 100644 --- a/mddocs/docs/file/file_filters/file_filter.md +++ b/mddocs/docs/file/file_filters/file_filter.md @@ -1,15 +1,5 @@ # File Filter (legacy) { #DBR-onetl-file-filters-file-filter-legacy } - ::: onetl.core.file_filter.file_filter.FileFilter options: diff --git a/mddocs/docs/file/file_filters/file_mtime_filter.md b/mddocs/docs/file/file_filters/file_mtime_filter.md index 4d71e97af..3b76ea407 100644 --- a/mddocs/docs/file/file_filters/file_mtime_filter.md +++ b/mddocs/docs/file/file_filters/file_mtime_filter.md @@ -1,15 +1,5 @@ # FileModifiedTime { #DBR-onetl-file-filters-file-mtime-filter-filemodifiedtime } - ::: onetl.file.filter.file_mtime.FileModifiedTime options: diff --git a/mddocs/docs/file/file_filters/file_size_filter.md b/mddocs/docs/file/file_filters/file_size_filter.md index b57a15583..a257b6dfc 100644 --- a/mddocs/docs/file/file_filters/file_size_filter.md +++ b/mddocs/docs/file/file_filters/file_size_filter.md @@ -1,15 +1,5 @@ # FileSizeRange { #DBR-onetl-file-filters-file-size-filter-filesizerange } - ::: onetl.file.filter.file_size.FileSizeRange options: diff --git a/mddocs/docs/file/file_filters/glob.md b/mddocs/docs/file/file_filters/glob.md index e98d1d6a0..9e00024a3 100644 --- a/mddocs/docs/file/file_filters/glob.md +++ b/mddocs/docs/file/file_filters/glob.md @@ -1,15 +1,5 @@ # Glob { #DBR-onetl-file-filters-glob } - ::: onetl.file.filter.glob.Glob options: diff --git a/mddocs/docs/file/file_filters/match_all_filters.md b/mddocs/docs/file/file_filters/match_all_filters.md index 8d9a2f457..150df6f8a 100644 --- a/mddocs/docs/file/file_filters/match_all_filters.md +++ b/mddocs/docs/file/file_filters/match_all_filters.md @@ -1,13 +1,4 @@ # match_all_filters { #DBR-onetl-file-filters-match-all-filters } - ::: onetl.file.filter.match_all_filters diff --git a/mddocs/docs/file/file_filters/regexp.md b/mddocs/docs/file/file_filters/regexp.md index f91490404..e1c173cf5 100644 --- a/mddocs/docs/file/file_filters/regexp.md +++ b/mddocs/docs/file/file_filters/regexp.md @@ -1,16 +1,5 @@ # Regexp { #DBR-onetl-file-filters-regexp } - - ::: onetl.file.filter.regexp.Regexp options: diff --git a/mddocs/docs/file/file_limits/base.md b/mddocs/docs/file/file_limits/base.md index 02f155336..2e0bdd187 100644 --- a/mddocs/docs/file/file_limits/base.md +++ b/mddocs/docs/file/file_limits/base.md @@ -1,24 +1,5 @@ # Base interface { #DBR-onetl-file-limits-base-interface } - ::: onetl.base.base_file_limit.BaseFileLimit options: diff --git a/mddocs/docs/file/file_limits/file_limit.md b/mddocs/docs/file/file_limits/file_limit.md index cefdcdc26..c4886c74c 100644 --- a/mddocs/docs/file/file_limits/file_limit.md +++ b/mddocs/docs/file/file_limits/file_limit.md @@ -1,15 +1,5 @@ # File Limit (legacy) { #DBR-onetl-file-limits-file-limit-legacy } - ::: onetl.core.file_limit.file_limit.FileLimit options: diff --git a/mddocs/docs/file/file_limits/limits_reached.md b/mddocs/docs/file/file_limits/limits_reached.md index 2a292d67b..7dfd41ed6 100644 --- a/mddocs/docs/file/file_limits/limits_reached.md +++ b/mddocs/docs/file/file_limits/limits_reached.md @@ -1,13 +1,4 @@ # limits_reached { #DBR-onetl-file-limits-limits-reached } - ::: onetl.file.limit.limits_reached diff --git a/mddocs/docs/file/file_limits/limits_stop_at.md b/mddocs/docs/file/file_limits/limits_stop_at.md index e7f133491..3780e7df6 100644 --- a/mddocs/docs/file/file_limits/limits_stop_at.md +++ b/mddocs/docs/file/file_limits/limits_stop_at.md @@ -1,13 +1,4 @@ # limits_stop_at { #DBR-onetl-file-limits-limits-stop-at } - ::: onetl.file.limit.limits_stop_at diff --git a/mddocs/docs/file/file_limits/max_files_count.md b/mddocs/docs/file/file_limits/max_files_count.md index 5c59cb177..9f4190fa8 100644 --- a/mddocs/docs/file/file_limits/max_files_count.md +++ b/mddocs/docs/file/file_limits/max_files_count.md @@ -1,15 +1,5 @@ # MaxFilesCount { #DBR-onetl-file-limits-max-files-count-maxfilescount } - ::: onetl.file.limit.max_files_count.MaxFilesCount options: diff --git a/mddocs/docs/file/file_limits/reset_limits.md b/mddocs/docs/file/file_limits/reset_limits.md index d3766c6d9..15c3b4ecd 100644 --- a/mddocs/docs/file/file_limits/reset_limits.md +++ b/mddocs/docs/file/file_limits/reset_limits.md @@ -1,13 +1,4 @@ # reset_limits { #DBR-onetl-file-limits-reset-limits } - ::: onetl.file.limit.reset_limits diff --git a/mddocs/docs/file/file_limits/total_files_size.md b/mddocs/docs/file/file_limits/total_files_size.md index 3b91b6043..e0402db4a 100644 --- a/mddocs/docs/file/file_limits/total_files_size.md +++ b/mddocs/docs/file/file_limits/total_files_size.md @@ -1,15 +1,5 @@ # TotalFilesSize { #DBR-onetl-file-limits-total-files-size-totalfilessize } - ::: onetl.file.limit.total_files_size.TotalFilesSize options: diff --git a/mddocs/docs/file/file_mover/file_mover.md b/mddocs/docs/file/file_mover/file_mover.md index ccbc939a3..56f42e438 100644 --- a/mddocs/docs/file/file_mover/file_mover.md +++ b/mddocs/docs/file/file_mover/file_mover.md @@ -1,24 +1,5 @@ # File Mover { #DBR-onetl-file-mover-0 } - ::: onetl.file.file_mover.file_mover.FileMover options: diff --git a/mddocs/docs/file/file_mover/options.md b/mddocs/docs/file/file_mover/options.md index a6661336d..312fc6918 100644 --- a/mddocs/docs/file/file_mover/options.md +++ b/mddocs/docs/file/file_mover/options.md @@ -1,16 +1,4 @@ # File Mover Options { #DBR-onetl-file-mover-options } - ::: onetl.file.file_mover.options.FileMoverOptions diff --git a/mddocs/docs/file/file_mover/result.md b/mddocs/docs/file/file_mover/result.md index 0b7a667dc..63c4ad963 100644 --- a/mddocs/docs/file/file_mover/result.md +++ b/mddocs/docs/file/file_mover/result.md @@ -1,15 +1,5 @@ # File Mover Result { #DBR-onetl-file-mover-result } - ::: onetl.file.file_mover.result.MoveResult options: diff --git a/mddocs/docs/file/file_uploader/file_uploader.md b/mddocs/docs/file/file_uploader/file_uploader.md index 88f283d7b..17a88a164 100644 --- a/mddocs/docs/file/file_uploader/file_uploader.md +++ b/mddocs/docs/file/file_uploader/file_uploader.md @@ -1,24 +1,5 @@ # File Uploader { #DBR-onetl-file-uploader-0 } - ::: onetl.file.file_uploader.file_uploader.FileUploader options: diff --git a/mddocs/docs/file/file_uploader/options.md b/mddocs/docs/file/file_uploader/options.md index 6ccf57f4c..a3ff165f1 100644 --- a/mddocs/docs/file/file_uploader/options.md +++ b/mddocs/docs/file/file_uploader/options.md @@ -1,17 +1,5 @@ # File Uploader Options { #DBR-onetl-file-uploader-options } - ::: onetl.file.file_uploader.options.FileUploaderOptions options: diff --git a/mddocs/docs/file/file_uploader/result.md b/mddocs/docs/file/file_uploader/result.md index cb8f07302..a8a9cdfe2 100644 --- a/mddocs/docs/file/file_uploader/result.md +++ b/mddocs/docs/file/file_uploader/result.md @@ -1,15 +1,5 @@ # File Uploader Result { #DBR-onetl-file-uploader-result } - ::: onetl.file.file_uploader.result.UploadResult options: diff --git a/mddocs/docs/file_df/file_df_reader/file_df_reader.md b/mddocs/docs/file_df/file_df_reader/file_df_reader.md index 0212129fb..7f8f4def2 100644 --- a/mddocs/docs/file_df/file_df_reader/file_df_reader.md +++ b/mddocs/docs/file_df/file_df_reader/file_df_reader.md @@ -1,16 +1,5 @@ # FileDF Reader { #DBR-onetl-file-df-reader-filedf-reader-0 } - ::: onetl.file.file_df_reader.file_df_reader.FileDFReader options: diff --git a/mddocs/docs/file_df/file_df_reader/options.md b/mddocs/docs/file_df/file_df_reader/options.md index fd672a766..deff22262 100644 --- a/mddocs/docs/file_df/file_df_reader/options.md +++ b/mddocs/docs/file_df/file_df_reader/options.md @@ -1,15 +1,4 @@ # Options { #DBR-onetl-file-df-reader-options } - ::: onetl.file.file_df_reader.options.FileDFReaderOptions diff --git a/mddocs/docs/file_df/file_df_writer/file_df_writer.md b/mddocs/docs/file_df/file_df_writer/file_df_writer.md index 5e69f1515..0ad444c1b 100644 --- a/mddocs/docs/file_df/file_df_writer/file_df_writer.md +++ b/mddocs/docs/file_df/file_df_writer/file_df_writer.md @@ -1,16 +1,5 @@ # FileDF Writer { #DBR-onetl-file-df-writer-filedf-writer-0 } - ::: onetl.file.file_df_writer.file_df_writer.FileDFWriter options: diff --git a/mddocs/docs/file_df/file_df_writer/options.md b/mddocs/docs/file_df/file_df_writer/options.md index 2da895707..254acf2f4 100644 --- a/mddocs/docs/file_df/file_df_writer/options.md +++ b/mddocs/docs/file_df/file_df_writer/options.md @@ -1,15 +1,4 @@ # Options { #DBR-onetl-file-df-writer-options } - ::: onetl.file.file_df_writer.options.FileDFWriterOptions diff --git a/mddocs/docs/file_df/file_formats/avro.md b/mddocs/docs/file_df/file_formats/avro.md index 92657a064..8738ddd20 100644 --- a/mddocs/docs/file_df/file_formats/avro.md +++ b/mddocs/docs/file_df/file_formats/avro.md @@ -1,16 +1,5 @@ # Avro { #DBR-onetl-file-df-file-formats-avro } - ::: onetl.file.format.avro.Avro options: diff --git a/mddocs/docs/file_df/file_formats/base.md b/mddocs/docs/file_df/file_formats/base.md index 656353938..7c86d22c3 100644 --- a/mddocs/docs/file_df/file_formats/base.md +++ b/mddocs/docs/file_df/file_formats/base.md @@ -1,22 +1,5 @@ # Base interface { #DBR-onetl-file-df-file-formats-base-interface } - ::: onetl.base.base_file_format.BaseReadableFileFormat options: diff --git a/mddocs/docs/file_df/file_formats/csv.md b/mddocs/docs/file_df/file_formats/csv.md index 68572970c..24902c15e 100644 --- a/mddocs/docs/file_df/file_formats/csv.md +++ b/mddocs/docs/file_df/file_formats/csv.md @@ -1,16 +1,5 @@ # CSV { #DBR-onetl-file-df-file-formats-csv } - ::: onetl.file.format.csv.CSV options: diff --git a/mddocs/docs/file_df/file_formats/excel.md b/mddocs/docs/file_df/file_formats/excel.md index 1c84025ce..67dc99443 100644 --- a/mddocs/docs/file_df/file_formats/excel.md +++ b/mddocs/docs/file_df/file_formats/excel.md @@ -1,16 +1,5 @@ # Excel { #DBR-onetl-file-df-file-formats-excel } - ::: onetl.file.format.excel.Excel options: diff --git a/mddocs/docs/file_df/file_formats/json.md b/mddocs/docs/file_df/file_formats/json.md index 36a8fb538..1c062e113 100644 --- a/mddocs/docs/file_df/file_formats/json.md +++ b/mddocs/docs/file_df/file_formats/json.md @@ -1,16 +1,5 @@ # JSON { #DBR-onetl-file-df-file-formats-json } - ::: onetl.file.format.json.JSON options: diff --git a/mddocs/docs/file_df/file_formats/jsonline.md b/mddocs/docs/file_df/file_formats/jsonline.md index d20076fb4..2335729d7 100644 --- a/mddocs/docs/file_df/file_formats/jsonline.md +++ b/mddocs/docs/file_df/file_formats/jsonline.md @@ -1,16 +1,5 @@ # JSONLine { #DBR-onetl-file-df-file-formats-jsonline } - ::: onetl.file.format.jsonline.JSONLine options: diff --git a/mddocs/docs/file_df/file_formats/orc.md b/mddocs/docs/file_df/file_formats/orc.md index 262fbebd9..917be305e 100644 --- a/mddocs/docs/file_df/file_formats/orc.md +++ b/mddocs/docs/file_df/file_formats/orc.md @@ -1,16 +1,5 @@ # ORC { #DBR-onetl-file-df-file-formats-orc } - ::: onetl.file.format.orc.ORC options: diff --git a/mddocs/docs/file_df/file_formats/parquet.md b/mddocs/docs/file_df/file_formats/parquet.md index d6499ad2d..6e36d69ad 100644 --- a/mddocs/docs/file_df/file_formats/parquet.md +++ b/mddocs/docs/file_df/file_formats/parquet.md @@ -1,16 +1,5 @@ # Parquet { #DBR-onetl-file-df-file-formats-parquet } - ::: onetl.file.format.parquet.Parquet options: diff --git a/mddocs/docs/file_df/file_formats/xml.md b/mddocs/docs/file_df/file_formats/xml.md index 6785e3709..90c5d45be 100644 --- a/mddocs/docs/file_df/file_formats/xml.md +++ b/mddocs/docs/file_df/file_formats/xml.md @@ -1,16 +1,5 @@ # XML { #DBR-onetl-file-df-file-formats-xml } - ::: onetl.file.format.xml.XML options: diff --git a/mddocs/docs/hooks/global_state.md b/mddocs/docs/hooks/global_state.md index 672c222e6..8ef108286 100644 --- a/mddocs/docs/hooks/global_state.md +++ b/mddocs/docs/hooks/global_state.md @@ -1,30 +1,5 @@ # Hooks global state { #DBR-onetl-hooks-global-state } - ::: onetl.hooks.hooks_state options: diff --git a/mddocs/docs/hooks/hook.md b/mddocs/docs/hooks/hook.md index ae2db3d89..1b71313b1 100644 --- a/mddocs/docs/hooks/hook.md +++ b/mddocs/docs/hooks/hook.md @@ -1,37 +1,5 @@ # `@hook` decorator { #DBR-onetl-hooks-hook-decorator } - ::: onetl.hooks.hook.hook diff --git a/mddocs/docs/hooks/slot.md b/mddocs/docs/hooks/slot.md index 2a2c3ad38..382e60e22 100644 --- a/mddocs/docs/hooks/slot.md +++ b/mddocs/docs/hooks/slot.md @@ -1,29 +1,5 @@ # `@slot` decorator { #DBR-onetl-hooks-slot-decorator } - ::: onetl.hooks.slot.slot diff --git a/mddocs/docs/hooks/support_hooks.md b/mddocs/docs/hooks/support_hooks.md index 467c7ff40..c51547f77 100644 --- a/mddocs/docs/hooks/support_hooks.md +++ b/mddocs/docs/hooks/support_hooks.md @@ -1,43 +1,5 @@ # `@support_hooks` decorator { #DBR-onetl-hooks-support-hooks-decorator } - - ::: onetl.hooks.support_hooks options: diff --git a/mddocs/docs/hwm_store/yaml_hwm_store.md b/mddocs/docs/hwm_store/yaml_hwm_store.md index eb04f3384..c30271d60 100644 --- a/mddocs/docs/hwm_store/yaml_hwm_store.md +++ b/mddocs/docs/hwm_store/yaml_hwm_store.md @@ -1,15 +1,5 @@ # YAML HWM Store { #DBR-onetl-hwm-store-yaml-hwm-store } - ::: onetl.hwm.store.yaml_hwm_store.YAMLHWMStore options: diff --git a/mddocs/docs/index.md b/mddocs/docs/index.md index b29808d70..29808fbf4 100644 --- a/mddocs/docs/index.md +++ b/mddocs/docs/index.md @@ -9,7 +9,7 @@ {{ docs_status_badge }} {{ ci_status_badge }} {{ precommit_badge }} - +{{ test_cov_badge }} {{ onetl_logo_wide }} diff --git a/mddocs/docs/install/spark.md b/mddocs/docs/install/spark.md index 9499bc23c..fbd2a43f6 100644 --- a/mddocs/docs/install/spark.md +++ b/mddocs/docs/install/spark.md @@ -1,24 +1,10 @@ # Spark { #DBR-onetl-install-spark } - All DB connection classes (`Clickhouse`, `Greenplum`, `Hive` and others) and all FileDF connection classes (`SparkHDFS`, `SparkLocalFS`, `SparkS3`) require Spark to be installed. ## Installing Java { #DBR-onetl-install-spark-installing-java } - Firstly, you should install JDK. The exact installation instruction depends on your OS, here are some examples: @@ -41,13 +27,6 @@ apt-get install openjdk-11-jdk # Debian-based + Spark 3 ## Installing PySpark { #DBR-onetl-install-spark-installing-pyspark } - Then you should install PySpark via passing `spark` to `extras`: diff --git a/mddocs/docs/quickstart.md b/mddocs/docs/quickstart.md index a65a8c2d7..a9ad7ab6e 100644 --- a/mddocs/docs/quickstart.md +++ b/mddocs/docs/quickstart.md @@ -9,7 +9,7 @@ {{ docs_status_badge }} {{ ci_status_badge }} {{ precommit_badge }} - +{{ test_cov_badge }} {{ onetl_logo_wide }} diff --git a/mddocs/docs/strategy/incremental_batch_strategy.md b/mddocs/docs/strategy/incremental_batch_strategy.md index cd9ac0ac5..e9fd48545 100644 --- a/mddocs/docs/strategy/incremental_batch_strategy.md +++ b/mddocs/docs/strategy/incremental_batch_strategy.md @@ -1,15 +1,5 @@ # Incremental Batch Strategy { #DBR-onetl-strategy-incremental-batch-strategy } - ::: onetl.strategy.incremental_strategy.IncrementalBatchStrategy options: diff --git a/mddocs/docs/strategy/incremental_strategy.md b/mddocs/docs/strategy/incremental_strategy.md index b54b8c1ba..5f774c765 100644 --- a/mddocs/docs/strategy/incremental_strategy.md +++ b/mddocs/docs/strategy/incremental_strategy.md @@ -1,15 +1,5 @@ # Incremental Strategy { #DBR-onetl-strategy-incremental-strategy } - ::: onetl.strategy.incremental_strategy.IncrementalStrategy options: diff --git a/mddocs/docs/strategy/snapshot_batch_strategy.md b/mddocs/docs/strategy/snapshot_batch_strategy.md index 3ba1a6805..908929207 100644 --- a/mddocs/docs/strategy/snapshot_batch_strategy.md +++ b/mddocs/docs/strategy/snapshot_batch_strategy.md @@ -1,15 +1,5 @@ # Snapshot Batch Strategy { #DBR-onetl-strategy-snapshot-batch-strategy } - ::: onetl.strategy.snapshot_strategy.SnapshotBatchStrategy options: diff --git a/mddocs/docs/strategy/snapshot_strategy.md b/mddocs/docs/strategy/snapshot_strategy.md index 75025edaa..3775e4a6a 100644 --- a/mddocs/docs/strategy/snapshot_strategy.md +++ b/mddocs/docs/strategy/snapshot_strategy.md @@ -1,15 +1,5 @@ # Snapshot Strategy { #DBR-onetl-strategy-snapshot-strategy } - ::: onetl.strategy.snapshot_strategy.SnapshotStrategy options: From 2c5c3f721e7e3ce5276c77cd14e9f920462292db Mon Sep 17 00:00:00 2001 From: sga Date: Wed, 8 Apr 2026 00:32:14 +0300 Subject: [PATCH 23/28] fix code blocks indents --- .../_static/stylesheets/autodoc_pydantic.css | 11 + .../db_connection/clickhouse/execute.md | 36 +-- .../db_connection/clickhouse/read.md | 68 ++--- .../db_connection/clickhouse/sql.md | 48 ++-- .../db_connection/greenplum/execute.md | 54 ++-- .../db_connection/greenplum/prerequisites.md | 154 +++++------ .../db_connection/greenplum/read.md | 212 +++++++-------- .../db_connection/greenplum/write.md | 42 +-- .../connection/db_connection/hive/execute.md | 36 +-- .../db_connection/hive/prerequisites.md | 20 +- .../connection/db_connection/hive/read.md | 54 ++-- .../docs/connection/db_connection/hive/sql.md | 36 +-- .../db_connection/mongodb/prerequisites.md | 26 +- .../connection/db_connection/mongodb/read.md | 168 ++++++------ .../connection/db_connection/mongodb/types.md | 86 +++--- .../connection/db_connection/mongodb/write.md | 28 +- .../connection/db_connection/mssql/execute.md | 54 ++-- .../connection/db_connection/mssql/read.md | 70 +++-- .../connection/db_connection/mssql/sql.md | 48 ++-- .../connection/db_connection/mssql/types.md | 114 ++++---- .../connection/db_connection/mssql/write.md | 24 +- .../connection/db_connection/mysql/execute.md | 56 ++-- .../connection/db_connection/mysql/read.md | 72 ++--- .../connection/db_connection/mysql/sql.md | 48 ++-- .../connection/db_connection/mysql/types.md | 136 +++++----- .../connection/db_connection/mysql/write.md | 40 +-- .../db_connection/oracle/execute.md | 54 ++-- .../db_connection/oracle/prerequisites.md | 20 +- .../connection/db_connection/oracle/read.md | 72 ++--- .../connection/db_connection/oracle/sql.md | 48 ++-- .../connection/db_connection/oracle/types.md | 146 +++++------ .../connection/db_connection/oracle/write.md | 24 +- .../db_connection/postgres/execute.md | 54 ++-- .../connection/db_connection/postgres/read.md | 68 ++--- .../connection/db_connection/postgres/sql.md | 48 ++-- .../db_connection/postgres/types.md | 204 +++++++------- .../db_connection/postgres/write.md | 24 +- .../spark_s3/troubleshooting.md | 248 +++++++++--------- 38 files changed, 1380 insertions(+), 1371 deletions(-) create mode 100644 mddocs/docs/_static/stylesheets/autodoc_pydantic.css diff --git a/mddocs/docs/_static/stylesheets/autodoc_pydantic.css b/mddocs/docs/_static/stylesheets/autodoc_pydantic.css new file mode 100644 index 000000000..994a3e548 --- /dev/null +++ b/mddocs/docs/_static/stylesheets/autodoc_pydantic.css @@ -0,0 +1,11 @@ +.autodoc_pydantic_validator_arrow { + padding-left: 8px; + } + +.autodoc_pydantic_collapsable_json { + cursor: pointer; + } + +.autodoc_pydantic_collapsable_erd { + cursor: pointer; + } \ No newline at end of file diff --git a/mddocs/docs/connection/db_connection/clickhouse/execute.md b/mddocs/docs/connection/db_connection/clickhouse/execute.md index 08ef695f3..482426437 100644 --- a/mddocs/docs/connection/db_connection/clickhouse/execute.md +++ b/mddocs/docs/connection/db_connection/clickhouse/execute.md @@ -69,25 +69,25 @@ This method supports **any** query syntax supported by Clickhouse, like: #### Examples for `Clickhouse.execute` { #DBR-onetl-connection-db-connection-clickhouse-execute-examples-for-clickhouse-execute } - ```python - from onetl.connection import Clickhouse - - clickhouse = Clickhouse(...) - - clickhouse.execute("DROP TABLE schema.table") - clickhouse.execute( - """ - CREATE TABLE schema.table ( - id UInt8, - key String, - value Float32 - ) - ENGINE = MergeTree() - ORDER BY id - """, - options=Clickhouse.ExecuteOptions(queryTimeout=10), +```python +from onetl.connection import Clickhouse + +clickhouse = Clickhouse(...) + +clickhouse.execute("DROP TABLE schema.table") +clickhouse.execute( + """ + CREATE TABLE schema.table ( + id UInt8, + key String, + value Float32 ) - ``` + ENGINE = MergeTree() + ORDER BY id + """, + options=Clickhouse.ExecuteOptions(queryTimeout=10), +) +``` ## Notes { #DBR-onetl-connection-db-connection-clickhouse-execute-notes } diff --git a/mddocs/docs/connection/db_connection/clickhouse/read.md b/mddocs/docs/connection/db_connection/clickhouse/read.md index ad82d8453..f42f94b8a 100644 --- a/mddocs/docs/connection/db_connection/clickhouse/read.md +++ b/mddocs/docs/connection/db_connection/clickhouse/read.md @@ -24,44 +24,44 @@ but does not support custom queries, like `JOIN`. ### Snapshot strategy { #DBR-onetl-connection-db-connection-clickhouse-read-snapshot-strategy } - ```python - from onetl.connection import Clickhouse - from onetl.db import DBReader - - clickhouse = Clickhouse(...) - - reader = DBReader( - connection=clickhouse, - source="schema.table", - columns=["id", "key", "CAST(value AS String) value", "updated_dt"], - where="key = 'something'", - options=Clickhouse.ReadOptions(partitionColumn="id", numPartitions=10), - ) - df = reader.run() +```python +from onetl.connection import Clickhouse +from onetl.db import DBReader + +clickhouse = Clickhouse(...) - ``` +reader = DBReader( + connection=clickhouse, + source="schema.table", + columns=["id", "key", "CAST(value AS String) value", "updated_dt"], + where="key = 'something'", + options=Clickhouse.ReadOptions(partitionColumn="id", numPartitions=10), +) +df = reader.run() + +``` ### Incremental strategy { #DBR-onetl-connection-db-connection-clickhouse-read-incremental-strategy } - ```python - from onetl.connection import Clickhouse - from onetl.db import DBReader - from onetl.strategy import IncrementalStrategy - - clickhouse = Clickhouse(...) - - reader = DBReader( - connection=clickhouse, - source="schema.table", - columns=["id", "key", "CAST(value AS String) value", "updated_dt"], - where="key = 'something'", - hwm=DBReader.AutoDetectHWM(name="clickhouse_hwm", expression="updated_dt"), - options=Clickhouse.ReadOptions(partitionColumn="id", numPartitions=10), - ) - - with IncrementalStrategy(): - df = reader.run() - ``` +```python +from onetl.connection import Clickhouse +from onetl.db import DBReader +from onetl.strategy import IncrementalStrategy + +clickhouse = Clickhouse(...) + +reader = DBReader( + connection=clickhouse, + source="schema.table", + columns=["id", "key", "CAST(value AS String) value", "updated_dt"], + where="key = 'something'", + hwm=DBReader.AutoDetectHWM(name="clickhouse_hwm", expression="updated_dt"), + options=Clickhouse.ReadOptions(partitionColumn="id", numPartitions=10), +) + +with IncrementalStrategy(): + df = reader.run() +``` ## Recommendations { #DBR-onetl-connection-db-connection-clickhouse-read-recommendations } diff --git a/mddocs/docs/connection/db_connection/clickhouse/sql.md b/mddocs/docs/connection/db_connection/clickhouse/sql.md index c893b5fed..3145a42c0 100644 --- a/mddocs/docs/connection/db_connection/clickhouse/sql.md +++ b/mddocs/docs/connection/db_connection/clickhouse/sql.md @@ -21,30 +21,30 @@ Only queries with the following syntax are supported: ## Examples { #DBR-onetl-connection-db-connection-clickhouse-sql-examples } - ```python - from onetl.connection import Clickhouse - - clickhouse = Clickhouse(...) - df = clickhouse.sql( - """ - SELECT - id, - key, - CAST(value AS String) value, - updated_at - FROM - some.mytable - WHERE - key = 'something' - """, - options=Clickhouse.SQLOptions( - partitionColumn="id", - numPartitions=10, - lowerBound=0, - upperBound=1000, - ), - ) - ``` +```python +from onetl.connection import Clickhouse + +clickhouse = Clickhouse(...) +df = clickhouse.sql( + """ + SELECT + id, + key, + CAST(value AS String) value, + updated_at + FROM + some.mytable + WHERE + key = 'something' + """, + options=Clickhouse.SQLOptions( + partitionColumn="id", + numPartitions=10, + lowerBound=0, + upperBound=1000, + ), +) +``` ## Recommendations { #DBR-onetl-connection-db-connection-clickhouse-sql-recommendations } diff --git a/mddocs/docs/connection/db_connection/greenplum/execute.md b/mddocs/docs/connection/db_connection/greenplum/execute.md index dba466228..c7cda310f 100644 --- a/mddocs/docs/connection/db_connection/greenplum/execute.md +++ b/mddocs/docs/connection/db_connection/greenplum/execute.md @@ -34,19 +34,19 @@ This method supports **any** query syntax supported by Greenplum, like: #### Examples for `Greenplum.fetch` { #DBR-onetl-connection-db-connection-greenplum-execute-examples-for-greenplum-fetch } - ```python - from onetl.connection import Greenplum +```python +from onetl.connection import Greenplum - greenplum = Greenplum(...) +greenplum = Greenplum(...) - df = greenplum.fetch( - "SELECT value FROM some.reference_table WHERE key = 'some_constant'", - options=Greenplum.FetchOptions(queryTimeout=10), - ) - greenplum.close() - value = df.collect()[0][0] # get value from first row and first column +df = greenplum.fetch( + "SELECT value FROM some.reference_table WHERE key = 'some_constant'", + options=Greenplum.FetchOptions(queryTimeout=10), +) +greenplum.close() +value = df.collect()[0][0] # get value from first row and first column - ``` +``` ### Use `Greenplum.execute` { #DBR-onetl-connection-db-connection-greenplum-execute-use-greenplum-execute } @@ -71,24 +71,24 @@ This method supports **any** query syntax supported by Greenplum, like: #### Examples for `Greenplum.execute` { #DBR-onetl-connection-db-connection-greenplum-execute-examples-for-greenplum-execute } - ```python - from onetl.connection import Greenplum - - greenplum = Greenplum(...) - - greenplum.execute("DROP TABLE schema.table") - greenplum.execute( - """ - CREATE TABLE schema.table ( - id int, - key text, - value real - ) - DISTRIBUTED BY id - """, - options=Greenplum.ExecuteOptions(queryTimeout=10), +```python +from onetl.connection import Greenplum + +greenplum = Greenplum(...) + +greenplum.execute("DROP TABLE schema.table") +greenplum.execute( + """ + CREATE TABLE schema.table ( + id int, + key text, + value real ) - ``` + DISTRIBUTED BY id + """, + options=Greenplum.ExecuteOptions(queryTimeout=10), +) +``` ## Interaction schema { #DBR-onetl-connection-db-connection-greenplum-execute-interaction-schema } diff --git a/mddocs/docs/connection/db_connection/greenplum/prerequisites.md b/mddocs/docs/connection/db_connection/greenplum/prerequisites.md index dfd6aa3ee..63ea99a79 100644 --- a/mddocs/docs/connection/db_connection/greenplum/prerequisites.md +++ b/mddocs/docs/connection/db_connection/greenplum/prerequisites.md @@ -115,18 +115,18 @@ Number of connections can be limited by 2 ways: - By limiting connection pool size user by Spark (**only** for Spark with `master=local`): - ```python - spark = SparkSession.builder.config("spark.master", "local[*]").getOrCreate() - - # No matter how many executors are started and how many cores they have, - # number of connections cannot exceed pool size: - Greenplum( - ..., - extra={ - "pool.maxSize": 10, - }, - ) - ``` +```python +spark = SparkSession.builder.config("spark.master", "local[*]").getOrCreate() + +# No matter how many executors are started and how many cores they have, +# number of connections cannot exceed pool size: +Greenplum( + ..., + extra={ + "pool.maxSize": 10, + }, +) +``` See [connection pooling](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/using_the_connector.html#jdbcconnpool) documentation. @@ -153,24 +153,24 @@ To read data from Greenplum using Spark, following ports should be opened in fir - Spark driver and all Spark executors -> port `5432` on Greenplum master node. - This port number should be set while connecting to Greenplum: + This port number should be set while connecting to Greenplum: - ```python - greenplum = Greenplum(host="master.host", port=5432, ...) - ``` + ```python + greenplum = Greenplum(host="master.host", port=5432, ...) + ``` - Greenplum segments -> some port range (e.g. `41000-42000`) **listened by Spark executors**. - This range should be set in `extra` option: + This range should be set in `extra` option: - ```python - greenplum = Greenplum( - ..., - extra={ - "server.port": "41000-42000", - }, - ) - ``` + ```python + greenplum = Greenplum( + ..., + extra={ + "server.port": "41000-42000", + }, + ) + ``` Number of ports in this range is `number of parallel running Spark sessions` * `number of parallel connections per session`. @@ -225,31 +225,31 @@ There are 2 ways to fix that: - Explicitly pass your host IP address to connector, like this - ```python - import os - - # pass here real host IP (accessible from GP segments) - os.environ["HOST_IP"] = "192.168.1.1" - - greenplum = Greenplum( - ..., - extra={ - # connector will read IP from this environment variable - "server.hostEnv": "env.HOST_IP", - }, - spark=spark, - ) - ``` + ```python + import os + + # pass here real host IP (accessible from GP segments) + os.environ["HOST_IP"] = "192.168.1.1" + + greenplum = Greenplum( + ..., + extra={ + # connector will read IP from this environment variable + "server.hostEnv": "env.HOST_IP", + }, + spark=spark, + ) + ``` More details can be found in [official documentation](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/options.html#server.hostenv). - Update `/etc/hosts` file to include real host IP: - ```text - 127.0.0.1 localhost - # this IP should be accessible from GP segments - 192.168.1.1 driver-host-name - ``` + ```text + 127.0.0.1 localhost + # this IP should be accessible from GP segments + 192.168.1.1 driver-host-name + ``` So Greenplum connector will properly resolve host IP. @@ -261,14 +261,14 @@ There are 3 ways to fix that: - Pass node hostname to `gpfdist` URL. So IP will be resolved on segment side: - ```python - greenplum = Greenplum( - ..., - extra={ - "server.useHostname": "true", - }, - ) - ``` + ```python + greenplum = Greenplum( + ..., + extra={ + "server.useHostname": "true", + }, + ) + ``` But this may fail if Hadoop cluster node hostname cannot be resolved from Greenplum segment side. @@ -276,30 +276,30 @@ There are 3 ways to fix that: - Set specific network interface to get IP address from: - ```python - greenplum = Greenplum( - ..., - extra={ - "server.nic": "eth0", - }, - ) - ``` + ```python + greenplum = Greenplum( + ..., + extra={ + "server.nic": "eth0", + }, + ) + ``` You can get list of network interfaces using this command. - !!! note +!!! note - This command should be executed on Hadoop cluster node, **not** Spark driver host! + This command should be executed on Hadoop cluster node, **not** Spark driver host! - ```bash - $ ip address - 1: lo: mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000 - inet 127.0.0.1/8 scope host lo - valid_lft forever preferred_lft forever - 2: eth0: mtu 1500 qdisc fq_codel state UP group default qlen 1000 - inet 192.168.1.1/24 brd 192.168.1.255 scope global dynamic noprefixroute eth0 - valid_lft 83457sec preferred_lft 83457sec - ``` + ```bash + $ ip address + 1: lo: mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000 + inet 127.0.0.1/8 scope host lo + valid_lft forever preferred_lft forever + 2: eth0: mtu 1500 qdisc fq_codel state UP group default qlen 1000 + inet 192.168.1.1/24 brd 192.168.1.255 scope global dynamic noprefixroute eth0 + valid_lft 83457sec preferred_lft 83457sec + ``` Note that in this case **each** Hadoop cluster node node should have network interface with name `eth0`. @@ -307,11 +307,11 @@ There are 3 ways to fix that: - Update `/etc/hosts` on each Hadoop cluster node to include real node IP: - ```text - 127.0.0.1 localhost - # this IP should be accessible from GP segments - 192.168.1.1 cluster-node-name - ``` + ```text + 127.0.0.1 localhost + # this IP should be accessible from GP segments + 192.168.1.1 cluster-node-name + ``` So Greenplum connector will properly resolve node IP. diff --git a/mddocs/docs/connection/db_connection/greenplum/read.md b/mddocs/docs/connection/db_connection/greenplum/read.md index 70ec1f7be..1c7501542 100644 --- a/mddocs/docs/connection/db_connection/greenplum/read.md +++ b/mddocs/docs/connection/db_connection/greenplum/read.md @@ -35,29 +35,29 @@ Data can be read from Greenplum to Spark using [DBReader][DBR-onetl-db-reader]. This is OK: ```python - DBReader( - columns=[ - "some_column", - # this cast is executed on Spark side - "CAST(another_column AS STRING)", - ], - # this predicate is parsed by Spark, and can be pushed down to Greenplum - where="some_column LIKE 'val1%'", - ) + DBReader( + columns=[ + "some_column", + # this cast is executed on Spark side + "CAST(another_column AS STRING)", + ], + # this predicate is parsed by Spark, and can be pushed down to Greenplum + where="some_column LIKE 'val1%'", + ) ``` This is will fail: ```python - DBReader( - columns=[ - "some_column", - # Spark does not have `text` type - "CAST(another_column AS text)", - ], - # Spark does not support ~ syntax for regexp matching - where="some_column ~ 'val1.*'", - ) + DBReader( + columns=[ + "some_column", + # Spark does not have `text` type + "CAST(another_column AS text)", + ], + # Spark does not support ~ syntax for regexp matching + where="some_column ~ 'val1.*'", + ) ``` ## Examples { #DBR-onetl-connection-db-connection-greenplum-read-examples } @@ -210,28 +210,28 @@ If view is used, it is recommended to include `gp_segment_id` column to this vie ??? note "Reading from view with gp_segment_id column" ```python - from onetl.connection import Greenplum - from onetl.db import DBReader - - greenplum = Greenplum(...) - - greenplum.execute( - """ - CREATE VIEW schema.view_with_gp_segment_id AS - SELECT - id, - some_column, - another_column, - gp_segment_id -- IMPORTANT - FROM schema.some_table - """, - ) - - reader = DBReader( - connection=greenplum, - source="schema.view_with_gp_segment_id", - ) - df = reader.run() + from onetl.connection import Greenplum + from onetl.db import DBReader + + greenplum = Greenplum(...) + + greenplum.execute( + """ + CREATE VIEW schema.view_with_gp_segment_id AS + SELECT + id, + some_column, + another_column, + gp_segment_id -- IMPORTANT + FROM schema.some_table + """, + ) + + reader = DBReader( + connection=greenplum, + source="schema.view_with_gp_segment_id", + ) + df = reader.run() ``` #### Using custom `partition_column` { #DBR-onetl-connection-db-connection-greenplum-read-using-custom-partition-column } @@ -244,33 +244,33 @@ In this case, custom column can be used instead: ??? note "Reading from view with custom partition_column" ```python - from onetl.connection import Greenplum - from onetl.db import DBReader - - greenplum = Greenplum(...) - - greenplum.execute( - """ - CREATE VIEW schema.view_with_partition_column AS - SELECT - id, - some_column, - part_column -- correlated to greenplum segment ID - FROM schema.some_table - """, - ) - - reader = DBReader( - connection=greenplum, - source="schema.view_with_partition_column", - options=Greenplum.ReadOptions( - # parallelize data using specified column - partitionColumn="part_column", - # create 10 Spark tasks, each will read only part of table data - partitions=10, - ), - ) - df = reader.run() + from onetl.connection import Greenplum + from onetl.db import DBReader + + greenplum = Greenplum(...) + + greenplum.execute( + """ + CREATE VIEW schema.view_with_partition_column AS + SELECT + id, + some_column, + part_column -- correlated to greenplum segment ID + FROM schema.some_table + """, + ) + + reader = DBReader( + connection=greenplum, + source="schema.view_with_partition_column", + options=Greenplum.ReadOptions( + # parallelize data using specified column + partitionColumn="part_column", + # create 10 Spark tasks, each will read only part of table data + partitions=10, + ), + ) + df = reader.run() ``` #### Reading `DISTRIBUTED REPLICATED` tables { #DBR-onetl-connection-db-connection-greenplum-read-reading-distributed-replicated-tables } @@ -290,42 +290,42 @@ Instead is recommended to run `JOIN` query on Greenplum side, save the result to ??? note "Reading from view using intermediate table" ```python - from onetl.connection import Greenplum - from onetl.db import DBReader - - greenplum = Greenplum(...) - - greenplum.execute( - """ - CREATE UNLOGGED TABLE schema.intermediate_table AS - SELECT - id, - tbl1.col1, - tbl1.data, - tbl2.another_data - FROM - schema.table1 as tbl1 - JOIN - schema.table2 as tbl2 - ON - tbl1.col1 = tbl2.col2 - WHERE ... - """, - ) - - reader = DBReader( - connection=greenplum, - source="schema.intermediate_table", - ) - df = reader.run() - - # write dataframe somethere - - greenplum.execute( - """ - DROP TABLE schema.intermediate_table - """, - ) + from onetl.connection import Greenplum + from onetl.db import DBReader + + greenplum = Greenplum(...) + + greenplum.execute( + """ + CREATE UNLOGGED TABLE schema.intermediate_table AS + SELECT + id, + tbl1.col1, + tbl1.data, + tbl2.another_data + FROM + schema.table1 as tbl1 + JOIN + schema.table2 as tbl2 + ON + tbl1.col1 = tbl2.col2 + WHERE ... + """, + ) + + reader = DBReader( + connection=greenplum, + source="schema.intermediate_table", + ) + df = reader.run() + + # write dataframe somethere + + greenplum.execute( + """ + DROP TABLE schema.intermediate_table + """, + ) ``` !!! warning @@ -333,10 +333,10 @@ Instead is recommended to run `JOIN` query on Greenplum side, save the result to **NEVER** do that: ```python - df1 = DBReader(connection=greenplum, target="public.table1", ...).run() - df2 = DBReader(connection=greenplum, target="public.table2", ...).run() + df1 = DBReader(connection=greenplum, target="public.table1", ...).run() + df2 = DBReader(connection=greenplum, target="public.table2", ...).run() - joined_df = df1.join(df2, on="col") + joined_df = df1.join(df2, on="col") ``` This will lead to sending all the data from both `table1` and `table2` to Spark executor memory, and then `JOIN` diff --git a/mddocs/docs/connection/db_connection/greenplum/write.md b/mddocs/docs/connection/db_connection/greenplum/write.md index a59992e57..c2536f4cd 100644 --- a/mddocs/docs/connection/db_connection/greenplum/write.md +++ b/mddocs/docs/connection/db_connection/greenplum/write.md @@ -15,27 +15,27 @@ For writing data to Greenplum, use [DBWriter][DBR-onetl-db-writer] with [Greenpl ## Examples { #DBR-onetl-connection-db-connection-greenplum-write-examples } - ```python - from onetl.connection import Greenplum - from onetl.db import DBWriter - - greenplum = Greenplum(...) - - df = ... # data is here - - writer = DBWriter( - connection=greenplum, - target="schema.table", - options=Greenplum.WriteOptions( - if_exists="append", - # by default distribution is random - distributedBy="id", - # partitionBy is not supported - ), - ) - - writer.run(df) - ``` +```python +from onetl.connection import Greenplum +from onetl.db import DBWriter + +greenplum = Greenplum(...) + +df = ... # data is here + +writer = DBWriter( + connection=greenplum, + target="schema.table", + options=Greenplum.WriteOptions( + if_exists="append", + # by default distribution is random + distributedBy="id", + # partitionBy is not supported + ), +) + +writer.run(df) +``` ## Interaction schema { #DBR-onetl-connection-db-connection-greenplum-write-interaction-schema } diff --git a/mddocs/docs/connection/db_connection/hive/execute.md b/mddocs/docs/connection/db_connection/hive/execute.md index e924201f8..d2b98c158 100644 --- a/mddocs/docs/connection/db_connection/hive/execute.md +++ b/mddocs/docs/connection/db_connection/hive/execute.md @@ -21,24 +21,24 @@ This method supports **any** query syntax supported by Hive, like: ## Examples { #DBR-onetl-connection-db-connection-hive-execute-examples } - ```python - from onetl.connection import Hive - - hive = Hive(...) - - hive.execute("DROP TABLE schema.table") - hive.execute( - """ - CREATE TABLE schema.table ( - id NUMBER, - key VARCHAR, - value DOUBLE - ) - PARTITION BY (business_date DATE) - STORED AS orc - """ - ) - ``` +```python +from onetl.connection import Hive + +hive = Hive(...) + +hive.execute("DROP TABLE schema.table") +hive.execute( + """ + CREATE TABLE schema.table ( + id NUMBER, + key VARCHAR, + value DOUBLE + ) + PARTITION BY (business_date DATE) + STORED AS orc + """ +) +``` ### Details { #DBR-onetl-connection-db-connection-hive-execute-details } diff --git a/mddocs/docs/connection/db_connection/hive/prerequisites.md b/mddocs/docs/connection/db_connection/hive/prerequisites.md index bd4ac2e4a..8af35b609 100644 --- a/mddocs/docs/connection/db_connection/hive/prerequisites.md +++ b/mddocs/docs/connection/db_connection/hive/prerequisites.md @@ -31,16 +31,16 @@ See [installation instruction][DBR-onetl-install-spark] for more details. Create `$SPARK_CONF_DIR/hive-site.xml` with Hive Metastore URL: - ```xml - - - - - hive.metastore.uris - thrift://metastore.host.name:9083 - - - ``` +```xml + + + + + hive.metastore.uris + thrift://metastore.host.name:9083 + + +``` Create `$SPARK_CONF_DIR/core-site.xml` with warehouse location ,e.g. HDFS IPC port of Hadoop namenode, or S3 bucket address & credentials: diff --git a/mddocs/docs/connection/db_connection/hive/read.md b/mddocs/docs/connection/db_connection/hive/read.md index b20fd4d63..0bff9278a 100644 --- a/mddocs/docs/connection/db_connection/hive/read.md +++ b/mddocs/docs/connection/db_connection/hive/read.md @@ -24,41 +24,41 @@ but does not support custom queries, like `JOIN`. Snapshot strategy: - ```python - from onetl.connection import Hive - from onetl.db import DBReader +```python +from onetl.connection import Hive +from onetl.db import DBReader - hive = Hive(...) +hive = Hive(...) - reader = DBReader( - connection=hive, - source="schema.table", - columns=["id", "key", "CAST(value AS text) value", "updated_dt"], - where="key = 'something'", - ) - df = reader.run() - ``` +reader = DBReader( + connection=hive, + source="schema.table", + columns=["id", "key", "CAST(value AS text) value", "updated_dt"], + where="key = 'something'", +) +df = reader.run() +``` Incremental strategy: - ```python - from onetl.connection import Hive - from onetl.db import DBReader - from onetl.strategy import IncrementalStrategy +```python +from onetl.connection import Hive +from onetl.db import DBReader +from onetl.strategy import IncrementalStrategy - hive = Hive(...) +hive = Hive(...) - reader = DBReader( - connection=hive, - source="schema.table", - columns=["id", "key", "CAST(value AS text) value", "updated_dt"], - where="key = 'something'", - hwm=DBReader.AutoDetectHWM(name="hive_hwm", expression="updated_dt"), - ) +reader = DBReader( + connection=hive, + source="schema.table", + columns=["id", "key", "CAST(value AS text) value", "updated_dt"], + where="key = 'something'", + hwm=DBReader.AutoDetectHWM(name="hive_hwm", expression="updated_dt"), +) - with IncrementalStrategy(): - df = reader.run() - ``` +with IncrementalStrategy(): + df = reader.run() +``` ## Recommendations { #DBR-onetl-connection-db-connection-hive-read-recommendations } diff --git a/mddocs/docs/connection/db_connection/hive/sql.md b/mddocs/docs/connection/db_connection/hive/sql.md index 296574345..0bc0d2ca9 100644 --- a/mddocs/docs/connection/db_connection/hive/sql.md +++ b/mddocs/docs/connection/db_connection/hive/sql.md @@ -16,24 +16,24 @@ Only queries with the following syntax are supported: ## Examples { #DBR-onetl-connection-db-connection-hive-sql-examples } - ```python - from onetl.connection import Hive - - hive = Hive(...) - df = hive.sql( - """ - SELECT - id, - key, - CAST(value AS text) value, - updated_at - FROM - some.mytable - WHERE - key = 'something' - """ - ) - ``` +```python +from onetl.connection import Hive + +hive = Hive(...) +df = hive.sql( + """ + SELECT + id, + key, + CAST(value AS text) value, + updated_at + FROM + some.mytable + WHERE + key = 'something' + """ +) +``` ## Recommendations { #DBR-onetl-connection-db-connection-hive-sql-recommendations } diff --git a/mddocs/docs/connection/db_connection/mongodb/prerequisites.md b/mddocs/docs/connection/db_connection/mongodb/prerequisites.md index 84a151a12..fb74cb9c2 100644 --- a/mddocs/docs/connection/db_connection/mongodb/prerequisites.md +++ b/mddocs/docs/connection/db_connection/mongodb/prerequisites.md @@ -25,19 +25,19 @@ It is possible to connect to MongoDB host by using either DNS name of host or it It is also possible to connect to MongoDB shared cluster: - ```python - mongo = MongoDB( - host="master.host.or.ip", - user="user", - password="*****", - database="target_database", - spark=spark, - extra={ - # read data from secondary cluster node, switch to primary if not available - "readPreference": "secondaryPreferred", - }, - ) - ``` +```python +mongo = MongoDB( + host="master.host.or.ip", + user="user", + password="*****", + database="target_database", + spark=spark, + extra={ + # read data from secondary cluster node, switch to primary if not available + "readPreference": "secondaryPreferred", + }, +) +``` Supported `readPreference` values are described in [official documentation](https://www.mongodb.com/docs/manual/core/read-preference/). diff --git a/mddocs/docs/connection/db_connection/mongodb/read.md b/mddocs/docs/connection/db_connection/mongodb/read.md index fba78334b..9d5bb4df4 100644 --- a/mddocs/docs/connection/db_connection/mongodb/read.md +++ b/mddocs/docs/connection/db_connection/mongodb/read.md @@ -24,95 +24,95 @@ Snapshot strategy: - ```python - from onetl.connection import MongoDB - from onetl.db import DBReader - - from pyspark.sql.types import ( - StructType, - StructField, - IntegerType, - StringType, - TimestampType, - ) - - mongodb = MongoDB(...) - - # mandatory - df_schema = StructType( - [ - StructField("_id", StringType()), - StructField("some", StringType()), - StructField( - "field", - StructType( - [ - StructField("nested", IntegerType()), - ], - ), +```python +from onetl.connection import MongoDB +from onetl.db import DBReader + +from pyspark.sql.types import ( + StructType, + StructField, + IntegerType, + StringType, + TimestampType, +) + +mongodb = MongoDB(...) + +# mandatory +df_schema = StructType( + [ + StructField("_id", StringType()), + StructField("some", StringType()), + StructField( + "field", + StructType( + [ + StructField("nested", IntegerType()), + ], ), - StructField("updated_dt", TimestampType()), - ] - ) - - reader = DBReader( - connection=mongodb, - source="some_collection", - df_schema=df_schema, - where={"field": {"$eq": 123}}, - hint={"field": 1}, - options=MongoDBReadOptions(batchSize=10000), - ) - df = reader.run() - ``` + ), + StructField("updated_dt", TimestampType()), + ] +) + +reader = DBReader( + connection=mongodb, + source="some_collection", + df_schema=df_schema, + where={"field": {"$eq": 123}}, + hint={"field": 1}, + options=MongoDBReadOptions(batchSize=10000), +) +df = reader.run() +``` Incremental strategy: - ```python - from onetl.connection import MongoDB - from onetl.db import DBReader - from onetl.strategy import IncrementalStrategy - - from pyspark.sql.types import ( - StructType, - StructField, - IntegerType, - StringType, - TimestampType, - ) - - mongodb = MongoDB(...) - - # mandatory - df_schema = StructType( - [ - StructField("_id", StringType()), - StructField("some", StringType()), - StructField( - "field", - StructType( - [ - StructField("nested", IntegerType()), - ], - ), +```python +from onetl.connection import MongoDB +from onetl.db import DBReader +from onetl.strategy import IncrementalStrategy + +from pyspark.sql.types import ( + StructType, + StructField, + IntegerType, + StringType, + TimestampType, +) + +mongodb = MongoDB(...) + +# mandatory +df_schema = StructType( + [ + StructField("_id", StringType()), + StructField("some", StringType()), + StructField( + "field", + StructType( + [ + StructField("nested", IntegerType()), + ], ), - StructField("updated_dt", TimestampType()), - ] - ) - - reader = DBReader( - connection=mongodb, - source="some_collection", - df_schema=df_schema, - where={"field": {"$eq": 123}}, - hint={"field": 1}, - hwm=DBReader.AutoDetectHWM(name="mongodb_hwm", expression="updated_dt"), - options=MongoDBReadOptions(batchSize=10000), - ) - - with IncrementalStrategy(): - df = reader.run() - ``` + ), + StructField("updated_dt", TimestampType()), + ] +) + +reader = DBReader( + connection=mongodb, + source="some_collection", + df_schema=df_schema, + where={"field": {"$eq": 123}}, + hint={"field": 1}, + hwm=DBReader.AutoDetectHWM(name="mongodb_hwm", expression="updated_dt"), + options=MongoDBReadOptions(batchSize=10000), +) + +with IncrementalStrategy(): + df = reader.run() +``` ## Recommendations { #DBR-onetl-connection-db-connection-mongodb-read-recommendations } diff --git a/mddocs/docs/connection/db_connection/mongodb/types.md b/mddocs/docs/connection/db_connection/mongodb/types.md index 92e43ffb8..7e5cd219f 100644 --- a/mddocs/docs/connection/db_connection/mongodb/types.md +++ b/mddocs/docs/connection/db_connection/mongodb/types.md @@ -15,54 +15,54 @@ MongoDB is, by design, \_\_schemaless\_\_. So there are 2 ways how this can be h ??? note "See example" ```python - from onetl.connection import MongoDB - from onetl.db import DBReader - - from pyspark.sql.types import ( - StructType, - StructField, - IntegerType, - StringType, - TimestampType, - ) - - mongodb = MongoDB(...) - - df_schema = StructType( - [ - StructField("_id", StringType()), - StructField("some", StringType()), - StructField( - "field", - StructType( - [ - StructField("nested", IntegerType()), - ] - ), + from onetl.connection import MongoDB + from onetl.db import DBReader + + from pyspark.sql.types import ( + StructType, + StructField, + IntegerType, + StringType, + TimestampType, + ) + + mongodb = MongoDB(...) + + df_schema = StructType( + [ + StructField("_id", StringType()), + StructField("some", StringType()), + StructField( + "field", + StructType( + [ + StructField("nested", IntegerType()), + ] ), - ] - ) - - reader = DBReader( - connection=mongodb, - source="some_collection", - df_schema=df_schema, - ) - df = reader.run() - - # or - - df = mongodb.pipeline( - collection="some_collection", - df_schema=df_schema, - ) + ), + ] + ) + + reader = DBReader( + connection=mongodb, + source="some_collection", + df_schema=df_schema, + ) + df = reader.run() + + # or + + df = mongodb.pipeline( + collection="some_collection", + df_schema=df_schema, + ) ``` - Rely on MongoDB connector schema infer: - ```python - df = mongodb.pipeline(collection="some_collection") - ``` + ```python + df = mongodb.pipeline(collection="some_collection") + ``` In this case MongoDB connector read a sample of collection documents, and build DataFrame schema based on document fields and values. diff --git a/mddocs/docs/connection/db_connection/mongodb/write.md b/mddocs/docs/connection/db_connection/mongodb/write.md index 8bdf8ac10..ae717ada6 100644 --- a/mddocs/docs/connection/db_connection/mongodb/write.md +++ b/mddocs/docs/connection/db_connection/mongodb/write.md @@ -8,24 +8,24 @@ For writing data to MongoDB, use [DBWriter][DBR-onetl-db-writer]. ## Examples { #DBR-onetl-connection-db-connection-mongodb-write-examples } - ```python - from onetl.connection import MongoDB - from onetl.db import DBWriter +```python +from onetl.connection import MongoDB +from onetl.db import DBWriter - mongodb = MongoDB(...) +mongodb = MongoDB(...) - df = ... # data is here +df = ... # data is here - writer = DBWriter( - connection=mongodb, - target="schema.table", - options=MongoDB.WriteOptions( - if_exists="append", - ), - ) +writer = DBWriter( + connection=mongodb, + target="schema.table", + options=MongoDB.WriteOptions( + if_exists="append", + ), +) - writer.run(df) - ``` +writer.run(df) +``` ## Write options { #DBR-onetl-connection-db-connection-mongodb-write-options } diff --git a/mddocs/docs/connection/db_connection/mssql/execute.md b/mddocs/docs/connection/db_connection/mssql/execute.md index f31a8f930..c4b8c0ba2 100644 --- a/mddocs/docs/connection/db_connection/mssql/execute.md +++ b/mddocs/docs/connection/db_connection/mssql/execute.md @@ -33,18 +33,18 @@ This method supports **any** query syntax supported by MSSQL, like: #### Examples for `MSSQL.fetch` { #DBR-onetl-connection-db-connection-mssql-execute-examples-for-mssql-fetch } - ```python - from onetl.connection import MSSQL +```python +from onetl.connection import MSSQL - mssql = MSSQL(...) +mssql = MSSQL(...) - df = mssql.fetch( - "SELECT value FROM some.reference_table WHERE key = 'some_constant'", - options=MSSQL.FetchOptions(queryTimeout=10), - ) - mssql.close() - value = df.collect()[0][0] # get value from first row and first column - ``` +df = mssql.fetch( + "SELECT value FROM some.reference_table WHERE key = 'some_constant'", + options=MSSQL.FetchOptions(queryTimeout=10), +) +mssql.close() +value = df.collect()[0][0] # get value from first row and first column +``` ### Use `MSSQL.execute` { #DBR-onetl-connection-db-connection-mssql-execute-use-mssql-execute } @@ -69,23 +69,23 @@ This method supports **any** query syntax supported by MSSQL, like: #### Examples for `MSSQL.execute` { #DBR-onetl-connection-db-connection-mssql-execute-examples-for-mssql-execute } - ```python - from onetl.connection import MSSQL - - mssql = MSSQL(...) - - mssql.execute("DROP TABLE schema.table") - mssql.execute( - """ - CREATE TABLE schema.table ( - id bigint GENERATED ALWAYS AS IDENTITY, - key VARCHAR2(4000), - value NUMBER - ) - """, - options=MSSQL.ExecuteOptions(queryTimeout=10), - ) - ``` +```python +from onetl.connection import MSSQL + +mssql = MSSQL(...) + +mssql.execute("DROP TABLE schema.table") +mssql.execute( + """ + CREATE TABLE schema.table ( + id bigint GENERATED ALWAYS AS IDENTITY, + key VARCHAR2(4000), + value NUMBER + ) + """, + options=MSSQL.ExecuteOptions(queryTimeout=10), +) +``` ## Options { #DBR-onetl-connection-db-connection-mssql-execute-options } diff --git a/mddocs/docs/connection/db_connection/mssql/read.md b/mddocs/docs/connection/db_connection/mssql/read.md index 022c00746..6dd48d4aa 100644 --- a/mddocs/docs/connection/db_connection/mssql/read.md +++ b/mddocs/docs/connection/db_connection/mssql/read.md @@ -23,45 +23,43 @@ Snapshot strategy: - ```python - from onetl.connection import MSSQL - from onetl.db import DBReader - - mssql = MSSQL(...) - - reader = DBReader( - connection=mssql, - source="schema.table", - columns=["id", "key", "CAST(value AS text) value", "updated_dt"], - where="key = 'something'", - options=MSSQL.ReadOptions(partitionColumn="id", numPartitions=10), - ) - df = reader.run() - - . - ``` +```python +from onetl.connection import MSSQL +from onetl.db import DBReader + +mssql = MSSQL(...) + +reader = DBReader( + connection=mssql, + source="schema.table", + columns=["id", "key", "CAST(value AS text) value", "updated_dt"], + where="key = 'something'", + options=MSSQL.ReadOptions(partitionColumn="id", numPartitions=10), +) +df = reader.run() +``` Incremental strategy: - ```python - from onetl.connection import MSSQL - from onetl.db import DBReader - from onetl.strategy import IncrementalStrategy - - mssql = MSSQL(...) - - reader = DBReader( - connection=mssql, - source="schema.table", - columns=["id", "key", "CAST(value AS text) value", "updated_dt"], - where="key = 'something'", - hwm=DBReader.AutoDetectHWM(name="mssql_hwm", expression="updated_dt"), - options=MSSQL.ReadOptions(partitionColumn="id", numPartitions=10), - ) - - with IncrementalStrategy(): - df = reader.run() - ``` +```python +from onetl.connection import MSSQL +from onetl.db import DBReader +from onetl.strategy import IncrementalStrategy + +mssql = MSSQL(...) + +reader = DBReader( + connection=mssql, + source="schema.table", + columns=["id", "key", "CAST(value AS text) value", "updated_dt"], + where="key = 'something'", + hwm=DBReader.AutoDetectHWM(name="mssql_hwm", expression="updated_dt"), + options=MSSQL.ReadOptions(partitionColumn="id", numPartitions=10), +) + +with IncrementalStrategy(): + df = reader.run() +``` ## Recommendations { #DBR-onetl-connection-db-connection-mssql-read-recommendations } diff --git a/mddocs/docs/connection/db_connection/mssql/sql.md b/mddocs/docs/connection/db_connection/mssql/sql.md index 9776a59d1..37a07488e 100644 --- a/mddocs/docs/connection/db_connection/mssql/sql.md +++ b/mddocs/docs/connection/db_connection/mssql/sql.md @@ -21,30 +21,30 @@ Only queries with the following syntax are supported: ## Examples { #DBR-onetl-connection-db-connection-mssql-sql-examples } - ```python - from onetl.connection import MSSQL - - mssql = MSSQL(...) - df = mssql.sql( - """ - SELECT - id, - key, - CAST(value AS text) value, - updated_at - FROM - some.mytable - WHERE - key = 'something' - """, - options=MSSQL.SQLOptions( - partitionColumn="id", - numPartitions=10, - lowerBound=0, - upperBound=1000, - ), - ) - ``` +```python +from onetl.connection import MSSQL + +mssql = MSSQL(...) +df = mssql.sql( + """ + SELECT + id, + key, + CAST(value AS text) value, + updated_at + FROM + some.mytable + WHERE + key = 'something' + """, + options=MSSQL.SQLOptions( + partitionColumn="id", + numPartitions=10, + lowerBound=0, + upperBound=1000, + ), +) +``` ## Recommendations { #DBR-onetl-connection-db-connection-mssql-sql-recommendations } diff --git a/mddocs/docs/connection/db_connection/mssql/types.md b/mddocs/docs/connection/db_connection/mssql/types.md index 62fd6441f..2c9040f65 100644 --- a/mddocs/docs/connection/db_connection/mssql/types.md +++ b/mddocs/docs/connection/db_connection/mssql/types.md @@ -184,77 +184,77 @@ It is possible to explicitly cast column type using `DBReader(columns=...)` synt For example, you can use `CAST(column AS text)` to convert data to string representation on MSSQL side, and so it will be read as Spark's `StringType()`: - ```python - from onetl.connection import MSSQL - from onetl.db import DBReader - - mssql = MSSQL(...) - - DBReader( - connection=mssql, - columns=[ - "id", - "supported_column", - "CAST(unsupported_column AS text) unsupported_column_str", - ], - ) - df = reader.run() - - # cast column content to proper Spark type - df = df.select( - df.id, - df.supported_column, - # explicit cast - df.unsupported_column_str.cast("integer").alias("parsed_integer"), - ) - ``` +```python +from onetl.connection import MSSQL +from onetl.db import DBReader + +mssql = MSSQL(...) + +DBReader( + connection=mssql, + columns=[ + "id", + "supported_column", + "CAST(unsupported_column AS text) unsupported_column_str", + ], +) +df = reader.run() + +# cast column content to proper Spark type +df = df.select( + df.id, + df.supported_column, + # explicit cast + df.unsupported_column_str.cast("integer").alias("parsed_integer"), +) +``` ### `DBWriter` { #DBR-onetl-connection-db-connection-mssql-types-dbwriter } Convert dataframe column to JSON using [to_json](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.functions.to_json.html), and write it as `text` column in MSSQL: - ```python - mssql.execute( - """ - CREATE TABLE schema.target_tbl ( - id bigint, - struct_column_json text -- any string type, actually - ) - """, - ) +```python +mssql.execute( + """ + CREATE TABLE schema.target_tbl ( + id bigint, + struct_column_json text -- any string type, actually + ) + """, +) - from pyspark.sql.functions import to_json +from pyspark.sql.functions import to_json - df = df.select( - df.id, - to_json(df.struct_column).alias("struct_column_json"), - ) +df = df.select( + df.id, + to_json(df.struct_column).alias("struct_column_json"), +) - writer.run(df) - ``` +writer.run(df) +``` Then you can parse this column on MSSQL side - for example, by creating a view: - ```sql - SELECT - id, - JSON_VALUE(struct_column_json, "$.nested.field") AS nested_field - FROM target_tbl - ``` +```sql +SELECT + id, + JSON_VALUE(struct_column_json, "$.nested.field") AS nested_field +FROM target_tbl +``` Or by using [computed column](https://learn.microsoft.com/en-us/sql/relational-databases/tables/specify-computed-columns-in-a-table): - ```sql - CREATE TABLE schema.target_table ( - id bigint, - supported_column datetime2(6), - struct_column_json text, -- any string type, actually - -- computed column - nested_field AS (JSON_VALUE(struct_column_json, "$.nested.field")) - -- or persisted column - -- nested_field AS (JSON_VALUE(struct_column_json, "$.nested.field")) PERSISTED - ) - ``` +```sql +CREATE TABLE schema.target_table ( + id bigint, + supported_column datetime2(6), + struct_column_json text, -- any string type, actually + -- computed column + nested_field AS (JSON_VALUE(struct_column_json, "$.nested.field")) + -- or persisted column + -- nested_field AS (JSON_VALUE(struct_column_json, "$.nested.field")) PERSISTED +) +``` By default, column value is calculated on every table read. Column marked as `PERSISTED` is calculated during insert, but this require additional space. diff --git a/mddocs/docs/connection/db_connection/mssql/write.md b/mddocs/docs/connection/db_connection/mssql/write.md index deaf554c8..d38e1b621 100644 --- a/mddocs/docs/connection/db_connection/mssql/write.md +++ b/mddocs/docs/connection/db_connection/mssql/write.md @@ -16,22 +16,22 @@ For writing data to MSSQL, use [DBWriter][DBR-onetl-db-writer]. ## Examples { #DBR-onetl-connection-db-connection-mssql-write-examples } - ```python - from onetl.connection import MSSQL - from onetl.db import DBWriter +```python +from onetl.connection import MSSQL +from onetl.db import DBWriter - mssql = MSSQL(...) +mssql = MSSQL(...) - df = ... # data is here +df = ... # data is here - writer = DBWriter( - connection=mssql, - target="schema.table", - options=MSSQL.WriteOptions(if_exists="append"), - ) +writer = DBWriter( + connection=mssql, + target="schema.table", + options=MSSQL.WriteOptions(if_exists="append"), +) - writer.run(df) - ``` +writer.run(df) +``` ## Options { #DBR-onetl-connection-db-connection-mssql-write-options } diff --git a/mddocs/docs/connection/db_connection/mysql/execute.md b/mddocs/docs/connection/db_connection/mysql/execute.md index 663dbdeb0..b6d96169f 100644 --- a/mddocs/docs/connection/db_connection/mysql/execute.md +++ b/mddocs/docs/connection/db_connection/mysql/execute.md @@ -34,18 +34,18 @@ This method supports **any** query syntax supported by MySQL, like: #### Examples in `MySQL.fetch` { #DBR-onetl-connection-db-connection-mysql-execute-examples-in-mysql-fetch } - ```python - from onetl.connection import MySQL +```python +from onetl.connection import MySQL - mysql = MySQL(...) +mysql = MySQL(...) - df = mysql.fetch( - "SELECT value FROM some.reference_table WHERE key = 'some_constant'", - options=MySQL.FetchOptions(queryTimeout=10), - ) - mysql.close() - value = df.collect()[0][0] # get value from first row and first column - ``` +df = mysql.fetch( + "SELECT value FROM some.reference_table WHERE key = 'some_constant'", + options=MySQL.FetchOptions(queryTimeout=10), +) +mysql.close() +value = df.collect()[0][0] # get value from first row and first column +``` ### Use `MySQL.execute` { #DBR-onetl-connection-db-connection-mysql-execute-use-mysql-execute } @@ -69,24 +69,24 @@ This method supports **any** query syntax supported by MySQL, like: #### Examples for `MySQL.execute` { #DBR-onetl-connection-db-connection-mysql-execute-examples-for-mysql-execute } - ```python - from onetl.connection import MySQL - - mysql = MySQL(...) - - mysql.execute("DROP TABLE schema.table") - mysql.execute( - """ - CREATE TABLE schema.table ( - id bigint, - key text, - value float - ) - ENGINE = InnoDB - """, - options=MySQL.ExecuteOptions(queryTimeout=10), - ) - ``` +```python +from onetl.connection import MySQL + +mysql = MySQL(...) + +mysql.execute("DROP TABLE schema.table") +mysql.execute( + """ + CREATE TABLE schema.table ( + id bigint, + key text, + value float + ) + ENGINE = InnoDB + """, + options=MySQL.ExecuteOptions(queryTimeout=10), +) +``` ## Options { #DBR-onetl-connection-db-connection-mysql-execute-options } diff --git a/mddocs/docs/connection/db_connection/mysql/read.md b/mddocs/docs/connection/db_connection/mysql/read.md index ac5aca318..cc1d36b10 100644 --- a/mddocs/docs/connection/db_connection/mysql/read.md +++ b/mddocs/docs/connection/db_connection/mysql/read.md @@ -23,45 +23,45 @@ Snapshot strategy: - ```python - from onetl.connection import MySQL - from onetl.db import DBReader - - mysql = MySQL(...) - - reader = DBReader( - connection=mysql, - source="schema.table", - columns=["id", "key", "CAST(value AS text) value", "updated_dt"], - where="key = 'something'", - hint="SKIP_SCAN(schema.table key_index)", - options=MySQL.ReadOptions(partitionColumn="id", numPartitions=10), - ) - df = reader.run() - ``` +```python +from onetl.connection import MySQL +from onetl.db import DBReader + +mysql = MySQL(...) + +reader = DBReader( + connection=mysql, + source="schema.table", + columns=["id", "key", "CAST(value AS text) value", "updated_dt"], + where="key = 'something'", + hint="SKIP_SCAN(schema.table key_index)", + options=MySQL.ReadOptions(partitionColumn="id", numPartitions=10), +) +df = reader.run() +``` Incremental strategy: - ```python - from onetl.connection import MySQL - from onetl.db import DBReader - from onetl.strategy import IncrementalStrategy - - mysql = MySQL(...) - - reader = DBReader( - connection=mysql, - source="schema.table", - columns=["id", "key", "CAST(value AS text) value", "updated_dt"], - where="key = 'something'", - hint="SKIP_SCAN(schema.table key_index)", - hwm=DBReader.AutoDetectHWM(name="mysql_hwm", expression="updated_dt"), - options=MySQL.ReadOptions(partitionColumn="id", numPartitions=10), - ) - - with IncrementalStrategy(): - df = reader.run() - ``` +```python +from onetl.connection import MySQL +from onetl.db import DBReader +from onetl.strategy import IncrementalStrategy + +mysql = MySQL(...) + +reader = DBReader( + connection=mysql, + source="schema.table", + columns=["id", "key", "CAST(value AS text) value", "updated_dt"], + where="key = 'something'", + hint="SKIP_SCAN(schema.table key_index)", + hwm=DBReader.AutoDetectHWM(name="mysql_hwm", expression="updated_dt"), + options=MySQL.ReadOptions(partitionColumn="id", numPartitions=10), +) + +with IncrementalStrategy(): + df = reader.run() +``` ## Recommendations { #DBR-onetl-connection-db-connection-mysql-read-recommendations } diff --git a/mddocs/docs/connection/db_connection/mysql/sql.md b/mddocs/docs/connection/db_connection/mysql/sql.md index 81baff3e4..beb6417b8 100644 --- a/mddocs/docs/connection/db_connection/mysql/sql.md +++ b/mddocs/docs/connection/db_connection/mysql/sql.md @@ -21,30 +21,30 @@ Only queries with the following syntax are supported: ## Examples { #DBR-onetl-connection-db-connection-mysql-sql-examples } - ```python - from onetl.connection import MySQL - - mysql = MySQL(...) - df = mysql.sql( - """ - SELECT - id, - key, - CAST(value AS text) value, - updated_at - FROM - some.mytable - WHERE - key = 'something' - """, - options=MySQL.SQLOptions( - partitionColumn="id", - numPartitions=10, - lowerBound=0, - upperBound=1000, - ), - ) - ``` +```python +from onetl.connection import MySQL + +mysql = MySQL(...) +df = mysql.sql( + """ + SELECT + id, + key, + CAST(value AS text) value, + updated_at + FROM + some.mytable + WHERE + key = 'something' + """, + options=MySQL.SQLOptions( + partitionColumn="id", + numPartitions=10, + lowerBound=0, + upperBound=1000, + ), +) +``` ## Recommendations { #DBR-onetl-connection-db-connection-mysql-sql-recommendations } diff --git a/mddocs/docs/connection/db_connection/mysql/types.md b/mddocs/docs/connection/db_connection/mysql/types.md index a74e7def5..d5502caca 100644 --- a/mddocs/docs/connection/db_connection/mysql/types.md +++ b/mddocs/docs/connection/db_connection/mysql/types.md @@ -179,87 +179,87 @@ For example, you can use `CAST(column AS text)` to convert data to string repres It is also possible to use [JSON_OBJECT](https://dev.mysql.com/doc/refman/en/json.html) MySQL function and parse JSON columns in MySQL with the [JSON.parse_column][onetl.file.format.json.JSON.parse_column] method. - ```python - from pyspark.sql.types import IntegerType, StructType, StructField - - from onetl.connection import MySQL - from onetl.db import DBReader - from onetl.file.format import JSON - - mysql = MySQL(...) - - DBReader( - connection=mysql, - columns=[ - "id", - "supported_column", - "CAST(unsupported_column AS text) unsupported_column_str", - # or - "JSON_OBJECT('key', value_column) json_column", - ], - ) - df = reader.run() - - json_scheme = StructType([StructField("key", IntegerType())]) - - df = df.select( - df.id, - df.supported_column, - # explicit cast - df.unsupported_column_str.cast("integer").alias("parsed_integer"), - JSON().parse_column("json_column", json_scheme).alias("struct_column"), - ) - ``` +```python +from pyspark.sql.types import IntegerType, StructType, StructField + +from onetl.connection import MySQL +from onetl.db import DBReader +from onetl.file.format import JSON + +mysql = MySQL(...) + +DBReader( + connection=mysql, + columns=[ + "id", + "supported_column", + "CAST(unsupported_column AS text) unsupported_column_str", + # or + "JSON_OBJECT('key', value_column) json_column", + ], +) +df = reader.run() + +json_scheme = StructType([StructField("key", IntegerType())]) + +df = df.select( + df.id, + df.supported_column, + # explicit cast + df.unsupported_column_str.cast("integer").alias("parsed_integer"), + JSON().parse_column("json_column", json_scheme).alias("struct_column"), +) +``` ### `DBWriter` { #DBR-onetl-connection-db-connection-mysql-types-dbwriter } To write JSON data to a `json` or `text` column in a MySQL table, use the [JSON.serialize_column][onetl.file.format.json.JSON.serialize_column] method. - ```python - from onetl.connection import MySQL - from onetl.db import DBWriter - from onetl.file.format import JSON - - mysql.execute( - """ - CREATE TABLE schema.target_tbl ( - id bigint, - array_column_json json -- any string type, actually - ) - ENGINE = InnoDB - """, - ) +```python +from onetl.connection import MySQL +from onetl.db import DBWriter +from onetl.file.format import JSON - df = df.select( - df.id, - JSON().serialize_column(df.array_column).alias("array_column_json"), - ) +mysql.execute( + """ + CREATE TABLE schema.target_tbl ( + id bigint, + array_column_json json -- any string type, actually + ) + ENGINE = InnoDB + """, +) - writer.run(df) - ``` +df = df.select( + df.id, + JSON().serialize_column(df.array_column).alias("array_column_json"), +) + +writer.run(df) +``` Then you can parse this column on MySQL side - for example, by creating a view: - ```sql - SELECT - id, - array_column_json->"$[0]" AS array_item - FROM target_tbl - ``` +```sql +SELECT + id, + array_column_json->"$[0]" AS array_item +FROM target_tbl +``` Or by using [GENERATED column](https://dev.mysql.com/doc/refman/en/create-table-generated-columns.html): - ```sql - CREATE TABLE schema.target_table ( - id bigint, - supported_column timestamp, - array_column_json json, -- any string type, actually - -- virtual column - array_item_0 GENERATED ALWAYS AS (array_column_json->"$[0]")) VIRTUAL - -- or stired column - -- array_item_0 GENERATED ALWAYS AS (array_column_json->"$[0]")) STORED - ) - ``` +```sql +CREATE TABLE schema.target_table ( + id bigint, + supported_column timestamp, + array_column_json json, -- any string type, actually + -- virtual column + array_item_0 GENERATED ALWAYS AS (array_column_json->"$[0]")) VIRTUAL + -- or stired column + -- array_item_0 GENERATED ALWAYS AS (array_column_json->"$[0]")) STORED +) +``` `VIRTUAL` column value is calculated on every table read. `STORED` column value is calculated during insert, but this require additional space. diff --git a/mddocs/docs/connection/db_connection/mysql/write.md b/mddocs/docs/connection/db_connection/mysql/write.md index 07d767f4e..9031356ad 100644 --- a/mddocs/docs/connection/db_connection/mysql/write.md +++ b/mddocs/docs/connection/db_connection/mysql/write.md @@ -14,26 +14,26 @@ For writing data to MySQL, use [DBWriter][DBR-onetl-db-writer]. ## Examples { #DBR-onetl-connection-db-connection-mysql-write-examples } - ```python - from onetl.connection import MySQL - from onetl.db import DBWriter - - mysql = MySQL(...) - - df = ... # data is here - - writer = DBWriter( - connection=mysql, - target="schema.table", - options=MySQL.WriteOptions( - if_exists="append", - # ENGINE is required by MySQL - createTableOptions="ENGINE = MergeTree() ORDER BY id", - ), - ) - - writer.run(df) - ``` +```python +from onetl.connection import MySQL +from onetl.db import DBWriter + +mysql = MySQL(...) + +df = ... # data is here + +writer = DBWriter( + connection=mysql, + target="schema.table", + options=MySQL.WriteOptions( + if_exists="append", + # ENGINE is required by MySQL + createTableOptions="ENGINE = MergeTree() ORDER BY id", + ), +) + +writer.run(df) +``` ## Options { #DBR-onetl-connection-db-connection-mysql-write-options } diff --git a/mddocs/docs/connection/db_connection/oracle/execute.md b/mddocs/docs/connection/db_connection/oracle/execute.md index e871a7f92..d2d8413c4 100644 --- a/mddocs/docs/connection/db_connection/oracle/execute.md +++ b/mddocs/docs/connection/db_connection/oracle/execute.md @@ -35,18 +35,18 @@ This method supports **any** query syntax supported by Oracle, like: #### Examples for `Oracle.fetch` { #DBR-onetl-connection-db-connection-oracle-execute-examples-for-oracle-fetch } - ```python - from onetl.connection import Oracle +```python +from onetl.connection import Oracle - oracle = Oracle(...) +oracle = Oracle(...) - df = oracle.fetch( - "SELECT value FROM some.reference_table WHERE key = 'some_constant'", - options=Oracle.FetchOptions(queryTimeout=10), - ) - oracle.close() - value = df.collect()[0][0] # get value from first row and first column - ``` +df = oracle.fetch( + "SELECT value FROM some.reference_table WHERE key = 'some_constant'", + options=Oracle.FetchOptions(queryTimeout=10), +) +oracle.close() +value = df.collect()[0][0] # get value from first row and first column +``` ### Use `Oracle.execute` { #DBR-onetl-connection-db-connection-oracle-execute-use-oracle-execute } @@ -71,23 +71,23 @@ This method supports **any** query syntax supported by Oracle, like: #### Examples for `Oracle.execute` { #DBR-onetl-connection-db-connection-oracle-execute-examples-for-oracle-execute } - ```python - from onetl.connection import Oracle - - oracle = Oracle(...) - - oracle.execute("DROP TABLE schema.table") - oracle.execute( - """ - CREATE TABLE schema.table ( - id bigint GENERATED ALWAYS AS IDENTITY, - key VARCHAR2(4000), - value NUMBER - ) - """, - options=Oracle.ExecuteOptions(queryTimeout=10), - ) - ``` +```python +from onetl.connection import Oracle + +oracle = Oracle(...) + +oracle.execute("DROP TABLE schema.table") +oracle.execute( + """ + CREATE TABLE schema.table ( + id bigint GENERATED ALWAYS AS IDENTITY, + key VARCHAR2(4000), + value NUMBER + ) + """, + options=Oracle.ExecuteOptions(queryTimeout=10), +) +``` ## Options { #DBR-onetl-connection-db-connection-oracle-execute-options } diff --git a/mddocs/docs/connection/db_connection/oracle/prerequisites.md b/mddocs/docs/connection/db_connection/oracle/prerequisites.md index 3b634fb31..dd712356d 100644 --- a/mddocs/docs/connection/db_connection/oracle/prerequisites.md +++ b/mddocs/docs/connection/db_connection/oracle/prerequisites.md @@ -37,19 +37,19 @@ It is possible to connect to database as another user without knowing this user This can be enabled by granting user a special `CONNECT THROUGH` permission: - ```sql - ALTER USER schema_owner GRANT CONNECT THROUGH proxy_user; - ``` +```sql +ALTER USER schema_owner GRANT CONNECT THROUGH proxy_user; +``` Then you can connect to Oracle using credentials of `proxy_user` but specify that you need permissions of `schema_owner`: - ```python - oracle = Oracle( - ..., - user="proxy_user[schema_owner]", - password="proxy_user password", - ) - ``` +```python +oracle = Oracle( + ..., + user="proxy_user[schema_owner]", + password="proxy_user password", +) +``` See [official documentation](https://oracle-base.com/articles/misc/proxy-users-and-connect-through). diff --git a/mddocs/docs/connection/db_connection/oracle/read.md b/mddocs/docs/connection/db_connection/oracle/read.md index 410a33c4e..ddcdfc9eb 100644 --- a/mddocs/docs/connection/db_connection/oracle/read.md +++ b/mddocs/docs/connection/db_connection/oracle/read.md @@ -23,45 +23,45 @@ Snapshot strategy: - ```python - from onetl.connection import Oracle - from onetl.db import DBReader - - oracle = Oracle(...) - - reader = DBReader( - connection=oracle, - source="schema.table", - columns=["id", "key", "CAST(value AS VARCHAR2(4000)) value", "updated_dt"], - where="key = 'something'", - hint="INDEX(schema.table key_index)", - options=Oracle.ReadOptions(partitionColumn="id", numPartitions=10), - ) - df = reader.run() - ``` +```python +from onetl.connection import Oracle +from onetl.db import DBReader + +oracle = Oracle(...) + +reader = DBReader( + connection=oracle, + source="schema.table", + columns=["id", "key", "CAST(value AS VARCHAR2(4000)) value", "updated_dt"], + where="key = 'something'", + hint="INDEX(schema.table key_index)", + options=Oracle.ReadOptions(partitionColumn="id", numPartitions=10), +) +df = reader.run() +``` Incremental strategy: - ```python - from onetl.connection import Oracle - from onetl.db import DBReader - from onetl.strategy import IncrementalStrategy - - oracle = Oracle(...) - - reader = DBReader( - connection=oracle, - source="schema.table", - columns=["id", "key", "CAST(value AS VARCHAR2(4000)) value", "updated_dt"], - where="key = 'something'", - hint="INDEX(schema.table key_index)", - hwm=DBReader.AutoDetectHWM(name="oracle_hwm", expression="updated_dt"), - options=Oracle.ReadOptions(partitionColumn="id", numPartitions=10), - ) - - with IncrementalStrategy(): - df = reader.run() - ``` +```python +from onetl.connection import Oracle +from onetl.db import DBReader +from onetl.strategy import IncrementalStrategy + +oracle = Oracle(...) + +reader = DBReader( + connection=oracle, + source="schema.table", + columns=["id", "key", "CAST(value AS VARCHAR2(4000)) value", "updated_dt"], + where="key = 'something'", + hint="INDEX(schema.table key_index)", + hwm=DBReader.AutoDetectHWM(name="oracle_hwm", expression="updated_dt"), + options=Oracle.ReadOptions(partitionColumn="id", numPartitions=10), +) + +with IncrementalStrategy(): + df = reader.run() +``` ## Recommendations { #DBR-onetl-connection-db-connection-oracle-read-recommendations } diff --git a/mddocs/docs/connection/db_connection/oracle/sql.md b/mddocs/docs/connection/db_connection/oracle/sql.md index 8335a145c..2ec4b7484 100644 --- a/mddocs/docs/connection/db_connection/oracle/sql.md +++ b/mddocs/docs/connection/db_connection/oracle/sql.md @@ -21,30 +21,30 @@ Only queries with the following syntax are supported: ## Examples { #DBR-onetl-connection-db-connection-oracle-sql-examples } - ```python - from onetl.connection import Oracle - - oracle = Oracle(...) - df = oracle.sql( - """ - SELECT - id, - key, - CAST(value AS VARCHAR2(4000)) value, - updated_at - FROM - some.mytable - WHERE - key = 'something' - """, - options=Oracle.SQLOptions( - partitionColumn="id", - numPartitions=10, - lowerBound=0, - upperBound=1000, - ), - ) - ``` +```python +from onetl.connection import Oracle + +oracle = Oracle(...) +df = oracle.sql( + """ + SELECT + id, + key, + CAST(value AS VARCHAR2(4000)) value, + updated_at + FROM + some.mytable + WHERE + key = 'something' + """, + options=Oracle.SQLOptions( + partitionColumn="id", + numPartitions=10, + lowerBound=0, + upperBound=1000, + ), +) +``` ## Recommendations { #DBR-onetl-connection-db-connection-oracle-sql-recommendations } diff --git a/mddocs/docs/connection/db_connection/oracle/types.md b/mddocs/docs/connection/db_connection/oracle/types.md index 39440beee..171cae8e3 100644 --- a/mddocs/docs/connection/db_connection/oracle/types.md +++ b/mddocs/docs/connection/db_connection/oracle/types.md @@ -176,36 +176,36 @@ For example, you can use `CAST(column AS CLOB)` to convert data to string repres It is also possible to use [JSON_ARRAY](https://docs.oracle.com/en/database/oracle/oracle-database/23/sqlrf/JSON_ARRAY.html) or [JSON_OBJECT](https://docs.oracle.com/en/database/oracle/oracle-database/23/sqlrf/JSON_OBJECT.html) Oracle functions to convert column of any type to string representation. Then this JSON string can then be effectively parsed using the [JSON.parse_column][onetl.file.format.json.JSON.parse_column] method. - ```python - from onetl.file.format import JSON - from pyspark.sql.types import IntegerType, StructType, StructField - - from onetl.connection import Oracle - from onetl.db import DBReader - - oracle = Oracle(...) - - DBReader( - connection=oracle, - columns=[ - "id", - "supported_column", - "CAST(unsupported_column AS VARCHAR2(4000)) unsupported_column_str", - # or - "JSON_ARRAY(array_column) array_column_json", - ], - ) - df = reader.run() - - json_scheme = StructType([StructField("key", IntegerType())]) - - df = df.select( - df.id, - df.supported_column, - df.unsupported_column_str.cast("integer").alias("parsed_integer"), - JSON().parse_column("array_column_json", json_scheme).alias("array_column"), - ) - ``` +```python +from onetl.file.format import JSON +from pyspark.sql.types import IntegerType, StructType, StructField + +from onetl.connection import Oracle +from onetl.db import DBReader + +oracle = Oracle(...) + +DBReader( + connection=oracle, + columns=[ + "id", + "supported_column", + "CAST(unsupported_column AS VARCHAR2(4000)) unsupported_column_str", + # or + "JSON_ARRAY(array_column) array_column_json", + ], +) +df = reader.run() + +json_scheme = StructType([StructField("key", IntegerType())]) + +df = df.select( + df.id, + df.supported_column, + df.unsupported_column_str.cast("integer").alias("parsed_integer"), + JSON().parse_column("array_column_json", json_scheme).alias("array_column"), +) +``` ### `DBWriter` { #DBR-onetl-connection-db-connection-oracle-types-dbwriter } @@ -213,56 +213,56 @@ It is always possible to convert data on Spark side to string, and then write it To serialize and write JSON data to a `text` or `json` column in an Oracle table use the [JSON.serialize_column][onetl.file.format.json.JSON.serialize_column] method. - ```python - from onetl.connection import Oracle - from onetl.db import DBWriter - from onetl.file.format import JSON - - oracle = Oracle(...) - - oracle.execute( - """ - CREATE TABLE schema.target_table ( - id INTEGER, - supported_column TIMESTAMP, - array_column_json VARCHAR2(4000) -- any string type, actually - ) - """, - ) +```python +from onetl.connection import Oracle +from onetl.db import DBWriter +from onetl.file.format import JSON - write_df = df.select( - df.id, - df.supported_column, - JSON().serialize_column(df.unsupported_column).alias("array_column_json"), - ) +oracle = Oracle(...) - writer = DBWriter( - connection=oracle, - target="schema.target_table", - ) - writer.run(write_df) - ``` +oracle.execute( + """ + CREATE TABLE schema.target_table ( + id INTEGER, + supported_column TIMESTAMP, + array_column_json VARCHAR2(4000) -- any string type, actually + ) + """, +) + +write_df = df.select( + df.id, + df.supported_column, + JSON().serialize_column(df.unsupported_column).alias("array_column_json"), +) + +writer = DBWriter( + connection=oracle, + target="schema.target_table", +) +writer.run(write_df) +``` Then you can parse this column on Oracle side - for example, by creating a view: - ```sql - SELECT - id, - supported_column, - JSON_VALUE(array_column_json, '$[0]' RETURNING NUMBER) AS array_item_0 - FROM - schema.target_table - ``` +```sql +SELECT + id, + supported_column, + JSON_VALUE(array_column_json, '$[0]' RETURNING NUMBER) AS array_item_0 +FROM + schema.target_table +``` Or by using [VIRTUAL column](https://oracle-base.com/articles/11g/virtual-columns-11gr1): - ```sql - CREATE TABLE schema.target_table ( - id INTEGER, - supported_column TIMESTAMP, - array_column_json VARCHAR2(4000), -- any string type, actually - array_item_0 GENERATED ALWAYS AS (JSON_VALUE(array_column_json, '$[0]' RETURNING NUMBER)) VIRTUAL - ) - ``` +```sql +CREATE TABLE schema.target_table ( + id INTEGER, + supported_column TIMESTAMP, + array_column_json VARCHAR2(4000), -- any string type, actually + array_item_0 GENERATED ALWAYS AS (JSON_VALUE(array_column_json, '$[0]' RETURNING NUMBER)) VIRTUAL +) +``` But data will be parsed on each table read in any case, as Oracle does no support `GENERATED ALWAYS AS (...) STORED` columns. diff --git a/mddocs/docs/connection/db_connection/oracle/write.md b/mddocs/docs/connection/db_connection/oracle/write.md index fd2a25d86..fcf906127 100644 --- a/mddocs/docs/connection/db_connection/oracle/write.md +++ b/mddocs/docs/connection/db_connection/oracle/write.md @@ -14,22 +14,22 @@ For writing data to Oracle, use [DBWriter][DBR-onetl-db-writer]. ## Examples { #DBR-onetl-connection-db-connection-oracle-write-examples } - ```python - from onetl.connection import Oracle - from onetl.db import DBWriter +```python +from onetl.connection import Oracle +from onetl.db import DBWriter - oracle = Oracle(...) +oracle = Oracle(...) - df = ... # data is here +df = ... # data is here - writer = DBWriter( - connection=oracle, - target="schema.table", - options=Oracle.WriteOptions(if_exists="append"), - ) +writer = DBWriter( + connection=oracle, + target="schema.table", + options=Oracle.WriteOptions(if_exists="append"), +) - writer.run(df) - ``` +writer.run(df) +``` ## Options { #DBR-onetl-connection-db-connection-oracle-write-options } diff --git a/mddocs/docs/connection/db_connection/postgres/execute.md b/mddocs/docs/connection/db_connection/postgres/execute.md index 54d91972a..d7695b724 100644 --- a/mddocs/docs/connection/db_connection/postgres/execute.md +++ b/mddocs/docs/connection/db_connection/postgres/execute.md @@ -32,18 +32,18 @@ This method supports **any** query syntax supported by Postgres, like: #### Examples for `Postgres.fetch` { #DBR-onetl-connection-db-connection-postgres-execute-examples-for-postgres-fetch } - ```python - from onetl.connection import Postgres +```python +from onetl.connection import Postgres - postgres = Postgres(...) +postgres = Postgres(...) - df = postgres.fetch( - "SELECT value FROM some.reference_table WHERE key = 'some_constant'", - options=Postgres.FetchOptions(queryTimeout=10), - ) - postgres.close() - value = df.collect()[0][0] # get value from first row and first column - ``` +df = postgres.fetch( + "SELECT value FROM some.reference_table WHERE key = 'some_constant'", + options=Postgres.FetchOptions(queryTimeout=10), +) +postgres.close() +value = df.collect()[0][0] # get value from first row and first column +``` ### Use `Postgres.execute` { #DBR-onetl-connection-db-connection-postgres-execute-use-postgres-execute } @@ -68,23 +68,23 @@ This method supports **any** query syntax supported by Postgres, like: #### Examples for `Postgres.execute` { #DBR-onetl-connection-db-connection-postgres-execute-examples-for-postgres-execute } - ```python - from onetl.connection import Postgres - - postgres = Postgres(...) - - postgres.execute("DROP TABLE schema.table") - postgres.execute( - """ - CREATE TABLE schema.table ( - id bigint GENERATED ALWAYS AS IDENTITY, - key text, - value real - ) - """, - options=Postgres.ExecuteOptions(queryTimeout=10), - ) - ``` +```python +from onetl.connection import Postgres + +postgres = Postgres(...) + +postgres.execute("DROP TABLE schema.table") +postgres.execute( + """ + CREATE TABLE schema.table ( + id bigint GENERATED ALWAYS AS IDENTITY, + key text, + value real + ) + """, + options=Postgres.ExecuteOptions(queryTimeout=10), +) +``` ## Options { #DBR-onetl-connection-db-connection-postgres-execute-options } diff --git a/mddocs/docs/connection/db_connection/postgres/read.md b/mddocs/docs/connection/db_connection/postgres/read.md index eaad0252b..e2af4a03a 100644 --- a/mddocs/docs/connection/db_connection/postgres/read.md +++ b/mddocs/docs/connection/db_connection/postgres/read.md @@ -23,43 +23,43 @@ Snapshot strategy: - ```python - from onetl.connection import Postgres - from onetl.db import DBReader - - postgres = Postgres(...) - - reader = DBReader( - connection=postgres, - source="schema.table", - columns=["id", "key", "CAST(value AS text) value", "updated_dt"], - where="key = 'something'", - options=Postgres.ReadOptions(partitionColumn="id", numPartitions=10), - ) - df = reader.run() - ``` +```python +from onetl.connection import Postgres +from onetl.db import DBReader + +postgres = Postgres(...) + +reader = DBReader( + connection=postgres, + source="schema.table", + columns=["id", "key", "CAST(value AS text) value", "updated_dt"], + where="key = 'something'", + options=Postgres.ReadOptions(partitionColumn="id", numPartitions=10), +) +df = reader.run() +``` Incremental strategy: - ```python - from onetl.connection import Postgres - from onetl.db import DBReader - from onetl.strategy import IncrementalStrategy - - postgres = Postgres(...) - - reader = DBReader( - connection=postgres, - source="schema.table", - columns=["id", "key", "CAST(value AS text) value", "updated_dt"], - where="key = 'something'", - hwm=DBReader.AutoDetectHWM(name="postgres_hwm", expression="updated_dt"), - options=Postgres.ReadOptions(partitionColumn="id", numPartitions=10), - ) - - with IncrementalStrategy(): - df = reader.run() - ``` +```python +from onetl.connection import Postgres +from onetl.db import DBReader +from onetl.strategy import IncrementalStrategy + +postgres = Postgres(...) + +reader = DBReader( + connection=postgres, + source="schema.table", + columns=["id", "key", "CAST(value AS text) value", "updated_dt"], + where="key = 'something'", + hwm=DBReader.AutoDetectHWM(name="postgres_hwm", expression="updated_dt"), + options=Postgres.ReadOptions(partitionColumn="id", numPartitions=10), +) + +with IncrementalStrategy(): + df = reader.run() +``` ## Recommendations { #DBR-onetl-connection-db-connection-postgres-read-recommendations } diff --git a/mddocs/docs/connection/db_connection/postgres/sql.md b/mddocs/docs/connection/db_connection/postgres/sql.md index d03f0a924..2a06a0b20 100644 --- a/mddocs/docs/connection/db_connection/postgres/sql.md +++ b/mddocs/docs/connection/db_connection/postgres/sql.md @@ -20,30 +20,30 @@ Only queries with the following syntax are supported: ## Examples { #DBR-onetl-connection-db-connection-postgres-sql-examples } - ```python - from onetl.connection import Postgres - - postgres = Postgres(...) - df = postgres.sql( - """ - SELECT - id, - key, - CAST(value AS text) value, - updated_at - FROM - some.mytable - WHERE - key = 'something' - """, - options=Postgres.SQLOptions( - partitionColumn="id", - numPartitions=10, - lowerBound=0, - upperBound=1000, - ), - ) - ``` +```python +from onetl.connection import Postgres + +postgres = Postgres(...) +df = postgres.sql( + """ + SELECT + id, + key, + CAST(value AS text) value, + updated_at + FROM + some.mytable + WHERE + key = 'something' + """, + options=Postgres.SQLOptions( + partitionColumn="id", + numPartitions=10, + lowerBound=0, + upperBound=1000, + ), +) +``` ## Recommendations { #DBR-onetl-connection-db-connection-postgres-sql-recommendations } diff --git a/mddocs/docs/connection/db_connection/postgres/types.md b/mddocs/docs/connection/db_connection/postgres/types.md index 5321e9a09..d47922258 100644 --- a/mddocs/docs/connection/db_connection/postgres/types.md +++ b/mddocs/docs/connection/db_connection/postgres/types.md @@ -217,42 +217,42 @@ For example, you can use `CAST(column AS text)` to convert data to string repres It is also possible to use [to_json](https://www.postgresql.org/docs/current/functions-json.html) Postgres function to convert column of any type to string representation, and then parse this column on Spark side you can use the [JSON.parse_column][onetl.file.format.json.JSON.parse_column] method: - ```python - from pyspark.sql.types import IntegerType - - from onetl.connection import Postgres - from onetl.db import DBReader - from onetl.file.format import JSON - - postgres = Postgres(...) - - DBReader( - connection=postgres, - columns=[ - "id", - "supported_column", - "CAST(unsupported_column AS text) unsupported_column_str", - # or - "to_json(unsupported_column) array_column_json", - ], - ) - df = reader.run() - - json_schema = StructType( - [ - StructField("id", IntegerType(), nullable=True), - StructField("name", StringType(), nullable=True), - ..., - ] - ) - df = df.select( - df.id, - df.supported_column, - # explicit cast - df.unsupported_column_str.cast("integer").alias("parsed_integer"), - JSON().parse_column("array_column_json", json_schema).alias("json_string"), - ) - ``` +```python +from pyspark.sql.types import IntegerType + +from onetl.connection import Postgres +from onetl.db import DBReader +from onetl.file.format import JSON + +postgres = Postgres(...) + +DBReader( + connection=postgres, + columns=[ + "id", + "supported_column", + "CAST(unsupported_column AS text) unsupported_column_str", + # or + "to_json(unsupported_column) array_column_json", + ], +) +df = reader.run() + +json_schema = StructType( + [ + StructField("id", IntegerType(), nullable=True), + StructField("name", StringType(), nullable=True), + ..., + ] +) +df = df.select( + df.id, + df.supported_column, + # explicit cast + df.unsupported_column_str.cast("integer").alias("parsed_integer"), + JSON().parse_column("array_column_json", json_schema).alias("json_string"), +) +``` ### `DBWriter` { #DBR-onetl-connection-db-connection-postgres-types-dbwriter } @@ -262,48 +262,48 @@ It is always possible to convert data on the Spark side to a string, and then wr You can use the [JSON.serialize_column][onetl.file.format.json.JSON.serialize_column] method for data serialization: - ```python - from onetl.file.format import JSON - from pyspark.sql.functions import col - - from onetl.connection import Postgres - from onetl.db import DBWriter - - postgres = Postgres(...) - - postgres.execute( - """ - CREATE TABLE schema.target_table ( - id int, - supported_column timestamp, - array_column_json jsonb -- any column type, actually - ) - """, - ) +```python +from onetl.file.format import JSON +from pyspark.sql.functions import col - write_df = df.select( - df.id, - df.supported_column, - JSON().serialize_column(df.unsupported_column).alias("array_column_json"), - ) +from onetl.connection import Postgres +from onetl.db import DBWriter - writer = DBWriter( - connection=postgres, - target="schema.target_table", - ) - writer.run(write_df) - ``` +postgres = Postgres(...) + +postgres.execute( + """ + CREATE TABLE schema.target_table ( + id int, + supported_column timestamp, + array_column_json jsonb -- any column type, actually + ) + """, +) + +write_df = df.select( + df.id, + df.supported_column, + JSON().serialize_column(df.unsupported_column).alias("array_column_json"), +) + +writer = DBWriter( + connection=postgres, + target="schema.target_table", +) +writer.run(write_df) +``` Then you can parse this column on the Postgres side (for example, by creating a view): - ```sql - SELECT - id, - supported_column, - array_column_json->'0' AS array_item_0 - FROM - schema.target_table - ``` +```sql +SELECT + id, + supported_column, + array_column_json->'0' AS array_item_0 +FROM + schema.target_table +``` To avoid casting the value on every table read you can use [GENERATED ALWAYS STORED](https://www.postgresql.org/docs/current/ddl-generated-columns.html) column, but this requires 2x space (for original and parsed value). @@ -313,43 +313,43 @@ Postgres connector also supports conversion text value directly to target column For example, you can write data like `[123, 345)` to `int8range` type because Postgres allows cast `'[123, 345)'::int8range'`: - ```python - from pyspark.sql.ftypes import StringType - from pyspark.sql.functions import udf +```python +from pyspark.sql.ftypes import StringType +from pyspark.sql.functions import udf - from onetl.connection import Postgres - from onetl.db import DBReader +from onetl.connection import Postgres +from onetl.db import DBReader - postgres = Postgres(...) +postgres = Postgres(...) - postgres.execute( - """ - CREATE TABLE schema.target_table ( - id int, - range_column int8range -- any column type, actually - ) - """, - ) +postgres.execute( + """ + CREATE TABLE schema.target_table ( + id int, + range_column int8range -- any column type, actually + ) + """, +) - @udf(returnType=StringType()) - def array_to_range(value: tuple): - """This UDF allows to convert tuple[start, end] to Postgres' range format""" - start, end = value - return f"[{start},{end})" +@udf(returnType=StringType()) +def array_to_range(value: tuple): + """This UDF allows to convert tuple[start, end] to Postgres' range format""" + start, end = value + return f"[{start},{end})" - write_df = df.select( - df.id, - array_to_range(df.range_column).alias("range_column"), - ) +write_df = df.select( + df.id, + array_to_range(df.range_column).alias("range_column"), +) - writer = DBWriter( - connection=postgres, - target="schema.target_table", - ) - writer.run(write_df) - ``` +writer = DBWriter( + connection=postgres, + target="schema.target_table", +) +writer.run(write_df) +``` This can be tricky to implement and may lead to longer write process. But this does not require extra space on Postgres side, and allows to avoid explicit value cast on every table read. diff --git a/mddocs/docs/connection/db_connection/postgres/write.md b/mddocs/docs/connection/db_connection/postgres/write.md index 8489d8ad3..ab83aa796 100644 --- a/mddocs/docs/connection/db_connection/postgres/write.md +++ b/mddocs/docs/connection/db_connection/postgres/write.md @@ -16,22 +16,22 @@ For writing data to Postgres, use [DBWriter][DBR-onetl-db-writer]. ## Examples { #DBR-onetl-connection-db-connection-postgres-write-examples } - ```python - from onetl.connection import Postgres - from onetl.db import DBWriter +```python +from onetl.connection import Postgres +from onetl.db import DBWriter - postgres = Postgres(...) +postgres = Postgres(...) - df = ... # data is here +df = ... # data is here - writer = DBWriter( - connection=postgres, - target="schema.table", - options=Postgres.WriteOptions(if_exists="append"), - ) +writer = DBWriter( + connection=postgres, + target="schema.table", + options=Postgres.WriteOptions(if_exists="append"), +) - writer.run(df) - ``` +writer.run(df) +``` ## Options { #DBR-onetl-connection-db-connection-postgres-write-options } diff --git a/mddocs/docs/connection/file_df_connection/spark_s3/troubleshooting.md b/mddocs/docs/connection/file_df_connection/spark_s3/troubleshooting.md index c5ec2e9e0..5e5f9bc13 100644 --- a/mddocs/docs/connection/file_df_connection/spark_s3/troubleshooting.md +++ b/mddocs/docs/connection/file_df_connection/spark_s3/troubleshooting.md @@ -37,130 +37,130 @@ Resulting logs will look like this ??? note "See log" ```text - 23/08/03 11:25:10 DEBUG S3AFileSystem: Using S3ABlockOutputStream with buffer = disk; block=67108864; queue limit=4 - 23/08/03 11:25:10 DEBUG S3Guard: Metastore option source [core-default.xml] - 23/08/03 11:25:10 DEBUG S3Guard: Using NullMetadataStore metadata store for s3a filesystem - 23/08/03 11:25:10 DEBUG S3AFileSystem: S3Guard is disabled on this bucket: test-bucket - 23/08/03 11:25:10 DEBUG DirectoryPolicyImpl: Directory markers will be deleted - 23/08/03 11:25:10 DEBUG S3AFileSystem: Directory marker retention policy is DirectoryMarkerRetention{policy='delete'} - 23/08/03 11:25:10 DEBUG S3AUtils: Value of fs.s3a.multipart.purge.age is 86400 - 23/08/03 11:25:10 DEBUG S3AUtils: Value of fs.s3a.bulk.delete.page.size is 250 - 23/08/03 11:25:10 DEBUG FileSystem: Creating FS s3a://test-bucket/fake: duration 0:01.029s - 23/08/03 11:25:10 DEBUG IOStatisticsStoreImpl: Incrementing counter op_is_directory by 1 with final value 1 - 23/08/03 11:25:10 DEBUG S3AFileSystem: Getting path status for s3a://test-bucket/fake (fake); needEmptyDirectory=false - 23/08/03 11:25:10 DEBUG S3AFileSystem: S3GetFileStatus s3a://test-bucket/fake - 23/08/03 11:25:10 DEBUG S3AFileSystem: LIST List test-bucket:/fake/ delimiter=/ keys=2 requester pays=false - 23/08/03 11:25:10 DEBUG S3AFileSystem: Starting: LIST - 23/08/03 11:25:10 DEBUG IOStatisticsStoreImpl: Incrementing counter object_list_request by 1 with final value 1 - 23/08/03 11:25:10 DEBUG AWSCredentialProviderList: Using credentials from SimpleAWSCredentialsProvider - 23/08/03 11:25:10 DEBUG request: Sending Request: GET https://test-bucket.localhost:9000 / Parameters: ({"list-type":["2"],"delimiter":["/"],"max-keys":["2"],"prefix":["fake/"],"fetch-owner":["false"]}Headers: (amz-sdk-invocation-id: e6d62603-96e4-a80f-10a1-816e0822bc71, Content-Type: application/octet-stream, User-Agent: Hadoop 3.3.4, aws-sdk-java/1.12.262 Linux/6.4.7-1-MANJARO OpenJDK_64-Bit_Server_VM/25.292-b10 java/1.8.0_292 scala/2.12.17 vendor/AdoptOpenJDK cfg/retry-mode/legacy, ) - 23/08/03 11:25:10 DEBUG AWS4Signer: AWS4 Canonical Request: '"GET - / - delimiter=%2F&fetch-owner=false&list-type=2&max-keys=2&prefix=fake%2F - amz-sdk-invocation-id:e6d62603-96e4-a80f-10a1-816e0822bc71 - amz-sdk-request:attempt=1;max=21 - amz-sdk-retry:0/0/500 - content-type:application/octet-stream - host:test-bucket.localhost:9000 - user-agent:Hadoop 3.3.4, aws-sdk-java/1.12.262 Linux/6.4.7-1-MANJARO OpenJDK_64-Bit_Server_VM/25.292-b10 java/1.8.0_292 scala/2.12.17 vendor/AdoptOpenJDK cfg/retry-mode/legacy - x-amz-content-sha256:UNSIGNED-PAYLOAD - x-amz-date:20230803T112510Z - - amz-sdk-invocation-id;amz-sdk-request;amz-sdk-retry;content-type;host;user-agent;x-amz-content-sha256;x-amz-date - UNSIGNED-PAYLOAD" - 23/08/03 11:25:10 DEBUG AWS4Signer: AWS4 String to Sign: '"AWS4-HMAC-SHA256 - 20230803T112510Z - 20230803/us-east-1/s3/aws4_request - 31a317bb7f6d97248dd0cf03429d701cbb3e29ce889cfbb98ba7a34c57a3bfba" - 23/08/03 11:25:10 DEBUG AWS4Signer: Generating a new signing key as the signing key not available in the cache for the date 1691020800000 - 23/08/03 11:25:10 DEBUG RequestAddCookies: CookieSpec selected: default - 23/08/03 11:25:10 DEBUG RequestAuthCache: Auth cache not set in the context - 23/08/03 11:25:10 DEBUG PoolingHttpClientConnectionManager: Connection request: [route: {s}->https://test-bucket.localhost:9000][total available: 0; route allocated: 0 of 96; total allocated: 0 of 96] - 23/08/03 11:25:10 DEBUG PoolingHttpClientConnectionManager: Connection leased: [id: 0][route: {s}->https://test-bucket.localhost:9000][total available: 0; route allocated: 1 of 96; total allocated: 1 of 96] - 23/08/03 11:25:10 DEBUG MainClientExec: Opening connection {s}->https://test-bucket.localhost:9000 - 23/08/03 11:25:10 DEBUG DefaultHttpClientConnectionOperator: Connecting to test-bucket.localhost/127.0.0.1:9000 - 23/08/03 11:25:10 DEBUG SSLConnectionSocketFactory: Connecting socket to test-bucket.localhost/127.0.0.1:9000 with timeout 5000 - 23/08/03 11:25:10 DEBUG SSLConnectionSocketFactory: Enabled protocols: [TLSv1.2] - 23/08/03 11:25:10 DEBUG SSLConnectionSocketFactory: Enabled cipher suites:[TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384, TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384, TLS_RSA_WITH_AES_256_CBC_SHA256, TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA384, TLS_ECDH_RSA_WITH_AES_256_CBC_SHA384, TLS_DHE_RSA_WITH_AES_256_CBC_SHA256, TLS_DHE_DSS_WITH_AES_256_CBC_SHA256, TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA, TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA, TLS_RSA_WITH_AES_256_CBC_SHA, TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA, TLS_ECDH_RSA_WITH_AES_256_CBC_SHA, TLS_DHE_RSA_WITH_AES_256_CBC_SHA, TLS_DHE_DSS_WITH_AES_256_CBC_SHA, TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256, TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256, TLS_RSA_WITH_AES_128_CBC_SHA256, TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA256, TLS_ECDH_RSA_WITH_AES_128_CBC_SHA256, TLS_DHE_RSA_WITH_AES_128_CBC_SHA256, TLS_DHE_DSS_WITH_AES_128_CBC_SHA256, TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA, TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA, TLS_RSA_WITH_AES_128_CBC_SHA, TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA, TLS_ECDH_RSA_WITH_AES_128_CBC_SHA, TLS_DHE_RSA_WITH_AES_128_CBC_SHA, TLS_DHE_DSS_WITH_AES_128_CBC_SHA, TLS_EMPTY_RENEGOTIATION_INFO_SCSV] - 23/08/03 11:25:10 DEBUG SSLConnectionSocketFactory: Starting handshake - 23/08/03 11:25:10 DEBUG ClientConnectionManagerFactory: - java.lang.reflect.InvocationTargetException - at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) - at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) - at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) - at java.lang.reflect.Method.invoke(Method.java:498) - at com.amazonaws.http.conn.ClientConnectionManagerFactory$Handler.invoke(ClientConnectionManagerFactory.java:76) - at com.amazonaws.http.conn.$Proxy32.connect(Unknown Source) - at com.amazonaws.thirdparty.apache.http.impl.execchain.MainClientExec.establishRoute(MainClientExec.java:393) - at com.amazonaws.thirdparty.apache.http.impl.execchain.MainClientExec.execute(MainClientExec.java:236) - at com.amazonaws.thirdparty.apache.http.impl.execchain.ProtocolExec.execute(ProtocolExec.java:186) - at com.amazonaws.thirdparty.apache.http.impl.client.InternalHttpClient.doExecute(InternalHttpClient.java:185) - at com.amazonaws.thirdparty.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:83) - at com.amazonaws.thirdparty.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:56) - at com.amazonaws.http.apache.client.impl.SdkHttpClient.execute(SdkHttpClient.java:72) - at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeOneRequest(AmazonHttpClient.java:1346) - at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeHelper(AmazonHttpClient.java:1157) - at com.amazonaws.http.AmazonHttpClient$RequestExecutor.doExecute(AmazonHttpClient.java:814) - at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeWithTimer(AmazonHttpClient.java:781) - at com.amazonaws.http.AmazonHttpClient$RequestExecutor.execute(AmazonHttpClient.java:755) - at com.amazonaws.http.AmazonHttpClient$RequestExecutor.access$500(AmazonHttpClient.java:715) - at com.amazonaws.http.AmazonHttpClient$RequestExecutionBuilderImpl.execute(AmazonHttpClient.java:697) - at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:561) - at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:541) - at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:5456) - at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:5403) - at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:5397) - at com.amazonaws.services.s3.AmazonS3Client.listObjectsV2(AmazonS3Client.java:971) - at org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$listObjects$11(S3AFileSystem.java:2595) - at org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.lambda$trackDurationOfOperation$5(IOStatisticsBinding.java:499) - at org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:414) - at org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:377) - at org.apache.hadoop.fs.s3a.S3AFileSystem.listObjects(S3AFileSystem.java:2586) - at org.apache.hadoop.fs.s3a.S3AFileSystem.s3GetFileStatus(S3AFileSystem.java:3832) - at org.apache.hadoop.fs.s3a.S3AFileSystem.innerGetFileStatus(S3AFileSystem.java:3688) - at org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$isDirectory$35(S3AFileSystem.java:4724) - at org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.lambda$trackDurationOfOperation$5(IOStatisticsBinding.java:499) - at org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.trackDuration(IOStatisticsBinding.java:444) - at org.apache.hadoop.fs.s3a.S3AFileSystem.trackDurationAndSpan(S3AFileSystem.java:2337) - at org.apache.hadoop.fs.s3a.S3AFileSystem.trackDurationAndSpan(S3AFileSystem.java:2356) - at org.apache.hadoop.fs.s3a.S3AFileSystem.isDirectory(S3AFileSystem.java:4722) - at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:54) - at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:366) - at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:229) - at org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:211) - at scala.Option.getOrElse(Option.scala:189) - at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211) - at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:186) - at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) - at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) - at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) - at java.lang.reflect.Method.invoke(Method.java:498) - at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) - at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374) - at py4j.Gateway.invoke(Gateway.java:282) - at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) - at py4j.commands.CallCommand.execute(CallCommand.java:79) - at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182) - at py4j.ClientServerConnection.run(ClientServerConnection.java:106) - at java.lang.Thread.run(Thread.java:748) - Caused by: javax.net.ssl.SSLException: Unsupported or unrecognized SSL message - at sun.security.ssl.SSLSocketInputRecord.handleUnknownRecord(SSLSocketInputRecord.java:448) - at sun.security.ssl.SSLSocketInputRecord.decode(SSLSocketInputRecord.java:184) - at sun.security.ssl.SSLTransport.decode(SSLTransport.java:109) - at sun.security.ssl.SSLSocketImpl.decode(SSLSocketImpl.java:1383) - at sun.security.ssl.SSLSocketImpl.readHandshakeRecord(SSLSocketImpl.java:1291) - at sun.security.ssl.SSLSocketImpl.startHandshake(SSLSocketImpl.java:435) - at com.amazonaws.thirdparty.apache.http.conn.ssl.SSLConnectionSocketFactory.createLayeredSocket(SSLConnectionSocketFactory.java:436) - at com.amazonaws.thirdparty.apache.http.conn.ssl.SSLConnectionSocketFactory.connectSocket(SSLConnectionSocketFactory.java:384) - at com.amazonaws.thirdparty.apache.http.impl.conn.DefaultHttpClientConnectionOperator.connect(DefaultHttpClientConnectionOperator.java:142) - at com.amazonaws.thirdparty.apache.http.impl.conn.PoolingHttpClientConnectionManager.connect(PoolingHttpClientConnectionManager.java:376) - ... 58 more - 23/08/03 11:25:10 DEBUG DefaultManagedHttpClientConnection: http-outgoing-0: Shutdown connection - 23/08/03 11:25:10 DEBUG MainClientExec: Connection discarded - 23/08/03 11:25:10 DEBUG PoolingHttpClientConnectionManager: Connection released: [id: 0][route: {s}->https://test-bucket.localhost:9000][total available: 0; route allocated: 0 of 96; total allocated: 0 of 96] - 23/08/03 11:25:10 DEBUG AmazonHttpClient: Unable to execute HTTP request: Unsupported or unrecognized SSL message Request will be retried. - 23/08/03 11:25:10 DEBUG request: Retrying Request: GET https://test-bucket.localhost:9000 / Parameters: ({"list-type":["2"],"delimiter":["/"],"max-keys":["2"],"prefix":["fake/"],"fetch-owner":["false"]}Headers: (amz-sdk-invocation-id: e6d62603-96e4-a80f-10a1-816e0822bc71, Content-Type: application/octet-stream, User-Agent: Hadoop 3.3.4, aws-sdk-java/1.12.262 Linux/6.4.7-1-MANJARO OpenJDK_64-Bit_Server_VM/25.292-b10 java/1.8.0_292 scala/2.12.17 vendor/AdoptOpenJDK cfg/retry-mode/legacy, ) - 23/08/03 11:25:10 DEBUG AmazonHttpClient: Retriable error detected, will retry in 49ms, attempt number: 0 + 23/08/03 11:25:10 DEBUG S3AFileSystem: Using S3ABlockOutputStream with buffer = disk; block=67108864; queue limit=4 + 23/08/03 11:25:10 DEBUG S3Guard: Metastore option source [core-default.xml] + 23/08/03 11:25:10 DEBUG S3Guard: Using NullMetadataStore metadata store for s3a filesystem + 23/08/03 11:25:10 DEBUG S3AFileSystem: S3Guard is disabled on this bucket: test-bucket + 23/08/03 11:25:10 DEBUG DirectoryPolicyImpl: Directory markers will be deleted + 23/08/03 11:25:10 DEBUG S3AFileSystem: Directory marker retention policy is DirectoryMarkerRetention{policy='delete'} + 23/08/03 11:25:10 DEBUG S3AUtils: Value of fs.s3a.multipart.purge.age is 86400 + 23/08/03 11:25:10 DEBUG S3AUtils: Value of fs.s3a.bulk.delete.page.size is 250 + 23/08/03 11:25:10 DEBUG FileSystem: Creating FS s3a://test-bucket/fake: duration 0:01.029s + 23/08/03 11:25:10 DEBUG IOStatisticsStoreImpl: Incrementing counter op_is_directory by 1 with final value 1 + 23/08/03 11:25:10 DEBUG S3AFileSystem: Getting path status for s3a://test-bucket/fake (fake); needEmptyDirectory=false + 23/08/03 11:25:10 DEBUG S3AFileSystem: S3GetFileStatus s3a://test-bucket/fake + 23/08/03 11:25:10 DEBUG S3AFileSystem: LIST List test-bucket:/fake/ delimiter=/ keys=2 requester pays=false + 23/08/03 11:25:10 DEBUG S3AFileSystem: Starting: LIST + 23/08/03 11:25:10 DEBUG IOStatisticsStoreImpl: Incrementing counter object_list_request by 1 with final value 1 + 23/08/03 11:25:10 DEBUG AWSCredentialProviderList: Using credentials from SimpleAWSCredentialsProvider + 23/08/03 11:25:10 DEBUG request: Sending Request: GET https://test-bucket.localhost:9000 / Parameters: ({"list-type":["2"],"delimiter":["/"],"max-keys":["2"],"prefix":["fake/"],"fetch-owner":["false"]}Headers: (amz-sdk-invocation-id: e6d62603-96e4-a80f-10a1-816e0822bc71, Content-Type: application/octet-stream, User-Agent: Hadoop 3.3.4, aws-sdk-java/1.12.262 Linux/6.4.7-1-MANJARO OpenJDK_64-Bit_Server_VM/25.292-b10 java/1.8.0_292 scala/2.12.17 vendor/AdoptOpenJDK cfg/retry-mode/legacy, ) + 23/08/03 11:25:10 DEBUG AWS4Signer: AWS4 Canonical Request: '"GET + / + delimiter=%2F&fetch-owner=false&list-type=2&max-keys=2&prefix=fake%2F + amz-sdk-invocation-id:e6d62603-96e4-a80f-10a1-816e0822bc71 + amz-sdk-request:attempt=1;max=21 + amz-sdk-retry:0/0/500 + content-type:application/octet-stream + host:test-bucket.localhost:9000 + user-agent:Hadoop 3.3.4, aws-sdk-java/1.12.262 Linux/6.4.7-1-MANJARO OpenJDK_64-Bit_Server_VM/25.292-b10 java/1.8.0_292 scala/2.12.17 vendor/AdoptOpenJDK cfg/retry-mode/legacy + x-amz-content-sha256:UNSIGNED-PAYLOAD + x-amz-date:20230803T112510Z + + amz-sdk-invocation-id;amz-sdk-request;amz-sdk-retry;content-type;host;user-agent;x-amz-content-sha256;x-amz-date + UNSIGNED-PAYLOAD" + 23/08/03 11:25:10 DEBUG AWS4Signer: AWS4 String to Sign: '"AWS4-HMAC-SHA256 + 20230803T112510Z + 20230803/us-east-1/s3/aws4_request + 31a317bb7f6d97248dd0cf03429d701cbb3e29ce889cfbb98ba7a34c57a3bfba" + 23/08/03 11:25:10 DEBUG AWS4Signer: Generating a new signing key as the signing key not available in the cache for the date 1691020800000 + 23/08/03 11:25:10 DEBUG RequestAddCookies: CookieSpec selected: default + 23/08/03 11:25:10 DEBUG RequestAuthCache: Auth cache not set in the context + 23/08/03 11:25:10 DEBUG PoolingHttpClientConnectionManager: Connection request: [route: {s}->https://test-bucket.localhost:9000][total available: 0; route allocated: 0 of 96; total allocated: 0 of 96] + 23/08/03 11:25:10 DEBUG PoolingHttpClientConnectionManager: Connection leased: [id: 0][route: {s}->https://test-bucket.localhost:9000][total available: 0; route allocated: 1 of 96; total allocated: 1 of 96] + 23/08/03 11:25:10 DEBUG MainClientExec: Opening connection {s}->https://test-bucket.localhost:9000 + 23/08/03 11:25:10 DEBUG DefaultHttpClientConnectionOperator: Connecting to test-bucket.localhost/127.0.0.1:9000 + 23/08/03 11:25:10 DEBUG SSLConnectionSocketFactory: Connecting socket to test-bucket.localhost/127.0.0.1:9000 with timeout 5000 + 23/08/03 11:25:10 DEBUG SSLConnectionSocketFactory: Enabled protocols: [TLSv1.2] + 23/08/03 11:25:10 DEBUG SSLConnectionSocketFactory: Enabled cipher suites:[TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384, TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384, TLS_RSA_WITH_AES_256_CBC_SHA256, TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA384, TLS_ECDH_RSA_WITH_AES_256_CBC_SHA384, TLS_DHE_RSA_WITH_AES_256_CBC_SHA256, TLS_DHE_DSS_WITH_AES_256_CBC_SHA256, TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA, TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA, TLS_RSA_WITH_AES_256_CBC_SHA, TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA, TLS_ECDH_RSA_WITH_AES_256_CBC_SHA, TLS_DHE_RSA_WITH_AES_256_CBC_SHA, TLS_DHE_DSS_WITH_AES_256_CBC_SHA, TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256, TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256, TLS_RSA_WITH_AES_128_CBC_SHA256, TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA256, TLS_ECDH_RSA_WITH_AES_128_CBC_SHA256, TLS_DHE_RSA_WITH_AES_128_CBC_SHA256, TLS_DHE_DSS_WITH_AES_128_CBC_SHA256, TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA, TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA, TLS_RSA_WITH_AES_128_CBC_SHA, TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA, TLS_ECDH_RSA_WITH_AES_128_CBC_SHA, TLS_DHE_RSA_WITH_AES_128_CBC_SHA, TLS_DHE_DSS_WITH_AES_128_CBC_SHA, TLS_EMPTY_RENEGOTIATION_INFO_SCSV] + 23/08/03 11:25:10 DEBUG SSLConnectionSocketFactory: Starting handshake + 23/08/03 11:25:10 DEBUG ClientConnectionManagerFactory: + java.lang.reflect.InvocationTargetException + at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) + at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) + at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) + at java.lang.reflect.Method.invoke(Method.java:498) + at com.amazonaws.http.conn.ClientConnectionManagerFactory$Handler.invoke(ClientConnectionManagerFactory.java:76) + at com.amazonaws.http.conn.$Proxy32.connect(Unknown Source) + at com.amazonaws.thirdparty.apache.http.impl.execchain.MainClientExec.establishRoute(MainClientExec.java:393) + at com.amazonaws.thirdparty.apache.http.impl.execchain.MainClientExec.execute(MainClientExec.java:236) + at com.amazonaws.thirdparty.apache.http.impl.execchain.ProtocolExec.execute(ProtocolExec.java:186) + at com.amazonaws.thirdparty.apache.http.impl.client.InternalHttpClient.doExecute(InternalHttpClient.java:185) + at com.amazonaws.thirdparty.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:83) + at com.amazonaws.thirdparty.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:56) + at com.amazonaws.http.apache.client.impl.SdkHttpClient.execute(SdkHttpClient.java:72) + at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeOneRequest(AmazonHttpClient.java:1346) + at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeHelper(AmazonHttpClient.java:1157) + at com.amazonaws.http.AmazonHttpClient$RequestExecutor.doExecute(AmazonHttpClient.java:814) + at com.amazonaws.http.AmazonHttpClient$RequestExecutor.executeWithTimer(AmazonHttpClient.java:781) + at com.amazonaws.http.AmazonHttpClient$RequestExecutor.execute(AmazonHttpClient.java:755) + at com.amazonaws.http.AmazonHttpClient$RequestExecutor.access$500(AmazonHttpClient.java:715) + at com.amazonaws.http.AmazonHttpClient$RequestExecutionBuilderImpl.execute(AmazonHttpClient.java:697) + at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:561) + at com.amazonaws.http.AmazonHttpClient.execute(AmazonHttpClient.java:541) + at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:5456) + at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:5403) + at com.amazonaws.services.s3.AmazonS3Client.invoke(AmazonS3Client.java:5397) + at com.amazonaws.services.s3.AmazonS3Client.listObjectsV2(AmazonS3Client.java:971) + at org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$listObjects$11(S3AFileSystem.java:2595) + at org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.lambda$trackDurationOfOperation$5(IOStatisticsBinding.java:499) + at org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:414) + at org.apache.hadoop.fs.s3a.Invoker.retryUntranslated(Invoker.java:377) + at org.apache.hadoop.fs.s3a.S3AFileSystem.listObjects(S3AFileSystem.java:2586) + at org.apache.hadoop.fs.s3a.S3AFileSystem.s3GetFileStatus(S3AFileSystem.java:3832) + at org.apache.hadoop.fs.s3a.S3AFileSystem.innerGetFileStatus(S3AFileSystem.java:3688) + at org.apache.hadoop.fs.s3a.S3AFileSystem.lambda$isDirectory$35(S3AFileSystem.java:4724) + at org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.lambda$trackDurationOfOperation$5(IOStatisticsBinding.java:499) + at org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.trackDuration(IOStatisticsBinding.java:444) + at org.apache.hadoop.fs.s3a.S3AFileSystem.trackDurationAndSpan(S3AFileSystem.java:2337) + at org.apache.hadoop.fs.s3a.S3AFileSystem.trackDurationAndSpan(S3AFileSystem.java:2356) + at org.apache.hadoop.fs.s3a.S3AFileSystem.isDirectory(S3AFileSystem.java:4722) + at org.apache.spark.sql.execution.streaming.FileStreamSink$.hasMetadata(FileStreamSink.scala:54) + at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:366) + at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:229) + at org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:211) + at scala.Option.getOrElse(Option.scala:189) + at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:211) + at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:186) + at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) + at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) + at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) + at java.lang.reflect.Method.invoke(Method.java:498) + at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) + at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374) + at py4j.Gateway.invoke(Gateway.java:282) + at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) + at py4j.commands.CallCommand.execute(CallCommand.java:79) + at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182) + at py4j.ClientServerConnection.run(ClientServerConnection.java:106) + at java.lang.Thread.run(Thread.java:748) + Caused by: javax.net.ssl.SSLException: Unsupported or unrecognized SSL message + at sun.security.ssl.SSLSocketInputRecord.handleUnknownRecord(SSLSocketInputRecord.java:448) + at sun.security.ssl.SSLSocketInputRecord.decode(SSLSocketInputRecord.java:184) + at sun.security.ssl.SSLTransport.decode(SSLTransport.java:109) + at sun.security.ssl.SSLSocketImpl.decode(SSLSocketImpl.java:1383) + at sun.security.ssl.SSLSocketImpl.readHandshakeRecord(SSLSocketImpl.java:1291) + at sun.security.ssl.SSLSocketImpl.startHandshake(SSLSocketImpl.java:435) + at com.amazonaws.thirdparty.apache.http.conn.ssl.SSLConnectionSocketFactory.createLayeredSocket(SSLConnectionSocketFactory.java:436) + at com.amazonaws.thirdparty.apache.http.conn.ssl.SSLConnectionSocketFactory.connectSocket(SSLConnectionSocketFactory.java:384) + at com.amazonaws.thirdparty.apache.http.impl.conn.DefaultHttpClientConnectionOperator.connect(DefaultHttpClientConnectionOperator.java:142) + at com.amazonaws.thirdparty.apache.http.impl.conn.PoolingHttpClientConnectionManager.connect(PoolingHttpClientConnectionManager.java:376) + ... 58 more + 23/08/03 11:25:10 DEBUG DefaultManagedHttpClientConnection: http-outgoing-0: Shutdown connection + 23/08/03 11:25:10 DEBUG MainClientExec: Connection discarded + 23/08/03 11:25:10 DEBUG PoolingHttpClientConnectionManager: Connection released: [id: 0][route: {s}->https://test-bucket.localhost:9000][total available: 0; route allocated: 0 of 96; total allocated: 0 of 96] + 23/08/03 11:25:10 DEBUG AmazonHttpClient: Unable to execute HTTP request: Unsupported or unrecognized SSL message Request will be retried. + 23/08/03 11:25:10 DEBUG request: Retrying Request: GET https://test-bucket.localhost:9000 / Parameters: ({"list-type":["2"],"delimiter":["/"],"max-keys":["2"],"prefix":["fake/"],"fetch-owner":["false"]}Headers: (amz-sdk-invocation-id: e6d62603-96e4-a80f-10a1-816e0822bc71, Content-Type: application/octet-stream, User-Agent: Hadoop 3.3.4, aws-sdk-java/1.12.262 Linux/6.4.7-1-MANJARO OpenJDK_64-Bit_Server_VM/25.292-b10 java/1.8.0_292 scala/2.12.17 vendor/AdoptOpenJDK cfg/retry-mode/legacy, ) + 23/08/03 11:25:10 DEBUG AmazonHttpClient: Retriable error detected, will retry in 49ms, attempt number: 0 ``` #### Change number of retries { #DBR-onetl-connection-file-df-connection-spark-s3-troubleshooting-change-number-of-retries } From cf1cc4d3fb9101554591fb17599b9387c78eebbb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 7 Apr 2026 21:32:40 +0000 Subject: [PATCH 24/28] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- mddocs/docs/_static/stylesheets/autodoc_pydantic.css | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mddocs/docs/_static/stylesheets/autodoc_pydantic.css b/mddocs/docs/_static/stylesheets/autodoc_pydantic.css index 994a3e548..db37fda45 100644 --- a/mddocs/docs/_static/stylesheets/autodoc_pydantic.css +++ b/mddocs/docs/_static/stylesheets/autodoc_pydantic.css @@ -8,4 +8,4 @@ .autodoc_pydantic_collapsable_erd { cursor: pointer; - } \ No newline at end of file + } From c237a3034e8ce83e5a935c58ff5cfe5b93e70259 Mon Sep 17 00:00:00 2001 From: sga Date: Wed, 8 Apr 2026 19:32:41 +0300 Subject: [PATCH 25/28] fix missing changes --- mddocs/docs/concepts.md | 2 +- .../db_connection/clickhouse/execute.md | 2 - .../db_connection/clickhouse/prerequisites.md | 6 +- .../db_connection/clickhouse/types.md | 6 +- .../db_connection/greenplum/execute.md | 2 - .../db_connection/greenplum/prerequisites.md | 381 ++++++++++-------- .../db_connection/greenplum/types.md | 5 +- .../db_connection/hive/prerequisites.md | 4 +- .../connection/db_connection/hive/read.md | 6 +- .../docs/connection/db_connection/hive/sql.md | 4 - .../connection/db_connection/hive/write.md | 3 - .../db_connection/kafka/prerequisites.md | 4 +- .../connection/db_connection/kafka/read.md | 8 + .../db_connection/kafka/troubleshooting.md | 10 + .../db_connection/mongodb/prerequisites.md | 6 +- .../connection/db_connection/mongodb/types.md | 6 +- .../connection/db_connection/mssql/execute.md | 2 - .../db_connection/mssql/prerequisites.md | 8 +- .../connection/db_connection/mssql/types.md | 6 +- .../connection/db_connection/mysql/execute.md | 2 - .../db_connection/mysql/prerequisites.md | 8 +- .../connection/db_connection/mysql/types.md | 6 +- .../db_connection/oracle/execute.md | 2 - .../db_connection/oracle/prerequisites.md | 6 +- .../connection/db_connection/oracle/types.md | 6 +- .../db_connection/postgres/execute.md | 2 - .../db_connection/postgres/prerequisites.md | 8 +- .../db_connection/postgres/types.md | 6 +- .../spark_hdfs/prerequisites.md | 4 +- .../spark_s3/prerequisites.md | 4 +- .../spark_s3/troubleshooting.md | 6 +- 31 files changed, 293 insertions(+), 238 deletions(-) diff --git a/mddocs/docs/concepts.md b/mddocs/docs/concepts.md index 372ced02b..98f0d3191 100644 --- a/mddocs/docs/concepts.md +++ b/mddocs/docs/concepts.md @@ -39,7 +39,7 @@ classDiagram JDBCConnection <|-- MySQL JDBCConnection <|-- Postgres JDBCConnection <|-- Oracle - JDBCConnection <|-- Teradata + BaseConnection <|-- FileConnection FileConnection <|-- FTP FileConnection <|-- FTPS diff --git a/mddocs/docs/connection/db_connection/clickhouse/execute.md b/mddocs/docs/connection/db_connection/clickhouse/execute.md index 482426437..02c0e720a 100644 --- a/mddocs/docs/connection/db_connection/clickhouse/execute.md +++ b/mddocs/docs/connection/db_connection/clickhouse/execute.md @@ -17,7 +17,6 @@ Clickhouse config, or reading data from some reference table. Method returns Spa Method accepts [Clickhouse.FetchOptions][onetl.connection.db_connection.clickhouse.options.ClickhouseFetchOptions]. -Connection opened using this method should be then closed with `connection.close()` or `with connection:`. !!! warning @@ -54,7 +53,6 @@ Use this method to execute DDL and DML operations. Each method call runs operati Method accepts [Clickhouse.ExecuteOptions][onetl.connection.db_connection.clickhouse.options.ClickhouseExecuteOptions]. -Connection opened using this method should be then closed with `connection.close()` or `with connection:`. #### Syntax support in `Clickhouse.execute` { #DBR-onetl-connection-db-connection-clickhouse-execute-syntax-support-in-clickhouse-execute } diff --git a/mddocs/docs/connection/db_connection/clickhouse/prerequisites.md b/mddocs/docs/connection/db_connection/clickhouse/prerequisites.md index 9613cfb9e..39770111b 100644 --- a/mddocs/docs/connection/db_connection/clickhouse/prerequisites.md +++ b/mddocs/docs/connection/db_connection/clickhouse/prerequisites.md @@ -4,9 +4,9 @@ - Clickhouse server versions: - Officially declared: 22.8 or higher - - Actually tested: 21.1, 25.1 -- Spark versions: 2.3.x - 3.5.x -- Java versions: 8 - 20 + - Actually tested: 21.1, 25.8 +- Spark versions: 3.2.x - 4.1.x +- Java versions: 8 - 22 See [official documentation](https://clickhouse.com/docs/en/integrations/java#jdbc-driver). diff --git a/mddocs/docs/connection/db_connection/clickhouse/types.md b/mddocs/docs/connection/db_connection/clickhouse/types.md index 6e63c7222..f761e3f94 100644 --- a/mddocs/docs/connection/db_connection/clickhouse/types.md +++ b/mddocs/docs/connection/db_connection/clickhouse/types.md @@ -2,7 +2,7 @@ !!! note - The results below are valid for Spark 3.5.5, and may differ on other Spark versions. + The results below are valid for Spark 3.5.8, and may differ on other Spark versions. !!! note @@ -104,8 +104,8 @@ Always prefer creating tables with specific types **BEFORE WRITING DATA**: Here you can find source code with type conversions: - [Clickhouse -> JDBC](https://github.com/ClickHouse/clickhouse-java/blob/0.3.2/clickhouse-jdbc/src/main/java/com/clickhouse/jdbc/JdbcTypeMapping.java#L39-L176) -- [JDBC -> Spark](https://github.com/apache/spark/blob/v3.5.5/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala#L307) -- [Spark -> JDBC](https://github.com/apache/spark/blob/v3.5.5/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala#L141-L164) +- [JDBC -> Spark](https://github.com/apache/spark/blob/v3.5.8/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala#L307) +- [Spark -> JDBC](https://github.com/apache/spark/blob/v3.5.8/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/jdbc/JdbcUtils.scala#L141-L164) - [JDBC -> Clickhouse](https://github.com/ClickHouse/clickhouse-java/blob/0.3.2/clickhouse-jdbc/src/main/java/com/clickhouse/jdbc/JdbcTypeMapping.java#L185-L311) ## Supported types { #DBR-onetl-connection-db-connection-clickhouse-types-supported-types } diff --git a/mddocs/docs/connection/db_connection/greenplum/execute.md b/mddocs/docs/connection/db_connection/greenplum/execute.md index c7cda310f..8205daf46 100644 --- a/mddocs/docs/connection/db_connection/greenplum/execute.md +++ b/mddocs/docs/connection/db_connection/greenplum/execute.md @@ -17,7 +17,6 @@ Greenplum config, or reading data from some reference table. Method returns Spar Method accepts [Greenplum.FetchOptions][onetl.connection.db_connection.greenplum.options.GreenplumFetchOptions]. -Connection opened using this method should be then closed with `connection.close()` or `with connection:`. !!! warning @@ -54,7 +53,6 @@ Use this method to execute DDL and DML operations. Each method call runs operati Method accepts [Greenplum.ExecuteOptions][onetl.connection.db_connection.greenplum.options.GreenplumExecuteOptions]. -Connection opened using this method should be then closed with `connection.close()` or `with connection:`. #### Syntax support in `Greenplum.execute` { #DBR-onetl-connection-db-connection-greenplum-execute-syntax-support-in-greenplum-execute } diff --git a/mddocs/docs/connection/db_connection/greenplum/prerequisites.md b/mddocs/docs/connection/db_connection/greenplum/prerequisites.md index 63ea99a79..e840f0d30 100644 --- a/mddocs/docs/connection/db_connection/greenplum/prerequisites.md +++ b/mddocs/docs/connection/db_connection/greenplum/prerequisites.md @@ -5,7 +5,7 @@ - Greenplum server versions: - Officially declared: 5.x, 6.x, and 7.x (which requires `Greenplum.get_packages(package_version="2.3.0")` or higher) - Actually tested: 6.23, 7.0 -- Spark versions: 2.3.x - 3.2.x (Spark 3.3+ is not supported yet) +- Spark versions: 3.2.x (Spark 3.3+ is not supported yet) - Java versions: 8 - 11 See [official documentation](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.2/greenplum-connector-spark/release_notes.html). @@ -17,7 +17,7 @@ BEFORE creating the connector instance. See [installation instruction][DBR-onetl-install-spark] for more details. -## Downloading VMware package { #DBR-onetl-connection-db-connection-greenplum-prerequisites-downloading-vmware-package } +## Download VMware package { #DBR-onetl-connection-db-connection-greenplum-prerequisites-downloading-vmware-package } To use Greenplum connector you should download connector `.jar` file from [VMware website](https://network.tanzu.vmware.com/products/vmware-greenplum#/releases/1413479/file_groups/16966) @@ -40,146 +40,33 @@ There are several ways to do that. See [install Java packages][DBR-onetl-install If you're uploading package to private package repo, use `groupId=io.pivotal` and `artifactoryId=greenplum-spark_2.12` (`2.12` is Scala version) to give uploaded package a proper name. -## Connecting to Greenplum { #DBR-onetl-connection-db-connection-greenplum-prerequisites-connecting-to-greenplum } +## Interaction Spark ↔ Greenplum { #DBR-onetl-connection-db-connection-greenplum-prerequisites-interaction-spark-greenplum } -### Interaction schema { #DBR-onetl-connection-db-connection-greenplum-prerequisites-interaction-schema } +This connector is **very** different from regular Postgres connector. -Spark executors open ports to listen incoming requests. -Greenplum segments are initiating connections to Spark executors using [EXTERNAL TABLE](https://docs.vmware.com/en/VMware-Greenplum/7/greenplum-database/ref_guide-sql_commands-CREATE_EXTERNAL_TABLE.html) -functionality, and send/read data using [gpfdist protocol](https://docs.vmware.com/en/VMware-Greenplum/7/greenplum-database/admin_guide-external-g-using-the-greenplum-parallel-file-server--gpfdist-.html#about-gpfdist-setup-and-performance-1). +Postgres connector connects directly to Postgres host via JDBC driver: -Data is **not** send through Greenplum master. -Greenplum master only receives commands to start reading/writing process, and manages all the metadata (external table location, schema and so on). +- Spark driver → Postgres host (get query column names and types, create target table) +- Spark executors → Postgres host (send/fetch actual data) -More details can be found in [official documentation](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/overview.html). - -### Set number of connections { #DBR-onetl-connection-db-connection-greenplum-prerequisites-set-number-of-connections } - -!!! warning - - This is very important!!! - - If you don't limit number of connections, you can exceed the [max_connections](https://docs.vmware.com/en/VMware-Greenplum/7/greenplum-database/admin_guide-client_auth.html#limiting-concurrent-connections#limiting-concurrent-connections-2) - limit set on the Greenplum side. It's usually not so high, e.g. 500-1000 connections max, - depending on your Greenplum instance settings and using connection balancers like `pgbouncer`. - - Consuming all available connections means **nobody** (even admin users) can connect to Greenplum. - -Each job on the Spark executor makes its own connection to Greenplum master node, -so you need to limit number of connections to avoid opening too many of them. - -- Reading about `5-10Gb` of data requires about `3-5` parallel connections. -- Reading about `20-30Gb` of data requires about `5-10` parallel connections. -- Reading about `50Gb` of data requires ~ `10-20` parallel connections. -- Reading about `100+Gb` of data requires `20-30` parallel connections. -- Opening more than `30-50` connections is not recommended. - -Number of connections can be limited by 2 ways: - -- By limiting number of Spark executors and number of cores per-executor. Max number of parallel jobs is `executors * cores`. - -=== "Spark with master=local" - - ```python - spark = ( - SparkSession.builder - # Spark will run with 5 threads in local mode, allowing up to 5 parallel tasks - .config("spark.master", "local[5]") - .config("spark.executor.cores", 1) - ).getOrCreate() - ``` - -=== "Spark with master=yarn or master=k8s, dynamic allocation" - - ```python - spark = ( - SparkSession.builder - .config("spark.master", "yarn") - # Spark will start MAX 10 executors with 1 core each (dynamically), so max number of parallel jobs is 10 - .config("spark.dynamicAllocation.maxExecutors", 10) - .config("spark.executor.cores", 1) - ).getOrCreate() - ``` - -=== "Spark with master=yarn or master=k8s, static allocation" - - ```python - spark = ( - SparkSession.builder - .config("spark.master", "yarn") - # Spark will start EXACTLY 10 executors with 1 core each, so max number of parallel jobs is 10 - .config("spark.executor.instances", 10) - .config("spark.executor.cores", 1) - ).getOrCreate() - ``` - -- By limiting connection pool size user by Spark (**only** for Spark with `master=local`): - -```python -spark = SparkSession.builder.config("spark.master", "local[*]").getOrCreate() - -# No matter how many executors are started and how many cores they have, -# number of connections cannot exceed pool size: -Greenplum( - ..., - extra={ - "pool.maxSize": 10, - }, -) -``` - -See [connection pooling](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/using_the_connector.html#jdbcconnpool) -documentation. - -- By setting [num_partitions][onetl.connection.db_connection.greenplum.options.GreenplumReadOptions.num_partitions] - and [partition_column][onetl.connection.db_connection.greenplum.options.GreenplumReadOptions.partition_column] (not recommended). - -### Allowing connection to Greenplum master { #DBR-onetl-connection-db-connection-greenplum-prerequisites-allowing-connection-to-greenplum-master } - -Ask your Greenplum cluster administrator to allow your user to connect to Greenplum master node, -e.g. by updating `pg_hba.conf` file. - -More details can be found in [official documentation](https://docs.vmware.com/en/VMware-Greenplum/7/greenplum-database/admin_guide-client_auth.html#limiting-concurrent-connections#allowing-connections-to-greenplum-database-0). - -### Set connection port { #DBR-onetl-connection-db-connection-greenplum-prerequisites-set-connection-port } +Data should **NEVER** be send via Greenplum master (coordinator) using regular Postgres connector, as it's very easy to overload coordinator +by sending hundreds and thousands of gigabytes of data. -#### Connection port for Spark with `master=k8s` { #DBR-onetl-connection-db-connection-greenplum-prerequisites-connection-port-for-spark-with-masterk8s } +Instead, Greenplum connector uses [gpfdist protocol](https://docs.vmware.com/en/VMware-Greenplum/7/greenplum-database/admin_guide-external-g-using-the-greenplum-parallel-file-server--gpfdist-.html#about-gpfdist-setup-and-performance-1) with a bit complicated schema: -Please follow [the official documentation](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/configure.html#k8scfg) +- Spark driver → Greenplum master (get query column names and types, create target table) +- Spark executors → Greenplum master (create [EXTERNAL TABLEs](https://docs.vmware.com/en/VMware-Greenplum/7/greenplum-database/ref_guide-sql_commands-CREATE_EXTERNAL_TABLE.html)) +- Greenplum segments → Spark executors (send/fetch actual data via `EXTERNAL TABLE`) -#### Connection port for Spark with `master=yarn` or `master=local` { #DBR-onetl-connection-db-connection-greenplum-prerequisites-connection-port-for-spark-with-masteryarn-or-masterlocal } - -To read data from Greenplum using Spark, following ports should be opened in firewall between Spark and Greenplum: - -- Spark driver and all Spark executors -> port `5432` on Greenplum master node. - - This port number should be set while connecting to Greenplum: - - ```python - greenplum = Greenplum(host="master.host", port=5432, ...) - ``` - -- Greenplum segments -> some port range (e.g. `41000-42000`) **listened by Spark executors**. - - This range should be set in `extra` option: - - ```python - greenplum = Greenplum( - ..., - extra={ - "server.port": "41000-42000", - }, - ) - ``` - - Number of ports in this range is `number of parallel running Spark sessions` * `number of parallel connections per session`. +More details can be found in [official documentation](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/overview.html). - Number of connections per session (see below) is usually less than `30` (see above). +## Configuring the connector { #DBR-onetl-connection-db-connection-greenplum-prerequisites-configuring-the-connector } - Number of session depends on your environment: +Each Spark executor starts a `gpfdist` server, and each Greeplum **segment** connect to this server. +Greenplum segment should know server's IP address/hostname and a port number. - - For `master=local` only few ones-tens sessions can be started on the same host, depends on available RAM and CPU. - - For `master=yarn` hundreds or thousands of sessions can be started simultaneously, but they are executing on different cluster nodes, so one port can be opened on different nodes at the same time. +This target IP and port range should be added to firewall `ALLOW` rule on Spark host/cluster with sourceIP = Greenplum network. +Otherwise connection cannot be established. More details can be found in official documentation: @@ -187,18 +74,15 @@ More details can be found in official documentation: - [format of server.port value](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/options.html#server.port) - [port troubleshooting](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/troubleshooting.html#port-errors) -### Set connection host { #DBR-onetl-connection-db-connection-greenplum-prerequisites-set-connection-host } +### spark.master=local { #DBR-onetl-connection-db-connection-greenplum-prerequisites-sparkmaster-local } -#### Connection host for Spark with `master=k8s` { #DBR-onetl-connection-db-connection-greenplum-prerequisites-connection-host-for-spark-with-masterk8s } +#### Set `gpfdist` server host { #DBR-onetl-connection-db-connection-greenplum-prerequisites-sparkmaster-local-set-gpfdist-server-host } -Please follow [the official documentation](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/configure.html#k8scfg) +By default, Greenplum connector tries to resolve current host IP, and then pass it to Greenplum segment. +On some hosts it works as-is, without any additional configuration. In others it's not. -#### Connection host for Spark with `master=local` { #DBR-onetl-connection-db-connection-greenplum-prerequisites-connection-host-for-spark-with-masterlocal } - -By default, Greenplum connector tries to resolve IP of current host, and then pass it as `gpfdist` URL to Greenplum segment. -This may fail in some cases. - -For example, IP can be resolved using `/etc/hosts` content like this: +The most common error is that Greenplum segment receives `127.0.0.1` IP address (loopback interface). +This is usually caused by `/etc/hosts` content like this: ```text 127.0.0.1 localhost real-host-name @@ -223,19 +107,22 @@ error code = 111 (Connection refused); (seg3 slice1 12.34.56.78:10003 pid=12345 There are 2 ways to fix that: -- Explicitly pass your host IP address to connector, like this +- Explicitly pass your host IP address to connector, like this: ```python import os - # pass here real host IP (accessible from GP segments) - os.environ["HOST_IP"] = "192.168.1.1" + # host IP, accessible from GP segments + os.environ["SPARK_LOCAL_IP"] = "192.168.1.1" + + # !!!SET IP BEFORE CREATING SPARK SESSION!!! + spark = ... greenplum = Greenplum( ..., extra={ # connector will read IP from this environment variable - "server.hostEnv": "env.HOST_IP", + "server.hostEnv": "env.SPARK_LOCAL_IP", }, spark=spark, ) @@ -248,18 +135,39 @@ There are 2 ways to fix that: ```text 127.0.0.1 localhost # this IP should be accessible from GP segments - 192.168.1.1 driver-host-name + 192.168.1.1 real-host-name ``` - So Greenplum connector will properly resolve host IP. + This requires root privileges on host, not everyone can do this. + Also this doesn't work with dynamic IP addresses. -#### Connection host for Spark with `master=yarn` { #DBR-onetl-connection-db-connection-greenplum-prerequisites-connection-host-for-spark-with-masteryarn } +#### Set `gpfdist` server port { #DBR-onetl-connection-db-connection-greenplum-prerequisites-sparkmaster-local-set-gpfdist-server-port } -The same issue with resolving IP address can occur on Hadoop cluster node, but it's tricky to fix, because each node has a different IP. +By default, Spark executors can start `gpfdist` server on *any* random port number. +You can limit port range using `extra` option: -There are 3 ways to fix that: +```python +greenplum = Greenplum( + ..., + extra={ + "server.port": "41000-42000", # !!! JUST AN EXAMPLE !!! + }, +) +``` + +Number of ports in this range should be at least `number of parallel running Spark sessions on host` * `number of executors per session`. + +### spark.master=yarn { #DBR-onetl-connection-db-connection-greenplum-prerequisites-sparkmaster-yarn } + +#### Set `gpfdist` server host { #DBR-onetl-connection-db-connection-greenplum-prerequisites-sparkmaster-yarn-set-gpfdist-server-host } + +By default, Greenplum connector tries to resolve current host IP, and then pass it to Greenplum segment. +Usually there are no issues with that, connector just works as-is, without any adjustments. -- Pass node hostname to `gpfdist` URL. So IP will be resolved on segment side: +The most common error is that Greenplum segment receives `127.0.0.1` IP address (loopback interface) +instead of external IP of Hadoop data/compute node. There are 3 ways to fix it: + +- Pass node hostname instead of IP address to Greenplum segment: ```python greenplum = Greenplum( @@ -270,11 +178,11 @@ There are 3 ways to fix that: ) ``` - But this may fail if Hadoop cluster node hostname cannot be resolved from Greenplum segment side. + This may require configuring DNS on each Greenplum segment to properly resolve Hadoop node hostname → some IP. More details can be found in [official documentation](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/options.html#server.usehostname). -- Set specific network interface to get IP address from: +- Set network interface name to get IP address from: ```python greenplum = Greenplum( @@ -301,24 +209,177 @@ There are 3 ways to fix that: valid_lft 83457sec preferred_lft 83457sec ``` - Note that in this case **each** Hadoop cluster node node should have network interface with name `eth0`. + Note that in this case **each** Hadoop cluster node node should have network interface with name `eth0`, + which may not be the case. More details can be found in [official documentation](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/options.html#server.nic). -- Update `/etc/hosts` on each Hadoop cluster node to include real node IP: +- Update `/etc/hosts` on each Hadoop cluster node to include its IP address: ```text 127.0.0.1 localhost # this IP should be accessible from GP segments - 192.168.1.1 cluster-node-name + 192.168.1.1 real-host-name ``` - So Greenplum connector will properly resolve node IP. + This requires root privileges on host, not everyone can do this. + Also this doesn't work with dynamic IP addresses. + +#### Set `gpfdist` server port { #DBR-onetl-connection-db-connection-greenplum-prerequisites-sparkmaster-yarn-set-gpfdist-server-port } + +By default, Spark executors can start `gpfdist` server on *any* random port number. +You can limit port range using `extra` option: + +```python +greenplum = Greenplum( + ..., + extra={ + "server.port": "41000-42000", # !!! JUST AN EXAMPLE !!! + }, +) +``` + +Number of ports in this range should be at least `number of parallel running Spark sessions per node` * `number of executors per session` / `number of Hadoop nodes`. + +### spark.master=k8s { #DBR-onetl-connection-db-connection-greenplum-prerequisites-sparkmaster-k8s } + +Before starting Spark session, you should to create a Kubernetes [Ingress](https://kubernetes.io/docs/concepts/services-networking/ingress/) object: + +```yaml title="ingress.yaml" +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: gpfdist-ingress + namespace: mynamespace + annotations: + nginx.ingress.kubernetes.io/ssl-redirect: "false" + nginx.ingress.kubernetes.io/force-ssl-redirect: "false" +spec: + rules: + - http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: gpfdist-default + port: + number: 50000 + +## RETURNED FROM K8S API RESPONSE ## +# status: +# loadBalancer: +# ingress: +# - ip: 11.22.33.44 +``` + +Then add special Spark listener to Spark session config, and specify ingress' load balancer IP or domain name with a port number: + +```python +spark = ( + SparkSession.builder.config("spark.master", "k8s://...") + .config("spark.extraListeners", "org.greenplum.GpfdistIngressListener") + .config("spark.kubernetes.namespace", "mynamespace") + .config("spark.greenplum.k8s.ingress.name", "gpfdist-ingress") # ingress name + .config("spark.greenplum.gpfdist.host", "11.22.33.44") # ingress IP/domain name + .config("spark.greenplum.gpfdist.listen-port", "50000") # ingress port + .config( + "spark.greenplum.gpfdist.is-ssl", "false" + ) # true for ingress with TLS enabled +).getOrCreate() +``` + +Set fixed port for `gpfdist` server to listen on: + +```python +greenplum = Greenplum( + ..., + extra={ + "server.port": "50000", # should match ingress port + }, +) +``` + +## Set number of connections { #DBR-onetl-connection-db-connection-greenplum-prerequisites-set-number-of-connections } + +!!! warning + + This is very important!!! + + If you don't limit number of connections, you can exceed the [max_connections](https://docs.vmware.com/en/VMware-Greenplum/7/greenplum-database/admin_guide-client_auth.html#limiting-concurrent-connections#limiting-concurrent-connections-2) + limit set on the Greenplum side. It's usually not so high, e.g. 500-1000 connections max, + depending on your Greenplum instance settings and using connection balancers like `pgbouncer`. + + Consuming all available connections means **nobody** (even admin users) can connect to Greenplum! + +Each task running on the Spark executor makes its own connection to Greenplum master node. +To avoid opening too many connections to Greenplum master (coordinator), you should limit number of tasks. + +- Reading about `5-10Gb` of data requires about `3-5` parallel connections. +- Reading about `20-30Gb` of data requires about `5-10` parallel connections. +- Reading about `50Gb` of data requires ~ `10-20` parallel connections. +- Reading about `100+Gb` of data requires `20-30` parallel connections. +- Opening more than `30-50` connections is not recommended. + +Max number of parallel tasks is `N executors * N cores-per-executor`, so this can be adjusted using Spark session configuration: + +=== "Spark with master=local" + + ```python + spark = ( + SparkSession.builder + # Spark will run with 5 threads in local mode, allowing up to 5 parallel tasks + .config("spark.master", "local[5]") + ).getOrCreate() + + # Set connection pool size AT LEAST to number of executors + 1 for driver + Greenplum( + ..., + extra={ + "pool.maxSize": 6, # 5 executors + 1 driver + }, + ) + ``` + +=== "Spark with master=yarn or master=k8s, dynamic allocation" + + ```python + spark = ( + SparkSession.builder + .config("spark.master", "yarn") + # Spark will start MAX 10 executors with 1 core each (dynamically), so max number of parallel jobs is 10 + .config("spark.dynamicAllocation.maxExecutors", 10) + .config("spark.executor.cores", 1) + ).getOrCreate() + ``` + +=== "Spark with master=yarn or master=k8s, static allocation" + + ```python + spark = ( + SparkSession.builder + .config("spark.master", "yarn") + # Spark will start EXACTLY 10 executors with 1 core each, so max number of parallel jobs is 10 + .config("spark.executor.instances", 10) + .config("spark.executor.cores", 1) + ).getOrCreate() + ``` + +See [connection pooling](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/using_the_connector.html#jdbcconnpool) +documentation. + +## Greenplum side adjustments { #DBR-onetl-connection-db-connection-greenplum-prerequisites-greenplum-side-adjustments } + +### Allow connecting to Greenplum master { #DBR-onetl-connection-db-connection-greenplum-prerequisites-allow-connecting-to-greenplum-master } + +Ask your Greenplum cluster administrator to allow your user to connect to Greenplum master (coordinator), +e.g. by updating `pg_hba.conf` file. + +More details can be found in [official documentation](https://docs.vmware.com/en/VMware-Greenplum/7/greenplum-database/admin_guide-client_auth.html#limiting-concurrent-connections#allowing-connections-to-greenplum-database-0). -### Set required grants { #DBR-onetl-connection-db-connection-greenplum-prerequisites-set-required-grants } +### Provide required grants { #DBR-onetl-connection-db-connection-greenplum-prerequisites-provide-required-grants } -Ask your Greenplum cluster administrator to set following grants for a user, -used for creating a connection: +Ask your Greenplum cluster administrator to set following grants for a user: === "Read + Write" diff --git a/mddocs/docs/connection/db_connection/greenplum/types.md b/mddocs/docs/connection/db_connection/greenplum/types.md index 9e131abc4..f28338a5a 100644 --- a/mddocs/docs/connection/db_connection/greenplum/types.md +++ b/mddocs/docs/connection/db_connection/greenplum/types.md @@ -33,7 +33,8 @@ This is how Greenplum connector performs this: See [Explicit type cast][DBR-onetl-connection-db-connection-greenplum-types-explicit-type-cast]. - Find corresponding `Spark type` → `Greenplumtype (write)` combination (see below) for each DataFrame column. If no combination is found, raise exception. - If `Greenplumtype (write)` match `Greenplum type (read)`, no additional casts will be performed, DataFrame column will be written to Greenplum as is. -- If `Greenplumtype (write)` does not match `Greenplum type (read)`, DataFrame column will be casted to target column type **on Greenplum side**. For example, you can write column with text data to `json` column which Greenplum connector currently does not support. +- If `Greenplumtype (write)` does not match `Greenplum type (read)`, DataFrame column will be casted to target column type **on Greenplum side**. + For example, you can write column with text data to column of `json` type (which Greenplum connector currently does not support). ### Create new table using Spark { #DBR-onetl-connection-db-connection-greenplum-types-create-new-table-using-spark } @@ -257,7 +258,7 @@ For example, you can use [to_json](https://www.postgresql.org/docs/current/funct ### `DBWriter` { #DBR-onetl-connection-db-connection-greenplum-types-dbwriter } -To write data to a `text` or `json` column in a Greenplum table, use [JSON.serialize_column][onetl.file.format.json.JSON.serialize_column] method. +To write data to a column of `text` or `json` types in some Greenplum table, use [JSON.serialize_column][onetl.file.format.json.JSON.serialize_column] method. ```python from onetl.connection import Greenplum diff --git a/mddocs/docs/connection/db_connection/hive/prerequisites.md b/mddocs/docs/connection/db_connection/hive/prerequisites.md index 8af35b609..b6d37203d 100644 --- a/mddocs/docs/connection/db_connection/hive/prerequisites.md +++ b/mddocs/docs/connection/db_connection/hive/prerequisites.md @@ -12,8 +12,8 @@ - Hive Metastore version: - Officially declared: 0.12 - 3.1.3 (may require to add proper .jar file explicitly) - Actually tested: 1.2.100, 2.3.10, 3.1.3 -- Spark versions: 2.3.x - 3.5.x -- Java versions: 8 - 20 +- Spark versions: 3.2.x - 4.1.x +- Java versions: 8 - 22 See [official documentation](https://spark.apache.org/docs/latest/sql-data-sources-hive-tables.html). diff --git a/mddocs/docs/connection/db_connection/hive/read.md b/mddocs/docs/connection/db_connection/hive/read.md index 0bff9278a..7e52bbda0 100644 --- a/mddocs/docs/connection/db_connection/hive/read.md +++ b/mddocs/docs/connection/db_connection/hive/read.md @@ -12,7 +12,7 @@ but does not support custom queries, like `JOIN`. - ✅︎ [Incremental strategy][DBR-onetl-connection-db-connection-clickhouse-read-incremental-strategy] - ✅︎ [Snapshot batch strategy][DBR-onetl-strategy-snapshot-batch-strategy] - ✅︎ [Incremental batch strategy][DBR-onetl-strategy-incremental-batch-strategy] -- ❌ `hint` (is not supported by Hive) +- ✅︎ `hint` - ❌ `df_schema` - ❌ `options` (only Spark config params are used) @@ -68,10 +68,6 @@ Prefer these write formats: - [ORC](https://spark.apache.org/docs/latest/sql-data-sources-orc.html) - [Parquet](https://spark.apache.org/docs/latest/sql-data-sources-parquet.html) -- [Iceberg](https://iceberg.apache.org/spark-quickstart/) -- [Hudi](https://hudi.apache.org/docs/quick-start-guide/) -- [Delta](https://docs.delta.io/latest/quick-start.html#set-up-apache-spark-with-delta-lake) - For colum-based write formats, each file contains separated sections there column data is stored. The file footer contains location of each column section/group. Spark can use this information to load only sections required by specific query, e.g. only selected columns, to drastically speed up the query. Another advantage is high compression ratio, e.g. 10x-100x in comparison to JSON or CSV. diff --git a/mddocs/docs/connection/db_connection/hive/sql.md b/mddocs/docs/connection/db_connection/hive/sql.md index 0bc0d2ca9..c691f5f07 100644 --- a/mddocs/docs/connection/db_connection/hive/sql.md +++ b/mddocs/docs/connection/db_connection/hive/sql.md @@ -43,10 +43,6 @@ Prefer these write formats: - [ORC](https://spark.apache.org/docs/latest/sql-data-sources-orc.html) - [Parquet](https://spark.apache.org/docs/latest/sql-data-sources-parquet.html) -- [Iceberg](https://iceberg.apache.org/spark-quickstart/) -- [Hudi](https://hudi.apache.org/docs/quick-start-guide/) -- [Delta](https://docs.delta.io/latest/quick-start.html#set-up-apache-spark-with-delta-lake) - For colum-based write formats, each file contains separated sections there column data is stored. The file footer contains location of each column section/group. Spark can use this information to load only sections required by specific query, e.g. only selected columns, to drastically speed up the query. Another advantage is high compression ratio, e.g. 10x-100x in comparison to JSON or CSV. diff --git a/mddocs/docs/connection/db_connection/hive/write.md b/mddocs/docs/connection/db_connection/hive/write.md index 5772037bd..01c0e3a7c 100644 --- a/mddocs/docs/connection/db_connection/hive/write.md +++ b/mddocs/docs/connection/db_connection/hive/write.md @@ -46,9 +46,6 @@ Prefer these write formats: - [ORC](https://spark.apache.org/docs/latest/sql-data-sources-orc.html) (**default**) - [Parquet](https://spark.apache.org/docs/latest/sql-data-sources-parquet.html) -- [Iceberg](https://iceberg.apache.org/spark-quickstart/) -- [Hudi](https://hudi.apache.org/docs/quick-start-guide/) -- [Delta](https://docs.delta.io/latest/quick-start.html#set-up-apache-spark-with-delta-lake) !!! warning When using `DBWriter`, the default spark data format configured in `spark.sql.sources.default` is ignored, as `Hive.WriteOptions(format=...)` default value is explicitly set to `orc`. diff --git a/mddocs/docs/connection/db_connection/kafka/prerequisites.md b/mddocs/docs/connection/db_connection/kafka/prerequisites.md index 417c3ea89..bb3271e23 100644 --- a/mddocs/docs/connection/db_connection/kafka/prerequisites.md +++ b/mddocs/docs/connection/db_connection/kafka/prerequisites.md @@ -5,8 +5,8 @@ - Kafka server versions: - Officially declared: 0.10 or higher - Actually tested: 3.2.3, 3.9.0 (only Kafka 3.x supports message headers) -- Spark versions: 2.4.x - 3.5.x -- Java versions: 8 - 17 +- Spark versions: 2.4.x - 4.1.x +- Java versions: 8 - 22 See [official documentation](https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html). diff --git a/mddocs/docs/connection/db_connection/kafka/read.md b/mddocs/docs/connection/db_connection/kafka/read.md index 6e6aa39f0..5d60ee7aa 100644 --- a/mddocs/docs/connection/db_connection/kafka/read.md +++ b/mddocs/docs/connection/db_connection/kafka/read.md @@ -48,6 +48,14 @@ This could be done using following methods: - [CSV.parse_column][onetl.file.format.csv.CSV.parse_column] - [XML.parse_column][onetl.file.format.xml.XML.parse_column] +Or any other method provided by Spark or third-larty libraries which can parse `BinaryType()` column into useful data. + +## GroupIds and offsets + +Regular Kafka consumers use `subscrube(topic)` method to notify Kafka that some new data from Kafka should be send to consumer if available. Offsets read by group are committed to Kafka, to guarantee at-least-once even if consumer failed somethere. + +Spark connector for Kafka is very different. It uses `assign(topic)` to read data manually from a topic. It doesn't commit offsets to Kafka, as the same data can be read multiple times, e.g. task failed and lost all its memory, new task will read this data again. + ## Examples { #DBR-onetl-connection-db-connection-kafka-read-examples } Snapshot strategy, `value` is Avro binary data: diff --git a/mddocs/docs/connection/db_connection/kafka/troubleshooting.md b/mddocs/docs/connection/db_connection/kafka/troubleshooting.md index c0c926290..2e2c1ee53 100644 --- a/mddocs/docs/connection/db_connection/kafka/troubleshooting.md +++ b/mddocs/docs/connection/db_connection/kafka/troubleshooting.md @@ -7,3 +7,13 @@ ## Cannot connect using `SSL` protocol { #DBR-onetl-connection-db-connection-kafka-troubleshooting-cannot-connect-using-ssl-protocol } Please check that certificate files are not Base-64 encoded. + +## Group authorization failed + +Before Spark 3.4.0, Kafka connector read topic offsets using Consumer API. To ensure that each time offsets fetched from Kafka are fresh, Spark driver generates random `groupId`, and passes it to Kafka. If Kafka ACL limits which groupIds can access specific topic, this will fail. + +To prevent this, explicitly pass groupId `Kafka(extra={"group.id": "something")`, matching the ACL rule. + +## Spark driver hangs while fetching offsets from Kafka + +This may be the case on Spark 3.2.x - 3.3.x there Spark driver uses Consumer API to fetch offsets. Since [Spark 3.4.0](https://issues.apache.org/jira/browse/SPARK-40844) connector uses Admin API. You can force Spark to use Admin API by setting Spark session config `spark.sql.streaming.kafka.useDeprecatedOffsetFetching=false`. diff --git a/mddocs/docs/connection/db_connection/mongodb/prerequisites.md b/mddocs/docs/connection/db_connection/mongodb/prerequisites.md index fb74cb9c2..4e8404c49 100644 --- a/mddocs/docs/connection/db_connection/mongodb/prerequisites.md +++ b/mddocs/docs/connection/db_connection/mongodb/prerequisites.md @@ -4,9 +4,9 @@ - MongoDB server versions: - Officially declared: 4.0 or higher - - Actually tested: 4.0.0, 8.0.4 -- Spark versions: 3.2.x - 3.5.x -- Java versions: 8 - 20 + - Actually tested: 4.0.0, 8.2.2 +- Spark versions: 3.2.x - 4.1.x +- Java versions: 8 - 22 See [official documentation](https://www.mongodb.com/docs/spark-connector/). diff --git a/mddocs/docs/connection/db_connection/mongodb/types.md b/mddocs/docs/connection/db_connection/mongodb/types.md index 7e5cd219f..3612b6f4d 100644 --- a/mddocs/docs/connection/db_connection/mongodb/types.md +++ b/mddocs/docs/connection/db_connection/mongodb/types.md @@ -2,7 +2,7 @@ !!! note - The results below are valid for Spark 3.5.5, and may differ on other Spark versions. + The results below are valid for Spark 3.5.8, and may differ on other Spark versions. ## Type detection & casting { #DBR-onetl-connection-db-connection-mongodb-types-type-detection-casting } @@ -72,8 +72,8 @@ It is highly recommended to pass `df_schema` explicitly, to avoid type conversio Here you can find source code with type conversions: -- [MongoDB -> Spark](https://github.com/mongodb/mongo-spark/blob/r10.4.1/src/main/java/com/mongodb/spark/sql/connector/schema/InferSchema.java#L214-L260) -- [Spark -> MongoDB](https://github.com/mongodb/mongo-spark/blob/r10.4.1/src/main/java/com/mongodb/spark/sql/connector/schema/RowToBsonDocumentConverter.java#L157-L260) +- [MongoDB -> Spark](https://github.com/mongodb/mongo-spark/blob/r10.5.0/src/main/java/com/mongodb/spark/sql/connector/schema/InferSchema.java#L214-L260) +- [Spark -> MongoDB](https://github.com/mongodb/mongo-spark/blob/r10.5.0/src/main/java/com/mongodb/spark/sql/connector/schema/RowToBsonDocumentConverter.java#L157-L260) ## Supported types { #DBR-onetl-connection-db-connection-mongodb-types-supported-types } diff --git a/mddocs/docs/connection/db_connection/mssql/execute.md b/mddocs/docs/connection/db_connection/mssql/execute.md index c4b8c0ba2..4005a335f 100644 --- a/mddocs/docs/connection/db_connection/mssql/execute.md +++ b/mddocs/docs/connection/db_connection/mssql/execute.md @@ -16,7 +16,6 @@ Use this method to perform some `SELECT` query which returns **small number or r Method accepts [MSSQL.FetchOptions][onetl.connection.db_connection.mssql.options.MSSQLFetchOptions]. -Connection opened using this method should be then closed with `connection.close()` or `with connection:`. !!! warning @@ -52,7 +51,6 @@ Use this method to execute DDL and DML operations. Each method call runs operati Method accepts [MSSQL.ExecuteOptions][onetl.connection.db_connection.mssql.options.MSSQLExecuteOptions]. -Connection opened using this method should be then closed with `connection.close()` or `with connection:`. #### Syntax support in `MSSQL.execute` { #DBR-onetl-connection-db-connection-mssql-execute-syntax-support-in-mssql-execute } diff --git a/mddocs/docs/connection/db_connection/mssql/prerequisites.md b/mddocs/docs/connection/db_connection/mssql/prerequisites.md index f233950d2..a54855708 100644 --- a/mddocs/docs/connection/db_connection/mssql/prerequisites.md +++ b/mddocs/docs/connection/db_connection/mssql/prerequisites.md @@ -3,10 +3,10 @@ ## Version Compatibility { #DBR-onetl-connection-db-connection-mssql-prerequisites-version-compatibility } - SQL Server versions: - - Officially declared: 2016 - 2022 - - Actually tested: 2017, 2022 -- Spark versions: 2.3.x - 3.5.x -- Java versions: 8 - 20 + - Officially declared: 2016 - 2025 + - Actually tested: 2017, 2025 +- Spark versions: 3.2.x - 4.1.x +- Java versions: 8 - 22 See [official documentation](https://learn.microsoft.com/en-us/sql/connect/jdbc/system-requirements-for-the-jdbc-driver) and [official compatibility matrix](https://learn.microsoft.com/en-us/sql/connect/jdbc/microsoft-jdbc-driver-for-sql-server-support-matrix). diff --git a/mddocs/docs/connection/db_connection/mssql/types.md b/mddocs/docs/connection/db_connection/mssql/types.md index 2c9040f65..961662b87 100644 --- a/mddocs/docs/connection/db_connection/mssql/types.md +++ b/mddocs/docs/connection/db_connection/mssql/types.md @@ -2,7 +2,7 @@ !!! note - The results below are valid for Spark 3.5.5, and may differ on other Spark versions. + The results below are valid for Spark 3.5.8, and may differ on other Spark versions. ## Type detection & casting { #DBR-onetl-connection-db-connection-mssql-types-type-detection-casting } @@ -94,8 +94,8 @@ Always prefer creating tables with specific types **BEFORE WRITING DATA**: Here you can find source code with type conversions: - [MSSQL -> JDBC](https://github.com/microsoft/mssql-jdbc/blob/v12.2.0/src/main/java/com/microsoft/sqlserver/jdbc/SQLServerResultSetMetaData.java#L117-L170) -- [JDBC -> Spark](https://github.com/apache/spark/blob/v3.5.5/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MsSqlServerDialect.scala#L135-L152) -- [Spark -> JDBC](https://github.com/apache/spark/blob/v3.5.5/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MsSqlServerDialect.scala#L154-L163) +- [JDBC -> Spark](https://github.com/apache/spark/blob/v3.5.8/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MsSqlServerDialect.scala#L135-L152) +- [Spark -> JDBC](https://github.com/apache/spark/blob/v3.5.8/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MsSqlServerDialect.scala#L154-L163) - [JDBC -> MSSQL](https://github.com/microsoft/mssql-jdbc/blob/v12.2.0/src/main/java/com/microsoft/sqlserver/jdbc/DataTypes.java#L625-L676) ## Supported types { #DBR-onetl-connection-db-connection-mssql-types-supported-types } diff --git a/mddocs/docs/connection/db_connection/mysql/execute.md b/mddocs/docs/connection/db_connection/mysql/execute.md index b6d96169f..b731a7862 100644 --- a/mddocs/docs/connection/db_connection/mysql/execute.md +++ b/mddocs/docs/connection/db_connection/mysql/execute.md @@ -16,7 +16,6 @@ Use this method to perform some `SELECT` query which returns **small number or r Method accepts [MySQL.FetchOptions][onetl.connection.db_connection.mysql.options.MySQLFetchOptions]. -Connection opened using this method should be then closed with `connection.close()` or `with connection:`. !!! warning @@ -53,7 +52,6 @@ Use this method to execute DDL and DML operations. Each method call runs operati Method accepts [MySQL.ExecuteOptions][onetl.connection.db_connection.mysql.options.MySQLExecuteOptions]. -Connection opened using this method should be then closed with `connection.close()` or `with connection:`. #### Syntax support in `MySQL.execute` { #DBR-onetl-connection-db-connection-mysql-execute-syntax-support-in-mysql-execute } diff --git a/mddocs/docs/connection/db_connection/mysql/prerequisites.md b/mddocs/docs/connection/db_connection/mysql/prerequisites.md index 0f3fdf27b..13b755e60 100644 --- a/mddocs/docs/connection/db_connection/mysql/prerequisites.md +++ b/mddocs/docs/connection/db_connection/mysql/prerequisites.md @@ -3,10 +3,10 @@ ## Version Compatibility { #DBR-onetl-connection-db-connection-mysql-prerequisites-version-compatibility } - MySQL server versions: - - Officially declared: 8.0 - 9.2 - - Actually tested: 5.7.13, 9.2.0 -- Spark versions: 2.3.x - 3.5.x -- Java versions: 8 - 20 + - Officially declared: 8.0 - 9.5 + - Actually tested: 5.7.16, 9.5.0 +- Spark versions: 3.2.x - 4.1.x +- Java versions: 8 - 22 See [official documentation](https://dev.mysql.com/doc/connector-j/en/connector-j-versions.html). diff --git a/mddocs/docs/connection/db_connection/mysql/types.md b/mddocs/docs/connection/db_connection/mysql/types.md index d5502caca..78b194ba2 100644 --- a/mddocs/docs/connection/db_connection/mysql/types.md +++ b/mddocs/docs/connection/db_connection/mysql/types.md @@ -2,7 +2,7 @@ !!! note - The results below are valid for Spark 3.5.5, and may differ on other Spark versions. + The results below are valid for Spark 3.5.8, and may differ on other Spark versions. ## Type detection & casting { #DBR-onetl-connection-db-connection-mysql-types-type-detection-casting } @@ -92,8 +92,8 @@ Always prefer creating tables with specific types **BEFORE WRITING DATA**: Here you can find source code with type conversions: - [MySQL -> JDBC](https://github.com/mysql/mysql-connector-j/blob/8.0.33/src/main/core-api/java/com/mysql/cj/MysqlType.java#L44-L623) -- [JDBC -> Spark](https://github.com/apache/spark/blob/v3.5.5/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala#L104-L132) -- [Spark -> JDBC](https://github.com/apache/spark/blob/v3.5.5/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala#L204-L211) +- [JDBC -> Spark](https://github.com/apache/spark/blob/v3.5.8/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala#L104-L132) +- [Spark -> JDBC](https://github.com/apache/spark/blob/v3.5.8/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MySQLDialect.scala#L204-L211) - [JDBC -> MySQL](https://github.com/mysql/mysql-connector-j/blob/8.0.33/src/main/core-api/java/com/mysql/cj/MysqlType.java#L625-L867) ## Supported types { #DBR-onetl-connection-db-connection-mysql-types-supported-types } diff --git a/mddocs/docs/connection/db_connection/oracle/execute.md b/mddocs/docs/connection/db_connection/oracle/execute.md index d2d8413c4..a104cb317 100644 --- a/mddocs/docs/connection/db_connection/oracle/execute.md +++ b/mddocs/docs/connection/db_connection/oracle/execute.md @@ -17,7 +17,6 @@ Oracle config, or reading data from some reference table. Method returns Spark D Method accepts [Oracle.FetchOptions][onetl.connection.db_connection.oracle.options.OracleFetchOptions]. -Connection opened using this method should be then closed with `connection.close()` or `with connection:`. !!! warning @@ -54,7 +53,6 @@ Use this method to execute DDL and DML operations. Each method call runs operati Method accepts [Oracle.ExecuteOptions][onetl.connection.db_connection.oracle.options.OracleExecuteOptions]. -Connection opened using this method should be then closed with `connection.close()` or `with connection:`. #### Syntax support in `Oracle.execute` { #DBR-onetl-connection-db-connection-oracle-execute-syntax-support-in-oracle-execute } diff --git a/mddocs/docs/connection/db_connection/oracle/prerequisites.md b/mddocs/docs/connection/db_connection/oracle/prerequisites.md index dd712356d..72582da33 100644 --- a/mddocs/docs/connection/db_connection/oracle/prerequisites.md +++ b/mddocs/docs/connection/db_connection/oracle/prerequisites.md @@ -4,9 +4,9 @@ - Oracle Server versions: - Officially declared: 19c, 21c, 23ai - - Actually tested: 11.2, 23.5 -- Spark versions: 2.3.x - 3.5.x -- Java versions: 8 - 20 + - Actually tested: 11.2, 23.26 +- Spark versions: 3.2.x - 4.1.x +- Java versions: 8 - 22 See [official documentation](https://www.oracle.com/cis/database/technologies/appdev/jdbc-downloads.html). diff --git a/mddocs/docs/connection/db_connection/oracle/types.md b/mddocs/docs/connection/db_connection/oracle/types.md index 171cae8e3..7bad8b577 100644 --- a/mddocs/docs/connection/db_connection/oracle/types.md +++ b/mddocs/docs/connection/db_connection/oracle/types.md @@ -2,7 +2,7 @@ !!! note - The results below are valid for Spark 3.5.5, and may differ on other Spark versions. + The results below are valid for Spark 3.5.8, and may differ on other Spark versions. ## Type detection & casting { #DBR-onetl-connection-db-connection-oracle-types-type-detection-casting } @@ -92,8 +92,8 @@ See [List of Oracle types](https://docs.oracle.com/en/database/oracle/oracle-dat Here you can find source code with type conversions: -- [JDBC -> Spark](https://github.com/apache/spark/blob/v3.5.5/sql/core/src/main/scala/org/apache/spark/sql/jdbc/OracleDialect.scala#L83-L109) -- [Spark -> JDBC](https://github.com/apache/spark/blob/v3.5.5/sql/core/src/main/scala/org/apache/spark/sql/jdbc/OracleDialect.scala#L111-L123) +- [JDBC -> Spark](https://github.com/apache/spark/blob/v3.5.8/sql/core/src/main/scala/org/apache/spark/sql/jdbc/OracleDialect.scala#L83-L109) +- [Spark -> JDBC](https://github.com/apache/spark/blob/v3.5.8/sql/core/src/main/scala/org/apache/spark/sql/jdbc/OracleDialect.scala#L111-L123) ### Numeric types { #DBR-onetl-connection-db-connection-oracle-types-numeric-types } diff --git a/mddocs/docs/connection/db_connection/postgres/execute.md b/mddocs/docs/connection/db_connection/postgres/execute.md index d7695b724..7912478af 100644 --- a/mddocs/docs/connection/db_connection/postgres/execute.md +++ b/mddocs/docs/connection/db_connection/postgres/execute.md @@ -16,7 +16,6 @@ Use this method to execute some `SELECT` query which returns **small number or r Method accepts [Postgres.FetchOptions][onetl.connection.db_connection.postgres.options.PostgresFetchOptions]. -Connection opened using this method should be then closed with `connection.close()` or `with connection:`. !!! warning @@ -51,7 +50,6 @@ Use this method to execute DDL and DML operations. Each method call runs operati Method accepts [Postgres.ExecuteOptions][onetl.connection.db_connection.postgres.options.PostgresExecuteOptions]. -Connection opened using this method should be then closed with `connection.close()` or `with connection:`. #### Syntax support in `Postgres.execute` { #DBR-onetl-connection-db-connection-postgres-execute-syntax-support-in-postgres-execute } diff --git a/mddocs/docs/connection/db_connection/postgres/prerequisites.md b/mddocs/docs/connection/db_connection/postgres/prerequisites.md index 8382f4232..e0bfe5708 100644 --- a/mddocs/docs/connection/db_connection/postgres/prerequisites.md +++ b/mddocs/docs/connection/db_connection/postgres/prerequisites.md @@ -3,10 +3,10 @@ ## Version Compatibility { #DBR-onetl-connection-db-connection-postgres-prerequisites-version-compatibility } - PostgreSQL server versions: - - Officially declared: 8.2 - 17 - - Actually tested: 9.4.26, 17.3 -- Spark versions: 2.3.x - 3.5.x -- Java versions: 8 - 20 + - Officially declared: 8.4 - 18 + - Actually tested: 9.4.26, 18.1 +- Spark versions: 3.2.x - 4.1.x +- Java versions: 8 - 22 See [official documentation](https://jdbc.postgresql.org/). diff --git a/mddocs/docs/connection/db_connection/postgres/types.md b/mddocs/docs/connection/db_connection/postgres/types.md index d47922258..037ffabdd 100644 --- a/mddocs/docs/connection/db_connection/postgres/types.md +++ b/mddocs/docs/connection/db_connection/postgres/types.md @@ -2,7 +2,7 @@ !!! note - The results below are valid for Spark 3.5.5, and may differ on other Spark versions. + The results below are valid for Spark 3.5.8, and may differ on other Spark versions. ## Type detection & casting { #DBR-onetl-connection-db-connection-postgres-types-type-detection-casting } @@ -100,8 +100,8 @@ See [List of Postgres types](https://www.postgresql.org/docs/current/datatype.ht Here you can find source code with type conversions: - [Postgres <-> JDBC](https://github.com/pgjdbc/pgjdbc/blob/REL42.6.0/pgjdbc/src/main/java/org/postgresql/jdbc/TypeInfoCache.java#L78-L112) -- [JDBC -> Spark](https://github.com/apache/spark/blob/v3.5.5/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala#L52-L108) -- [Spark -> JDBC](https://github.com/apache/spark/blob/v3.5.5/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala#L118-L132) +- [JDBC -> Spark](https://github.com/apache/spark/blob/v3.5.8/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala#L52-L108) +- [Spark -> JDBC](https://github.com/apache/spark/blob/v3.5.8/sql/core/src/main/scala/org/apache/spark/sql/jdbc/PostgresDialect.scala#L118-L132) ### Numeric types { #DBR-onetl-connection-db-connection-postgres-types-numeric-types } diff --git a/mddocs/docs/connection/file_df_connection/spark_hdfs/prerequisites.md b/mddocs/docs/connection/file_df_connection/spark_hdfs/prerequisites.md index 6d4eadcc1..cb4c40668 100644 --- a/mddocs/docs/connection/file_df_connection/spark_hdfs/prerequisites.md +++ b/mddocs/docs/connection/file_df_connection/spark_hdfs/prerequisites.md @@ -2,8 +2,8 @@ ## Version Compatibility { #DBR-onetl-connection-file-df-connection-spark-hdfs-prerequisites-version-compatibility } -- Hadoop versions: 2.x, 3.x (only with Hadoop 3.x libraries) -- Spark versions: 2.3.x - 3.5.x +- Hadoop versions: 2.x, 3.x +- Spark versions: 3.2.x - 3.5.x - Java versions: 8 - 20 ## Installing PySpark { #DBR-onetl-connection-file-df-connection-spark-hdfs-prerequisites-installing-pyspark } diff --git a/mddocs/docs/connection/file_df_connection/spark_s3/prerequisites.md b/mddocs/docs/connection/file_df_connection/spark_s3/prerequisites.md index 5c4ee6626..52966b1c5 100644 --- a/mddocs/docs/connection/file_df_connection/spark_s3/prerequisites.md +++ b/mddocs/docs/connection/file_df_connection/spark_s3/prerequisites.md @@ -2,7 +2,7 @@ ## Version Compatibility { #DBR-onetl-connection-file-df-connection-spark-s3-prerequisites-version-compatibility } -- Spark versions: 3.2.x - 3.5.x (only with Hadoop 3.x libraries) +- Spark versions: 3.2.x - 3.5.x - Java versions: 8 - 20 ## Installing PySpark { #DBR-onetl-connection-file-df-connection-spark-s3-prerequisites-installing-pyspark } @@ -21,7 +21,7 @@ AWS and some other S3 cloud providers allows bucket access using domain style on Other implementations, like Minio, by default allows path style access only, e.g. `https://s3provider.com/mybucket` (see [MINIO_DOMAIN](https://min.io/docs/minio/linux/reference/minio-server/minio-server.html#envvar.MINIO_DOMAIN)). -You should set `path.style.access` to `True` or `False`, to choose the preferred style. +You should set `path_style_access` to `True` or `False`, to choose the preferred style. ### Authentication { #DBR-onetl-connection-file-df-connection-spark-s3-prerequisites-authentication } diff --git a/mddocs/docs/connection/file_df_connection/spark_s3/troubleshooting.md b/mddocs/docs/connection/file_df_connection/spark_s3/troubleshooting.md index 5e5f9bc13..1520058c1 100644 --- a/mddocs/docs/connection/file_df_connection/spark_s3/troubleshooting.md +++ b/mddocs/docs/connection/file_df_connection/spark_s3/troubleshooting.md @@ -234,7 +234,7 @@ spark_s3 = SparkS3( But is is **NOT** recommended. -#### Accessing S3 without domain-style access style support { #DBR-onetl-connection-file-df-connection-spark-s3-troubleshooting-accessing-s3-without-domain-style-access-style-support } +#### Accessing S3 without path-style access style support { #DBR-onetl-connection-file-df-connection-spark-s3-troubleshooting-accessing-s3-without-path-style-access-style-support } ```text Caused by: java.net.UnknownHostException: my-bucket.s3provider.com @@ -247,9 +247,7 @@ spark_s3 = SparkS3( host="s3provider.com", bucket="my-bucket", ..., - extra={ - "path.style.access": True, - }, + path_style_access=True, ) ``` From 30d21fec3e2fff749486a3d40af4e9f76bd9518c Mon Sep 17 00:00:00 2001 From: sga Date: Thu, 9 Apr 2026 00:07:29 +0300 Subject: [PATCH 26/28] fixes for contributing guide, quickstart & install guides + some small fixes for greenplum & kafka docs --- mddocs/docs/changelog/index.md | 4 ++ .../db_connection/greenplum/prerequisites.md | 16 +++--- .../connection/db_connection/kafka/read.md | 2 +- .../db_connection/kafka/troubleshooting.md | 4 +- mddocs/docs/contributing.md | 53 +++++-------------- mddocs/docs/include_0.md | 10 ++-- mddocs/docs/install/files.md | 6 +-- mddocs/docs/install/full.md | 4 +- mddocs/docs/install/kerberos.md | 2 +- mddocs/docs/install/spark.md | 8 +-- mddocs/docs/quickstart.md | 38 ++++++------- 11 files changed, 64 insertions(+), 83 deletions(-) diff --git a/mddocs/docs/changelog/index.md b/mddocs/docs/changelog/index.md index 62a6c6eb2..01026b43c 100644 --- a/mddocs/docs/changelog/index.md +++ b/mddocs/docs/changelog/index.md @@ -1,5 +1,9 @@ # Changelog { #DBR-onetl-changelog } +- [0.15.0 (2025-12-08)][DBR-onetl-changelog-0-15-0] +- [0.14.1 (2025-11-25)][DBR-onetl-changelog-0-14-1] +- [0.14.0 (2025-09-08)][DBR-onetl-changelog-0-14-0] +- [0.13.5 (2025-04-14)][DBR-onetl-changelog-0-13-5] - [0.13.4 (2025-03-20)][DBR-onetl-changelog-0-13-4] - [0.13.3 (2025-03-11)][DBR-onetl-changelog-0-13-3] - [0.13.1 (2025-03-06)][DBR-onetl-changelog-0-13-1] diff --git a/mddocs/docs/connection/db_connection/greenplum/prerequisites.md b/mddocs/docs/connection/db_connection/greenplum/prerequisites.md index e840f0d30..511b43c77 100644 --- a/mddocs/docs/connection/db_connection/greenplum/prerequisites.md +++ b/mddocs/docs/connection/db_connection/greenplum/prerequisites.md @@ -17,7 +17,7 @@ BEFORE creating the connector instance. See [installation instruction][DBR-onetl-install-spark] for more details. -## Download VMware package { #DBR-onetl-connection-db-connection-greenplum-prerequisites-downloading-vmware-package } +## Download VMware package { #DBR-onetl-connection-db-connection-greenplum-prerequisites-download-vmware-package } To use Greenplum connector you should download connector `.jar` file from [VMware website](https://network.tanzu.vmware.com/products/vmware-greenplum#/releases/1413479/file_groups/16966) @@ -74,9 +74,9 @@ More details can be found in official documentation: - [format of server.port value](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/options.html#server.port) - [port troubleshooting](https://docs.vmware.com/en/VMware-Greenplum-Connector-for-Apache-Spark/2.3/greenplum-connector-spark/troubleshooting.html#port-errors) -### spark.master=local { #DBR-onetl-connection-db-connection-greenplum-prerequisites-sparkmaster-local } +### spark.master=local { #DBR-onetl-connection-db-connection-greenplum-prerequisites-spark-masterlocal } -#### Set `gpfdist` server host { #DBR-onetl-connection-db-connection-greenplum-prerequisites-sparkmaster-local-set-gpfdist-server-host } +#### Set `gpfdist` server host { #DBR-onetl-connection-db-connection-greenplum-prerequisites-set-gpfdist-server-host-0 } By default, Greenplum connector tries to resolve current host IP, and then pass it to Greenplum segment. On some hosts it works as-is, without any additional configuration. In others it's not. @@ -141,7 +141,7 @@ There are 2 ways to fix that: This requires root privileges on host, not everyone can do this. Also this doesn't work with dynamic IP addresses. -#### Set `gpfdist` server port { #DBR-onetl-connection-db-connection-greenplum-prerequisites-sparkmaster-local-set-gpfdist-server-port } +#### Set `gpfdist` server port { #DBR-onetl-connection-db-connection-greenplum-prerequisites-set-gpfdist-server-port-0 } By default, Spark executors can start `gpfdist` server on *any* random port number. You can limit port range using `extra` option: @@ -157,9 +157,9 @@ greenplum = Greenplum( Number of ports in this range should be at least `number of parallel running Spark sessions on host` * `number of executors per session`. -### spark.master=yarn { #DBR-onetl-connection-db-connection-greenplum-prerequisites-sparkmaster-yarn } +### spark.master=yarn { #DBR-onetl-connection-db-connection-greenplum-prerequisites-spark-masteryarn } -#### Set `gpfdist` server host { #DBR-onetl-connection-db-connection-greenplum-prerequisites-sparkmaster-yarn-set-gpfdist-server-host } +#### Set `gpfdist` server host { #DBR-onetl-connection-db-connection-greenplum-prerequisites-set-gpfdist-server-host-1 } By default, Greenplum connector tries to resolve current host IP, and then pass it to Greenplum segment. Usually there are no issues with that, connector just works as-is, without any adjustments. @@ -225,7 +225,7 @@ instead of external IP of Hadoop data/compute node. There are 3 ways to fix it: This requires root privileges on host, not everyone can do this. Also this doesn't work with dynamic IP addresses. -#### Set `gpfdist` server port { #DBR-onetl-connection-db-connection-greenplum-prerequisites-sparkmaster-yarn-set-gpfdist-server-port } +#### Set `gpfdist` server port { #DBR-onetl-connection-db-connection-greenplum-prerequisites-set-gpfdist-server-port-1 } By default, Spark executors can start `gpfdist` server on *any* random port number. You can limit port range using `extra` option: @@ -241,7 +241,7 @@ greenplum = Greenplum( Number of ports in this range should be at least `number of parallel running Spark sessions per node` * `number of executors per session` / `number of Hadoop nodes`. -### spark.master=k8s { #DBR-onetl-connection-db-connection-greenplum-prerequisites-sparkmaster-k8s } +### spark.master=k8s { #DBR-onetl-connection-db-connection-greenplum-prerequisites-spark-masterk8s } Before starting Spark session, you should to create a Kubernetes [Ingress](https://kubernetes.io/docs/concepts/services-networking/ingress/) object: diff --git a/mddocs/docs/connection/db_connection/kafka/read.md b/mddocs/docs/connection/db_connection/kafka/read.md index 5d60ee7aa..74374fd02 100644 --- a/mddocs/docs/connection/db_connection/kafka/read.md +++ b/mddocs/docs/connection/db_connection/kafka/read.md @@ -50,7 +50,7 @@ This could be done using following methods: Or any other method provided by Spark or third-larty libraries which can parse `BinaryType()` column into useful data. -## GroupIds and offsets +## GroupIds and offsets { #DBR-onetl-connection-db-connection-kafka-read-groupids-and-offsets } Regular Kafka consumers use `subscrube(topic)` method to notify Kafka that some new data from Kafka should be send to consumer if available. Offsets read by group are committed to Kafka, to guarantee at-least-once even if consumer failed somethere. diff --git a/mddocs/docs/connection/db_connection/kafka/troubleshooting.md b/mddocs/docs/connection/db_connection/kafka/troubleshooting.md index 2e2c1ee53..92fe21286 100644 --- a/mddocs/docs/connection/db_connection/kafka/troubleshooting.md +++ b/mddocs/docs/connection/db_connection/kafka/troubleshooting.md @@ -8,12 +8,12 @@ Please check that certificate files are not Base-64 encoded. -## Group authorization failed +## Group authorization failed { #DBR-onetl-connection-db-connection-kafka-troubleshooting-group-authorization-failed } Before Spark 3.4.0, Kafka connector read topic offsets using Consumer API. To ensure that each time offsets fetched from Kafka are fresh, Spark driver generates random `groupId`, and passes it to Kafka. If Kafka ACL limits which groupIds can access specific topic, this will fail. To prevent this, explicitly pass groupId `Kafka(extra={"group.id": "something")`, matching the ACL rule. -## Spark driver hangs while fetching offsets from Kafka +## Spark driver hangs while fetching offsets from Kafka { #DBR-onetl-connection-db-connection-kafka-troubleshooting-spark-driver-hangs-while-fetching-offsets-from-kafka } This may be the case on Spark 3.2.x - 3.3.x there Spark driver uses Consumer API to fetch offsets. Since [Spark 3.4.0](https://issues.apache.org/jira/browse/SPARK-40844) connector uses Admin API. You can force Spark to use Admin API by setting Spark session config `spark.sql.streaming.kafka.useDeprecatedOffsetFetching=false`. diff --git a/mddocs/docs/contributing.md b/mddocs/docs/contributing.md index 406d17f73..d6506915e 100644 --- a/mddocs/docs/contributing.md +++ b/mddocs/docs/contributing.md @@ -9,7 +9,7 @@ project. We should keep close to these items during development: -* Some companies still use old Spark versions, like 2.3.1. So it is required to keep compatibility if possible, e.g. adding branches for different Spark versions. +* Some companies still use old Spark versions, like 3.2.0. So it is required to keep compatibility if possible, e.g. adding branches for different Spark versions. * Different users uses onETL in different ways - some uses only DB connectors, some only files. Connector-specific dependencies should be optional. * Instead of creating classes with a lot of different options, prefer splitting them into smaller classes, e.g. options class, context manager, etc, and using composition. @@ -27,7 +27,7 @@ Please follow [instruction](https://docs.github.com/en/get-started/quickstart/fo ### Clone the repo { #DBR-onetl-contributing-clone-the-repo } -Open terminal and run these commands: +Open terminal and run these commands to clone a **forked** repo: ```bash git clone git@github.com:myuser/onetl.git -b develop @@ -40,33 +40,7 @@ cd onetl Create virtualenv and install dependencies: ```bash -python -m venv venv -source venv/bin/activate -pip install -U wheel -pip install -U pip setuptools -pip install -U \ - -r requirements/core.txt \ - -r requirements/ftp.txt \ - -r requirements/hdfs.txt \ - -r requirements/kerberos.txt \ - -r requirements/s3.txt \ - -r requirements/sftp.txt \ - -r requirements/webdav.txt \ - -r requirements/dev.txt \ - -r requirements/docs.txt \ - -r requirements/tests/base.txt \ - -r requirements/tests/clickhouse.txt \ - -r requirements/tests/kafka.txt \ - -r requirements/tests/mongodb.txt \ - -r requirements/tests/mssql.txt \ - -r requirements/tests/mysql.txt \ - -r requirements/tests/postgres.txt \ - -r requirements/tests/oracle.txt \ - -r requirements/tests/pydantic-2.txt \ - -r requirements/tests/spark-3.5.5.txt - -# TODO: remove after https://github.com/zqmillet/sphinx-plantuml/pull/4 -pip install sphinx-plantuml --no-deps +make venv-install ``` ### Enable pre-commit hooks { #DBR-onetl-contributing-enable-pre-commit-hooks } @@ -74,13 +48,13 @@ pip install sphinx-plantuml --no-deps Install pre-commit hooks: ```bash -pre-commit install --install-hooks +prek install --install-hooks ``` Test pre-commit hooks run: ```bash -pre-commit run +prek run ``` ## How to { #DBR-onetl-contributing-how-to } @@ -110,13 +84,13 @@ docker-compose --profile mongodb up -d Run tests: ```bash -docker-compose run --rm onetl ./run_tests.sh +docker-compose run --rm onetl pytest ``` You can pass additional arguments, they will be passed to pytest: ```bash -docker-compose run --rm onetl ./run_tests.sh -m mongodb -lsx -vvvv --log-cli-level=INFO +docker-compose run --rm onetl pytest -m mongodb -lsx -vvvv --log-cli-level=INFO ``` You can run interactive bash session and use it: @@ -124,7 +98,7 @@ You can run interactive bash session and use it: ```bash docker-compose run --rm onetl bash -./run_tests.sh -m mongodb -lsx -vvvv --log-cli-level=INFO +pytest -m mongodb -lsx -vvvv --log-cli-level=INFO ``` See logs of test container: @@ -178,22 +152,23 @@ You can run limited set of dependencies: docker-compose --profile mongodb up -d ``` -Load environment variables with connection properties: +Run core tests: ```bash -source .env.local +make test-core ``` -Run tests: +Run specific connection tests: ```bash -./run_tests.sh +make test-spark PYTEST_ARGS="-m mongodb" +make test-no-spark PYTEST_ARGS="-m ftp" ``` You can pass additional arguments, they will be passed to pytest: ```bash -./run_tests.sh -m mongodb -lsx -vvvv --log-cli-level=INFO +make test-spark PYTEST_ARGS="-m mongodb -lsx -vvvv --log-cli-level=INFO" ``` Stop all containers and remove created volumes: diff --git a/mddocs/docs/include_0.md b/mddocs/docs/include_0.md index 7bfa97234..fb23f1c4f 100644 --- a/mddocs/docs/include_0.md +++ b/mddocs/docs/include_0.md @@ -1,8 +1,8 @@ -## What is onETL? { #DBR-onetl-snippet-0-what-is-onetl } +## What is onETL? { #DBR-onetl-include-0-what-is-onetl } Python ETL/ELT library powered by [Apache Spark](https://spark.apache.org/) & other open-source tools. -## Goals { #DBR-onetl-snippet-0-goals } +## Goals { #DBR-onetl-include-0-goals } - Provide unified classes to extract data from (**E**) & load data to (**L**) various stores. - Provides [Spark DataFrame API](https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.html) for performing transformations (**T**) in terms of *ETL*. @@ -10,21 +10,21 @@ Python ETL/ELT library powered by [Apache Spark](https://spark.apache.org/) & ot - Support different [read strategies][DBR-onetl-strategy-read-strategies] for incremental and batch data fetching. - Provide [hooks][DBR-onetl-hooks] & [plugins][DBR-onetl-plugins] mechanism for altering behavior of internal classes. -## Non-goals { #DBR-onetl-snippet-0-non-goals } +## Non-goals { #DBR-onetl-include-0-non-goals } - onETL is not a Spark replacement. It just provides additional functionality that Spark does not have, and improves UX for end users. - onETL is not a framework, as it does not have requirements to project structure, naming, the way of running ETL/ELT processes, configuration, etc. All of that should be implemented in some other tool. - onETL is deliberately developed without any integration with scheduling software like Apache Airflow. All integrations should be implemented as separated tools. - Only batch operations, no streaming. For streaming prefer [Apache Flink](https://flink.apache.org/). -## Requirements { #DBR-onetl-snippet-0-requirements } +## Requirements { #DBR-onetl-include-0-requirements } - **Python** 3.7 - 3.13 - PySpark 2.3.x - 3.5.x (depends on used connector) - Java 8+ (required by Spark, see below) - Kerberos libs & GCC (required by `Hive`, `HDFS` and `SparkHDFS` connectors) -## Supported storages { #DBR-onetl-snippet-0-supported-storages } +## Supported storages { #DBR-onetl-include-0-supported-storages } | Type | Storage | Powered by | |--------------------|--------------|-------------------------------------------------------------------------------------------------------------------------| diff --git a/mddocs/docs/install/files.md b/mddocs/docs/install/files.md index 023e7c8cb..3bc4eaa8e 100644 --- a/mddocs/docs/install/files.md +++ b/mddocs/docs/install/files.md @@ -5,14 +5,14 @@ All File (but not *FileDF*) connection classes (`FTP`, `SFTP`, `HDFS` and so on Each client can be installed explicitly by passing connector name (in lowercase) to `extras`: ```bash -pip install onetl[ftp] # specific connector -pip install onetl[ftp,ftps,sftp,hdfs,s3,webdav,samba] # multiple connectors +pip install "onetl[ftp]" # specific connector +pip install "onetl[ftp,ftps,sftp,hdfs,s3,webdav,samba]" # multiple connectors ``` To install all file connectors at once you can pass `files` to `extras`: ```bash -pip install onetl[files] +pip install "onetl[files]" ``` **Otherwise class import will fail.** diff --git a/mddocs/docs/install/full.md b/mddocs/docs/install/full.md index ddace90f8..f264a95a5 100644 --- a/mddocs/docs/install/full.md +++ b/mddocs/docs/install/full.md @@ -3,10 +3,10 @@ To install all connectors and dependencies, you can pass `all` into `extras`: ```bash -pip install onetl[all] +pip install "onetl[all]" # this is just the same as -pip install onetl[spark,files,kerberos] +pip install "onetl[spark,files,kerberos]" ``` !!! warning diff --git a/mddocs/docs/install/kerberos.md b/mddocs/docs/install/kerberos.md index 57095ca73..537014487 100644 --- a/mddocs/docs/install/kerberos.md +++ b/mddocs/docs/install/kerberos.md @@ -26,5 +26,5 @@ dnf install krb5-devel krb5-libs krb5-workstation gcc # CentOS, OracleLinux Also you should pass `kerberos` to `extras` to install required Python packages: ```bash -pip install onetl[kerberos] +pip install "onetl[kerberos]" ``` diff --git a/mddocs/docs/install/spark.md b/mddocs/docs/install/spark.md index fbd2a43f6..a29f138a7 100644 --- a/mddocs/docs/install/spark.md +++ b/mddocs/docs/install/spark.md @@ -23,7 +23,9 @@ apt-get install openjdk-11-jdk # Debian-based + Spark 3 | [3.2.x](https://spark.apache.org/docs/3.2.4/#downloading) | 3.7 - 3.10 | 8u201 - 11 | 2.12 | | [3.3.x](https://spark.apache.org/docs/3.3.4/#downloading) | 3.7 - 3.12 | 8u201 - 17 | 2.12 | | [3.4.x](https://spark.apache.org/docs/3.4.4/#downloading) | 3.7 - 3.12 | 8u362 - 20 | 2.12 | -| [3.5.x](https://spark.apache.org/docs/3.5.5/#downloading) | 3.8 - 3.13 | 8u371 - 20 | 2.12 | +| [3.5.x](https://spark.apache.org/docs/3.5.1/#downloading) | 3.8 - 3.13 | 8u371 - 20 | 2.12 | +| [4.0.x](https://spark.apache.org/docs/4.0.1/#downloading) | 3.9 - 3.14 | 17 - 22 | 2.13 | +| [4.1.x](https://spark.apache.org/docs/4.1.1/#downloading) | 3.10 - 3.14 | 17 - 22 | 2.13 | ## Installing PySpark { #DBR-onetl-install-spark-installing-pyspark } @@ -31,13 +33,13 @@ apt-get install openjdk-11-jdk # Debian-based + Spark 3 Then you should install PySpark via passing `spark` to `extras`: ```bash -pip install onetl[spark] # install latest PySpark +pip install "onetl[spark]" # install latest PySpark ``` or install PySpark explicitly: ```bash -pip install onetl pyspark==3.5.5 # install a specific PySpark version +pip install onetl pyspark==3.5.8 # install a specific PySpark version ``` or inject PySpark to `sys.path` in some other way BEFORE creating a class instance. diff --git a/mddocs/docs/quickstart.md b/mddocs/docs/quickstart.md index a9ad7ab6e..a54645d67 100644 --- a/mddocs/docs/quickstart.md +++ b/mddocs/docs/quickstart.md @@ -74,27 +74,27 @@ apt-get install openjdk-11-jdk # Debian-based | Spark 3 #### Compatibility matrix { #DBR-onetl-quickstart-compatibility-matrix } -| Spark | Python | Java | Scala | -| --------------------------------------------------------- | ---------- | ---------- | ----- | -| [2.3.x](https://spark.apache.org/docs/2.3.1/#downloading) | 3.7 only | 8 only | 2.11 | -| [2.4.x](https://spark.apache.org/docs/2.4.8/#downloading) | 3.7 only | 8 only | 2.11 | -| [3.2.x](https://spark.apache.org/docs/3.2.4/#downloading) | 3.7 - 3.10 | 8u201 - 11 | 2.12 | -| [3.3.x](https://spark.apache.org/docs/3.3.4/#downloading) | 3.7 - 3.12 | 8u201 - 17 | 2.12 | -| [3.4.x](https://spark.apache.org/docs/3.4.4/#downloading) | 3.7 - 3.12 | 8u362 - 20 | 2.12 | -| [3.5.x](https://spark.apache.org/docs/3.5.5/#downloading) | 3.8 - 3.13 | 8u371 - 20 | 2.12 | +| Spark | Python | Java | Scala | +| --------------------------------------------------------- | ----------- | ---------- | ----- | +| [3.2.x](https://spark.apache.org/docs/3.2.4/#downloading) | 3.7 - 3.10 | 8u201 - 11 | 2.12 | +| [3.3.x](https://spark.apache.org/docs/3.3.4/#downloading) | 3.7 - 3.12 | 8u201 - 17 | 2.12 | +| [3.4.x](https://spark.apache.org/docs/3.4.4/#downloading) | 3.7 - 3.12 | 8u362 - 20 | 2.12 | +| [3.5.x](https://spark.apache.org/docs/3.5.5/#downloading) | 3.8 - 3.13 | 8u371 - 20 | 2.12 | +| [4.0.x](https://spark.apache.org/docs/4.0.1/#downloading) | 3.9 - 3.14 | 17 - 22 | 2.13 | +| [4.1.x](https://spark.apache.org/docs/4.1.1/#downloading) | 3.10 - 3.14 | 17 - 22 | 2.13 | Then you should install PySpark via passing `spark` to `extras`: ```bash -pip install onetl[spark] # install latest PySpark +pip install "onetl[spark]" # install latest PySpark ``` or install PySpark explicitly: ```bash -pip install onetl pyspark==3.5.5 # install a specific PySpark version +pip install onetl pyspark==3.5.8 # install a specific PySpark version ``` or inject PySpark to `sys.path` in some other way BEFORE creating a class instance. @@ -109,14 +109,14 @@ All File (but not *FileDF*) connection classes (`FTP`, `SFTP`, `HDFS` and so on) Each client can be installed explicitly by passing connector name (in lowercase) to `extras`: ```bash -pip install onetl[ftp] # specific connector -pip install onetl[ftp,ftps,sftp,hdfs,s3,webdav,samba] # multiple connectors +pip install "onetl[ftp]" # specific connector +pip install "onetl[ftp,ftps,sftp,hdfs,s3,webdav,samba]" # multiple connectors ``` To install all file connectors at once you can pass `files` to `extras`: ```bash -pip install onetl[files] +pip install "onetl[files]" ``` **Otherwise class import will fail.** @@ -151,7 +151,7 @@ dnf install krb5-devel krb5-libs krb5-workstation gcc # CentOS, OracleLinux Also you should pass `kerberos` to `extras` to install required Python packages: ```bash -pip install onetl[kerberos] +pip install "onetl[kerberos]" ``` ### Full bundle { #DBR-onetl-quickstart-full-bundle } @@ -161,10 +161,10 @@ pip install onetl[kerberos] To install all connectors and dependencies, you can pass `all` into `extras`: ```bash -pip install onetl[all] +pip install "onetl[all]" # this is just the same as -pip install onetl[spark,files,kerberos] +pip install "onetl[spark,files,kerberos]" ``` !!! warning @@ -181,7 +181,7 @@ Read data from MSSQL, transform & write to Hive. ```bash # install onETL and PySpark -pip install onetl[spark] +pip install "onetl[spark]" ``` ```python @@ -282,7 +282,7 @@ Download files from SFTP & upload them to HDFS. ```bash # install onETL with SFTP and HDFS clients, and Kerberos support -pip install onetl[hdfs,sftp,kerberos] +pip install "onetl[hdfs,sftp,kerberos]" ``` ```python @@ -402,7 +402,7 @@ Read files directly from S3 path, convert them to dataframe, transform it and th ```bash # install onETL and PySpark -pip install onetl[spark] +pip install "onetl[spark]" ``` ```python From 42e2fef8d13c7fad74086c7d9d90f98e166c0466 Mon Sep 17 00:00:00 2001 From: sga Date: Thu, 9 Apr 2026 00:18:43 +0300 Subject: [PATCH 27/28] rework contributing guide --- mddocs/docs/contributing.md | 95 +++++++++++++++++++++---------------- 1 file changed, 55 insertions(+), 40 deletions(-) diff --git a/mddocs/docs/contributing.md b/mddocs/docs/contributing.md index d6506915e..76e01dd21 100644 --- a/mddocs/docs/contributing.md +++ b/mddocs/docs/contributing.md @@ -5,25 +5,30 @@ reports, improving documentation, submitting feature requests, reviewing new submissions, or contributing code that can be incorporated into the project. -## Limitations { #DBR-onetl-contributing-limitations } +## Review process { #DBR-onetl-contributing-review-process } -We should keep close to these items during development: +For any **significant** changes please create a new GitHub issue and +enhancements that you wish to make. Describe the feature you would like +to see, why you need it, and how it will work. Discuss your ideas +transparently and get community feedback before proceeding. -* Some companies still use old Spark versions, like 3.2.0. So it is required to keep compatibility if possible, e.g. adding branches for different Spark versions. -* Different users uses onETL in different ways - some uses only DB connectors, some only files. Connector-specific dependencies should be optional. -* Instead of creating classes with a lot of different options, prefer splitting them into smaller classes, e.g. options class, context manager, etc, and using composition. +Small changes can directly be crafted and submitted to the GitHub +Repository as a Pull Request. This requires creating a **repo fork** using +[instruction](https://docs.github.com/en/get-started/quickstart/fork-a-repo). -## Initial setup for local development { #DBR-onetl-contributing-initial-setup-for-local-development } +## Important notes { #DBR-onetl-contributing-limitations } -### Install Git { #DBR-onetl-contributing-install-git } +Please take into account that: -Please follow [instruction](https://docs.github.com/en/get-started/quickstart/set-up-git). +- Some companies still use old Spark versions, like 3.2.0. So it is required to keep compatibility if possible, e.g. adding branches for different Spark versions. +- Different users uses onETL in different ways - some uses only DB connectors, some only files. Connector-specific dependencies should be optional. +- Instead of creating classes with a lot of different options, prefer splitting them into smaller classes, e.g. options class, context manager, etc, and using composition. -### Create a fork { #DBR-onetl-contributing-create-a-fork } +## Initial setup for local development { #DBR-onetl-contributing-initial-setup-for-local-development } -If you are not a member of a development team building onETL, you should create a fork before making any changes. +### Install Git { #DBR-onetl-contributing-install-git } -Please follow [instruction](https://docs.github.com/en/get-started/quickstart/fork-a-repo). +Please follow [instruction](https://docs.github.com/en/get-started/quickstart/set-up-git). ### Clone the repo { #DBR-onetl-contributing-clone-the-repo } @@ -35,7 +40,7 @@ git clone git@github.com:myuser/onetl.git -b develop cd onetl ``` -### Setup environment { #DBR-onetl-contributing-setup-environment } +### Enable pre-commit hooks { #DBR-onetl-contributing-enable-pre-commit-hooks } Create virtualenv and install dependencies: @@ -43,8 +48,6 @@ Create virtualenv and install dependencies: make venv-install ``` -### Enable pre-commit hooks { #DBR-onetl-contributing-enable-pre-commit-hooks } - Install pre-commit hooks: ```bash @@ -61,6 +64,18 @@ prek run ### Run tests locally { #DBR-onetl-contributing-run-tests-locally } +!!! note + + You can skip this if only documentation is changed. + +#### Setup environment { #DBR-onetl-contributing-setup-environment } + +Create virtualenv and install dependencies: + +```bash +make venv-install +``` + #### Using docker-compose { #DBR-onetl-contributing-using-docker-compose } Build image for running tests: @@ -179,6 +194,16 @@ docker-compose --profile all down -v ### Build documentation { #DBR-onetl-contributing-build-documentation } +!!! note + + You can skip this if only source code behavior remains the same. + +Create virtualenv and install dependencies: + +```bash +make venv-install +``` + Build documentation using Sphinx: ```bash @@ -188,20 +213,6 @@ make html Then open in browser `docs/_build/index.html`. -## Review process { #DBR-onetl-contributing-review-process } - -Please create a new GitHub issue for any significant changes and -enhancements that you wish to make. Provide the feature you would like -to see, why you need it, and how it will work. Discuss your ideas -transparently and get community feedback before proceeding. - -Significant Changes that you wish to contribute to the project should be -discussed first in a GitHub issue that clearly outlines the changes and -benefits of the feature. - -Small Changes can directly be crafted and submitted to the GitHub -Repository as a Pull Request. - ### Create pull request { #DBR-onetl-contributing-create-pull-request } Commit your changes: @@ -229,7 +240,7 @@ enough but feel free to add as many details as you feel necessary for the users to understand what it means. **Use the past tense** for the text in your fragment because, -combined with others, it will be a part of the “news digest” +combined with others, it will be a part of the "news digest" telling the readers **what changed** in a specific version of the library *since the previous version*. @@ -289,53 +300,57 @@ Just add `ci:skip-changelog` label to pull request. #### Release Process { #DBR-onetl-contributing-release-process } +!!! note + + This is for repo maintainers only + Before making a release from the `develop` branch, follow these steps: -1. Checkout to `develop` branch and update it to the actual state +0. Checkout to `develop` branch and update it to the actual state ```bash git checkout develop git pull -p ``` -2. Backup `NEXT_RELEASE.rst` +1. Backup `NEXT_RELEASE.rst` ```bash cp "docs/changelog/NEXT_RELEASE.rst" "docs/changelog/temp_NEXT_RELEASE.rst" ``` -3. Build the Release notes with Towncrier +2. Build the Release notes with Towncrier ```bash VERSION=$(cat onetl/VERSION) towncrier build "--version=${VERSION}" --yes ``` -4. Change file with changelog to release version number +3. Change file with changelog to release version number ```bash mv docs/changelog/NEXT_RELEASE.rst "docs/changelog/${VERSION}.rst" ``` -5. Remove content above the version number heading in the `${VERSION}.rst` file +4. Remove content above the version number heading in the `${VERSION}.rst` file ```bash awk '!/^.*towncrier release notes start/' "docs/changelog/${VERSION}.rst" > temp && mv temp "docs/changelog/${VERSION}.rst" ``` -6. Update Changelog Index +5. Update Changelog Index ```bash awk -v version=${VERSION} '/DRAFT/{print;print " " version;next}1' docs/changelog/index.rst > temp && mv temp docs/changelog/index.rst ``` -7. Restore `NEXT_RELEASE.rst` file from backup +6. Restore `NEXT_RELEASE.rst` file from backup ```bash mv "docs/changelog/temp_NEXT_RELEASE.rst" "docs/changelog/NEXT_RELEASE.rst" ``` -8. Commit and push changes to `develop` branch +7. Commit and push changes to `develop` branch ```bash git add . @@ -343,7 +358,7 @@ git commit -m "Prepare for release ${VERSION}" git push ``` -9. Merge `develop` branch to `master`, **WITHOUT** squashing +8. Merge `develop` branch to `master`, **WITHOUT** squashing ```bash git checkout master @@ -352,14 +367,14 @@ git merge develop git push ``` -10. Add git tag to the latest commit in `master` branch +9. Add git tag to the latest commit in `master` branch ```bash git tag "$VERSION" git push origin "$VERSION" ``` -11. Update version in `develop` branch **after release**: +10. Update version in `develop` branch **after release**: ```bash git checkout develop From 730013d6b0ba3f3773e5ebf30c6ff05e74259518 Mon Sep 17 00:00:00 2001 From: sga Date: Thu, 9 Apr 2026 10:40:53 +0300 Subject: [PATCH 28/28] rewove spark 2 from spark install docs --- mddocs/docs/install/spark.md | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/mddocs/docs/install/spark.md b/mddocs/docs/install/spark.md index a29f138a7..bf0d8dcf7 100644 --- a/mddocs/docs/install/spark.md +++ b/mddocs/docs/install/spark.md @@ -16,16 +16,14 @@ apt-get install openjdk-11-jdk # Debian-based + Spark 3 ### Compatibility matrix { #DBR-onetl-install-spark-compatibility-matrix } -| Spark | Python | Java | Scala | -|-----------------------------------------------------------|------------|------------|---------| -| [2.3.x](https://spark.apache.org/docs/2.3.1/#downloading) | 3.7 only | 8 only | 2.11 | -| [2.4.x](https://spark.apache.org/docs/2.4.8/#downloading) | 3.7 only | 8 only | 2.11 | -| [3.2.x](https://spark.apache.org/docs/3.2.4/#downloading) | 3.7 - 3.10 | 8u201 - 11 | 2.12 | -| [3.3.x](https://spark.apache.org/docs/3.3.4/#downloading) | 3.7 - 3.12 | 8u201 - 17 | 2.12 | -| [3.4.x](https://spark.apache.org/docs/3.4.4/#downloading) | 3.7 - 3.12 | 8u362 - 20 | 2.12 | -| [3.5.x](https://spark.apache.org/docs/3.5.1/#downloading) | 3.8 - 3.13 | 8u371 - 20 | 2.12 | -| [4.0.x](https://spark.apache.org/docs/4.0.1/#downloading) | 3.9 - 3.14 | 17 - 22 | 2.13 | -| [4.1.x](https://spark.apache.org/docs/4.1.1/#downloading) | 3.10 - 3.14 | 17 - 22 | 2.13 | +| Spark | Python | Java | Scala | +|-----------------------------------------------------------|-------------|------------|---------| +| [3.2.x](https://spark.apache.org/docs/3.2.4/#downloading) | 3.7 - 3.10 | 8u201 - 11 | 2.12 | +| [3.3.x](https://spark.apache.org/docs/3.3.4/#downloading) | 3.7 - 3.12 | 8u201 - 17 | 2.12 | +| [3.4.x](https://spark.apache.org/docs/3.4.4/#downloading) | 3.7 - 3.12 | 8u362 - 20 | 2.12 | +| [3.5.x](https://spark.apache.org/docs/3.5.1/#downloading) | 3.8 - 3.13 | 8u371 - 20 | 2.12 | +| [4.0.x](https://spark.apache.org/docs/4.0.1/#downloading) | 3.9 - 3.14 | 17 - 22 | 2.13 | +| [4.1.x](https://spark.apache.org/docs/4.1.1/#downloading) | 3.10 - 3.14 | 17 - 22 | 2.13 | ## Installing PySpark { #DBR-onetl-install-spark-installing-pyspark }