From 75a4b6ee868cc109d7f40b87e6fae7b0429b95b4 Mon Sep 17 00:00:00 2001 From: dimbo4ka Date: Mon, 2 Feb 2026 16:06:22 +0000 Subject: [PATCH 1/5] fix: add latin-1 fallback for non-UTF-8 table metadata --- ch_backup/logic/table.py | 7 +++++++ ch_backup/storage/loader.py | 7 ++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/ch_backup/logic/table.py b/ch_backup/logic/table.py index 3eeb08fa..7fec58b5 100644 --- a/ch_backup/logic/table.py +++ b/ch_backup/logic/table.py @@ -235,6 +235,13 @@ def _load_create_statement_from_disk(table: Table) -> Optional[str]: return None try: return Path(table.metadata_path).read_text("utf-8") + except UnicodeDecodeError: + logging.warning( + 'Table "{}"."{}": metadata contains non-UTF-8 bytes, using latin-1 fallback', + table.database, + table.name, + ) + return Path(table.metadata_path).read_text("latin-1") except OSError as e: logging.debug( 'Cannot load a create statement of the table "{}"."{}": {}', diff --git a/ch_backup/storage/loader.py b/ch_backup/storage/loader.py index 2aab49d5..83ed4183 100644 --- a/ch_backup/storage/loader.py +++ b/ch_backup/storage/loader.py @@ -139,7 +139,12 @@ def download_data( data = self._ploader.download_data( remote_path, is_async=is_async, encryption=encryption ) - return data.decode(encoding) if encoding else data + if not encoding: + return data + try: + return data.decode("utf-8") + except UnicodeDecodeError: + return data.decode("latin-1") def download_file( self, From 57f4f67a42f5adfdd0f5613334e8f5697bd0b1ae Mon Sep 17 00:00:00 2001 From: dimbo4ka Date: Mon, 2 Feb 2026 16:07:53 +0000 Subject: [PATCH 2/5] test: add integration test for non-UTF-8 medata handling --- .../schema_encoding_compatibility.feature | 63 +++++++++++++++++++ tests/integration/modules/clickhouse.py | 6 ++ tests/integration/steps/clickhouse.py | 19 ++++++ 3 files changed, 88 insertions(+) create mode 100644 tests/integration/features/schema_encoding_compatibility.feature diff --git a/tests/integration/features/schema_encoding_compatibility.feature b/tests/integration/features/schema_encoding_compatibility.feature new file mode 100644 index 00000000..226447af --- /dev/null +++ b/tests/integration/features/schema_encoding_compatibility.feature @@ -0,0 +1,63 @@ +Feature: Non-UTF-8 schema encoding support + + Background: + Given default configuration + And a working s3 + And a working zookeeper on zookeeper01 + And a working clickhouse on clickhouse01 + And a working clickhouse on clickhouse02 + + Scenario: Backup and restore multiple tables with correct utf-8 encodings + Given we have executed queries on clickhouse01 + """ + CREATE DATABASE test_db; + + CREATE TABLE test_db.table_ascii ( + id Int32, + name_ascii String COMMENT 'ascii' + ) ENGINE = MergeTree() ORDER BY id; + + CREATE TABLE test_db.table_emoji ( + id Int32, + `name_😈` String COMMENT '😈' + ) ENGINE = MergeTree() ORDER BY id; + + CREATE TABLE test_db.table_cyrillic ( + id Int32, + `name_абвгд` String COMMENT 'абвгд' + ) ENGINE = MergeTree() ORDER BY id; + + CREATE TABLE test_db.table_chinese ( + id Int32, + `name_试` String COMMENT '试' + ) ENGINE = MergeTree() ORDER BY id; + + INSERT INTO test_db.table_ascii VALUES (1, 'test1'); + INSERT INTO test_db.table_emoji VALUES (2, 'test2'); + INSERT INTO test_db.table_cyrillic VALUES (3, 'test3'); + INSERT INTO test_db.table_chinese VALUES (4, 'test4'); + """ + When we create clickhouse01 clickhouse backup + Then we got the following backups on clickhouse01 + | num | state | data_count | link_count | + | 0 | created | 4 | 0 | + When we restore clickhouse backup #0 to clickhouse02 + Then clickhouse02 has same schema as clickhouse01 + And we got same clickhouse data at clickhouse01 clickhouse02 + + Scenario: Table with invalid utf-8 characters + Given we have created non-UTF-8 test table on clickhouse01 + When we create clickhouse01 clickhouse backup + Then we got the following backups on clickhouse01 + | num | state | data_count | link_count | + | 0 | created | 1 | 0 | + When we restore clickhouse backup #0 to clickhouse02 + When we execute query on clickhouse02 + """ + EXISTS TABLE test_db.non_utf8_test + """ + Then we get response + """ + 1 + """ + diff --git a/tests/integration/modules/clickhouse.py b/tests/integration/modules/clickhouse.py index 33ad482e..0a44f1a2 100644 --- a/tests/integration/modules/clickhouse.py +++ b/tests/integration/modules/clickhouse.py @@ -73,6 +73,12 @@ def execute(self, query: str) -> None: """ self._query("POST", query=query) + def execute_raw(self, query_bytes: bytes) -> None: + """ + Execute query from raw bytes. + """ + self._query("POST", data=query_bytes) + def get_response(self, query: str) -> str: """ Execute arbitrary query and return result diff --git a/tests/integration/steps/clickhouse.py b/tests/integration/steps/clickhouse.py index 34379e6e..ae688ece 100644 --- a/tests/integration/steps/clickhouse.py +++ b/tests/integration/steps/clickhouse.py @@ -349,3 +349,22 @@ def step_create_multiple_tables(context, table_count, node): for i in range(table_count): table_schema = schema_template.format(table_number=i) ch_client.execute(table_schema) + + +@given("we have created non-UTF-8 test table on {node}") +def create_non_utf8_table(context, node): + """ + Create table with invalid utf-8 for testing latin-1 fallback + """ + ch_client = ClickhouseClient(context, node) + ch_client.execute("CREATE DATABASE IF NOT EXISTS test_db") + + query = ( + "CREATE TABLE test_db.non_utf8_test " + "(id Int32, `тест` String) " + "ENGINE = MergeTree() " + "ORDER BY id;" + ) + + ch_client.execute_raw(query.encode("cp1251")) + ch_client.execute("INSERT INTO test_db.non_utf8_test VALUES (1, 'test')") From ebe55b8607be4210df6d27b2e1af86cf3f20739b Mon Sep 17 00:00:00 2001 From: dimbo4ka Date: Mon, 2 Feb 2026 16:42:08 +0000 Subject: [PATCH 3/5] refactor: improve download_data function structure --- ch_backup/storage/loader.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/ch_backup/storage/loader.py b/ch_backup/storage/loader.py index 83ed4183..2846b7da 100644 --- a/ch_backup/storage/loader.py +++ b/ch_backup/storage/loader.py @@ -126,9 +126,7 @@ def upload_files_tarball( ) return remote_path - def download_data( - self, remote_path, is_async=False, encryption=False, encoding="utf-8" - ): + def download_data(self, remote_path, is_async=False, encryption=False): """ Download file from storage and return its content. @@ -139,8 +137,6 @@ def download_data( data = self._ploader.download_data( remote_path, is_async=is_async, encryption=encryption ) - if not encoding: - return data try: return data.decode("utf-8") except UnicodeDecodeError: From 81f5f127cd89680a3177c1cd9baa443ec76e2f70 Mon Sep 17 00:00:00 2001 From: dimbo4ka Date: Tue, 3 Feb 2026 11:20:25 +0000 Subject: [PATCH 4/5] refactor: move logic from execute_raw into execute --- tests/integration/modules/clickhouse.py | 13 +++++-------- tests/integration/steps/clickhouse.py | 2 +- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/tests/integration/modules/clickhouse.py b/tests/integration/modules/clickhouse.py index 0a44f1a2..eb4fbb8f 100644 --- a/tests/integration/modules/clickhouse.py +++ b/tests/integration/modules/clickhouse.py @@ -67,17 +67,14 @@ def ping(self) -> None: """ self._query("GET", url="ping") - def execute(self, query: str) -> None: + def execute(self, query: Union[str, bytes]) -> None: """ Execute arbitrary query. """ - self._query("POST", query=query) - - def execute_raw(self, query_bytes: bytes) -> None: - """ - Execute query from raw bytes. - """ - self._query("POST", data=query_bytes) + if isinstance(query, str): + self._query("POST", query=query) + return + self._query("POST", data=query) def get_response(self, query: str) -> str: """ diff --git a/tests/integration/steps/clickhouse.py b/tests/integration/steps/clickhouse.py index ae688ece..23e1f1cc 100644 --- a/tests/integration/steps/clickhouse.py +++ b/tests/integration/steps/clickhouse.py @@ -366,5 +366,5 @@ def create_non_utf8_table(context, node): "ORDER BY id;" ) - ch_client.execute_raw(query.encode("cp1251")) + ch_client.execute(query.encode("cp1251")) ch_client.execute("INSERT INTO test_db.non_utf8_test VALUES (1, 'test')") From ef173098026216b513e2bcf9f60f59280256fc0c Mon Sep 17 00:00:00 2001 From: dimbo4ka Date: Tue, 3 Feb 2026 11:56:48 +0000 Subject: [PATCH 5/5] tests: refactor step --- .../schema_encoding_compatibility.feature | 2 +- tests/integration/steps/clickhouse.py | 20 +++++++++++-------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/tests/integration/features/schema_encoding_compatibility.feature b/tests/integration/features/schema_encoding_compatibility.feature index 226447af..7a7f6cc3 100644 --- a/tests/integration/features/schema_encoding_compatibility.feature +++ b/tests/integration/features/schema_encoding_compatibility.feature @@ -54,7 +54,7 @@ Feature: Non-UTF-8 schema encoding support When we restore clickhouse backup #0 to clickhouse02 When we execute query on clickhouse02 """ - EXISTS TABLE test_db.non_utf8_test + EXISTS TABLE test_db.table_rus """ Then we get response """ diff --git a/tests/integration/steps/clickhouse.py b/tests/integration/steps/clickhouse.py index 23e1f1cc..71140694 100644 --- a/tests/integration/steps/clickhouse.py +++ b/tests/integration/steps/clickhouse.py @@ -359,12 +359,16 @@ def create_non_utf8_table(context, node): ch_client = ClickhouseClient(context, node) ch_client.execute("CREATE DATABASE IF NOT EXISTS test_db") - query = ( - "CREATE TABLE test_db.non_utf8_test " - "(id Int32, `тест` String) " - "ENGINE = MergeTree() " - "ORDER BY id;" - ) + query = b""" + CREATE TABLE test_db.table_rus ( + EventDate DateTime, + CounterID UInt32, + `\xcf\xf0\xe8\xe2\xe5\xf2` UInt32 + ) + ENGINE = MergeTree() + PARTITION BY CounterID % 10 + ORDER BY (CounterID, EventDate) + """ - ch_client.execute(query.encode("cp1251")) - ch_client.execute("INSERT INTO test_db.non_utf8_test VALUES (1, 'test')") + ch_client.execute(query) + ch_client.execute("INSERT INTO test_db.table_rus VALUES (toDateTime('17.01.2006 10:03:00'), 2, 3)")