diff --git a/ch_backup/logic/table.py b/ch_backup/logic/table.py index 3eeb08fa..7fec58b5 100644 --- a/ch_backup/logic/table.py +++ b/ch_backup/logic/table.py @@ -235,6 +235,13 @@ def _load_create_statement_from_disk(table: Table) -> Optional[str]: return None try: return Path(table.metadata_path).read_text("utf-8") + except UnicodeDecodeError: + logging.warning( + 'Table "{}"."{}": metadata contains non-UTF-8 bytes, using latin-1 fallback', + table.database, + table.name, + ) + return Path(table.metadata_path).read_text("latin-1") except OSError as e: logging.debug( 'Cannot load a create statement of the table "{}"."{}": {}', diff --git a/ch_backup/storage/loader.py b/ch_backup/storage/loader.py index 2aab49d5..2846b7da 100644 --- a/ch_backup/storage/loader.py +++ b/ch_backup/storage/loader.py @@ -126,9 +126,7 @@ def upload_files_tarball( ) return remote_path - def download_data( - self, remote_path, is_async=False, encryption=False, encoding="utf-8" - ): + def download_data(self, remote_path, is_async=False, encryption=False): """ Download file from storage and return its content. @@ -139,7 +137,10 @@ def download_data( data = self._ploader.download_data( remote_path, is_async=is_async, encryption=encryption ) - return data.decode(encoding) if encoding else data + try: + return data.decode("utf-8") + except UnicodeDecodeError: + return data.decode("latin-1") def download_file( self, diff --git a/tests/integration/features/schema_encoding_compatibility.feature b/tests/integration/features/schema_encoding_compatibility.feature new file mode 100644 index 00000000..7a7f6cc3 --- /dev/null +++ b/tests/integration/features/schema_encoding_compatibility.feature @@ -0,0 +1,63 @@ +Feature: Non-UTF-8 schema encoding support + + Background: + Given default configuration + And a working s3 + And a working zookeeper on zookeeper01 + And a working clickhouse on clickhouse01 + And a working clickhouse on clickhouse02 + + Scenario: Backup and restore multiple tables with correct utf-8 encodings + Given we have executed queries on clickhouse01 + """ + CREATE DATABASE test_db; + + CREATE TABLE test_db.table_ascii ( + id Int32, + name_ascii String COMMENT 'ascii' + ) ENGINE = MergeTree() ORDER BY id; + + CREATE TABLE test_db.table_emoji ( + id Int32, + `name_😈` String COMMENT '😈' + ) ENGINE = MergeTree() ORDER BY id; + + CREATE TABLE test_db.table_cyrillic ( + id Int32, + `name_абвгд` String COMMENT 'абвгд' + ) ENGINE = MergeTree() ORDER BY id; + + CREATE TABLE test_db.table_chinese ( + id Int32, + `name_试` String COMMENT '试' + ) ENGINE = MergeTree() ORDER BY id; + + INSERT INTO test_db.table_ascii VALUES (1, 'test1'); + INSERT INTO test_db.table_emoji VALUES (2, 'test2'); + INSERT INTO test_db.table_cyrillic VALUES (3, 'test3'); + INSERT INTO test_db.table_chinese VALUES (4, 'test4'); + """ + When we create clickhouse01 clickhouse backup + Then we got the following backups on clickhouse01 + | num | state | data_count | link_count | + | 0 | created | 4 | 0 | + When we restore clickhouse backup #0 to clickhouse02 + Then clickhouse02 has same schema as clickhouse01 + And we got same clickhouse data at clickhouse01 clickhouse02 + + Scenario: Table with invalid utf-8 characters + Given we have created non-UTF-8 test table on clickhouse01 + When we create clickhouse01 clickhouse backup + Then we got the following backups on clickhouse01 + | num | state | data_count | link_count | + | 0 | created | 1 | 0 | + When we restore clickhouse backup #0 to clickhouse02 + When we execute query on clickhouse02 + """ + EXISTS TABLE test_db.table_rus + """ + Then we get response + """ + 1 + """ + diff --git a/tests/integration/modules/clickhouse.py b/tests/integration/modules/clickhouse.py index 33ad482e..eb4fbb8f 100644 --- a/tests/integration/modules/clickhouse.py +++ b/tests/integration/modules/clickhouse.py @@ -67,11 +67,14 @@ def ping(self) -> None: """ self._query("GET", url="ping") - def execute(self, query: str) -> None: + def execute(self, query: Union[str, bytes]) -> None: """ Execute arbitrary query. """ - self._query("POST", query=query) + if isinstance(query, str): + self._query("POST", query=query) + return + self._query("POST", data=query) def get_response(self, query: str) -> str: """ diff --git a/tests/integration/steps/clickhouse.py b/tests/integration/steps/clickhouse.py index 34379e6e..71140694 100644 --- a/tests/integration/steps/clickhouse.py +++ b/tests/integration/steps/clickhouse.py @@ -349,3 +349,26 @@ def step_create_multiple_tables(context, table_count, node): for i in range(table_count): table_schema = schema_template.format(table_number=i) ch_client.execute(table_schema) + + +@given("we have created non-UTF-8 test table on {node}") +def create_non_utf8_table(context, node): + """ + Create table with invalid utf-8 for testing latin-1 fallback + """ + ch_client = ClickhouseClient(context, node) + ch_client.execute("CREATE DATABASE IF NOT EXISTS test_db") + + query = b""" + CREATE TABLE test_db.table_rus ( + EventDate DateTime, + CounterID UInt32, + `\xcf\xf0\xe8\xe2\xe5\xf2` UInt32 + ) + ENGINE = MergeTree() + PARTITION BY CounterID % 10 + ORDER BY (CounterID, EventDate) + """ + + ch_client.execute(query) + ch_client.execute("INSERT INTO test_db.table_rus VALUES (toDateTime('17.01.2006 10:03:00'), 2, 3)")