Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

clickhouse: don't flatten nested fields on restore #147

Merged
merged 1 commit into from
Oct 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 11 additions & 5 deletions astacus/coordinator/plugins/clickhouse/steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,16 +481,22 @@ async def run_step(self, cluster: Cluster, context: StepsContext) -> None:
)
# Tables creation is not parallelized with gather since they can depend on each other
# (although, we could use graphlib more subtly and parallelize what we can).

# If any table was initially created with custom global settings,
# we need to re-enable these custom global settings when creating the table again.
# We can enable these settings unconditionally because they are harmless
# for tables not needing them.
session_id = secrets.token_hex()
for query in [
# If any table was initially created with custom global settings,
# we need to re-enable these custom global settings when creating the table again.
# We can enable these settings unconditionally because they are harmless
# for tables not needing them.
b"SET allow_experimental_geo_types=true",
b"SET allow_experimental_object_type=true",
b"SET allow_suspicious_low_cardinality_types=true",
# If a table was created with flatten_nested=0, we must be careful to not re-create the
# table with flatten_nested=1, since this would recreate the table with a different schema.
# If a table was created with flatten_nested=1, the query in system.tables.create_table_query
# shows the flattened fields, not the original user query with nested fields. This means that
# when re-executing the query, we don't need to re-flatten the fields.
# In both cases, flatten_nested=0 is the right choice when restoring a backup.
b"SET flatten_nested=0",
]:
try:
await self.clients[0].execute(query, session_id=session_id)
Expand Down
43 changes: 43 additions & 0 deletions tests/integration/coordinator/plugins/clickhouse/test_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,31 @@ async def setup_cluster_content(clients: List[HttpClickHouseClient], use_named_c
b"SETTINGS index_granularity=8192 "
b"SETTINGS allow_experimental_object_type=1"
)
# test that we correctly restore nested fields
await clients[0].execute(
b"CREATE TABLE default.nested_not_flatten (thekey UInt32, thedata Nested(a UInt32, b UInt32)) "
b"ENGINE = ReplicatedMergeTree ORDER BY (thekey) "
b"SETTINGS index_granularity=8192 "
b"SETTINGS flatten_nested=0"
)
await clients[0].execute(
b"CREATE TABLE default.nested_flatten (thekey UInt32, thedata Nested(a UInt32, b UInt32)) "
b"ENGINE = ReplicatedMergeTree ORDER BY (thekey) "
b"SETTINGS index_granularity=8192 "
b"SETTINGS flatten_nested=1"
)
await clients[0].execute(
b"CREATE TABLE default.array_tuple_not_flatten (thekey UInt32, thedata Array(Tuple(a UInt32, b UInt32))) "
b"ENGINE = ReplicatedMergeTree ORDER BY (thekey) "
b"SETTINGS index_granularity=8192 "
b"SETTINGS flatten_nested=0"
)
await clients[0].execute(
b"CREATE TABLE default.array_tuple_flatten (thekey UInt32, thedata Array(Tuple(a UInt32, b UInt32))) "
b"ENGINE = ReplicatedMergeTree ORDER BY (thekey) "
b"SETTINGS index_granularity=8192 "
b"SETTINGS flatten_nested=1"
)
# add a table with data in object storage
await clients[0].execute(
b"CREATE TABLE default.in_object_storage (thekey UInt32, thedata String) "
Expand All @@ -170,6 +195,11 @@ async def setup_cluster_content(clients: List[HttpClickHouseClient], use_named_c
await clients[0].execute(b"INSERT INTO default.with_experimental_types VALUES (123, '{\"a\":1}')")
await clients[1].execute(b"INSERT INTO default.with_experimental_types VALUES (456, '{\"b\":2}')")
await clients[2].execute(b"INSERT INTO default.with_experimental_types VALUES (789, '{\"c\":3}')")
# Sample data for nested fields
await clients[0].execute(b"INSERT INTO default.nested_not_flatten VALUES (123, [(4, 5)])")
await clients[0].execute(b"INSERT INTO default.nested_flatten VALUES (123, [4], [5])")
await clients[0].execute(b"INSERT INTO default.array_tuple_not_flatten VALUES (123, [(4, 5)])")
await clients[0].execute(b"INSERT INTO default.array_tuple_flatten VALUES (123, [4], [5])")
# And some object storage data
await clients[0].execute(b"INSERT INTO default.in_object_storage VALUES (123, 'foo')")
await clients[1].execute(b"INSERT INTO default.in_object_storage VALUES (456, 'bar')")
Expand Down Expand Up @@ -235,6 +265,19 @@ async def test_restores_table_with_experimental_types(restored_cluster: List[Cli
assert response == expected_data


@pytest.mark.asyncio
async def test_restores_table_with_nested_fields(restored_cluster: List[ClickHouseClient]) -> None:
client = restored_cluster[0]
response = await client.execute(b"SELECT thekey, thedata FROM default.nested_not_flatten ORDER BY thekey")
assert response == [[123, [{"a": 4, "b": 5}]]]
response = await client.execute(b"SELECT thekey, thedata.a, thedata.b FROM default.nested_flatten ORDER BY thekey")
assert response == [[123, [4], [5]]]
response = await client.execute(b"SELECT thekey, thedata FROM default.array_tuple_not_flatten ORDER BY thekey")
assert response == [[123, [{"a": 4, "b": 5}]]]
response = await client.execute(b"SELECT thekey, thedata.a, thedata.b FROM default.array_tuple_flatten ORDER BY thekey")
assert response == [[123, [4], [5]]]


async def check_object_storage_data(cluster: List[ClickHouseClient]) -> None:
s1_data = [[123, "foo"], [456, "bar"]]
s2_data = [[789, "baz"]]
Expand Down
3 changes: 3 additions & 0 deletions tests/unit/coordinator/plugins/clickhouse/test_steps.py
Original file line number Diff line number Diff line change
Expand Up @@ -790,6 +790,7 @@ async def test_creates_all_replicated_databases_and_tables_in_manifest() -> None
b"SET allow_experimental_geo_types=true",
b"SET allow_experimental_object_type=true",
b"SET allow_suspicious_low_cardinality_types=true",
b"SET flatten_nested=0",
b"CREATE TABLE db-one.table-uno ...",
b"CREATE TABLE db-one.table-dos ...",
b"CREATE TABLE db-two.table-eins ...",
Expand Down Expand Up @@ -846,6 +847,7 @@ async def test_creates_all_replicated_databases_and_tables_in_manifest_with_cust
b"SET allow_experimental_geo_types=true",
b"SET allow_experimental_object_type=true",
b"SET allow_suspicious_low_cardinality_types=true",
b"SET flatten_nested=0",
b"CREATE TABLE db-one.table-uno ...",
b"CREATE TABLE db-one.table-dos ...",
b"CREATE TABLE db-two.table-eins ...",
Expand Down Expand Up @@ -894,6 +896,7 @@ async def test_drops_each_database_on_all_servers_before_recreating_it() -> None
b"SET allow_experimental_geo_types=true",
b"SET allow_experimental_object_type=true",
b"SET allow_suspicious_low_cardinality_types=true",
b"SET flatten_nested=0",
b"CREATE TABLE db-one.table-uno ...",
b"CREATE TABLE db-one.table-dos ...",
b"CREATE TABLE db-two.table-eins ...",
Expand Down
Loading