From 23a6df038681464888fbc610f7640e6ebe4eccde Mon Sep 17 00:00:00 2001 From: Thirunarayanan Balathandayuthapani Date: Fri, 13 Dec 2024 22:14:41 +0530 Subject: [PATCH] MDEV-34898 Doublewrite recovery of innodb_checksum_algorithm=full_crc32 encrypted pages does not work - InnoDB fails to recover the full crc32 encrypted page from doublewrite buffer. The reason is that buf_dblwr_t::recover() fails to identify the space id from the page because the page has been encrypted from FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION bytes. Fix: === buf_dblwr_t::recover(): preserve any pages whose space_id does not match a known tablespace. These could be encrypted pages of tablespaces that had been created with innodb_checksum_algorithm=full_crc32. buf_page_t::read_complete(): If the page looks corrupted and the tablespace is encrypted and in full_crc32 format, try to restore the page from doublewrite buffer. recv_dblwr_t::recover_encrypted_page(): Find the page which has the same page number and try to decrypt the page using space->crypt_data. After decryption, compare the space id. Write the recovered page back to the file. --- .../encryption/r/debug_key_management.result | 1 + .../encryption/r/doublewrite_debug.result | 47 ++++++++ .../encryption/t/debug_key_management.test | 1 + .../suite/encryption/t/doublewrite_debug.opt | 3 + .../suite/encryption/t/doublewrite_debug.test | 105 ++++++++++++++++++ storage/innobase/buf/buf0buf.cc | 11 ++ storage/innobase/buf/buf0dblwr.cc | 13 ++- storage/innobase/include/log0recv.h | 16 ++- storage/innobase/log/log0recv.cc | 42 +++++++ storage/innobase/srv/srv0start.cc | 9 +- 10 files changed, 239 insertions(+), 9 deletions(-) create mode 100644 mysql-test/suite/encryption/r/doublewrite_debug.result create mode 100644 mysql-test/suite/encryption/t/doublewrite_debug.opt create mode 100644 mysql-test/suite/encryption/t/doublewrite_debug.test diff --git a/mysql-test/suite/encryption/r/debug_key_management.result b/mysql-test/suite/encryption/r/debug_key_management.result index 7fe681d52ae64..97a999e513b76 100644 --- a/mysql-test/suite/encryption/r/debug_key_management.result +++ b/mysql-test/suite/encryption/r/debug_key_management.result @@ -1,3 +1,4 @@ +call mtr.add_suppression("InnoDB: Encrypted page \\[page id: space=[1-9][0-9]*, page number=[0-9]*\\] in file .*"); create table t1(a serial) engine=innoDB; set global innodb_encrypt_tables=ON; show variables like 'innodb_encrypt%'; diff --git a/mysql-test/suite/encryption/r/doublewrite_debug.result b/mysql-test/suite/encryption/r/doublewrite_debug.result new file mode 100644 index 0000000000000..73659bdb61375 --- /dev/null +++ b/mysql-test/suite/encryption/r/doublewrite_debug.result @@ -0,0 +1,47 @@ +call mtr.add_suppression("InnoDB: Encrypted page \\[page id: space=[1-9][0-9]*, page number=[0-9]*\\] in file .*"); +create table t1 (f1 int primary key, f2 blob)page_compressed = 1 engine=innodb stats_persistent=0; +create table t2(f1 int primary key, f2 blob)engine=innodb stats_persistent=0; +start transaction; +insert into t1 values(1, repeat('#',12)); +insert into t1 values(2, repeat('+',12)); +insert into t1 values(3, repeat('/',12)); +insert into t1 values(4, repeat('-',12)); +insert into t1 values(5, repeat('.',12)); +insert into t2 select * from t1; +commit work; +SET GLOBAL innodb_fast_shutdown = 0; +# restart: --debug_dbug=+d,ib_log_checkpoint_avoid_hard --innodb_flush_sync=0 +select space into @t1_space_id from information_schema.innodb_sys_tablespaces where name="test/t1"; +select space into @t2_space_id from information_schema.innodb_sys_tablespaces where name="test/t2"; +begin; +insert into t1 values (6, repeat('%', 400)); +insert into t2 values (6, repeat('%', 400)); +set global innodb_saved_page_number_debug = 3; +set global innodb_fil_make_page_dirty_debug = @t1_space_id; +set global innodb_saved_page_number_debug = 3; +set global innodb_fil_make_page_dirty_debug = @t2_space_id; +set global innodb_buf_flush_list_now = 1; +# Kill the server +# restart +FOUND 2 /InnoDB: Recovered page \[page id: space=[1-9]*, page number=3\]/ in mysqld.1.err +check table t1; +Table Op Msg_type Msg_text +test.t1 check status OK +check table t2; +Table Op Msg_type Msg_text +test.t2 check status OK +select f1, f2 from t1; +f1 f2 +1 ############ +2 ++++++++++++ +3 //////////// +4 ------------ +5 ............ +select f1, f2 from t2; +f1 f2 +1 ############ +2 ++++++++++++ +3 //////////// +4 ------------ +5 ............ +drop table t2, t1; diff --git a/mysql-test/suite/encryption/t/debug_key_management.test b/mysql-test/suite/encryption/t/debug_key_management.test index 45a93040d7313..f644d06ca8cbd 100644 --- a/mysql-test/suite/encryption/t/debug_key_management.test +++ b/mysql-test/suite/encryption/t/debug_key_management.test @@ -3,6 +3,7 @@ -- source include/innodb_undo_tablespaces.inc -- source include/not_embedded.inc +call mtr.add_suppression("InnoDB: Encrypted page \\[page id: space=[1-9][0-9]*, page number=[0-9]*\\] in file .*"); if (`select count(*) = 0 from information_schema.plugins where plugin_name = 'debug_key_management' and plugin_status='active'`) { diff --git a/mysql-test/suite/encryption/t/doublewrite_debug.opt b/mysql-test/suite/encryption/t/doublewrite_debug.opt new file mode 100644 index 0000000000000..ed86654270b4b --- /dev/null +++ b/mysql-test/suite/encryption/t/doublewrite_debug.opt @@ -0,0 +1,3 @@ +--innodb-use-atomic-writes=0 +--innodb-encrypt-tables=FORCE +--innodb_sys_tablespaces diff --git a/mysql-test/suite/encryption/t/doublewrite_debug.test b/mysql-test/suite/encryption/t/doublewrite_debug.test new file mode 100644 index 0000000000000..74cd2eccfc7ff --- /dev/null +++ b/mysql-test/suite/encryption/t/doublewrite_debug.test @@ -0,0 +1,105 @@ +--source include/have_innodb.inc +--source include/have_debug.inc +--source include/not_embedded.inc +--source include/have_example_key_management_plugin.inc +call mtr.add_suppression("InnoDB: Encrypted page \\[page id: space=[1-9][0-9]*, page number=[0-9]*\\] in file .*"); + +let INNODB_PAGE_SIZE=`select @@innodb_page_size`; +let MYSQLD_DATADIR=`select @@datadir`; +let ALGO=`select @@innodb_checksum_algorithm`; + +create table t1 (f1 int primary key, f2 blob)page_compressed = 1 engine=innodb stats_persistent=0; +create table t2(f1 int primary key, f2 blob)engine=innodb stats_persistent=0; + +start transaction; +insert into t1 values(1, repeat('#',12)); +insert into t1 values(2, repeat('+',12)); +insert into t1 values(3, repeat('/',12)); +insert into t1 values(4, repeat('-',12)); +insert into t1 values(5, repeat('.',12)); +insert into t2 select * from t1; +commit work; + +# Slow shutdown and restart to make sure ibuf merge is finished +SET GLOBAL innodb_fast_shutdown = 0; +let $shutdown_timeout=; +let $restart_parameters=--debug_dbug=+d,ib_log_checkpoint_avoid_hard --innodb_flush_sync=0; +--source include/restart_mysqld.inc +--source ../../suite/innodb/include/no_checkpoint_start.inc + +select space into @t1_space_id from information_schema.innodb_sys_tablespaces where name="test/t1"; +select space into @t2_space_id from information_schema.innodb_sys_tablespaces where name="test/t2"; + +begin; +insert into t1 values (6, repeat('%', 400)); +insert into t2 values (6, repeat('%', 400)); + +set global innodb_saved_page_number_debug = 3; +set global innodb_fil_make_page_dirty_debug = @t1_space_id; + +set global innodb_saved_page_number_debug = 3; +set global innodb_fil_make_page_dirty_debug = @t2_space_id; + +set global innodb_buf_flush_list_now = 1; +--let CLEANUP_IF_CHECKPOINT=drop table t1, t2, unexpected_checkpoint; +--source ../../suite/innodb/include/no_checkpoint_end.inc + +# Corrupt the page 3 in t1.ibd, t2.ibd file +perl; +use IO::Handle; +do "$ENV{MTR_SUITE_DIR}/include/crc32.pl"; +my $polynomial = 0x82f63b78; # CRC-32C +my $algo = $ENV{ALGO}; +die "Unsupported innodb_checksum_algorithm=$algo\n" unless $algo =~ /crc32/; + +my $fname= "$ENV{'MYSQLD_DATADIR'}test/t1.ibd"; +my $page_size = $ENV{INNODB_PAGE_SIZE}; +my $page; +do "$ENV{MTR_SUITE_DIR}/../innodb/include/crc32.pl"; +open(FILE, "+<", $fname) or die; +sysseek(FILE, 3*$page_size, 0); +sysread(FILE, $page, $page_size)==$page_size||die "Unable to read $name\n"; +sysseek(FILE, 3*$page_size, 0)||die "Unable to seek $fname\n"; +my $corrupted = $page; +# Set FIL_PAGE_LSN to the maximum +substr($corrupted, 16, 8) = chr(255) x 8; +substr($corrupted, $page_size - 8, 8) = chr(255) x 8; +if ($algo =~ /full_crc32/) +{ + my $ck = mycrc32(substr($corrupted, 0, $page_size - 4), 0, $polynomial); + substr($corrupted, $page_size - 4, 4) = pack("N", $ck); +} +else +{ + # Replace the innodb_checksum_algorithm=crc32 checksum + my $ck= pack("N", + mycrc32(substr($corrupted, 4, 22), 0, $polynomial) ^ + mycrc32(substr($corrupted_, 38, $page_size - 38 - 8), 0, + $polynomial)); + substr ($corrupted, 0, 4) = $ck; + substr ($corrupted, $page_size - 8, 4) = $ck; +} +syswrite(FILE, $corrupted); +close FILE; + +my $fname= "$ENV{'MYSQLD_DATADIR'}test/t2.ibd"; +open(FILE, "+<", $fname) or die; +FILE->autoflush(1); +binmode FILE; +seek(FILE, 3 * $ENV{'INNODB_PAGE_SIZE'}, SEEK_SET); +print FILE chr(0) x ($ENV{'INNODB_PAGE_SIZE'}); +close FILE; +EOF + +# Successful recover from doublewrite buffer +let $restart_parameters=; +--source include/start_mysqld.inc +let SEARCH_FILE= $MYSQLTEST_VARDIR/log/mysqld.1.err; +let SEARCH_PATTERN=InnoDB: Recovered page \\[page id: space=[1-9]*, page number=3\\]; +--source include/search_pattern_in_file.inc + +check table t1; +check table t2; +select f1, f2 from t1; +select f1, f2 from t2; +drop table t2, t1; diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc index c6adcb675f763..da0b5851dd732 100644 --- a/storage/innobase/buf/buf0buf.cc +++ b/storage/innobase/buf/buf0buf.cc @@ -3779,6 +3779,16 @@ dberr_t buf_page_t::read_complete(const fil_node_t &node) if (err == DB_PAGE_CORRUPTED || err == DB_DECRYPTION_FAILED) { release_page: + if (node.space->full_crc32() && node.space->crypt_data && + recv_recovery_is_on() && + recv_sys.dblwr.find_encrypted_page(node, id().page_no(), + const_cast(read_frame))) + { + /* Recover from doublewrite buffer */ + err= DB_SUCCESS; + goto success_page; + } + if (recv_sys.free_corrupted_page(expected_id, node)); else if (err == DB_FAIL) err= DB_PAGE_CORRUPTED; @@ -3800,6 +3810,7 @@ dberr_t buf_page_t::read_complete(const fil_node_t &node) buf_pool.corrupted_evict(this, buf_page_t::READ_FIX); return err; } +success_page: const bool recovery= recv_recovery_is_on(); diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc index f53710caf1dc1..07ae644f2636d 100644 --- a/storage/innobase/buf/buf0dblwr.cc +++ b/storage/innobase/buf/buf0dblwr.cc @@ -370,6 +370,7 @@ void buf_dblwr_t::recover() srv_page_size)); byte *const buf= read_buf + srv_page_size; + std::deque encrypted_pages; for (recv_dblwr_t::list::iterator i= recv_sys.dblwr.pages.begin(); i != recv_sys.dblwr.pages.end(); ++i, ++page_no_dblwr) { @@ -385,8 +386,16 @@ void buf_dblwr_t::recover() fil_space_t *space= fil_space_t::get(space_id); if (!space) - /* The tablespace that this page once belonged to does not exist */ + { + /* These pages doesn't belong to existing tablespace. + There is a possibility that these pages could be + encrypted using full_crc32 format. If innodb encounters + any corrupted encrypted page during recovery then + InnoDB should use this page to find the valid page. + See find_encrypted_page() */ + encrypted_pages.push_back(*i); continue; + } if (UNIV_UNLIKELY(page_no >= space->get_size())) { @@ -465,6 +474,8 @@ void buf_dblwr_t::recover() } recv_sys.dblwr.pages.clear(); + for (auto it : encrypted_pages) + recv_sys.dblwr.pages.push_back(it); fil_flush_file_spaces(); aligned_free(read_buf); } diff --git a/storage/innobase/include/log0recv.h b/storage/innobase/include/log0recv.h index fb7028126c886..d04fb6aa2343a 100644 --- a/storage/innobase/include/log0recv.h +++ b/storage/innobase/include/log0recv.h @@ -150,6 +150,17 @@ struct recv_dblwr_t const fil_space_t *space= nullptr, byte *tmp_buf= nullptr) const noexcept; + /** Find the doublewrite copy of an encrypted page with the + smallest FIL_PAGE_LSN that is large enough for recovery. + @param space tablespace object + @param page_no page number to find + @param buf unencrypted page + @retval true if encrypted page found in doublewrite buffer + @retval false otherwise */ + bool find_encrypted_page(const fil_node_t &space, + uint32_t page_no, + byte *buf); + /** Restore the first page of the given tablespace from doublewrite buffer. 1) Find the page which has page_no as 0 @@ -257,8 +268,9 @@ struct recv_sys_t during log scan or apply */ bool found_corrupt_fs; public: - /** whether we are applying redo log records during crash recovery */ - bool recovery_on; + /** whether we are applying redo log records during crash recovery. + This is protected by recv_sys.mutex */ + Atomic_relaxed recovery_on= false; /** whether recv_recover_page(), invoked from buf_page_t::read_complete(), should apply log records*/ bool apply_log_recs; diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc index cb40831327947..1d73de7d5b6a0 100644 --- a/storage/innobase/log/log0recv.cc +++ b/storage/innobase/log/log0recv.cc @@ -3785,6 +3785,7 @@ void recv_sys_t::apply(bool last_batch) /* We skipped this in buf_page_create(). */ mlog_init.mark_ibuf_exist(); mlog_init.clear(); + dblwr.pages.clear(); } else { @@ -4841,6 +4842,47 @@ bool recv_dblwr_t::validate_page(const page_id_t page_id, lsn_t max_lsn, goto check_if_corrupted; } +bool recv_dblwr_t::find_encrypted_page(const fil_node_t &node, + uint32_t page_no, + byte *buf) +{ + ut_ad(node.space->crypt_data); + ut_ad(node.space->full_crc32()); + mysql_mutex_lock(&recv_sys.mutex); + for (list::iterator page_it= pages.begin(); page_it != pages.end(); + page_it++) + { + const byte *page= *page_it; + if (page_get_page_no(page) != page_no || + buf_page_is_corrupted(true, page, node.space->flags)) + continue; + memcpy(buf, page, node.space->physical_size()); + buf_tmp_buffer_t* slot= buf_pool.io_buf_reserve(false); + ut_a(slot); + slot->allocate(); + bool invalidate= + !fil_space_decrypt(node.space, slot->crypt_buf, buf) || + (node.space->is_compressed() && + !fil_page_decompress(slot->crypt_buf, buf, node.space->flags)); + slot->release(); + + if (invalidate || + mach_read_from_4(buf + FIL_PAGE_SPACE_ID) != node.space->id) + continue; + + pages.erase(page_it); + sql_print_information("InnoDB: Recovered page [page id: space=" + UINT32PF ", page number=" UINT32PF "] " + "to '%s' from the doublewrite buffer.", + uint32_t(node.space->id), page_no, + node.name); + mysql_mutex_unlock(&recv_sys.mutex); + return true; + } + mysql_mutex_unlock(&recv_sys.mutex); + return false; +} + const byte *recv_dblwr_t::find_page(const page_id_t page_id, lsn_t max_lsn, const fil_space_t *space, byte *tmp_buf) const noexcept diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc index c3397c88336aa..cce1e5e359f02 100644 --- a/storage/innobase/srv/srv0start.cc +++ b/storage/innobase/srv/srv0start.cc @@ -324,8 +324,8 @@ static dberr_t create_log_file(bool create_new_db, lsn_t lsn, srv_startup_is_before_trx_rollback_phase = false; } - /* Enable checkpoints in buf_flush_page_cleaner(). */ - recv_sys.recovery_on = false; + ut_ad(!recv_sys.recovery_on); + mysql_mutex_unlock(&log_sys.mutex); log_make_checkpoint(); @@ -1351,8 +1351,7 @@ dberr_t srv_start(bool create_new_db) return(srv_init_abort(DB_ERROR)); } - /* Enable checkpoints in the page cleaner. */ - recv_sys.recovery_on = false; + ut_ad(!recv_sys.recovery_on); err= recv_recovery_read_max_checkpoint(); @@ -1507,8 +1506,6 @@ dberr_t srv_start(bool create_new_db) : recv_recovery_from_checkpoint_start(flushed_lsn); recv_sys.close_files(); - recv_sys.dblwr.pages.clear(); - if (err != DB_SUCCESS) { return(srv_init_abort(err)); }