From 8078ba2b613a83d0b59780e43c03f2751cd0b36f Mon Sep 17 00:00:00 2001 From: Yi Zhang Date: Tue, 13 Apr 2021 09:30:38 -0700 Subject: [PATCH] Fix contention in MDL_key::ACL_CACHE Summary: In 5.6 acl cache is protected by mutex which causes quite a bit of contention (especially if you have lots of acls), so we changed that to a mysql_rwlock_t in 5.6. In 8.0 this is replaced with a singleton MDL lock with namespace = MDL_key::ACL_CACHE with SHARED and EXCLUSIVE lock. However MDL lock has higher overhead than regular mysql_rwlock_t due to the way it maintains the tickets for granting/waiting and another internal mysql_prlock_rwlock (MDL_lock::m_rwlock) to update/traverse them, and therefore is more prone to contentions in high connection scenarios. In this particular case, the deadlock detection for ACL_CACHE leads to bad contention problems in the connection path, because taking rdlock on MDL_lock::m_rwlock while doing deadlock traversal takes longer with more granted and waiting tickets, but the releasing of these granted tickets are themselves blocked on taking wrlock on m_rwlock, meanwhile more connections are trying to come in and grant more tickets and taking rdlock as well in deadlock detection as well, so this becomes a serious lock contention that you can't get out of if you keep new connection come in at a fast pace. A easy way to trigger this is to run FLUSH PRIVILEGES on a server that has lots of incoming connections (with low connection timeout) with lots of connection from acl_check_host / has_global_grant. For ACL_CACHE, the deadlock detection in most cases doesn't provide a ton of value in production, because under ACL_cache lock we only operate on mysql.user/db/etc special system tables, so you'd need to take exclusive lock on them and then go through acl cache locking in order to trigger the deadlock detection, which seems quite unlikely in practice. We didn't see any issues in the past for 5.6 either, which was just a mutex and later we changed it to mysql_rwlock_t and had 0 deadlock detection. Therefore, I'm adding a new switch enable_acl_cache_deadlock_detection to turn it off for ACL_CACHE only. It defaults to on to maintain compat and we can turn it off in production. There are probably other options to fix this issue short term (such as replacing the ACL_cache lock with mysql_rwlock_t lock, moving allow_all_hosts check out of the lock, etc) with different risk trade offs, and longer term fixes that require bigger surgery to MDL locking system which probably should best left to upstream. This seems to be the lowest risk option at the moment. `#ifndef EXTRA_CODE_FOR_UNIT_TESTING` is needed to avoid compilation failure in locking unit tests (which doesn't seem to compile in the sys vars). Reviewed By: yoshinorim Differential Revision: D27740302 --- mysql-test/r/mysqld--help-notwin.result | 4 + ..._acl_cache_deadlock_detection_basic.result | 40 ++++++++++ ...le_acl_cache_deadlock_detection_basic.test | 75 +++++++++++++++++++ mysql-test/t/all_persisted_variables.test | 3 +- sql/mdl.cc | 24 ++++++ sql/mysqld.cc | 1 + sql/mysqld.h | 1 + sql/sys_vars.cc | 6 ++ 8 files changed, 153 insertions(+), 1 deletion(-) create mode 100644 mysql-test/suite/sys_vars/r/enable_acl_cache_deadlock_detection_basic.result create mode 100644 mysql-test/suite/sys_vars/t/enable_acl_cache_deadlock_detection_basic.test diff --git a/mysql-test/r/mysqld--help-notwin.result b/mysql-test/r/mysqld--help-notwin.result index 301690f1aea3..7c0ef4ab8f3e 100644 --- a/mysql-test/r/mysqld--help-notwin.result +++ b/mysql-test/r/mysqld--help-notwin.result @@ -408,6 +408,9 @@ The following options may be given as the first argument: before storage engine initialization, where each plugin is identified as name=library, where name is the plugin name and library is the plugin library in plugin_dir. + --enable-acl-cache-deadlock-detection + Enable deadlock detection on ACL_CACHE MDL lock + (Defaults to on; use --skip-enable-acl-cache-deadlock-detection to disable.) --enable-acl-db-cache Enable ACL db_cache lookup on (ip, user, db). Please issue FLUSH PRIVILEGES for the changes to take effect. @@ -2839,6 +2842,7 @@ disabled-storage-engines disallow-raft FALSE disconnect-on-expired-password TRUE div-precision-increment 4 +enable-acl-cache-deadlock-detection TRUE enable-acl-db-cache TRUE enable-acl-fast-lookup FALSE enable-binlog-hlc FALSE diff --git a/mysql-test/suite/sys_vars/r/enable_acl_cache_deadlock_detection_basic.result b/mysql-test/suite/sys_vars/r/enable_acl_cache_deadlock_detection_basic.result new file mode 100644 index 000000000000..4055a0e84a86 --- /dev/null +++ b/mysql-test/suite/sys_vars/r/enable_acl_cache_deadlock_detection_basic.result @@ -0,0 +1,40 @@ +SET @start_global_value = @@GLOBAL.enable_acl_cache_deadlock_detection; +SELECT @start_global_value; +@start_global_value +1 +SET @@GLOBAL.enable_acl_cache_deadlock_detection = on; +SET @@GLOBAL.enable_acl_cache_deadlock_detection = DEFAULT; +select @@GLOBAL.enable_acl_cache_deadlock_detection; +@@GLOBAL.enable_acl_cache_deadlock_detection +1 +FLUSH PRIVILEGES; +SET @@GLOBAL.enable_acl_cache_deadlock_detection = off; +SET @@GLOBAL.enable_acl_cache_deadlock_detection = DEFAULT; +select @@GLOBAL.enable_acl_cache_deadlock_detection; +@@GLOBAL.enable_acl_cache_deadlock_detection +1 +FLUSH PRIVILEGES; +SET @@GLOBAL.enable_acl_cache_deadlock_detection = on; +select @@GLOBAL.enable_acl_cache_deadlock_detection; +@@GLOBAL.enable_acl_cache_deadlock_detection +1 +FLUSH PRIVILEGES; +SET @@GLOBAL.enable_acl_cache_deadlock_detection = off; +select @@GLOBAL.enable_acl_cache_deadlock_detection; +@@GLOBAL.enable_acl_cache_deadlock_detection +0 +FLUSH PRIVILEGES; +SET @@GLOBAL.enable_acl_cache_deadlock_detection = something; +ERROR 42000: Variable 'enable_acl_cache_deadlock_detection' can't be set to the value of 'something' +select @@GLOBAL.enable_acl_cache_deadlock_detection; +@@GLOBAL.enable_acl_cache_deadlock_detection +0 +SET @@GLOBAL.enable_acl_cache_deadlock_detection = somethingelse; +ERROR 42000: Variable 'enable_acl_cache_deadlock_detection' can't be set to the value of 'somethingelse' +select @@GLOBAL.enable_acl_cache_deadlock_detection; +@@GLOBAL.enable_acl_cache_deadlock_detection +0 +SET @@GLOBAL.enable_acl_cache_deadlock_detection = @start_global_value; +SELECT @@GLOBAL.enable_acl_cache_deadlock_detection; +@@GLOBAL.enable_acl_cache_deadlock_detection +1 diff --git a/mysql-test/suite/sys_vars/t/enable_acl_cache_deadlock_detection_basic.test b/mysql-test/suite/sys_vars/t/enable_acl_cache_deadlock_detection_basic.test new file mode 100644 index 000000000000..a6bdf7af7b66 --- /dev/null +++ b/mysql-test/suite/sys_vars/t/enable_acl_cache_deadlock_detection_basic.test @@ -0,0 +1,75 @@ +################ mysql-test\t\enable_acl_cache_deadlock_detection.test ######## +# # +# Variable Name: enable_acl_cache_deadlock_detection # +# Scope: Global # +# # +# Creation Date: 2021-04-13 # +# # +# # +# Description:Test Cases of Dynamic System Variable # +# enable_acl_cache_deadlock_detection that checks the behavior of # +# this variable in the following ways # +# * Default Value # +# * Valid Value # +# * Invalid value # +# # +############################################################################### +SET @start_global_value = @@GLOBAL.enable_acl_cache_deadlock_detection; +SELECT @start_global_value; + +######################################################################## +# Display the DEFAULT value of enable_acl_cache_deadlock_detection # +######################################################################## + +SET @@GLOBAL.enable_acl_cache_deadlock_detection = on; +SET @@GLOBAL.enable_acl_cache_deadlock_detection = DEFAULT; +select @@GLOBAL.enable_acl_cache_deadlock_detection; + +# FLUSH PRIVILEGES not required but just a quick check to see acl cache +# is working properly +FLUSH PRIVILEGES; + +SET @@GLOBAL.enable_acl_cache_deadlock_detection = off; +SET @@GLOBAL.enable_acl_cache_deadlock_detection = DEFAULT; +select @@GLOBAL.enable_acl_cache_deadlock_detection; + +# FLUSH PRIVILEGES not required but just a quick check to see acl cache +# is working properly +FLUSH PRIVILEGES; + +############################################################################### +# change the value of enable_acl_cache_deadlock_detection to a valid value # +############################################################################### + +SET @@GLOBAL.enable_acl_cache_deadlock_detection = on; +select @@GLOBAL.enable_acl_cache_deadlock_detection; + +# FLUSH PRIVILEGES not required but just a quick check to see acl cache +# is working properly +FLUSH PRIVILEGES; + +SET @@GLOBAL.enable_acl_cache_deadlock_detection = off; +select @@GLOBAL.enable_acl_cache_deadlock_detection; + +# FLUSH PRIVILEGES not required but just a quick check to see acl cache +# is working properly +FLUSH PRIVILEGES; + +############################################################################### +# change the value of enable_acl_cache_deadlock_detection to an invalid value # +############################################################################### + +--Error ER_WRONG_VALUE_FOR_VAR +SET @@GLOBAL.enable_acl_cache_deadlock_detection = something; +select @@GLOBAL.enable_acl_cache_deadlock_detection; + +--Error ER_WRONG_VALUE_FOR_VAR +SET @@GLOBAL.enable_acl_cache_deadlock_detection = somethingelse; +select @@GLOBAL.enable_acl_cache_deadlock_detection; + +############################### +# Restore initial value # +############################### + +SET @@GLOBAL.enable_acl_cache_deadlock_detection = @start_global_value; +SELECT @@GLOBAL.enable_acl_cache_deadlock_detection; diff --git a/mysql-test/t/all_persisted_variables.test b/mysql-test/t/all_persisted_variables.test index 5ab248082211..0b9b308ee9e9 100644 --- a/mysql-test/t/all_persisted_variables.test +++ b/mysql-test/t/all_persisted_variables.test @@ -61,9 +61,10 @@ let $total_excluded_vars=`SELECT COUNT(*) FROM performance_schema.global_variabl 'binlog_file_basedir', 'binlog_index_basedir', 'default_collation_for_utf8mb4_init', +'disable_raft_log_repointing', +'enable_acl_cache_deadlock_detection', 'enable_acl_db_cache', 'enable_acl_fast_lookup', -'disable_raft_log_repointing', 'group_replication_plugin_hooks', 'gtid_committed', 'gtid_purged_for_tailing', diff --git a/sql/mdl.cc b/sql/mdl.cc index 76b92f5115de..26634542f5c3 100644 --- a/sql/mdl.cc +++ b/sql/mdl.cc @@ -4030,6 +4030,30 @@ bool MDL_lock::visit_subgraph(MDL_ticket *waiting_ticket, MDL_context *src_ctx = waiting_ticket->get_ctx(); bool result = true; +#ifndef EXTRA_CODE_FOR_UNIT_TESTING + if (!enable_acl_cache_deadlock_detection && + key.mdl_namespace() == MDL_key::ACL_CACHE) { + /* + Deadlock detection for ACL_CACHE may lead to bad contention problems + in the connection path. In particular taking rdlock on m_rwlock + while doing deadlock traversal takes longer with more granted and + waiting tickets, but the releasing of these granted tickets are + themselves blocked on taking wrlock on m_rwlock, meanwhile more + connections are trying to come in and grant more tickets and + taking wrlock as well, so this becomes a serious lock contention + that you can't get out of if you keep new connection come in at + a fast pace. A easy way to trigger this is to run FLUSH + PRIVILEGES on a server that has lots of incoming connections with + lots of connection from acl_check_host / has_global_grant. + + For ACL_CACHE, the deadlock detection in most cases doesn't provide + a ton of value in production so this allows turning it off for + ACL_CACHE only. + */ + return false; + } +#endif + mysql_prlock_rdlock(&m_rwlock); /* diff --git a/sql/mysqld.cc b/sql/mysqld.cc index c96db5f7e4a0..608d1623abf0 100644 --- a/sql/mysqld.cc +++ b/sql/mysqld.cc @@ -1255,6 +1255,7 @@ double wait_for_hlc_sleep_scaling_factor = 0.75; char *default_collation_for_utf8mb4_init = nullptr; bool enable_blind_replace = false; bool enable_acl_fast_lookup = false; +bool enable_acl_cache_deadlock_detection = false; bool enable_acl_db_cache = true; bool enable_super_log_bin_read_only = false; ulonglong max_tmp_disk_usage{0}; diff --git a/sql/mysqld.h b/sql/mysqld.h index 16d3caf6ce8d..4240d6738c70 100644 --- a/sql/mysqld.h +++ b/sql/mysqld.h @@ -286,6 +286,7 @@ extern ulong wait_for_hlc_sleep_threshold_ms; extern double wait_for_hlc_sleep_scaling_factor; extern char *default_collation_for_utf8mb4_init; extern bool enable_acl_fast_lookup; +extern bool enable_acl_cache_deadlock_detection; extern bool enable_acl_db_cache; extern bool enable_super_log_bin_read_only; diff --git a/sql/sys_vars.cc b/sql/sys_vars.cc index c7bbf5517546..f1fa79c5bcf2 100644 --- a/sql/sys_vars.cc +++ b/sql/sys_vars.cc @@ -8962,6 +8962,12 @@ static Sys_var_bool Sys_enable_acl_fast_lookup( DEFAULT(false), NO_MUTEX_GUARD, NOT_IN_BINLOG, ON_CHECK(check_enable_acl_fast_lookup)); +static Sys_var_bool Sys_enable_acl_cache_deadlock_detection( + "enable_acl_cache_deadlock_detection", + "Enable deadlock detection on ACL_CACHE MDL lock", + NON_PERSIST GLOBAL_VAR(enable_acl_cache_deadlock_detection), + CMD_LINE(OPT_ARG), DEFAULT(true)); + static Sys_var_bool Sys_enable_acl_db_cache( "enable_acl_db_cache", "Enable ACL db_cache lookup on (ip, user, db). Please issue "