From ee369d380cb871173983f04d7b877ae5cc913496 Mon Sep 17 00:00:00 2001 From: Dan Wang Date: Tue, 7 Nov 2023 15:16:49 +0800 Subject: [PATCH] fix: parse timestamp from the name of data dir for gc instead of the last update time (#1667) https://github.com/apache/incubator-pegasus/issues/1673 It's found that sometimes the data directories of replicas are removed immediately after they are renamed with postfixes `.err/.gar`, though actually both of `gc_disk_error_replica_interval_seconds` and `gc_disk_garbage_replica_interval_seconds` have been configured with at least one day. The reason is that the base time for expiration time is **the last write time**, that is, `st_mtime` within `struct stat` returned by `stat()`. Once a long time has passed since the last write time, the data directory will be removed immediately after it is renamed with postfixes `.err/.gar`. To fix this problem, just use the timestamp within the directory name as the base time that is generated when the data directory is renamed with postfixes `.err/.gar`. The last update time would be used iff the timestamp is NOT found within the directory name. --- src/replica/disk_cleaner.cpp | 200 +++++++++++++++++++++---- src/replica/disk_cleaner.h | 23 +-- src/replica/replica_stub.cpp | 2 +- src/replica/test/replica_disk_test.cpp | 80 +++++++--- src/test_util/test_util.h | 14 +- src/utils/macros.h | 28 ++++ src/utils/metrics.h | 3 +- 7 files changed, 275 insertions(+), 75 deletions(-) create mode 100644 src/utils/macros.h diff --git a/src/replica/disk_cleaner.cpp b/src/replica/disk_cleaner.cpp index 9488c32a69..cc9ffeb661 100644 --- a/src/replica/disk_cleaner.cpp +++ b/src/replica/disk_cleaner.cpp @@ -19,11 +19,13 @@ #include "disk_cleaner.h" +#include #include #include #include #include #include +#include #include "common/fs_manager.h" #include "metadata_types.h" @@ -32,6 +34,9 @@ #include "utils/filesystem.h" #include "utils/flags.h" #include "utils/fmt_logging.h" +#include "utils/macros.h" +#include "utils/string_conv.h" +#include "utils/string_view.h" namespace dsn { namespace replication { @@ -70,6 +75,108 @@ const std::string kFolderSuffixBak = ".bak"; const std::string kFolderSuffixOri = ".ori"; const std::string kFolderSuffixTmp = ".tmp"; +namespace { + +// TODO(wangdan): we could study later whether ctime (i.e. `st_ctime` within `struct stat`, +// the time of last status change) could be used instead of mtime (i.e. `st_ctime` within +// `struct stat`, the last write time), since ctime of the new directory would be updated +// to the current time once rename() is called, while mtime would not be updated. +bool get_expiration_timestamp_by_last_write_time(const std::string &path, + uint64_t delay_seconds, + uint64_t &expiration_timestamp_s) +{ + time_t last_write_time_s; + if (!dsn::utils::filesystem::last_write_time(path, last_write_time_s)) { + LOG_WARNING("gc_disk: failed to get last write time of {}", path); + return false; + } + + expiration_timestamp_s = static_cast(last_write_time_s) + delay_seconds; + return true; +} + +// Unix timestamp in microseconds for 2010-01-01 00:00:00 GMT+0000. +// This timestamp could be used as the minimum, since it's far earlier than the time when +// Pegasus was born. +#define MIN_TIMESTAMP_US 1262304000000000 +#define MIN_TIMESTAMP_US_LENGTH (sizeof(STRINGIFY(MIN_TIMESTAMP_US)) - 1) + +// Parse timestamp from the directory name. +// +// There are only 2 kinds of directory names that could include timestamp: one is the faulty +// replicas whose name has suffix ".err"; another is the dropped replicas whose name has +// suffix ".gar". The examples for both kinds of directory names: +// 1.1.pegasus.1698843209235962.err +// 1.2.pegasus.1698843214240709.gar +// +// Specify the size of suffix by `suffix_size`. For both kinds of names (.err and .gar), +// `suffix_size` is 4. +// +// The timestamp is the number just before the suffix, between the 2 dots. For example, in +// 1.1.pegasus.1698843209235962.err, 1698843209235962 is the timestamp in microseconds, +// generated by dsn_now_us(). +// +// `timestamp_us` is parsed result while returning true; otherwise, it would never be assigned. +bool parse_timestamp_us(const std::string &name, size_t suffix_size, uint64_t ×tamp_us) +{ + CHECK_GE(name.size(), suffix_size); + + if (suffix_size == name.size()) { + return false; + } + + const size_t end_idx = name.size() - suffix_size; + auto begin_idx = name.find_last_of('.', end_idx - 1); + if (begin_idx == std::string::npos || ++begin_idx >= end_idx) { + return false; + } + + const auto length = end_idx - begin_idx; + if (length < MIN_TIMESTAMP_US_LENGTH) { + return false; + } + + // std::isdigit() is not an addressable standard library function, thus it can't be used + // directly as an algorithm predicate. + // + // See following docs for details. + // https://stackoverflow.com/questions/75868796/differences-between-isdigit-and-stdisdigit + // https://en.cppreference.com/w/cpp/string/byte/isdigit + const auto begin_itr = name.cbegin() + begin_idx; + if (!std::all_of( + begin_itr, begin_itr + length, [](unsigned char c) { return std::isdigit(c); })) { + return false; + } + + const auto ok = + dsn::buf2uint64(dsn::string_view(name.data() + begin_idx, length), timestamp_us); + return ok ? timestamp_us > MIN_TIMESTAMP_US : false; +} + +bool get_expiration_timestamp(const std::string &name, + const std::string &path, + size_t suffix_size, + uint64_t delay_seconds, + uint64_t &expiration_timestamp_s) +{ + uint64_t timestamp_us = 0; + if (!parse_timestamp_us(name, suffix_size, timestamp_us)) { + // Once the timestamp could not be extracted from the directory name, the last write time + // would be used as the base time to compute the expiration time. + LOG_WARNING("gc_disk: failed to parse timestamp from {}, turn to " + "the last write time for {}", + name, + path); + return get_expiration_timestamp_by_last_write_time( + path, delay_seconds, expiration_timestamp_s); + } + + expiration_timestamp_s = timestamp_us / 1000000 + delay_seconds; + return true; +} + +} // anonymous namespace + error_s disk_remove_useless_dirs(const std::vector> &dir_nodes, /*output*/ disk_cleaning_report &report) { @@ -87,59 +194,85 @@ error_s disk_remove_useless_dirs(const std::vector> &d } sub_list.insert(sub_list.end(), tmp_list.begin(), tmp_list.end()); } - for (auto &fpath : sub_list) { - auto name = dsn::utils::filesystem::get_file_name(fpath); - if (!is_data_dir_removable(name)) { - continue; - } - std::string folder_suffix = name.substr(name.length() - 4); - - time_t mt; - if (!dsn::utils::filesystem::last_write_time(fpath, mt)) { - LOG_WARNING("gc_disk: failed to get last write time of {}", fpath); - continue; - } - auto last_write_time = (uint64_t)mt; - uint64_t current_time_ms = dsn_now_ms(); - uint64_t remove_interval_seconds = current_time_ms / 1000; + for (const auto &path : sub_list) { + uint64_t expiration_timestamp_s = 0; - // don't delete ".bak" directory because it is backed by administrator. - if (folder_suffix == kFolderSuffixErr) { + // Note: don't delete ".bak" directory since it could be did by administrator. + const auto name = dsn::utils::filesystem::get_file_name(path); + if (boost::algorithm::ends_with(name, kFolderSuffixErr)) { report.error_replica_count++; - remove_interval_seconds = FLAGS_gc_disk_error_replica_interval_seconds; - } else if (folder_suffix == kFolderSuffixGar) { + if (!get_expiration_timestamp(name, + path, + kFolderSuffixErr.size(), + FLAGS_gc_disk_error_replica_interval_seconds, + expiration_timestamp_s)) { + continue; + } + } else if (boost::algorithm::ends_with(name, kFolderSuffixGar)) { report.garbage_replica_count++; - remove_interval_seconds = FLAGS_gc_disk_garbage_replica_interval_seconds; - } else if (folder_suffix == kFolderSuffixTmp) { + if (!get_expiration_timestamp(name, + path, + kFolderSuffixGar.size(), + FLAGS_gc_disk_garbage_replica_interval_seconds, + expiration_timestamp_s)) { + continue; + } + } else if (boost::algorithm::ends_with(name, kFolderSuffixTmp)) { report.disk_migrate_tmp_count++; - remove_interval_seconds = FLAGS_gc_disk_migration_tmp_replica_interval_seconds; - } else if (folder_suffix == kFolderSuffixOri) { + if (!get_expiration_timestamp_by_last_write_time( + path, + FLAGS_gc_disk_migration_tmp_replica_interval_seconds, + expiration_timestamp_s)) { + continue; + } + } else if (boost::algorithm::ends_with(name, kFolderSuffixOri)) { report.disk_migrate_origin_count++; - remove_interval_seconds = FLAGS_gc_disk_migration_origin_replica_interval_seconds; + if (!get_expiration_timestamp_by_last_write_time( + path, + FLAGS_gc_disk_migration_origin_replica_interval_seconds, + expiration_timestamp_s)) { + continue; + } + } else { + continue; } - if (last_write_time + remove_interval_seconds <= current_time_ms / 1000) { - if (!dsn::utils::filesystem::remove_path(fpath)) { - LOG_WARNING("gc_disk: failed to delete directory '{}', time_used_ms = {}", - fpath, - dsn_now_ms() - current_time_ms); - } else { + const auto current_time_ms = dsn_now_ms(); + if (expiration_timestamp_s <= current_time_ms / 1000) { + if (dsn::utils::filesystem::remove_path(path)) { LOG_WARNING("gc_disk: replica_dir_op succeed to delete directory '{}'" ", time_used_ms = {}", - fpath, + path, dsn_now_ms() - current_time_ms); report.remove_dir_count++; + } else { + LOG_WARNING("gc_disk: failed to delete directory '{}', time_used_ms = {}", + path, + dsn_now_ms() - current_time_ms); } } else { LOG_INFO("gc_disk: reserve directory '{}', wait_seconds = {}", - fpath, - last_write_time + remove_interval_seconds - current_time_ms / 1000); + path, + expiration_timestamp_s - current_time_ms / 1000); } } return error_s::ok(); } +bool is_data_dir_removable(const std::string &dir) +{ + return boost::algorithm::ends_with(dir, kFolderSuffixErr) || + boost::algorithm::ends_with(dir, kFolderSuffixGar) || + boost::algorithm::ends_with(dir, kFolderSuffixTmp) || + boost::algorithm::ends_with(dir, kFolderSuffixOri); +} + +bool is_data_dir_invalid(const std::string &dir) +{ + return is_data_dir_removable(dir) || boost::algorithm::ends_with(dir, kFolderSuffixBak); +} + void move_to_err_path(const std::string &path, const std::string &log_prefix) { const std::string new_path = fmt::format("{}.{}{}", path, dsn_now_us(), kFolderSuffixErr); @@ -150,5 +283,6 @@ void move_to_err_path(const std::string &path, const std::string &log_prefix) new_path); LOG_WARNING("{}: succeed to move directory from '{}' to '{}'", log_prefix, path, new_path); } + } // namespace replication } // namespace dsn diff --git a/src/replica/disk_cleaner.h b/src/replica/disk_cleaner.h index 7961d084cf..9e2edb7523 100644 --- a/src/replica/disk_cleaner.h +++ b/src/replica/disk_cleaner.h @@ -55,26 +55,13 @@ struct disk_cleaning_report extern error_s disk_remove_useless_dirs(const std::vector> &dir_nodes, /*output*/ disk_cleaning_report &report); -inline bool is_data_dir_removable(const std::string &dir) -{ - if (dir.length() < 4) { - return false; - } - const std::string folder_suffix = dir.substr(dir.length() - 4); - return (folder_suffix == kFolderSuffixErr || folder_suffix == kFolderSuffixGar || - folder_suffix == kFolderSuffixTmp || folder_suffix == kFolderSuffixOri); -} +bool is_data_dir_removable(const std::string &dir); -// Note: ".bak" is invalid but not allow delete, because it can be backed by administrator. -inline bool is_data_dir_invalid(const std::string &dir) -{ - if (dir.length() < 4) { - return false; - } - const std::string folder_suffix = dir.substr(dir.length() - 4); - return is_data_dir_removable(dir) || folder_suffix == kFolderSuffixBak; -} +// Note: ".bak" is invalid but not allowed to be deleted, because it could be did by +// administrator on purpose. +bool is_data_dir_invalid(const std::string &dir); void move_to_err_path(const std::string &path, const std::string &log_prefix); + } // namespace replication } // namespace dsn diff --git a/src/replica/replica_stub.cpp b/src/replica/replica_stub.cpp index c5bd5000a9..e1815b48bf 100644 --- a/src/replica/replica_stub.cpp +++ b/src/replica/replica_stub.cpp @@ -1754,7 +1754,7 @@ void replica_stub::on_gc_replica(replica_stub_ptr this_, gpid id) CHECK( dsn::utils::filesystem::directory_exists(replica_path), "dir({}) not exist", replica_path); LOG_INFO("start to move replica({}) as garbage, path: {}", id, replica_path); - const auto rename_path = fmt::format("{}.{}.gar", replica_path, dsn_now_us()); + const auto rename_path = fmt::format("{}.{}{}", replica_path, dsn_now_us(), kFolderSuffixGar); if (!dsn::utils::filesystem::rename_path(replica_path, rename_path)) { LOG_WARNING("gc_replica: failed to move directory '{}' to '{}'", replica_path, rename_path); diff --git a/src/replica/test/replica_disk_test.cpp b/src/replica/test/replica_disk_test.cpp index 3bf7459347..6717ffd402 100644 --- a/src/replica/test/replica_disk_test.cpp +++ b/src/replica/test/replica_disk_test.cpp @@ -17,6 +17,7 @@ * under the License. */ +#include // IWYU pragma: no_include // IWYU pragma: no_include // IWYU pragma: no_include @@ -42,6 +43,7 @@ #include "replica/test/mock_utils.h" #include "replica_admin_types.h" #include "replica_disk_test_base.h" +#include "runtime/api_layer1.h" #include "runtime/rpc/rpc_holder.h" #include "test_util/test_util.h" #include "utils/autoref_ptr.h" @@ -198,46 +200,86 @@ TEST_P(replica_disk_test, on_query_disk_info_one_app) } } +TEST_P(replica_disk_test, check_data_dir_removable) +{ + struct test_case + { + std::string path; + bool expected_removable; + bool expected_invalid; + } tests[] = {{"./replica.0.err", true, true}, + {"./replica.1.gar", true, true}, + {"./replica.2.tmp", true, true}, + {"./replica.3.ori", true, true}, + {"./replica.4.bak", false, true}, + {"./replica.5.abcde", false, false}, + {"./replica.6.x", false, false}, + {"./replica.7.8", false, false}}; + + for (const auto &test : tests) { + EXPECT_EQ(test.expected_removable, is_data_dir_removable(test.path)); + EXPECT_EQ(test.expected_invalid, is_data_dir_invalid(test.path)); + } +} + TEST_P(replica_disk_test, gc_disk_useless_dir) { + PRESERVE_FLAG(gc_disk_error_replica_interval_seconds); + PRESERVE_FLAG(gc_disk_garbage_replica_interval_seconds); + PRESERVE_FLAG(gc_disk_migration_origin_replica_interval_seconds); + PRESERVE_FLAG(gc_disk_migration_tmp_replica_interval_seconds); + FLAGS_gc_disk_error_replica_interval_seconds = 1; FLAGS_gc_disk_garbage_replica_interval_seconds = 1; FLAGS_gc_disk_migration_origin_replica_interval_seconds = 1; FLAGS_gc_disk_migration_tmp_replica_interval_seconds = 1; - std::vector tests{ - "./replica1.err", - "./replica2.err", - "./replica.gar", - "./replica.tmp", - "./replica.ori", - "./replica.bak", - "./replica.1.1", - }; + struct test_case + { + std::string path; + bool expected_exists; + } tests[] = {{"./replica1.err", false}, + {"./replica2.err", false}, + {"./replica.gar", false}, + {"./replica.tmp", false}, + {"./replica.ori", false}, + {"./replica.bak", true}, + {"./replica.1.1", true}, + {"./1.1.pegasus.1234567890.err", false}, + {"./1.2.pegasus.0123456789.gar", false}, + {"./2.1.pegasus.1234567890123456.err", false}, + {"./2.2.pegasus.1234567890abcdef.gar", false}, + {fmt::format("./1.1.pegasus.{}.err", dsn_now_us()), false}, + {fmt::format("./2.1.pegasus.{}.gar", dsn_now_us()), false}, + {fmt::format("./1.2.pegasus.{}.gar", dsn_now_us() + 1000 * 1000 * 1000), true}, + {fmt::format("./2.2.pegasus.{}.err", dsn_now_us() + 1000 * 1000 * 1000), true}}; for (const auto &test : tests) { - utils::filesystem::create_directory(test); - ASSERT_TRUE(utils::filesystem::directory_exists(test)); + // Ensure that every directory does not exist and should be created. + CHECK_TRUE(utils::filesystem::create_directory(test.path)); + ASSERT_TRUE(utils::filesystem::directory_exists(test.path)); } sleep(5); disk_cleaning_report report{}; - dsn::replication::disk_remove_useless_dirs({std::make_shared("test", "./")}, report); + ASSERT_TRUE(dsn::replication::disk_remove_useless_dirs( + {std::make_shared("test", "./")}, report)); for (const auto &test : tests) { - if (!dsn::replication::is_data_dir_removable(test)) { - ASSERT_TRUE(utils::filesystem::directory_exists(test)); - continue; + ASSERT_EQ(test.expected_exists, utils::filesystem::directory_exists(test.path)); + if (test.expected_exists) { + // Delete existing directories, in case that they are mixed with later test cases + // to affect test results. + CHECK_TRUE(dsn::utils::filesystem::remove_path(test.path)); } - ASSERT_FALSE(utils::filesystem::directory_exists(test)); } - ASSERT_EQ(report.remove_dir_count, 5); + ASSERT_EQ(report.remove_dir_count, 11); ASSERT_EQ(report.disk_migrate_origin_count, 1); ASSERT_EQ(report.disk_migrate_tmp_count, 1); - ASSERT_EQ(report.garbage_replica_count, 1); - ASSERT_EQ(report.error_replica_count, 2); + ASSERT_EQ(report.garbage_replica_count, 5); + ASSERT_EQ(report.error_replica_count, 6); } TEST_P(replica_disk_test, disk_status_test) diff --git a/src/test_util/test_util.h b/src/test_util/test_util.h index ba253e7b35..90930c226c 100644 --- a/src/test_util/test_util.h +++ b/src/test_util/test_util.h @@ -19,16 +19,20 @@ #pragma once +#include +#include #include #include #include -#include #include -#include #include #include "runtime/api_layer1.h" #include "utils/env.h" +// IWYU refused to include "utils/defer.h" everywhere, both in .h and .cpp files. +// However, once "utils/defer.h" is not included, it is inevitable that compilation +// will fail since dsn::defer is referenced. Thus force IWYU to keep it. +#include "utils/defer.h" // IWYU pragma: keep #include "utils/flags.h" #include "utils/test_macros.h" @@ -40,6 +44,12 @@ class file_meta; DSN_DECLARE_bool(encrypt_data_at_rest); +// Save the current value of a flag and restore it at the end of the function. +#define PRESERVE_FLAG(name) \ + auto PRESERVED_FLAGS_##name = FLAGS_##name; \ + auto PRESERVED_FLAGS_##name##_cleanup = \ + dsn::defer([PRESERVED_FLAGS_##name]() { FLAGS_##name = PRESERVED_FLAGS_##name; }) + namespace pegasus { // A base parameterized test class for testing enable/disable encryption at rest. diff --git a/src/utils/macros.h b/src/utils/macros.h new file mode 100644 index 0000000000..c44c45e252 --- /dev/null +++ b/src/utils/macros.h @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +// Suppose there is a macro defined as: +// #define FOO 123 +// +// Once we need the value represented by FOO to be a string, i.e. "123", just do: +// STRINGIFY(FOO) +// +// See https://gcc.gnu.org/onlinedocs/gcc-4.8.5/cpp/Stringification.html for details. +#define STRINGIFY_HELPER(x) #x +#define STRINGIFY(x) STRINGIFY_HELPER(x) diff --git a/src/utils/metrics.h b/src/utils/metrics.h index 27c6355f32..75afc31d89 100644 --- a/src/utils/metrics.h +++ b/src/utils/metrics.h @@ -46,6 +46,7 @@ #include "utils/enum_helper.h" #include "utils/fmt_logging.h" #include "utils/long_adder.h" +#include "utils/macros.h" #include "utils/nth_element.h" #include "utils/ports.h" #include "utils/singleton.h" @@ -973,8 +974,6 @@ struct kth_percentile_property double decimal; }; -#define STRINGIFY_HELPER(x) #x -#define STRINGIFY(x) STRINGIFY_HELPER(x) #define STRINGIFY_KTH_PERCENTILE_NAME(kth) STRINGIFY(KTH_PERCENTILE_NAME(kth)) #define KTH_TO_DECIMAL(kth) 0.##kth #define KTH_PERCENTILE_PROPERTY_LIST(kth) \