From caf0ed3ae9ad811e3d92f8a737633e53286a2e48 Mon Sep 17 00:00:00 2001 From: Florent De Neve Date: Mon, 9 Dec 2024 16:32:31 -0400 Subject: [PATCH 01/27] Allow the checkpointer to zero the old *-shm pages instead of the writer who does so in the COMMIT block --- sqlitecluster/SQLite.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sqlitecluster/SQLite.cpp b/sqlitecluster/SQLite.cpp index e4f63acb2..178ffb035 100644 --- a/sqlitecluster/SQLite.cpp +++ b/sqlitecluster/SQLite.cpp @@ -797,7 +797,9 @@ int SQLite::commit(const string& description, function* preCheckpointCal if (_sharedData.outstandingFramesToCheckpoint) { auto start = STimeNow(); int framesCheckpointed = 0; - sqlite3_wal_checkpoint_v2(_db, 0, SQLITE_CHECKPOINT_PASSIVE, NULL, &framesCheckpointed); + sqlite3_busy_timeout(_db, 120'000); // 2 minutes + sqlite3_wal_checkpoint_v2(_db, 0, SQLITE_CHECKPOINT_FULL, NULL, &framesCheckpointed); + sqlite3_busy_timeout(_db, 0); auto end = STimeNow(); SINFO("Checkpointed " << framesCheckpointed << " (total) frames of " << _sharedData.outstandingFramesToCheckpoint << " in " << (end - start) << "us."); From a5792f94e7e7c3f0535987198c2f3bfcfd0aa45a Mon Sep 17 00:00:00 2001 From: Tyler Karaszewski Date: Fri, 13 Dec 2024 12:35:40 -0800 Subject: [PATCH 02/27] Move delete out of commit lock --- sqlitecluster/SQLite.cpp | 73 +++++++++++++--------------------------- sqlitecluster/SQLite.h | 4 --- 2 files changed, 24 insertions(+), 53 deletions(-) diff --git a/sqlitecluster/SQLite.cpp b/sqlitecluster/SQLite.cpp index 1d2fd946a..d1ebf1797 100644 --- a/sqlitecluster/SQLite.cpp +++ b/sqlitecluster/SQLite.cpp @@ -164,28 +164,6 @@ vector SQLite::initializeJournal(sqlite3* db, int minJournalTables) { return journalNames; } -uint64_t SQLite::initializeJournalSize(sqlite3* db, const vector& journalNames) { - // We keep track of the number of rows in the journal, so that we can delete old entries when we're over our size - // limit. - // We want the min of all journal tables. - string minQuery = _getJournalQuery(journalNames, {"SELECT MIN(id) AS id FROM"}, true); - minQuery = "SELECT MIN(id) AS id FROM (" + minQuery + ")"; - - // And the max. - string maxQuery = _getJournalQuery(journalNames, {"SELECT MAX(id) AS id FROM"}, true); - maxQuery = "SELECT MAX(id) AS id FROM (" + maxQuery + ")"; - - // Look up the min and max values in the database. - SQResult result; - SASSERT(!SQuery(db, "getting commit min", minQuery, result)); - uint64_t min = SToUInt64(result[0][0]); - SASSERT(!SQuery(db, "getting commit max", maxQuery, result)); - uint64_t max = SToUInt64(result[0][0]); - - // And save the difference as the size of the journal. - return max - min; -} - void SQLite::commonConstructorInitialization(bool hctree) { // Perform sanity checks. SASSERT(!_filename.empty()); @@ -229,7 +207,6 @@ SQLite::SQLite(const string& filename, int cacheSize, int maxJournalSize, _db(initializeDB(_filename, mmapSizeGB, hctree)), _journalNames(initializeJournal(_db, minJournalTables)), _sharedData(initializeSharedData(_db, _filename, _journalNames, hctree)), - _journalSize(initializeJournalSize(_db, _journalNames)), _cacheSize(cacheSize), _mmapSizeGB(mmapSizeGB) { @@ -242,7 +219,6 @@ SQLite::SQLite(const SQLite& from) : _db(initializeDB(_filename, from._mmapSizeGB, false)), // Create a *new* DB handle from the same filename, don't copy the existing handle. _journalNames(from._journalNames), _sharedData(from._sharedData), - _journalSize(from._journalSize), _cacheSize(from._cacheSize), _mmapSizeGB(from._mmapSizeGB) { @@ -665,6 +641,30 @@ bool SQLite::_writeIdempotent(const string& query, SQResult& result, bool always bool SQLite::prepare(uint64_t* transactionID, string* transactionhash) { SASSERT(_insideTransaction); + // Pick a journal for this transaction. + const int64_t journalID = _sharedData.nextJournalCount++; + _journalName = _journalNames[journalID % _journalNames.size()]; + + // It's possible to attempt to commit a transaction with no writes. We'll skip truncating the journal in this case to avoid + // Turning a no=op into a write. + if (_uncommittedQuery.size()) { + // Look up the oldest commit in our chosen journal, and compute the oldest commit we intend to keep. + SQResult result; + SASSERT(!SQuery(_db, "getting commit min", "SELECT MIN(id) AS id FROM " + _journalName, result)); + uint64_t minJournalEntry = SToUInt64(result[0][0]); + uint64_t oldestCommitToKeep = _sharedData.commitCount - _maxJournalSize; + + // We limit deletions to a relatively small number to avoid making this extremenly slow for some transactions in the case + // where this journal in particular has accumulated a large backlog. + static const size_t deleteLimit = 10; + if (minJournalEntry < oldestCommitToKeep) { + string query = "DELETE FROM " + _journalName + " WHERE id < " + SQ(oldestCommitToKeep) + " LIMIT " + SQ(deleteLimit); + SASSERT(!SQuery(_db, "Deleting oldest journal rows", query)); + size_t deletedCount = sqlite3_changes(_db); + SINFO("Removed " << deletedCount << " rows from journal " << _journalName); + } + } + // We lock this here, so that we can guarantee the order in which commits show up in the database. if (!_mutexLocked) { auto start = STimeNow(); @@ -680,8 +680,6 @@ bool SQLite::prepare(uint64_t* transactionID, string* transactionhash) { // We pass the journal number selected to the handler so that a caller can utilize the // same method bedrock does for accessing 1 table per thread, in order to attempt to // reduce conflicts on tables that are written to on every command - const int64_t journalID = _sharedData.nextJournalCount++; - _journalName = _journalNames[journalID % _journalNames.size()]; if (_shouldNotifyPluginsOnPrepare) { (*_onPrepareHandler)(*this, journalID); } @@ -738,28 +736,6 @@ int SQLite::commit(const string& description, function* preCheckpointCal SASSERT(!_uncommittedHash.empty()); // Must prepare first int result = 0; - // Do we need to truncate as we go? - uint64_t newJournalSize = _journalSize + 1; - if (newJournalSize > _maxJournalSize) { - // Delete the oldest entry - uint64_t before = STimeNow(); - string query = "DELETE FROM " + _journalName + " " - "WHERE id < (SELECT MAX(id) FROM " + _journalName + ") - " + SQ(_maxJournalSize) + " " - "LIMIT 10"; - SASSERT(!SQuery(_db, "Deleting oldest journal rows", query)); - - // Figure out the new journal size. - SQResult result; - SASSERT(!SQuery(_db, "getting commit min", "SELECT MIN(id) AS id FROM " + _journalName, result)); - uint64_t min = SToUInt64(result[0][0]); - SASSERT(!SQuery(_db, "getting commit max", "SELECT MAX(id) AS id FROM " + _journalName, result)); - uint64_t max = SToUInt64(result[0][0]); - newJournalSize = max - min; - - // Log timing info. - _writeElapsed += STimeNow() - before; - } - // Make sure one is ready to commit SDEBUG("Committing transaction"); @@ -798,7 +774,6 @@ int SQLite::commit(const string& description, function* preCheckpointCal } _commitElapsed += STimeNow() - before; - _journalSize = newJournalSize; _sharedData.incrementCommit(_uncommittedHash); _insideTransaction = false; _uncommittedHash.clear(); diff --git a/sqlitecluster/SQLite.h b/sqlitecluster/SQLite.h index 648d0b1a7..15767a7e5 100644 --- a/sqlitecluster/SQLite.h +++ b/sqlitecluster/SQLite.h @@ -354,7 +354,6 @@ class SQLite { static SharedData& initializeSharedData(sqlite3* db, const string& filename, const vector& journalNames, bool hctree); static sqlite3* initializeDB(const string& filename, int64_t mmapSizeGB, bool hctree); static vector initializeJournal(sqlite3* db, int minJournalTables); - static uint64_t initializeJournalSize(sqlite3* db, const vector& journalNames); void commonConstructorInitialization(bool hctree = false); // The filename of this DB, canonicalized to its full path on disk. @@ -375,9 +374,6 @@ class SQLite { // The name of the journal table that this particular DB handle with write to. string _journalName; - // The current size of the journal, in rows. TODO: Why isn't this in SharedData? - uint64_t _journalSize; - // True when we have a transaction in progress. bool _insideTransaction = false; From 09ae2b0f05ce0629824e6bb9d41511e1c4f557b1 Mon Sep 17 00:00:00 2001 From: Tyler Karaszewski Date: Fri, 13 Dec 2024 12:37:09 -0800 Subject: [PATCH 03/27] typo --- sqlitecluster/SQLite.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sqlitecluster/SQLite.cpp b/sqlitecluster/SQLite.cpp index d1ebf1797..456156e81 100644 --- a/sqlitecluster/SQLite.cpp +++ b/sqlitecluster/SQLite.cpp @@ -646,7 +646,7 @@ bool SQLite::prepare(uint64_t* transactionID, string* transactionhash) { _journalName = _journalNames[journalID % _journalNames.size()]; // It's possible to attempt to commit a transaction with no writes. We'll skip truncating the journal in this case to avoid - // Turning a no=op into a write. + // Turning a no-op into a write. if (_uncommittedQuery.size()) { // Look up the oldest commit in our chosen journal, and compute the oldest commit we intend to keep. SQResult result; From 85de3d3b1cf9c4821c827eb199b8a91514e69252 Mon Sep 17 00:00:00 2001 From: Tyler Karaszewski Date: Fri, 13 Dec 2024 13:46:38 -0800 Subject: [PATCH 04/27] Fix integer underflow --- sqlitecluster/SQLite.cpp | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/sqlitecluster/SQLite.cpp b/sqlitecluster/SQLite.cpp index 456156e81..49e89c0b1 100644 --- a/sqlitecluster/SQLite.cpp +++ b/sqlitecluster/SQLite.cpp @@ -650,9 +650,14 @@ bool SQLite::prepare(uint64_t* transactionID, string* transactionhash) { if (_uncommittedQuery.size()) { // Look up the oldest commit in our chosen journal, and compute the oldest commit we intend to keep. SQResult result; - SASSERT(!SQuery(_db, "getting commit min", "SELECT MIN(id) AS id FROM " + _journalName, result)); - uint64_t minJournalEntry = SToUInt64(result[0][0]); - uint64_t oldestCommitToKeep = _sharedData.commitCount - _maxJournalSize; + SASSERT(!SQuery(_db, "getting commit min", "SELECT MIN(id) FROM " + _journalName, result)); + uint64_t minJournalEntry = result.size() ? SToUInt64(result[0][0]) : 0; + uint64_t commitCount = _sharedData.commitCount; + + // If the commitCount is less than the max journal size, keep everything. Otherwise, keep everything from + // commitCount - _maxJournalSize forward. We can't just do the last subtraction part because it overflows our unsigned + // int. + uint64_t oldestCommitToKeep = commitCount < _maxJournalSize ? 0 : commitCount - _maxJournalSize; // We limit deletions to a relatively small number to avoid making this extremenly slow for some transactions in the case // where this journal in particular has accumulated a large backlog. @@ -661,7 +666,7 @@ bool SQLite::prepare(uint64_t* transactionID, string* transactionhash) { string query = "DELETE FROM " + _journalName + " WHERE id < " + SQ(oldestCommitToKeep) + " LIMIT " + SQ(deleteLimit); SASSERT(!SQuery(_db, "Deleting oldest journal rows", query)); size_t deletedCount = sqlite3_changes(_db); - SINFO("Removed " << deletedCount << " rows from journal " << _journalName); + SINFO("Removed " << deletedCount << " rows from journal " << _journalName << ", oldestToKeep: " << oldestCommitToKeep << ", count:" << commitCount << ", limit: " << _maxJournalSize); } } From ebcc4a99f77734b46fba2d3bfcd57e1679d18c2a Mon Sep 17 00:00:00 2001 From: Florent De Neve Date: Mon, 16 Dec 2024 11:11:28 -0400 Subject: [PATCH 05/27] Update SQLite with more logs, v19 --- libstuff/sqlite3.c | 36 ++++++++++++++++++++++++------------ libstuff/sqlite3.h | 2 +- 2 files changed, 25 insertions(+), 13 deletions(-) diff --git a/libstuff/sqlite3.c b/libstuff/sqlite3.c index e02dd0087..5381fa9ec 100644 --- a/libstuff/sqlite3.c +++ b/libstuff/sqlite3.c @@ -18,7 +18,7 @@ ** separate file. This file contains only code for the core SQLite library. ** ** The content in this amalgamation comes from Fossil check-in -** 3c25c69c93e55738cdbfdd87fa3c879b8786. +** fa87355f6286be1e92f22a71cbfbfb13d1a4. */ #define SQLITE_CORE 1 #define SQLITE_AMALGAMATION 1 @@ -465,7 +465,7 @@ extern "C" { */ #define SQLITE_VERSION "3.47.0" #define SQLITE_VERSION_NUMBER 3047000 -#define SQLITE_SOURCE_ID "2024-12-12 20:39:56 3c25c69c93e55738cdbfdd87fa3c879b878674973955490770f5e274da1ca9a4" +#define SQLITE_SOURCE_ID "2024-12-13 18:13:51 fa87355f6286be1e92f22a71cbfbfb13d1a478d5fb5b38abedbd78bf903171fa" /* ** CAPI3REF: Run-Time Library Version Numbers @@ -18241,13 +18241,14 @@ struct sqlite3 { #define SCHEMA_TIME_STAT4_Q1_BODY 14 #define SCHEMA_TIME_AFTER_STAT4_Q1 15 #define SCHEMA_TIME_STAT4_Q2_BODY 16 -#define SCHEMA_TIME_AFTER_STAT4_Q2 17 -#define SCHEMA_TIME_AFTER_STAT4 18 +#define SCHEMA_TIME_STAT4_SAMPLE_MALLOC 17 +#define SCHEMA_TIME_AFTER_STAT4_Q2 18 +#define SCHEMA_TIME_AFTER_STAT4 19 -#define SCHEMA_TIME_END_ANALYZE_LOAD 19 -#define SCHEMA_TIME_FINISH 20 +#define SCHEMA_TIME_END_ANALYZE_LOAD 20 +#define SCHEMA_TIME_FINISH 21 -#define SCHEMA_TIME_N 21 +#define SCHEMA_TIME_N 22 #define SCHEMA_TIME_TIMEOUT (500 * 1000) @@ -93491,7 +93492,7 @@ SQLITE_PRIVATE void sqlite3CommitTimeLog(u64 *aCommit){ } zStr = sqlite3_mprintf("%z%s%s%d%s", zStr, (zStr?", ":""),zHash,iVal,zU); } - sqlite3_log(SQLITE_WARNING, "slow commit (v=18): (%s)", zStr); + sqlite3_log(SQLITE_WARNING, "slow commit (v=19): (%s)", zStr); sqlite3_free(zStr); } } @@ -93519,7 +93520,7 @@ SQLITE_PRIVATE void sqlite3PrepareTimeLog(const char *zSql, int nSql, u64 *aPrep } if( nByte<0 ){ nByte = sqlite3Strlen30(zSql); } sqlite3_log(SQLITE_WARNING, - "slow prepare (v=18): (%s) [%.*s]", zStr, nByte, zSql + "slow prepare (v=19): (%s) [%.*s]", zStr, nByte, zSql ); sqlite3_free(zStr); } @@ -93535,12 +93536,13 @@ SQLITE_PRIVATE void sqlite3SchemaTimeLog(u64 *aSchema, const char *zFile){ if( val!=0 && ii!=SCHEMA_TIME_STAT4_Q1_BODY && ii!=SCHEMA_TIME_STAT4_Q2_BODY + && ii!=SCHEMA_TIME_STAT4_SAMPLE_MALLOC ){ val -= i1; } zStr = sqlite3_mprintf("%z%s%d", zStr, (zStr?", ":""), val); } - sqlite3_log(SQLITE_WARNING, "slow schema (%s) (v=18): (%s)", zFile, zStr); + sqlite3_log(SQLITE_WARNING, "slow schema (%s) (v=19): (%s)", zFile, zStr); sqlite3_free(zStr); } } @@ -123568,6 +123570,9 @@ static void decodeIntArray( #endif if( *z==' ' ) z++; } + if( aOut ){ + for(/* no-op */; iaSchemaTime ){ + t = sqlite3STimeNow(); + } + aNew = (IndexSample*)sqlite3DbMallocRaw(db, nByte); if( aNew==0 ) return SQLITE_NOMEM_BKPT; + if( db->aSchemaTime ){ + db->aSchemaTime[SCHEMA_TIME_STAT4_SAMPLE_MALLOC] += (sqlite3STimeNow() - t); + } pPtr = (u8*)aNew; pPtr += ROUND8(nNew*sizeof(pIdx->aSample[0])); @@ -258017,7 +258029,7 @@ static void fts5SourceIdFunc( ){ assert( nArg==0 ); UNUSED_PARAM2(nArg, apUnused); - sqlite3_result_text(pCtx, "fts5: 2024-12-12 20:39:56 3c25c69c93e55738cdbfdd87fa3c879b878674973955490770f5e274da1ca9a4", -1, SQLITE_TRANSIENT); + sqlite3_result_text(pCtx, "fts5: 2024-12-13 18:13:51 fa87355f6286be1e92f22a71cbfbfb13d1a478d5fb5b38abedbd78bf903171fa", -1, SQLITE_TRANSIENT); } /* diff --git a/libstuff/sqlite3.h b/libstuff/sqlite3.h index 2983455f0..fe6bc6016 100644 --- a/libstuff/sqlite3.h +++ b/libstuff/sqlite3.h @@ -148,7 +148,7 @@ extern "C" { */ #define SQLITE_VERSION "3.47.0" #define SQLITE_VERSION_NUMBER 3047000 -#define SQLITE_SOURCE_ID "2024-12-12 20:39:56 3c25c69c93e55738cdbfdd87fa3c879b878674973955490770f5e274da1ca9a4" +#define SQLITE_SOURCE_ID "2024-12-13 18:13:51 fa87355f6286be1e92f22a71cbfbfb13d1a478d5fb5b38abedbd78bf903171fa" /* ** CAPI3REF: Run-Time Library Version Numbers From 5acd465856c55587499396deef2979b75d7e1534 Mon Sep 17 00:00:00 2001 From: Florent De Neve Date: Mon, 16 Dec 2024 12:47:10 -0400 Subject: [PATCH 06/27] Allow to pass checkpointMode via CLI to make it dynamically configurable --- BedrockServer.cpp | 13 +++++++++---- main.cpp | 2 ++ sqlitecluster/SQLite.cpp | 32 +++++++++++++++++++++++++------- sqlitecluster/SQLite.h | 5 ++++- sqlitecluster/SQLitePool.cpp | 5 +++-- sqlitecluster/SQLitePool.h | 2 +- 6 files changed, 44 insertions(+), 15 deletions(-) diff --git a/BedrockServer.cpp b/BedrockServer.cpp index 98e59414f..cbe7d7e9a 100644 --- a/BedrockServer.cpp +++ b/BedrockServer.cpp @@ -97,7 +97,7 @@ void BedrockServer::sync() // We use fewer FDs on test machines that have other resource restrictions in place. SINFO("Setting dbPool size to: " << _dbPoolSize); - _dbPool = make_shared(_dbPoolSize, args["-db"], args.calc("-cacheSize"), args.calc("-maxJournalSize"), journalTables, mmapSizeGB, args.isSet("-hctree")); + _dbPool = make_shared(_dbPoolSize, args["-db"], args.calc("-cacheSize"), args.calc("-maxJournalSize"), journalTables, mmapSizeGB, args.isSet("-hctree"), args["-checkpointMode"]); SQLite& db = _dbPool->getBase(); // Initialize the command processor. @@ -358,7 +358,7 @@ void BedrockServer::sync() committingCommand = true; _syncNode->startCommit(SQLiteNode::QUORUM); _lastQuorumCommandTime = STimeNow(); - + // This interrupts the next poll loop immediately. This prevents a 1-second wait when running as a single server. _notifyDoneSync.push(true); SDEBUG("Finished sending distributed transaction for db upgrade."); @@ -1267,6 +1267,11 @@ BedrockServer::BedrockServer(const SData& args_) sort(versions.begin(), versions.end()); _version = SComposeList(versions, ":"); + const set validCheckpointModes = {"PASSIVE", "FULL", "RESTART", "TRUNCATE"}; + if (validCheckpointModes.find(args["-checkpointMode"]) == validCheckpointModes.end()) { + SERROR("Invalid checkpoint mode " << args["-checkpointMode"]); + } + list pluginString; for (auto& p : plugins) { pluginString.emplace_back(p.first); @@ -1695,14 +1700,14 @@ void BedrockServer::_status(unique_ptr& command) { size_t totalCount = 0; for (const auto& s : _crashCommands) { totalCount += s.second.size(); - + vector paramsArray; for (const STable& params : s.second) { if (!params.empty()) { paramsArray.push_back(SComposeJSONObject(params)); } } - + STable commandObject; commandObject[s.first] = SComposeJSONArray(paramsArray); crashCommandListArray.push_back(SComposeJSONObject(commandObject)); diff --git a/main.cpp b/main.cpp index 240a1da88..160f4262d 100644 --- a/main.cpp +++ b/main.cpp @@ -236,6 +236,7 @@ int main(int argc, char* argv[]) { << endl; cout << "-maxJournalSize <#commits> Number of commits to retain in the historical journal (default 1000000)" << endl; + cout << "-checkpointMode Accepts PASSIVE|FULL|RESTART|TRUNCATE, which is the value passed to https://www.sqlite.org/c3ref/wal_checkpoint_v2.html" << endl; cout << endl; cout << "Quick Start Tips:" << endl; cout << "-----------------" << endl; @@ -299,6 +300,7 @@ int main(int argc, char* argv[]) { SETDEFAULT("-maxJournalSize", "1000000"); SETDEFAULT("-queryLog", "queryLog.csv"); SETDEFAULT("-enableMultiWrite", "true"); + SETDEFAULT("-checkpointMode", "PASSIVE"); args["-plugins"] = SComposeList(loadPlugins(args)); diff --git a/sqlitecluster/SQLite.cpp b/sqlitecluster/SQLite.cpp index 2a5f4e4e0..0f81a685b 100644 --- a/sqlitecluster/SQLite.cpp +++ b/sqlitecluster/SQLite.cpp @@ -223,7 +223,7 @@ void SQLite::commonConstructorInitialization(bool hctree) { } SQLite::SQLite(const string& filename, int cacheSize, int maxJournalSize, - int minJournalTables, int64_t mmapSizeGB, bool hctree) : + int minJournalTables, int64_t mmapSizeGB, bool hctree, const string& checkpointMode) : _filename(initializeFilename(filename)), _maxJournalSize(maxJournalSize), _db(initializeDB(_filename, mmapSizeGB, hctree)), @@ -231,7 +231,8 @@ SQLite::SQLite(const string& filename, int cacheSize, int maxJournalSize, _sharedData(initializeSharedData(_db, _filename, _journalNames, hctree)), _journalSize(initializeJournalSize(_db, _journalNames)), _cacheSize(cacheSize), - _mmapSizeGB(mmapSizeGB) + _mmapSizeGB(mmapSizeGB), + _checkpointMode(checkpointMode) { commonConstructorInitialization(hctree); } @@ -244,7 +245,8 @@ SQLite::SQLite(const SQLite& from) : _sharedData(from._sharedData), _journalSize(from._journalSize), _cacheSize(from._cacheSize), - _mmapSizeGB(from._mmapSizeGB) + _mmapSizeGB(from._mmapSizeGB), + _checkpointMode(from._checkpointMode) { // This can always pass "true" because the copy constructor does not need to set the DB to WAL2 mode, it would have been set in the object being copied. commonConstructorInitialization(true); @@ -819,11 +821,27 @@ int SQLite::commit(const string& description, function* preCheckpointCal if (_sharedData.outstandingFramesToCheckpoint) { auto start = STimeNow(); int framesCheckpointed = 0; - sqlite3_busy_timeout(_db, 120'000); // 2 minutes - sqlite3_wal_checkpoint_v2(_db, 0, SQLITE_CHECKPOINT_FULL, NULL, &framesCheckpointed); - sqlite3_busy_timeout(_db, 0); + + // We default to PASSIVE checkpoint everywhere as that has been the value proven to work fine for many years. + if (_checkpointMode != "PASSIVE") { + int checkpointMode = SQLITE_CHECKPOINT_PASSIVE; + if (_checkpointMode == "FULL") { + checkpointMode = SQLITE_CHECKPOINT_FULL; + } else if (_checkpointMode == "RESTART") { + checkpointMode = SQLITE_CHECKPOINT_RESTART; + } else if (_checkpointMode == "TRUNCATE") { + checkpointMode = SQLITE_CHECKPOINT_TRUNCATE; + } + // For non-passive checkpoints, we must set a busy timeout in order to wait on any readers. + // We set it to 2 minutes as the majority of transactions should take less than that. + sqlite3_busy_timeout(_db, 120'000); + sqlite3_wal_checkpoint_v2(_db, 0, checkpointMode, NULL, &framesCheckpointed); + sqlite3_busy_timeout(_db, 0); + } else { + sqlite3_wal_checkpoint_v2(_db, 0, SQLITE_CHECKPOINT_PASSIVE, NULL, &framesCheckpointed); + } auto end = STimeNow(); - SINFO("Checkpointed " << framesCheckpointed << " (total) frames of " << _sharedData.outstandingFramesToCheckpoint << " in " << (end - start) << "us."); + SINFO(_checkpointMode << " checkpoint complete with " << framesCheckpointed << " frames checkpointed of " << _sharedData.outstandingFramesToCheckpoint << " frames outstanding in " << (end - start) << "us."); // It might not actually be 0, but we'll just let sqlite tell us what it is next time _walHookCallback runs. _sharedData.outstandingFramesToCheckpoint = 0; diff --git a/sqlitecluster/SQLite.h b/sqlitecluster/SQLite.h index 648d0b1a7..e11e91e3c 100644 --- a/sqlitecluster/SQLite.h +++ b/sqlitecluster/SQLite.h @@ -57,7 +57,7 @@ class SQLite { // // mmapSizeGB: address space to use for memory-mapped IO, in GB. SQLite(const string& filename, int cacheSize, int maxJournalSize, int minJournalTables, - int64_t mmapSizeGB = 0, bool hctree = false); + int64_t mmapSizeGB = 0, bool hctree = false, const string& checkpointMode = "PASSIVE"); // This constructor is not exactly a copy constructor. It creates an other SQLite object based on the first except // with a *different* journal table. This avoids a lot of locking around creating structures that we know already @@ -529,4 +529,7 @@ class SQLite { // Set to true inside of a write query. bool _currentlyWriting{false}; + + // One of PASSIVE|FULL|RESTART|TRUNCATE, translated to corresponding values to be passed to sqlite3_wal_checkpoint_v2. + string _checkpointMode; }; diff --git a/sqlitecluster/SQLitePool.cpp b/sqlitecluster/SQLitePool.cpp index ca3d7a4ad..75d1a31f2 100644 --- a/sqlitecluster/SQLitePool.cpp +++ b/sqlitecluster/SQLitePool.cpp @@ -8,9 +8,10 @@ SQLitePool::SQLitePool(size_t maxDBs, int maxJournalSize, int minJournalTables, int64_t mmapSizeGB, - bool hctree) + bool hctree, + const string& checkpointMode) : _maxDBs(max(maxDBs, 1ul)), - _baseDB(filename, cacheSize, maxJournalSize, minJournalTables, mmapSizeGB, hctree), + _baseDB(filename, cacheSize, maxJournalSize, minJournalTables, mmapSizeGB, hctree, checkpointMode), _objects(_maxDBs, nullptr) { } diff --git a/sqlitecluster/SQLitePool.h b/sqlitecluster/SQLitePool.h index 8cbc6c92e..cf12e38e2 100644 --- a/sqlitecluster/SQLitePool.h +++ b/sqlitecluster/SQLitePool.h @@ -7,7 +7,7 @@ class SQLitePool { public: // Create a pool of DB handles. SQLitePool(size_t maxDBs, const string& filename, int cacheSize, int maxJournalSize, int minJournalTables, - int64_t mmapSizeGB = 0, bool hctree = false); + int64_t mmapSizeGB = 0, bool hctree = false, const string& checkpointMode = "PASSIVE"); ~SQLitePool(); // Get the base object (the first one created, which uses the `journal` table). Note that if called by multiple From 211d274912f13ca237a0b9dc9229308c75a198a8 Mon Sep 17 00:00:00 2001 From: Daniel Silva Date: Mon, 16 Dec 2024 17:19:16 -0300 Subject: [PATCH 07/27] added metrics to read and write --- sqlitecluster/SQLite.cpp | 17 ++++++++++------- sqlitecluster/SQLite.h | 4 +++- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/sqlitecluster/SQLite.cpp b/sqlitecluster/SQLite.cpp index 1d2fd946a..891f3430c 100644 --- a/sqlitecluster/SQLite.cpp +++ b/sqlitecluster/SQLite.cpp @@ -416,7 +416,8 @@ bool SQLite::beginTransaction(TRANSACTION_TYPE type) { _dbCountAtStart = getCommitCount(); _queryCache.clear(); _tablesUsed.clear(); - _queryCount = 0; + _readQueryCount = 0; + _writeQueryCount = 0; _cacheHits = 0; _beginElapsed = STimeNow() - before; _readElapsed = 0; @@ -512,7 +513,7 @@ string SQLite::read(const string& query) const { bool SQLite::read(const string& query, SQResult& result, bool skipInfoWarn) const { uint64_t before = STimeNow(); bool queryResult = false; - _queryCount++; + _readQueryCount++; auto foundQuery = _queryCache.find(query); if (foundQuery != _queryCache.end()) { result = foundQuery->second; @@ -600,7 +601,7 @@ bool SQLite::writeUnmodified(const string& query) { bool SQLite::_writeIdempotent(const string& query, SQResult& result, bool alwaysKeepQueries) { SASSERT(_insideTransaction); _queryCache.clear(); - _queryCount++; + _writeQueryCount++; // Must finish everything with semicolon. SASSERT(query.empty() || SEndsWith(query, ";")); @@ -829,9 +830,10 @@ int SQLite::commit(const string& description, function* preCheckpointCal _sharedData.checkpointInProgress.clear(); } SINFO(description << " COMMIT " << SToStr(_sharedData.commitCount) << " complete in " << time << ". Wrote " << (endPages - startPages) - << " pages. WAL file size is " << sz << " bytes. " << _queryCount << " queries attempted, " << _cacheHits + << " pages. WAL file size is " << sz << " bytes. " << _readQueryCount << " read queries attempted, " << _writeQueryCount << " write queries attempted, " << _cacheHits << " served from cache. Used journal " << _journalName); - _queryCount = 0; + _readQueryCount = 0; + _writeQueryCount = 0; _cacheHits = 0; _dbCountAtStart = 0; _lastConflictPage = 0; @@ -886,8 +888,9 @@ void SQLite::rollback() { SINFO("Rolling back but not inside transaction, ignoring."); } _queryCache.clear(); - SDEBUG("Transaction rollback with " << _queryCount << " queries attempted, " << _cacheHits << " served from cache."); - _queryCount = 0; + SINFO("[performance] Transaction rollback with " << _readQueryCount << " read queries attempted, " << _writeQueryCount << " write queries attempted, " << _cacheHits << " served from cache."); + _readQueryCount = 0; + _writeQueryCount = 0; _cacheHits = 0; _dbCountAtStart = 0; } diff --git a/sqlitecluster/SQLite.h b/sqlitecluster/SQLite.h index 648d0b1a7..d19f858be 100644 --- a/sqlitecluster/SQLite.h +++ b/sqlitecluster/SQLite.h @@ -509,7 +509,9 @@ class SQLite { set _tablesUsed; // Number of queries that have been attempted in this transaction (for metrics only). - mutable int64_t _queryCount = 0; + mutable int64_t _readQueryCount = 0; + + mutable int64_t _writeQueryCount = 0; // Number of queries found in cache in this transaction (for metrics only). mutable int64_t _cacheHits = 0; From 7377779ce973fbdf2b5913434f53d489c441cb2f Mon Sep 17 00:00:00 2001 From: Tyler Karaszewski Date: Mon, 16 Dec 2024 12:41:26 -0800 Subject: [PATCH 08/27] Code review feedback --- sqlitecluster/SQLite.cpp | 49 ++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/sqlitecluster/SQLite.cpp b/sqlitecluster/SQLite.cpp index 49e89c0b1..59b1f2757 100644 --- a/sqlitecluster/SQLite.cpp +++ b/sqlitecluster/SQLite.cpp @@ -645,29 +645,30 @@ bool SQLite::prepare(uint64_t* transactionID, string* transactionhash) { const int64_t journalID = _sharedData.nextJournalCount++; _journalName = _journalNames[journalID % _journalNames.size()]; - // It's possible to attempt to commit a transaction with no writes. We'll skip truncating the journal in this case to avoid - // Turning a no-op into a write. - if (_uncommittedQuery.size()) { - // Look up the oldest commit in our chosen journal, and compute the oldest commit we intend to keep. - SQResult result; - SASSERT(!SQuery(_db, "getting commit min", "SELECT MIN(id) FROM " + _journalName, result)); - uint64_t minJournalEntry = result.size() ? SToUInt64(result[0][0]) : 0; - uint64_t commitCount = _sharedData.commitCount; - - // If the commitCount is less than the max journal size, keep everything. Otherwise, keep everything from - // commitCount - _maxJournalSize forward. We can't just do the last subtraction part because it overflows our unsigned - // int. - uint64_t oldestCommitToKeep = commitCount < _maxJournalSize ? 0 : commitCount - _maxJournalSize; - - // We limit deletions to a relatively small number to avoid making this extremenly slow for some transactions in the case - // where this journal in particular has accumulated a large backlog. - static const size_t deleteLimit = 10; - if (minJournalEntry < oldestCommitToKeep) { - string query = "DELETE FROM " + _journalName + " WHERE id < " + SQ(oldestCommitToKeep) + " LIMIT " + SQ(deleteLimit); - SASSERT(!SQuery(_db, "Deleting oldest journal rows", query)); - size_t deletedCount = sqlite3_changes(_db); - SINFO("Removed " << deletedCount << " rows from journal " << _journalName << ", oldestToKeep: " << oldestCommitToKeep << ", count:" << commitCount << ", limit: " << _maxJournalSize); - } + // Look up the oldest commit in our chosen journal, and compute the oldest commit we intend to keep. + SQResult journalLookupResult; + SASSERT(!SQuery(_db, "getting commit min", "SELECT MIN(id) FROM " + _journalName, journalLookupResult)); + uint64_t minJournalEntry = journalLookupResult.size() ? SToUInt64(journalLookupResult[0][0]) : 0; + + // Note that this can change before we hold the lock on _sharedData.commitLock, but it doesn't matter yet, as we're only + // using it to truncate the journal. We'll reset this value once we acquire that lock. + uint64_t commitCount = _sharedData.commitCount; + + // If the commitCount is less than the max journal size, keep everything. Otherwise, keep everything from + // commitCount - _maxJournalSize forward. We can't just do the last subtraction part because it overflows our unsigned + // int. + uint64_t oldestCommitToKeep = commitCount < _maxJournalSize ? 0 : commitCount - _maxJournalSize; + + // We limit deletions to a relatively small number to avoid making this extremely slow for some transactions in the case + // where this journal in particular has accumulated a large backlog. + static const size_t deleteLimit = 10; + if (minJournalEntry < oldestCommitToKeep) { + auto startUS = STimeNow(); + string query = "DELETE FROM " + _journalName + " WHERE id < " + SQ(oldestCommitToKeep) + " LIMIT " + SQ(deleteLimit); + SASSERT(!SQuery(_db, "Deleting oldest journal rows", query)); + size_t deletedCount = sqlite3_changes(_db); + SINFO("Removed " << deletedCount << " rows from journal " << _journalName << ", oldestToKeep: " << oldestCommitToKeep << ", count:" + << commitCount << ", limit: " << _maxJournalSize << ", in " << (STimeNow() - startUS) << "us."); } // We lock this here, so that we can guarantee the order in which commits show up in the database. @@ -691,7 +692,7 @@ bool SQLite::prepare(uint64_t* transactionID, string* transactionhash) { // Now that we've locked anybody else from committing, look up the state of the database. We don't need to lock the // SharedData object to get these values as we know it can't currently change. - uint64_t commitCount = _sharedData.commitCount; + commitCount = _sharedData.commitCount; // Queue up the journal entry string lastCommittedHash = getCommittedHash(); // This is why we need the lock. From 451441baf934dcf271c0fdaa1ac1652dabad3b6d Mon Sep 17 00:00:00 2001 From: Florent De Neve Date: Mon, 16 Dec 2024 16:50:31 -0400 Subject: [PATCH 09/27] Store _checkpointMode as int --- BedrockServer.cpp | 5 ----- main.cpp | 2 ++ sqlitecluster/SQLite.cpp | 38 +++++++++++++++++++++----------------- sqlitecluster/SQLite.h | 5 +++-- 4 files changed, 26 insertions(+), 24 deletions(-) diff --git a/BedrockServer.cpp b/BedrockServer.cpp index cbe7d7e9a..be9297eab 100644 --- a/BedrockServer.cpp +++ b/BedrockServer.cpp @@ -1267,11 +1267,6 @@ BedrockServer::BedrockServer(const SData& args_) sort(versions.begin(), versions.end()); _version = SComposeList(versions, ":"); - const set validCheckpointModes = {"PASSIVE", "FULL", "RESTART", "TRUNCATE"}; - if (validCheckpointModes.find(args["-checkpointMode"]) == validCheckpointModes.end()) { - SERROR("Invalid checkpoint mode " << args["-checkpointMode"]); - } - list pluginString; for (auto& p : plugins) { pluginString.emplace_back(p.first); diff --git a/main.cpp b/main.cpp index 160f4262d..e318e70ca 100644 --- a/main.cpp +++ b/main.cpp @@ -300,6 +300,8 @@ int main(int argc, char* argv[]) { SETDEFAULT("-maxJournalSize", "1000000"); SETDEFAULT("-queryLog", "queryLog.csv"); SETDEFAULT("-enableMultiWrite", "true"); + + // We default to PASSIVE checkpoint everywhere as that has been the value proven to work fine for many years. SETDEFAULT("-checkpointMode", "PASSIVE"); args["-plugins"] = SComposeList(loadPlugins(args)); diff --git a/sqlitecluster/SQLite.cpp b/sqlitecluster/SQLite.cpp index 0f81a685b..3a8d0f9c2 100644 --- a/sqlitecluster/SQLite.cpp +++ b/sqlitecluster/SQLite.cpp @@ -232,7 +232,7 @@ SQLite::SQLite(const string& filename, int cacheSize, int maxJournalSize, _journalSize(initializeJournalSize(_db, _journalNames)), _cacheSize(cacheSize), _mmapSizeGB(mmapSizeGB), - _checkpointMode(checkpointMode) + _checkpointMode(getCheckpointModeFromString(checkpointMode)) { commonConstructorInitialization(hctree); } @@ -822,24 +822,12 @@ int SQLite::commit(const string& description, function* preCheckpointCal auto start = STimeNow(); int framesCheckpointed = 0; - // We default to PASSIVE checkpoint everywhere as that has been the value proven to work fine for many years. - if (_checkpointMode != "PASSIVE") { - int checkpointMode = SQLITE_CHECKPOINT_PASSIVE; - if (_checkpointMode == "FULL") { - checkpointMode = SQLITE_CHECKPOINT_FULL; - } else if (_checkpointMode == "RESTART") { - checkpointMode = SQLITE_CHECKPOINT_RESTART; - } else if (_checkpointMode == "TRUNCATE") { - checkpointMode = SQLITE_CHECKPOINT_TRUNCATE; - } - // For non-passive checkpoints, we must set a busy timeout in order to wait on any readers. - // We set it to 2 minutes as the majority of transactions should take less than that. + // For non-passive checkpoints, we must set a busy timeout in order to wait on any readers. + // We set it to 2 minutes as the majority of transactions should take less than that. + if (_checkpointMode != SQLITE_CHECKPOINT_PASSIVE) { sqlite3_busy_timeout(_db, 120'000); - sqlite3_wal_checkpoint_v2(_db, 0, checkpointMode, NULL, &framesCheckpointed); - sqlite3_busy_timeout(_db, 0); - } else { - sqlite3_wal_checkpoint_v2(_db, 0, SQLITE_CHECKPOINT_PASSIVE, NULL, &framesCheckpointed); } + sqlite3_wal_checkpoint_v2(_db, 0, _checkpointMode, NULL, &framesCheckpointed); auto end = STimeNow(); SINFO(_checkpointMode << " checkpoint complete with " << framesCheckpointed << " frames checkpointed of " << _sharedData.outstandingFramesToCheckpoint << " frames outstanding in " << (end - start) << "us."); @@ -865,6 +853,22 @@ int SQLite::commit(const string& description, function* preCheckpointCal return result; } +int SQLite::getCheckpointModeFromString(const string& checkpointModeString) { + if (checkpointModeString == "PASSIVE") { + return SQLITE_CHECKPOINT_PASSIVE; + } + if (checkpointModeString == "FULL") { + return SQLITE_CHECKPOINT_FULL; + } + if (checkpointModeString == "RESTART") { + return SQLITE_CHECKPOINT_RESTART; + } + if (checkpointModeString == "TRUNCATE") { + return SQLITE_CHECKPOINT_TRUNCATE; + } + SERROR("Invalid checkpoint type: " << checkpointModeString); +} + map> SQLite::popCommittedTransactions() { return _sharedData.popCommittedTransactions(); } diff --git a/sqlitecluster/SQLite.h b/sqlitecluster/SQLite.h index e11e91e3c..81254f2c2 100644 --- a/sqlitecluster/SQLite.h +++ b/sqlitecluster/SQLite.h @@ -356,6 +356,7 @@ class SQLite { static vector initializeJournal(sqlite3* db, int minJournalTables); static uint64_t initializeJournalSize(sqlite3* db, const vector& journalNames); void commonConstructorInitialization(bool hctree = false); + static int getCheckpointModeFromString(const string& checkpointModeString); // The filename of this DB, canonicalized to its full path on disk. const string _filename; @@ -530,6 +531,6 @@ class SQLite { // Set to true inside of a write query. bool _currentlyWriting{false}; - // One of PASSIVE|FULL|RESTART|TRUNCATE, translated to corresponding values to be passed to sqlite3_wal_checkpoint_v2. - string _checkpointMode; + // One of 0|1|2|3 (a.k.a. PASSIVE|FULL|RESTART|TRUNCATE), which is the value to be passed to sqlite3_wal_checkpoint_v2. + int _checkpointMode; }; From 69389b56920dcc8473ed51652c23f3ba78093027 Mon Sep 17 00:00:00 2001 From: Tyler Karaszewski Date: Mon, 16 Dec 2024 15:17:41 -0800 Subject: [PATCH 10/27] Attempt to figure out where it gets stuck --- ci_tests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci_tests.sh b/ci_tests.sh index 77c36ec94..ca0615581 100755 --- a/ci_tests.sh +++ b/ci_tests.sh @@ -48,7 +48,7 @@ mark_fold end test_bedrock mark_fold start test_bedrock_cluster cd test/clustertest -./clustertest -threads 8 +./clustertest -threads 1 cd ../.. mark_fold end test_bedrock_cluster From d0dfdea3bb8493079cde26fa6ffb75c25594c79e Mon Sep 17 00:00:00 2001 From: Tyler Karaszewski Date: Mon, 16 Dec 2024 15:49:32 -0800 Subject: [PATCH 11/27] I think ConflictSpam breaks --- ci_tests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci_tests.sh b/ci_tests.sh index ca0615581..1c9290653 100755 --- a/ci_tests.sh +++ b/ci_tests.sh @@ -48,7 +48,7 @@ mark_fold end test_bedrock mark_fold start test_bedrock_cluster cd test/clustertest -./clustertest -threads 1 +./clustertest -threads 1 -only ConflictSpam cd ../.. mark_fold end test_bedrock_cluster From 4cf5fe79f4d2f1013e1d867e12bdc7889c143910 Mon Sep 17 00:00:00 2001 From: Tyler Karaszewski Date: Mon, 16 Dec 2024 17:02:12 -0800 Subject: [PATCH 12/27] Binary search the tests, first half. --- ci_tests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci_tests.sh b/ci_tests.sh index 1c9290653..34eddbc1a 100755 --- a/ci_tests.sh +++ b/ci_tests.sh @@ -48,7 +48,7 @@ mark_fold end test_bedrock mark_fold start test_bedrock_cluster cd test/clustertest -./clustertest -threads 1 -only ConflictSpam +./clustertest -threads 1 -only BadCommand,BroadcastCommand,ClusterUpgrade,ConflictSpam,ControlCommand,DoubleDetach,Escalate,FastStandDown,FinishJob,ForkCheck,FutureExecution,GracefulFailover,HTTPS,JobID cd ../.. mark_fold end test_bedrock_cluster From 975de1ad57046def806b4dc85f0686ede6e6149f Mon Sep 17 00:00:00 2001 From: Tyler Karaszewski Date: Mon, 16 Dec 2024 17:19:08 -0800 Subject: [PATCH 13/27] Binary search the tests, first quarter. --- ci_tests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci_tests.sh b/ci_tests.sh index 34eddbc1a..dbf645fd8 100755 --- a/ci_tests.sh +++ b/ci_tests.sh @@ -48,7 +48,7 @@ mark_fold end test_bedrock mark_fold start test_bedrock_cluster cd test/clustertest -./clustertest -threads 1 -only BadCommand,BroadcastCommand,ClusterUpgrade,ConflictSpam,ControlCommand,DoubleDetach,Escalate,FastStandDown,FinishJob,ForkCheck,FutureExecution,GracefulFailover,HTTPS,JobID +./clustertest -threads 1 -only BadCommand,BroadcastCommand,ClusterUpgrade,ConflictSpam,ControlCommand,DoubleDetach,Escalate cd ../.. mark_fold end test_bedrock_cluster From ea89fed5a6599605f32b193446f00363fadf1917 Mon Sep 17 00:00:00 2001 From: Tyler Karaszewski Date: Mon, 16 Dec 2024 17:36:16 -0800 Subject: [PATCH 14/27] Binary search the tests, first 1/8th. --- ci_tests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci_tests.sh b/ci_tests.sh index dbf645fd8..304009bee 100755 --- a/ci_tests.sh +++ b/ci_tests.sh @@ -48,7 +48,7 @@ mark_fold end test_bedrock mark_fold start test_bedrock_cluster cd test/clustertest -./clustertest -threads 1 -only BadCommand,BroadcastCommand,ClusterUpgrade,ConflictSpam,ControlCommand,DoubleDetach,Escalate +./clustertest -threads 1 -only BadCommand,BroadcastCommand,ClusterUpgrade,ConflictSpam cd ../.. mark_fold end test_bedrock_cluster From 9310e263d8fb01c7bd15824703ae9d9eece24f65 Mon Sep 17 00:00:00 2001 From: Tyler Karaszewski Date: Mon, 16 Dec 2024 17:43:46 -0800 Subject: [PATCH 15/27] Binary search the tests, second 1/8th. --- ci_tests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci_tests.sh b/ci_tests.sh index 304009bee..f9ca0cd99 100755 --- a/ci_tests.sh +++ b/ci_tests.sh @@ -48,7 +48,7 @@ mark_fold end test_bedrock mark_fold start test_bedrock_cluster cd test/clustertest -./clustertest -threads 1 -only BadCommand,BroadcastCommand,ClusterUpgrade,ConflictSpam +./clustertest -threads 1 -only ControlCommand,DoubleDetach,Escalate cd ../.. mark_fold end test_bedrock_cluster From c7d39b053da22b113ef7d74d46d6969c069a5c33 Mon Sep 17 00:00:00 2001 From: Tyler Karaszewski Date: Mon, 16 Dec 2024 17:55:30 -0800 Subject: [PATCH 16/27] Only control --- ci_tests.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ci_tests.sh b/ci_tests.sh index f9ca0cd99..a06f9b06d 100755 --- a/ci_tests.sh +++ b/ci_tests.sh @@ -48,7 +48,8 @@ mark_fold end test_bedrock mark_fold start test_bedrock_cluster cd test/clustertest -./clustertest -threads 1 -only ControlCommand,DoubleDetach,Escalate +./clustertest -threads 1 -only ControlCommand +# ControlCommand,DoubleDetach,Escalate cd ../.. mark_fold end test_bedrock_cluster From c16c1859178c2289f40703bf2c36e23398b6e0be Mon Sep 17 00:00:00 2001 From: Tyler Karaszewski Date: Mon, 16 Dec 2024 18:04:31 -0800 Subject: [PATCH 17/27] Only DoubleDetach --- ci_tests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci_tests.sh b/ci_tests.sh index a06f9b06d..caa9588eb 100755 --- a/ci_tests.sh +++ b/ci_tests.sh @@ -48,7 +48,7 @@ mark_fold end test_bedrock mark_fold start test_bedrock_cluster cd test/clustertest -./clustertest -threads 1 -only ControlCommand +./clustertest -threads 1 -only DoubleDetach # ControlCommand,DoubleDetach,Escalate cd ../.. mark_fold end test_bedrock_cluster From aa1019992f6ca18babeb12cf75a327a36818577d Mon Sep 17 00:00:00 2001 From: Tyler Karaszewski Date: Mon, 16 Dec 2024 18:16:43 -0800 Subject: [PATCH 18/27] Only Escalate --- ci_tests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci_tests.sh b/ci_tests.sh index caa9588eb..98bda9fff 100755 --- a/ci_tests.sh +++ b/ci_tests.sh @@ -48,7 +48,7 @@ mark_fold end test_bedrock mark_fold start test_bedrock_cluster cd test/clustertest -./clustertest -threads 1 -only DoubleDetach +./clustertest -threads 1 -only Escalate # ControlCommand,DoubleDetach,Escalate cd ../.. mark_fold end test_bedrock_cluster From 42adccf6baaccc2940ee27b62f01fac96fa473ef Mon Sep 17 00:00:00 2001 From: Tyler Karaszewski Date: Mon, 16 Dec 2024 19:02:03 -0800 Subject: [PATCH 19/27] Except DoubleDetach --- ci_tests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci_tests.sh b/ci_tests.sh index 98bda9fff..42ded1ee4 100755 --- a/ci_tests.sh +++ b/ci_tests.sh @@ -48,7 +48,7 @@ mark_fold end test_bedrock mark_fold start test_bedrock_cluster cd test/clustertest -./clustertest -threads 1 -only Escalate +./clustertest -threads 1 -except DoubleDetach # ControlCommand,DoubleDetach,Escalate cd ../.. mark_fold end test_bedrock_cluster From b8fae0c95a93a7096ea27b3496e03fd3c750d166 Mon Sep 17 00:00:00 2001 From: Tyler Karaszewski Date: Tue, 17 Dec 2024 07:52:30 -0800 Subject: [PATCH 20/27] Add cout notes --- test/clustertest/tests/DoubleDetachTest.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/clustertest/tests/DoubleDetachTest.cpp b/test/clustertest/tests/DoubleDetachTest.cpp index f7d5b8b8a..9ac06dfd2 100644 --- a/test/clustertest/tests/DoubleDetachTest.cpp +++ b/test/clustertest/tests/DoubleDetachTest.cpp @@ -23,16 +23,22 @@ struct DoubleDetachTest : tpunit::TestFixture { void testDoubleDetach() { // Test a control command + cout << "A" << endl; BedrockTester& follower = tester->getTester(1); // Detach + cout << "B" << endl; SData detachCommand("detach"); + cout << "C" << endl; follower.executeWaitVerifyContent(detachCommand, "203 DETACHING", true); // Wait for it to detach + cout << "D" << endl; sleep(3); + cout << "E" << endl; follower.executeWaitVerifyContent(detachCommand, "400 Already detached", true); + cout << "F" << endl; } } __DoubleDetachTest; From e1e7c0d474a64cc719e3776e511543b0d21f9bab Mon Sep 17 00:00:00 2001 From: Tyler Karaszewski Date: Tue, 17 Dec 2024 08:20:43 -0800 Subject: [PATCH 21/27] Needs only, not except --- ci_tests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci_tests.sh b/ci_tests.sh index 42ded1ee4..caa9588eb 100755 --- a/ci_tests.sh +++ b/ci_tests.sh @@ -48,7 +48,7 @@ mark_fold end test_bedrock mark_fold start test_bedrock_cluster cd test/clustertest -./clustertest -threads 1 -except DoubleDetach +./clustertest -threads 1 -only DoubleDetach # ControlCommand,DoubleDetach,Escalate cd ../.. mark_fold end test_bedrock_cluster From 3df5fdfe869919ea2f5b7b68f5368bc8dc6c7cb9 Mon Sep 17 00:00:00 2001 From: Florent De Neve Date: Tue, 17 Dec 2024 12:37:25 -0400 Subject: [PATCH 22/27] Move sqlite3_busy_timeout to commonConstructorInitialization --- sqlitecluster/SQLite.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sqlitecluster/SQLite.cpp b/sqlitecluster/SQLite.cpp index 3a8d0f9c2..3a831e409 100644 --- a/sqlitecluster/SQLite.cpp +++ b/sqlitecluster/SQLite.cpp @@ -220,6 +220,12 @@ void SQLite::commonConstructorInitialization(bool hctree) { // Always set synchronous commits to off for best commit performance in WAL mode. SASSERT(!SQuery(_db, "setting synchronous commits to off", "PRAGMA synchronous = OFF;")); + + // For non-passive checkpoints, we must set a busy timeout in order to wait on any readers. + // We set it to 2 minutes as the majority of transactions should take less than that. + if (_checkpointMode != SQLITE_CHECKPOINT_PASSIVE) { + sqlite3_busy_timeout(_db, 120'000); + } } SQLite::SQLite(const string& filename, int cacheSize, int maxJournalSize, @@ -821,15 +827,9 @@ int SQLite::commit(const string& description, function* preCheckpointCal if (_sharedData.outstandingFramesToCheckpoint) { auto start = STimeNow(); int framesCheckpointed = 0; - - // For non-passive checkpoints, we must set a busy timeout in order to wait on any readers. - // We set it to 2 minutes as the majority of transactions should take less than that. - if (_checkpointMode != SQLITE_CHECKPOINT_PASSIVE) { - sqlite3_busy_timeout(_db, 120'000); - } sqlite3_wal_checkpoint_v2(_db, 0, _checkpointMode, NULL, &framesCheckpointed); auto end = STimeNow(); - SINFO(_checkpointMode << " checkpoint complete with " << framesCheckpointed << " frames checkpointed of " << _sharedData.outstandingFramesToCheckpoint << " frames outstanding in " << (end - start) << "us."); + SINFO("Checkpoint with type=" << _checkpointMode << " complete with " << framesCheckpointed << " frames checkpointed of " << _sharedData.outstandingFramesToCheckpoint << " frames outstanding in " << (end - start) << "us."); // It might not actually be 0, but we'll just let sqlite tell us what it is next time _walHookCallback runs. _sharedData.outstandingFramesToCheckpoint = 0; From 64263b28b1528c0651f1569bbccba0ac646a549c Mon Sep 17 00:00:00 2001 From: Tyler Karaszewski Date: Tue, 17 Dec 2024 09:08:16 -0800 Subject: [PATCH 23/27] Attemot to shutdown cleanly --- test/clustertest/tests/DoubleDetachTest.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/test/clustertest/tests/DoubleDetachTest.cpp b/test/clustertest/tests/DoubleDetachTest.cpp index 9ac06dfd2..71b93e140 100644 --- a/test/clustertest/tests/DoubleDetachTest.cpp +++ b/test/clustertest/tests/DoubleDetachTest.cpp @@ -28,7 +28,7 @@ struct DoubleDetachTest : tpunit::TestFixture { // Detach cout << "B" << endl; - SData detachCommand("detach"); + SData detachCommand("Detach"); cout << "C" << endl; follower.executeWaitVerifyContent(detachCommand, "203 DETACHING", true); @@ -39,6 +39,11 @@ struct DoubleDetachTest : tpunit::TestFixture { cout << "E" << endl; follower.executeWaitVerifyContent(detachCommand, "400 Already detached", true); cout << "F" << endl; + + // Re-attach to make shutdown clean. + SData attachCommand("Attach"); + follower.executeWaitVerifyContent(attachCommand, "204 ATTACHING", true); + cout << "G" << endl; } } __DoubleDetachTest; From 92644ade49c73a26afafbb8eab118247a38155fe Mon Sep 17 00:00:00 2001 From: Tyler Karaszewski Date: Tue, 17 Dec 2024 09:15:33 -0800 Subject: [PATCH 24/27] Remove test changes --- ci_tests.sh | 3 +-- test/clustertest/tests/DoubleDetachTest.cpp | 7 ------- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/ci_tests.sh b/ci_tests.sh index caa9588eb..77c36ec94 100755 --- a/ci_tests.sh +++ b/ci_tests.sh @@ -48,8 +48,7 @@ mark_fold end test_bedrock mark_fold start test_bedrock_cluster cd test/clustertest -./clustertest -threads 1 -only DoubleDetach -# ControlCommand,DoubleDetach,Escalate +./clustertest -threads 8 cd ../.. mark_fold end test_bedrock_cluster diff --git a/test/clustertest/tests/DoubleDetachTest.cpp b/test/clustertest/tests/DoubleDetachTest.cpp index 71b93e140..db1edf34f 100644 --- a/test/clustertest/tests/DoubleDetachTest.cpp +++ b/test/clustertest/tests/DoubleDetachTest.cpp @@ -23,27 +23,20 @@ struct DoubleDetachTest : tpunit::TestFixture { void testDoubleDetach() { // Test a control command - cout << "A" << endl; BedrockTester& follower = tester->getTester(1); // Detach - cout << "B" << endl; SData detachCommand("Detach"); - cout << "C" << endl; follower.executeWaitVerifyContent(detachCommand, "203 DETACHING", true); // Wait for it to detach - cout << "D" << endl; sleep(3); - cout << "E" << endl; follower.executeWaitVerifyContent(detachCommand, "400 Already detached", true); - cout << "F" << endl; // Re-attach to make shutdown clean. SData attachCommand("Attach"); follower.executeWaitVerifyContent(attachCommand, "204 ATTACHING", true); - cout << "G" << endl; } } __DoubleDetachTest; From 9da6394f2fce5d87ff7883030e69846a0c2d0da3 Mon Sep 17 00:00:00 2001 From: Tyler Karaszewski Date: Tue, 17 Dec 2024 12:38:55 -0800 Subject: [PATCH 25/27] Add log lines --- BedrockServer.cpp | 7 +++++++ sqlitecluster/SQLite.cpp | 4 ++++ 2 files changed, 11 insertions(+) diff --git a/BedrockServer.cpp b/BedrockServer.cpp index 98e59414f..a31c7f40e 100644 --- a/BedrockServer.cpp +++ b/BedrockServer.cpp @@ -1823,6 +1823,7 @@ atomic __quiesceShouldUnlock(false); thread* __quiesceThread = nullptr; void BedrockServer::_control(unique_ptr& command) { + SINFO("Received control command: " << command->request.methodLine); SData& response = command->response; string reason = "MANUAL"; response.methodLine = "200 OK"; @@ -1913,7 +1914,9 @@ void BedrockServer::_control(unique_ptr& command) { if (dbPoolCopy) { SQLiteScopedHandle dbScope(*_dbPool, _dbPool->getIndex()); SQLite& db = dbScope.db(); + SINFO("[quiesce] Exclusive locking DB"); db.exclusiveLockDB(); + SINFO("[quiesce] Exclusive locked DB"); locked = true; while (true) { if (__quiesceShouldUnlock) { @@ -1936,12 +1939,16 @@ void BedrockServer::_control(unique_ptr& command) { response.methodLine = "200 Blocked"; } } else if (SIEquals(command->request.methodLine, "UnblockWrites")) { + SINFO("[quiesce] Locking __quiesceLock"); lock_guard lock(__quiesceLock); + SINFO("[quiesce] __quiesceLock locked"); if (!__quiesceThread) { response.methodLine = "200 Not Blocked"; } else { __quiesceShouldUnlock = true; + SINFO("[quiesce] Joining __quiesceThread"); __quiesceThread->join(); + SINFO("[quiesce] __quiesceThread joined"); delete __quiesceThread; __quiesceThread = nullptr; response.methodLine = "200 Unblocked"; diff --git a/sqlitecluster/SQLite.cpp b/sqlitecluster/SQLite.cpp index f31888cd6..26d3b9a18 100644 --- a/sqlitecluster/SQLite.cpp +++ b/sqlitecluster/SQLite.cpp @@ -323,13 +323,17 @@ void SQLite::exclusiveLockDB() { // writes in this case. // So when these are both locked by the same thread at the same time, `commitLock` is always locked first, and we do it the same way here to avoid deadlocks. try { + SINFO("Locking commitLock"); _sharedData.commitLock.lock(); + SINFO("commitLock Locked"); } catch (const system_error& e) { SWARN("Caught system_error calling _sharedData.commitLock, code: " << e.code() << ", message: " << e.what()); throw; } try { + SINFO("Locking writeLock"); _sharedData.writeLock.lock(); + SINFO("writeLock Locked"); } catch(const system_error& e) { SWARN("Caught system_error calling _sharedData.writeLock, code: " << e.code() << ", message: " << e.what()); throw; From 259970629cf337b48f86189c7b50813c0dbb0c00 Mon Sep 17 00:00:00 2001 From: Tyler Karaszewski Date: Tue, 17 Dec 2024 16:04:08 -0800 Subject: [PATCH 26/27] Add fix --- BedrockServer.cpp | 1 + sqlitecluster/SQLite.cpp | 7 +++++++ sqlitecluster/SQLiteNode.cpp | 7 ++++++- sqlitecluster/SQLiteNode.h | 8 ++++---- sqlitecluster/SQLitePeer.cpp | 1 + 5 files changed, 19 insertions(+), 5 deletions(-) diff --git a/BedrockServer.cpp b/BedrockServer.cpp index a31c7f40e..c36298c04 100644 --- a/BedrockServer.cpp +++ b/BedrockServer.cpp @@ -1721,6 +1721,7 @@ void BedrockServer::_status(unique_ptr& command) { // Coalesce all of the peer data into one value to return or return // an error message if we timed out getting the peerList data. list peerList; + // This blocks during state change list peerData = getPeerInfo(); for (const STable& peerTable : peerData) { peerList.push_back(SComposeJSONObject(peerTable)); diff --git a/sqlitecluster/SQLite.cpp b/sqlitecluster/SQLite.cpp index 26d3b9a18..1b55cfe7e 100644 --- a/sqlitecluster/SQLite.cpp +++ b/sqlitecluster/SQLite.cpp @@ -669,6 +669,7 @@ bool SQLite::prepare(uint64_t* transactionID, string* transactionhash) { static const size_t deleteLimit = 10; if (minJournalEntry < oldestCommitToKeep) { auto startUS = STimeNow(); + shared_lock lock(_sharedData.writeLock); string query = "DELETE FROM " + _journalName + " WHERE id < " + SQ(oldestCommitToKeep) + " LIMIT " + SQ(deleteLimit); SASSERT(!SQuery(_db, "Deleting oldest journal rows", query)); size_t deletedCount = sqlite3_changes(_db); @@ -676,6 +677,12 @@ bool SQLite::prepare(uint64_t* transactionID, string* transactionhash) { << commitCount << ", limit: " << _maxJournalSize << ", in " << (STimeNow() - startUS) << "us."); } + // So let's say that a replicate thread is running the above. Neither the write or commit lock is held. + + // Let's say another thread calls `exclusiveLockDB`. It grabs the commit lock. + // We grab the write lock above. Other thread waits. We release writeLock, it acquires it. Writes are now blocked. + // We attempot to grab commitLock below. We block. + // We lock this here, so that we can guarantee the order in which commits show up in the database. if (!_mutexLocked) { auto start = STimeNow(); diff --git a/sqlitecluster/SQLiteNode.cpp b/sqlitecluster/SQLiteNode.cpp index 17db479ba..480b6f3f1 100644 --- a/sqlitecluster/SQLiteNode.cpp +++ b/sqlitecluster/SQLiteNode.cpp @@ -493,7 +493,12 @@ void SQLiteNode::_sendOutstandingTransactions(const set& commitOnlyIDs } list SQLiteNode::getPeerInfo() const { - shared_lock sharedLock(_stateMutex); + // This does not lock _stateMutex. It follows the rule in `SQLiteNode.h` that says: + // * Alternatively, a public `const` method that is a simple getter for an atomic property can skip the lock. + // peer->getData is atomic internally, so we can treat `peer->getData()` as a simple getter for an atomic property. + // _peerList is also `const` and so we can iterate this list safely regardless of the lock. + // This makes this function a slightly more complex getter for an atomic property, but it's still safe to skip + // The state lock here. list peerData; for (SQLitePeer* peer : _peerList) { peerData.emplace_back(peer->getData()); diff --git a/sqlitecluster/SQLiteNode.h b/sqlitecluster/SQLiteNode.h index fd4ba5f6f..06868daae 100644 --- a/sqlitecluster/SQLiteNode.h +++ b/sqlitecluster/SQLiteNode.h @@ -25,16 +25,16 @@ * Rules for maintaining SQLiteNode methods so that atomicity works as intended. * * No non-const members should be publicly exposed. - * Any public method that is `const` must shared_lock<>(nodeMutex). + * Any public method that is `const` must shared_lock<>(_stateMutex). * Alternatively, a public `const` method that is a simple getter for an atomic property can skip the lock. - * Any public method that is non-const must unique_lock<>(nodeMutex) before changing any internal state, and must hold + * Any public method that is non-const must unique_lock<>(_stateMutex) before changing any internal state, and must hold * this lock until it is done changing state to make this method's changes atomic. * Any private methods must not call public methods. - * Any private methods must not lock nodeMutex (for recursion reasons). + * Any private methods must not lock _stateMutex (for recursion reasons). * Any public methods must not call other public methods. * * `_replicate` is a special exception because it runs in multiple threads internally. It needs to handle locking if it - * changes any internal state (and it calls `changeState`, which does). + * changes any internal state (and it calls `changeState`, which it does). * */ diff --git a/sqlitecluster/SQLitePeer.cpp b/sqlitecluster/SQLitePeer.cpp index c93589520..a9636ca1d 100644 --- a/sqlitecluster/SQLitePeer.cpp +++ b/sqlitecluster/SQLitePeer.cpp @@ -225,6 +225,7 @@ void SQLitePeer::getCommit(uint64_t& count, string& hashString) const { } STable SQLitePeer::getData() const { + lock_guard lock(peerMutex); // Add all of our standard stuff. STable result({ {"name", name}, From fe86e09155b000cb4207d93cc956d74dbf4aa88b Mon Sep 17 00:00:00 2001 From: Tyler Karaszewski Date: Tue, 17 Dec 2024 16:07:26 -0800 Subject: [PATCH 27/27] Remove test code --- BedrockServer.cpp | 1 - sqlitecluster/SQLite.cpp | 6 ------ sqlitecluster/SQLiteNode.h | 2 +- 3 files changed, 1 insertion(+), 8 deletions(-) diff --git a/BedrockServer.cpp b/BedrockServer.cpp index 45fdcda4a..c0cad65f3 100644 --- a/BedrockServer.cpp +++ b/BedrockServer.cpp @@ -1721,7 +1721,6 @@ void BedrockServer::_status(unique_ptr& command) { // Coalesce all of the peer data into one value to return or return // an error message if we timed out getting the peerList data. list peerList; - // This blocks during state change list peerData = getPeerInfo(); for (const STable& peerTable : peerData) { peerList.push_back(SComposeJSONObject(peerTable)); diff --git a/sqlitecluster/SQLite.cpp b/sqlitecluster/SQLite.cpp index 39cf31044..80a4636fc 100644 --- a/sqlitecluster/SQLite.cpp +++ b/sqlitecluster/SQLite.cpp @@ -685,12 +685,6 @@ bool SQLite::prepare(uint64_t* transactionID, string* transactionhash) { << commitCount << ", limit: " << _maxJournalSize << ", in " << (STimeNow() - startUS) << "us."); } - // So let's say that a replicate thread is running the above. Neither the write or commit lock is held. - - // Let's say another thread calls `exclusiveLockDB`. It grabs the commit lock. - // We grab the write lock above. Other thread waits. We release writeLock, it acquires it. Writes are now blocked. - // We attempot to grab commitLock below. We block. - // We lock this here, so that we can guarantee the order in which commits show up in the database. if (!_mutexLocked) { auto start = STimeNow(); diff --git a/sqlitecluster/SQLiteNode.h b/sqlitecluster/SQLiteNode.h index 06868daae..817a4e9b9 100644 --- a/sqlitecluster/SQLiteNode.h +++ b/sqlitecluster/SQLiteNode.h @@ -34,7 +34,7 @@ * Any public methods must not call other public methods. * * `_replicate` is a special exception because it runs in multiple threads internally. It needs to handle locking if it - * changes any internal state (and it calls `changeState`, which it does). + * changes any internal state (and it calls `changeState`, which does). * */