diff --git a/.gitignore b/.gitignore index 47d30c595..367c42837 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,4 @@ libstuff/libstuff.h.gch .nfs* .cache compile_commands.json +.vscode/* diff --git a/BedrockServer.cpp b/BedrockServer.cpp index 0bf9fe777..544d3e5b2 100644 --- a/BedrockServer.cpp +++ b/BedrockServer.cpp @@ -778,6 +778,7 @@ void BedrockServer::runCommand(unique_ptr&& _command, bool isBlo } int64_t lastConflictPage = 0; + string lastConflictTable; while (true) { // We just spin until the node looks ready to go. Typically, this doesn't happen expect briefly at startup. @@ -1021,7 +1022,14 @@ void BedrockServer::runCommand(unique_ptr&& _command, bool isBlo } else { SINFO("Conflict or state change committing " << command->request.methodLine << " on worker thread."); if (_enableConflictPageLocks) { - lastConflictPage = db.getLastConflictPage(); + lastConflictTable = db.getLastConflictTable(); + + // Journals are always chosen at the time of commit. So in case there was a conflict on the journal in + // the previous commit, the chances are very low (1/192) that we'll choose the same journal, thus, we + // don't need to lock our next commit on this page conflict. + if (!SStartsWith(lastConflictTable, "journal")) { + lastConflictPage = db.getLastConflictPage(); + } } } } else if (result == BedrockCore::RESULT::NO_COMMIT_REQUIRED) { diff --git a/sqlitecluster/SQLite.cpp b/sqlitecluster/SQLite.cpp index 7fc77fbfc..717d188ca 100644 --- a/sqlitecluster/SQLite.cpp +++ b/sqlitecluster/SQLite.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -17,6 +18,7 @@ sqlite3* SQLite::getDBHandle() { thread_local string SQLite::_mostRecentSQLiteErrorLog; thread_local int64_t SQLite::_conflictPage; +thread_local string SQLite::_conflictTable; const string SQLite::getMostRecentSQLiteErrorLog() const { return _mostRecentSQLiteErrorLog; @@ -282,8 +284,27 @@ void SQLite::_sqliteLogCallback(void* pArg, int iErrCode, const char* zMsg) { // This is sort of hacky to parse this from the logging info. If it works we could ask sqlite for a better interface to get this info. if (SStartsWith(zMsg, "cannot commit")) { // 17 is the length of "conflict at page" and the following space. - const char* offset = strstr(zMsg, "conflict at page") + 17; - _conflictPage = atol(offset); + const char* pageOffset = strstr(zMsg, "conflict at page") + 17; + _conflictPage = atol(pageOffset); + + // Check if the tableOffset exists since not all conflicts are on tables + const char* tableOffset = strstr(pageOffset, "part of db table"); + if (tableOffset) { + // 17 is the length of "part of db table" and the following space. + tableOffset += 17; + + // Based on the SQLite log line, we should always have ';' after the table name, + // so let's find it and use it to limit the size of the substring we need + const char* semicolonOffset = strstr(tableOffset, ";"); + + // Let's add this check in case the SQLite log changes and we don't notice it + // since this would generate a runtime error. + if (semicolonOffset) { + _conflictTable = string(tableOffset, semicolonOffset - tableOffset); + } else { + _conflictTable = string(tableOffset); + } + } } } @@ -401,6 +422,7 @@ bool SQLite::beginTransaction(TRANSACTION_TYPE type) { _commitElapsed = 0; _rollbackElapsed = 0; _lastConflictPage = 0; + _lastConflictTable = ""; return _insideTransaction; } @@ -726,12 +748,14 @@ int SQLite::commit(const string& description, function* preCheckpointCal sqlite3_db_status(_db, SQLITE_DBSTATUS_CACHE_WRITE, &startPages, &dummy, 0); _conflictPage = 0; + _conflictTable = ""; uint64_t before = STimeNow(); uint64_t beforeCommit = STimeNow(); result = SQuery(_db, "committing db transaction", "COMMIT"); _lastConflictPage = _conflictPage; + _lastConflictTable = _conflictTable; if (_lastConflictPage) { - SINFO("part of last conflict page: " << _lastConflictPage); + SINFO(format("part of last conflict page: {}, conflict table: {}", _conflictPage, _conflictTable)); } // If there were conflicting commits, will return SQLITE_BUSY_SNAPSHOT @@ -789,6 +813,7 @@ int SQLite::commit(const string& description, function* preCheckpointCal _cacheHits = 0; _dbCountAtStart = 0; _lastConflictPage = 0; + _lastConflictTable = ""; } else { SINFO("Commit failed, waiting for rollback."); } @@ -1136,6 +1161,10 @@ int64_t SQLite::getLastConflictPage() const { return _lastConflictPage; } +string SQLite::getLastConflictTable() const { + return _lastConflictTable; +} + SQLite::SharedData::SharedData() : nextJournalCount(0), _commitEnabled(true), diff --git a/sqlitecluster/SQLite.h b/sqlitecluster/SQLite.h index 1dc6d1ad6..246f4dd66 100644 --- a/sqlitecluster/SQLite.h +++ b/sqlitecluster/SQLite.h @@ -248,6 +248,8 @@ class SQLite { int64_t getLastConflictPage() const; + string getLastConflictTable() const; + // This is the callback function we use to log SQLite's internal errors. static void _sqliteLogCallback(void* pArg, int iErrCode, const char* zMsg); @@ -398,7 +400,9 @@ class SQLite { bool _mutexLocked = false; atomic _lastConflictPage = 0; + atomic _lastConflictTable; static thread_local int64_t _conflictPage; + static thread_local string _conflictTable; bool _writeIdempotent(const string& query, bool alwaysKeepQueries = false); diff --git a/sqlitecluster/SQLiteNode.cpp b/sqlitecluster/SQLiteNode.cpp index 2070b657a..2003f57cc 100644 --- a/sqlitecluster/SQLiteNode.cpp +++ b/sqlitecluster/SQLiteNode.cpp @@ -592,6 +592,7 @@ bool SQLiteNode::update() { // How many peers have we logged in to? size_t numFullPeers = 0; size_t numLoggedInFullPeers = 0; + list loggedInPeers; SQLitePeer* freshestPeer = nullptr; for (const auto& peer : _peerList) { // Count how many full peers (non-permafollowers) we have, and how many are logged in. @@ -605,6 +606,7 @@ bool SQLiteNode::update() { // Find the freshest non-broken peer (including permafollowers). if (peer->loggedIn) { + loggedInPeers.push_back(peer->name); if (_forkedFrom.count(peer->name)) { SWARN("Hash mismatch. Forked from peer " << peer->name << " so not considering it." << _getLostQuorumLogMessage()); continue; @@ -617,7 +619,7 @@ bool SQLiteNode::update() { } } - SINFO("Signed in to " << numLoggedInFullPeers << " of " << numFullPeers << " full peers (plus " << (_peerList.size() - numFullPeers) << " permafollowers)."); + SINFO("Signed in to " << numLoggedInFullPeers << " of " << numFullPeers << " full peers (plus " << (_peerList.size() - numFullPeers) << " permafollowers): " << SComposeList(loggedInPeers)); // We just keep searching until we are connected to at least half the full peers. // Note that `numLoggedInFullPeers == numFullPeers` is adequate to satisfy the cluster size, because we do not include ourselves in the cluster size. @@ -1343,6 +1345,11 @@ void SQLiteNode::_onMESSAGE(SQLitePeer* peer, const SData& message) { peer->version = message["Version"]; peer->state = stateFromName(message["State"]); + // If the peer is already standing up, go ahead and approve or deny immediately. + if (peer->state == SQLiteNodeState::STANDINGUP) { + _sendStandupResponse(peer, message); + } + // Let the server know that a peer has logged in. _server.onNodeLogin(peer); } else if (!peer->loggedIn) { @@ -1418,97 +1425,7 @@ void SQLiteNode::_onMESSAGE(SQLitePeer* peer, const SData& message) { peer->transactionResponse = SQLitePeer::Response::NONE; peer->subscribed = false; } else if (to == SQLiteNodeState::STANDINGUP) { - // STANDINGUP: When a peer announces it intends to stand up, we immediately respond with approval or - // denial. We determine this by checking to see if there is any other peer who is already leader or - // also trying to stand up. - SData response("STANDUP_RESPONSE"); - - // Parrot back the node's attempt count so that it can differentiate stale responses. - response["StateChangeCount"] = message["StateChangeCount"]; - - // Reason we would deny, if we do. - if (peer->permaFollower) { - // We think it's a permafollower, deny - PHMMM("Permafollower trying to stand up, denying."); - response["Response"] = "deny"; - response["Reason"] = "You're a permafollower"; - _sendToPeer(peer, response); - return; - } - - if (_forkedFrom.count(peer->name)) { - PHMMM("Forked from peer, can't approve standup."); - response["Response"] = "abstain"; - response["Reason"] = "We are forked"; - _sendToPeer(peer, response); - return; - } - - // What's our state - if (SWITHIN(SQLiteNodeState::STANDINGUP, _state, SQLiteNodeState::STANDINGDOWN)) { - // Oh crap, it's trying to stand up while we're leading. Who is higher priority? - if (peer->priority > _priority) { - // The other peer is a higher priority than us, so we should stand down (maybe it crashed, we - // came up as leader, and now it's been brought back up). We'll want to stand down here, but we - // do it gracefully so that we won't lose any transactions in progress. - if (_state == SQLiteNodeState::STANDINGUP) { - PWARN("Higher-priority peer is trying to stand up while we are STANDINGUP, SEARCHING."); - _changeState(SQLiteNodeState::SEARCHING); - } else if (_state == SQLiteNodeState::LEADING) { - PINFO("Higher-priority peer is trying to stand up while we are LEADING, STANDINGDOWN."); - _changeState(SQLiteNodeState::STANDINGDOWN); - } else { - PWARN("Higher-priority peer is trying to stand up while we are STANDINGDOWN, continuing."); - } - } else { - // Deny because we're currently in the process of leading and we're higher priority. - response["Response"] = "deny"; - response["Reason"] = "I am leading"; - - // Hmm, why is a lower priority peer trying to stand up? Is it possible we're no longer in - // control of the cluster? Let's see how many nodes are subscribed. - if (_majoritySubscribed()) { - // we have a majority of the cluster, so ignore this oddity. - PHMMM("Lower-priority peer is trying to stand up while we are " << stateName(_state) - << " with a majority of the cluster; denying and ignoring."); - } else { - // We don't have a majority of the cluster -- maybe it knows something we don't? For - // example, it could be that the rest of the cluster has forked away from us. This can - // happen if the leader hangs while processing a command: by the time it finishes, the - // cluster might have elected a new leader, forked, and be a thousand commits in the future. - // In this case, let's just reset everything anyway to be safe. - PWARN("Lower-priority peer is trying to stand up while we are " << stateName(_state) - << ", but we don't have a majority of the cluster so reconnecting and SEARCHING."); - _reconnectAll(); - // TODO: This puts us in an ambiguous state if we switch to SEARCHING from LEADING, - // without going through the STANDDOWN process. We'll need to handle it better, but it's - // unclear if this can ever happen at all. exit() may be a reasonable strategy here. - _changeState(SQLiteNodeState::SEARCHING); - } - } - } else { - // Approve if nobody else is trying to stand up - response["Response"] = "approve"; // Optimistic; will override - for (auto otherPeer : _peerList) { - if (otherPeer != peer) { - // See if it's trying to be leader - if (otherPeer->state == SQLiteNodeState::STANDINGUP || otherPeer->state == SQLiteNodeState::LEADING || otherPeer->state == SQLiteNodeState::STANDINGDOWN) { - // We need to contest this standup - response["Response"] = "deny"; - response["Reason"] = "peer '" + otherPeer->name + "' is '" + stateName(otherPeer->state) + "'"; - break; - } - } - } - } - - // Send the response - if (SIEquals(response["Response"], "approve")) { - PINFO("Approving standup request"); - } else { - PHMMM("Not approving standup request because " << response["Reason"]); - } - _sendToPeer(peer, response); + _sendStandupResponse(peer, message); } else if (from == SQLiteNodeState::STANDINGDOWN) { // STANDINGDOWN: When a peer stands down we double-check to make sure we don't have any outstanding // transaction (and if we do, we warn and rollback). @@ -1603,7 +1520,7 @@ void SQLiteNode::_onMESSAGE(SQLitePeer* peer, const SData& message) { uint64_t commitNum = SToUInt64(message["hashMismatchNumber"]); _db.getCommits(commitNum, commitNum, result); _forkedFrom.insert(peer->name); - + SALERT("Hash mismatch. Peer " << peer->name << " and I have forked at commit " << message["hashMismatchNumber"] << ". I have forked from " << _forkedFrom.size() << " other nodes. I am " << stateName(_state) << " and have hash " << result[0][0] << " for that commit. Peer has hash " << message["hashMismatchValue"] << "." @@ -1818,16 +1735,6 @@ void SQLiteNode::_onConnect(SQLitePeer* peer) { login["Version"] = _version; login["Permafollower"] = _originalPriority ? "false" : "true"; _sendToPeer(peer, login); - - // If we're STANDINGUP when a peer connects, send them a STATE message so they know they need to APPROVE or DENY the standup. - // Otherwise we will wait for their response that's not coming,and can eventually time out the standup. - if (_state == SQLiteNodeState::STANDINGUP) { - SData state("STATE"); - state["StateChangeCount"] = to_string(_stateChangeCount); - state["State"] = stateName(_state); - state["Priority"] = SToStr(_priority); - _sendToPeer(peer, state); - } } // -------------------------------------------------------------------------- @@ -2812,3 +2719,97 @@ string SQLiteNode::_getLostQuorumLogMessage() const { return lostQuorumMessage; } + +void SQLiteNode::_sendStandupResponse(SQLitePeer* peer, const SData& message) { + // STANDINGUP: When a peer announces it intends to stand up, we immediately respond with approval or + // denial. We determine this by checking to see if there is any other peer who is already leader or + // also trying to stand up. + SData response("STANDUP_RESPONSE"); + + // Parrot back the node's attempt count so that it can differentiate stale responses. + response["StateChangeCount"] = message["StateChangeCount"]; + + // Reason we would deny, if we do. + if (peer->permaFollower) { + // We think it's a permafollower, deny + PHMMM("Permafollower trying to stand up, denying."); + response["Response"] = "deny"; + response["Reason"] = "You're a permafollower"; + _sendToPeer(peer, response); + return; + } + + if (_forkedFrom.count(peer->name)) { + PHMMM("Forked from peer, can't approve standup."); + response["Response"] = "abstain"; + response["Reason"] = "We are forked"; + _sendToPeer(peer, response); + return; + } + + // What's our state + if (SWITHIN(SQLiteNodeState::STANDINGUP, _state, SQLiteNodeState::STANDINGDOWN)) { + // Oh crap, it's trying to stand up while we're leading. Who is higher priority? + if (peer->priority > _priority) { + // The other peer is a higher priority than us, so we should stand down (maybe it crashed, we + // came up as leader, and now it's been brought back up). We'll want to stand down here, but we + // do it gracefully so that we won't lose any transactions in progress. + if (_state == SQLiteNodeState::STANDINGUP) { + PWARN("Higher-priority peer is trying to stand up while we are STANDINGUP, SEARCHING."); + _changeState(SQLiteNodeState::SEARCHING); + } else if (_state == SQLiteNodeState::LEADING) { + PINFO("Higher-priority peer is trying to stand up while we are LEADING, STANDINGDOWN."); + _changeState(SQLiteNodeState::STANDINGDOWN); + } else { + PWARN("Higher-priority peer is trying to stand up while we are STANDINGDOWN, continuing."); + } + } else { + // Deny because we're currently in the process of leading and we're higher priority. + response["Response"] = "deny"; + response["Reason"] = "I am leading"; + + // Hmm, why is a lower priority peer trying to stand up? Is it possible we're no longer in + // control of the cluster? Let's see how many nodes are subscribed. + if (_majoritySubscribed()) { + // we have a majority of the cluster, so ignore this oddity. + PHMMM("Lower-priority peer is trying to stand up while we are " << stateName(_state) + << " with a majority of the cluster; denying and ignoring."); + } else { + // We don't have a majority of the cluster -- maybe it knows something we don't? For + // example, it could be that the rest of the cluster has forked away from us. This can + // happen if the leader hangs while processing a command: by the time it finishes, the + // cluster might have elected a new leader, forked, and be a thousand commits in the future. + // In this case, let's just reset everything anyway to be safe. + PWARN("Lower-priority peer is trying to stand up while we are " << stateName(_state) + << ", but we don't have a majority of the cluster so reconnecting and SEARCHING."); + _reconnectAll(); + // TODO: This puts us in an ambiguous state if we switch to SEARCHING from LEADING, + // without going through the STANDDOWN process. We'll need to handle it better, but it's + // unclear if this can ever happen at all. exit() may be a reasonable strategy here. + _changeState(SQLiteNodeState::SEARCHING); + } + } + } else { + // Approve if nobody else is trying to stand up + response["Response"] = "approve"; // Optimistic; will override + for (auto otherPeer : _peerList) { + if (otherPeer != peer) { + // See if it's trying to be leader + if (otherPeer->state == SQLiteNodeState::STANDINGUP || otherPeer->state == SQLiteNodeState::LEADING || otherPeer->state == SQLiteNodeState::STANDINGDOWN) { + // We need to contest this standup + response["Response"] = "deny"; + response["Reason"] = "peer '" + otherPeer->name + "' is '" + stateName(otherPeer->state) + "'"; + break; + } + } + } + } + + // Send the response + if (SIEquals(response["Response"], "approve")) { + PINFO("Approving standup request"); + } else { + PHMMM("Not approving standup request because " << response["Reason"]); + } + _sendToPeer(peer, response); +} diff --git a/sqlitecluster/SQLiteNode.h b/sqlitecluster/SQLiteNode.h index 5f9e6e3e0..c820d1569 100644 --- a/sqlitecluster/SQLiteNode.h +++ b/sqlitecluster/SQLiteNode.h @@ -267,6 +267,7 @@ class SQLiteNode : public STCPManager { // Replicates any transactions that have been made on our database by other threads to peers. void _sendOutstandingTransactions(const set& commitOnlyIDs = {}); + void _sendStandupResponse(SQLitePeer* peer, const SData& message); void _sendPING(SQLitePeer* peer); void _sendToAllPeers(const SData& message, bool subscribedOnly = false); void _sendToPeer(SQLitePeer* peer, const SData& message); diff --git a/test/clustertest/BedrockClusterTester.h b/test/clustertest/BedrockClusterTester.h index d8d11bc4a..fb8dbe541 100644 --- a/test/clustertest/BedrockClusterTester.h +++ b/test/clustertest/BedrockClusterTester.h @@ -129,6 +129,7 @@ ClusterTester::ClusterTester(ClusterSize size, {"-peerList", peerString}, {"-plugins", pluginsToLoad}, {"-overrideProcessName", "bedrock" + to_string(nodePort)}, + {"-enableConflictPageLocks", "true"}, }; // Now, if any args were supplied, we'll use those instead of these ones (note that this overwrites our