From a27c258f8933dcb7eaf333f91bbce247051d75ac Mon Sep 17 00:00:00 2001
From: Tyler Karaszewski <tyler@expensify.com>
Date: Mon, 9 Dec 2024 16:57:55 -0800
Subject: [PATCH 01/10] A bunch of notes

---
 sqlitecluster/SQLiteNode.cpp               | 26 +++++++++++++++++-----
 sqlitecluster/SQLiteSequentialNotifier.cpp |  8 ++++++-
 2 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/sqlitecluster/SQLiteNode.cpp b/sqlitecluster/SQLiteNode.cpp
index 5a75611cd..e789bde1f 100644
--- a/sqlitecluster/SQLiteNode.cpp
+++ b/sqlitecluster/SQLiteNode.cpp
@@ -187,6 +187,10 @@ void SQLiteNode::_replicate(SQLitePeer* peer, SData command, size_t sqlitePoolIn
     }
     _replicateStartCV.notify_all();
 
+    if (_replicationThreadsShouldExit) {
+        SINFO("Late replicate start, just exiting.");
+    }
+
     // Initialize each new thread with a new number.
     SInitialize("replicate" + to_string(currentReplicateThreadID.fetch_add(1)));
 
@@ -214,12 +218,18 @@ void SQLiteNode::_replicate(SQLitePeer* peer, SData command, size_t sqlitePoolIn
             uint64_t waitForCount = SStartsWith(command["ID"], "ASYNC") ? command.calcU64("dbCountAtStart") : currentCount;
             SINFO("[performance] BEGIN_TRANSACTION replicate thread for commit " << newCount << " waiting on DB count " << waitForCount << " (" << (quorum ? "QUORUM" : "ASYNC") << ")");
             while (true) {
+                // Ok, why doesn't this get counted?
+                // It's waiting on: commit 26056807239 waiting on DB count 26056807238
+                // It seems like this should return immediately.
+                // Ok, is it possible we got past here and waited somewhere else? We were either stuck here, or...
                 SQLiteSequentialNotifier::RESULT result = _localCommitNotifier.waitFor(waitForCount, false);
+                // My current inclination is that maybe we reset the commit notifier before the thread really starts.
                 if (result == SQLiteSequentialNotifier::RESULT::UNKNOWN) {
                     // This should be impossible.
                     SERROR("Got UNKNOWN result from waitFor, which shouldn't happen");
                 } else if (result == SQLiteSequentialNotifier::RESULT::COMPLETED) {
                     // Success case.
+                    // If we didn't get stuck above, we must have hit here, because otherwise we would have logged.
                     break;
                 } else if (result == SQLiteSequentialNotifier::RESULT::CANCELED) {
                     SINFO("_localCommitNotifier.waitFor canceled early, returning.");
@@ -237,6 +247,7 @@ void SQLiteNode::_replicate(SQLitePeer* peer, SData command, size_t sqlitePoolIn
                     if (commitAttemptCount > 1) {
                         SINFO("Commit attempt number " << commitAttemptCount << " for concurrent replication.");
                     }
+                    // We never log this line, so we can't have gotten to here.
                     SINFO("[performance] BEGIN for commit " << newCount);
                     bool uniqueContraintsError = false;
                     try {
@@ -1640,14 +1651,20 @@ void SQLiteNode::_onMESSAGE(SQLitePeer* peer, const SData& message) {
                 throw e;
             }
         } else if (SIEquals(message.methodLine, "BEGIN_TRANSACTION") || SIEquals(message.methodLine, "COMMIT_TRANSACTION") || SIEquals(message.methodLine, "ROLLBACK_TRANSACTION")) {
+            // Race condition here. What if _replicationThreadsShouldExit changes after this check?
             if (_replicationThreadsShouldExit) {
                 SINFO("Discarding replication message, stopping FOLLOWING");
             } else {
+                // Ok, so the race condition could be here, right?
+                // Right this instance, the thread count is 0, so we can move past the check that
+                // Waits for it to be 0.
+                // But then this thread starts. Can that happen?
                 auto threadID = _replicationThreadCount.fetch_add(1);
                 SDEBUG("Spawning concurrent replicate thread (blocks until DB handle available): " << threadID);
                 try {
                     uint64_t threadAttemptStartTimestamp = STimeNow();
                     _replicateThreadStarted = false;
+                    // Either here.
                     thread(&SQLiteNode::_replicate, this, peer, message, _dbPool->getIndex(false), threadAttemptStartTimestamp).detach();
                     {
                         unique_lock<mutex> lock(_replicateStartMutex);
@@ -1917,12 +1934,8 @@ void SQLiteNode::_changeState(SQLiteNodeState newState, uint64_t commitIDToCance
             // Polling wait for threads to quit. This could use a notification model such as with a condition_variable,
             // which would probably be "better" but introduces yet more state variables for a state that we're rarely
             // in, and so I've left it out for the time being.
-            size_t infoCount = 1;
             while (_replicationThreadCount) {
-                if (infoCount % 100 == 0) {
-                    SINFO("Waiting for " << _replicationThreadCount << " remaining replication threads.");
-                }
-                infoCount++;
+                SINFO("Waiting for " << _replicationThreadCount << " remaining replication threads.");
                 usleep(10'000);
             }
 
@@ -1930,8 +1943,11 @@ void SQLiteNode::_changeState(SQLiteNodeState newState, uint64_t commitIDToCance
             _replicationThreadsShouldExit = false;
 
             // Guaranteed to be done right now.
+            // I bet this is wrong when these get reset. If we no threads, these get reset before they check?
+            // That doesn't make sense to me for when we should increment _replicationThreadCount
             _localCommitNotifier.reset();
             _leaderCommitNotifier.reset();
+            // If the above completed, we should immediately see `Switching from... ` logged.
 
             // We have no leader anymore.
             _leadPeer = nullptr;
diff --git a/sqlitecluster/SQLiteSequentialNotifier.cpp b/sqlitecluster/SQLiteSequentialNotifier.cpp
index 2af9c731c..69d30f1ac 100644
--- a/sqlitecluster/SQLiteSequentialNotifier.cpp
+++ b/sqlitecluster/SQLiteSequentialNotifier.cpp
@@ -31,6 +31,8 @@ SQLiteSequentialNotifier::RESULT SQLiteSequentialNotifier::waitFor(uint64_t valu
                 SINFO("Canceled after " << _cancelAfter << ", but waiting for " << value << " so not returning yet.");
             } else {
                 // Canceled and we're not before the cancellation cutoff.
+                // I don't see how we don't return here. Maybe we never acquire `waitingThreadMutex`?
+                SINFO("Returning canceled because _cancelAfter=" << _cancelAfter << " and value=" << value);
                 return RESULT::CANCELED;
             }
         } else if (_globalResult != RESULT::UNKNOWN) {
@@ -52,6 +54,8 @@ SQLiteSequentialNotifier::RESULT SQLiteSequentialNotifier::waitFor(uint64_t valu
             // We should investigate any instances of thew below logline to see if they're same as for the success cases mentioned above (i.e., the timeout happens simultaneously as the
             // cancellation) or if the log line is delayed by up to a second (indicating a problem).
             if (_globalResult == RESULT::CANCELED || state->result == RESULT::CANCELED) {
+                // I bet removing a 1 second delay means we don't hit this. we get to calling `reset` on this sooner, possibly while there are threads waiting here.
+
                 // It's possible that we hit the timeout here after `cancel()` has set the global value, but before we received the notification.
                 // This isn't a problem, and we can jump back to the top of the loop and check again. If there's some problem, we'll see it there.
                 SINFO("Hit 1s timeout while global cancel " << (_globalResult == RESULT::CANCELED) << " or " << " specific cancel " << (state->result == RESULT::CANCELED));
@@ -114,10 +118,12 @@ void SQLiteSequentialNotifier::cancel(uint64_t cancelAfter) {
         auto& valueThreadMap = *valueThreadMapPtr;
         // If cancelAfter is specified, start from that value. Otherwise, we start from the beginning.
         auto start = _cancelAfter ? valueThreadMap.upper_bound(_cancelAfter) : valueThreadMap.begin();
-        SINFO("[performance] Next value to cancel after " << cancelAfter << " is " << start->first);
         if (start == valueThreadMap.end()) {
             // There's nothing to remove.
+            SINFO("[performance] Next value to cancel after " << cancelAfter << " is N/A");
             return;
+        } else {
+            SINFO("[performance] Next value to cancel after " << cancelAfter << " is " << start->first);
         }
 
         // Now iterate across whatever's remaining and mark it canceled.

From d9ec8964aa99d695601c335dbc458e71053fb361 Mon Sep 17 00:00:00 2001
From: Tyler Karaszewski <tyler@expensify.com>
Date: Mon, 9 Dec 2024 17:47:49 -0800
Subject: [PATCH 02/10] More notes

---
 sqlitecluster/SQLiteNode.cpp | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/sqlitecluster/SQLiteNode.cpp b/sqlitecluster/SQLiteNode.cpp
index e789bde1f..920f03e10 100644
--- a/sqlitecluster/SQLiteNode.cpp
+++ b/sqlitecluster/SQLiteNode.cpp
@@ -199,6 +199,19 @@ void SQLiteNode::_replicate(SQLitePeer* peer, SData command, size_t sqlitePoolIn
 
     // Allow the DB handle to be returned regardless of how this function exits.
     SQLiteScopedHandle dbScope(*_dbPool, sqlitePoolIndex);
+    // In dev, we sometimes crash on the destructor for the above after `detach` which implies to me that the DB Pool could have been deleted before we exited.
+    // This is a different manifestation of what could be the same issue.
+    // IN dev, we get:
+    // 2024-12-09T21:12:45.030255+00:00 expensidev2004 bedrock10013: xxxxxx (SQLiteNode.cpp:215) _replicate [replicate2065] [info] {cluster_node_4/SEARCHING} [performance] BEGIN_TRANSACTION replicate thread for commit 15003 waiting on DB count 15001 (ASYNC)
+    // 2024-12-09T21:12:49.023666+00:00 expensidev2004 bedrock10013: xxxxxx (SQLiteNode.cpp:1913) _changeState [sync] [info] {cluster_node_4/FOLLOWING} Replication threads should exit, canceling commits after current leader commit 0
+    // 2024-12-09T21:12:49.030957+00:00 expensidev2004 bedrock10013: xxxxxx (SQLiteSequentialNotifier.cpp:57) waitFor [replicate2065] [info] Hit 1s timeout while global cancel 1 or  specific cancel 0
+    // 2024-12-09T21:12:49.030963+00:00 expensidev2004 bedrock10013: xxxxxx (SQLiteNode.cpp:225) _replicate [replicate2065] [info] {cluster_node_4/FOLLOWING} _localCommitNotifier.waitFor canceled early, returning.
+    // 2024-12-09T21:12:49.030977+00:00 expensidev2004 bedrock10013: xxxxxx (SSignal.cpp:193) _SSignal_StackTrace [replicate2065] [warn] Signal Segmentation fault(11) caused crash, logging stack trace.
+    // Why doesn't this get cancelled as well?
+    // I'm not sure why dev thinks leader has commit 0.
+    // 2024-12-09T21:12:49.023613+00:00 expensidev2004 bedrock10013: xxxxxx (SQLiteNode.cpp:1902) _changeState [sync] [info] {cluster_node_4/FOLLOWING} [NOTIFY] setting commit count to: 17001
+    // 2024-12-09T21:12:49.023674+00:00 expensidev2004 bedrock10013: xxxxxx (SQLiteSequentialNotifier.cpp:105) cancel [sync] [info] Canceling all pending transactions after 0
+
     SQLite& db = dbScope.db();
 
     bool goSearchingOnExit = false;
@@ -224,6 +237,10 @@ void SQLiteNode::_replicate(SQLitePeer* peer, SData command, size_t sqlitePoolIn
                 // Ok, is it possible we got past here and waited somewhere else? We were either stuck here, or...
                 SQLiteSequentialNotifier::RESULT result = _localCommitNotifier.waitFor(waitForCount, false);
                 // My current inclination is that maybe we reset the commit notifier before the thread really starts.
+
+                // I think we get stuck in `waitFor` because it's been reset to 0 and we're waiting for every commit from the
+                // Beginning of time. I'm not acutally sure why it ever returns, though.
+
                 if (result == SQLiteSequentialNotifier::RESULT::UNKNOWN) {
                     // This should be impossible.
                     SERROR("Got UNKNOWN result from waitFor, which shouldn't happen");
@@ -1653,12 +1670,17 @@ void SQLiteNode::_onMESSAGE(SQLitePeer* peer, const SData& message) {
         } else if (SIEquals(message.methodLine, "BEGIN_TRANSACTION") || SIEquals(message.methodLine, "COMMIT_TRANSACTION") || SIEquals(message.methodLine, "ROLLBACK_TRANSACTION")) {
             // Race condition here. What if _replicationThreadsShouldExit changes after this check?
             if (_replicationThreadsShouldExit) {
+                // Interestingly, this doesn't happen.
+                // I think load is light when this issue occurs.
                 SINFO("Discarding replication message, stopping FOLLOWING");
             } else {
                 // Ok, so the race condition could be here, right?
                 // Right this instance, the thread count is 0, so we can move past the check that
                 // Waits for it to be 0.
                 // But then this thread starts. Can that happen?
+                // So the sync thread does the state switch.
+                // Who is running this though, it should also be the sync thread?
+                // I don't see how anyone else would be able to do this.
                 auto threadID = _replicationThreadCount.fetch_add(1);
                 SDEBUG("Spawning concurrent replicate thread (blocks until DB handle available): " << threadID);
                 try {
@@ -1938,6 +1960,9 @@ void SQLiteNode::_changeState(SQLiteNodeState newState, uint64_t commitIDToCance
                 SINFO("Waiting for " << _replicationThreadCount << " remaining replication threads.");
                 usleep(10'000);
             }
+            // How can the above fail????
+            // We only increment _replicationThreadCount in the sync thread and we are reading it here in the sync thread.
+            // It is feasible to call `_changeState` from another thread but that's not what's happening in the issue we're seeing.
 
             // Done exiting. Reset so that we can resume FOLLOWING in the future.
             _replicationThreadsShouldExit = false;

From 7784a9700a72a68dd969809bbbae764bd42b290b Mon Sep 17 00:00:00 2001
From: Florent De Neve <florent@expensify.com>
Date: Tue, 10 Dec 2024 10:11:12 -0400
Subject: [PATCH 03/10] Update SQLite with a fix for non passive checkpoints

---
 libstuff/sqlite3.c | 8 ++++----
 libstuff/sqlite3.h | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/libstuff/sqlite3.c b/libstuff/sqlite3.c
index b3c5eebf7..727617327 100644
--- a/libstuff/sqlite3.c
+++ b/libstuff/sqlite3.c
@@ -18,7 +18,7 @@
 ** separate file. This file contains only code for the core SQLite library.
 **
 ** The content in this amalgamation comes from Fossil check-in
-** 65b753735b8e8fb70d2b522d527426f1eb5c.
+** 1a59cae3c31aea25cef3705cce2477e26515.
 */
 #define SQLITE_CORE 1
 #define SQLITE_AMALGAMATION 1
@@ -465,7 +465,7 @@ extern "C" {
 */
 #define SQLITE_VERSION        "3.47.0"
 #define SQLITE_VERSION_NUMBER 3047000
-#define SQLITE_SOURCE_ID      "2024-12-06 17:52:38 65b753735b8e8fb70d2b522d527426f1eb5c09339fb4b15cf69cbd2e595b160f"
+#define SQLITE_SOURCE_ID      "2024-12-09 21:26:21 1a59cae3c31aea25cef3705cce2477e26515a0463cf9094bd29951899b758767"
 
 /*
 ** CAPI3REF: Run-Time Library Version Numbers
@@ -71575,7 +71575,7 @@ SQLITE_PRIVATE int sqlite3WalCheckpoint(
     ** writer lock retried until either the busy-handler returns 0 or the
     ** lock is successfully obtained.
     */
-    if( eMode!=SQLITE_CHECKPOINT_PASSIVE ){
+    if( eMode!=SQLITE_CHECKPOINT_PASSIVE && isWalMode2(pWal)==0 ){
       rc = walBusyLock(pWal, xBusy2, pBusyArg, WAL_WRITE_LOCK, 1);
       if( rc==SQLITE_OK ){
         pWal->writeLock = 1;
@@ -257917,7 +257917,7 @@ static void fts5SourceIdFunc(
 ){
   assert( nArg==0 );
   UNUSED_PARAM2(nArg, apUnused);
-  sqlite3_result_text(pCtx, "fts5: 2024-12-06 17:52:38 65b753735b8e8fb70d2b522d527426f1eb5c09339fb4b15cf69cbd2e595b160f", -1, SQLITE_TRANSIENT);
+  sqlite3_result_text(pCtx, "fts5: 2024-12-09 21:26:21 1a59cae3c31aea25cef3705cce2477e26515a0463cf9094bd29951899b758767", -1, SQLITE_TRANSIENT);
 }
 
 /*
diff --git a/libstuff/sqlite3.h b/libstuff/sqlite3.h
index ff4f7b3f8..77aaa8cbd 100644
--- a/libstuff/sqlite3.h
+++ b/libstuff/sqlite3.h
@@ -148,7 +148,7 @@ extern "C" {
 */
 #define SQLITE_VERSION        "3.47.0"
 #define SQLITE_VERSION_NUMBER 3047000
-#define SQLITE_SOURCE_ID      "2024-12-06 17:52:38 65b753735b8e8fb70d2b522d527426f1eb5c09339fb4b15cf69cbd2e595b160f"
+#define SQLITE_SOURCE_ID      "2024-12-09 21:26:21 1a59cae3c31aea25cef3705cce2477e26515a0463cf9094bd29951899b758767"
 
 /*
 ** CAPI3REF: Run-Time Library Version Numbers

From c8524918cdb87c597aca669d5fb1fd0a034930b9 Mon Sep 17 00:00:00 2001
From: Florent De Neve <florent@expensify.com>
Date: Tue, 10 Dec 2024 14:40:20 -0400
Subject: [PATCH 04/10] Update SQLite with more logs, v17

---
 libstuff/sqlite3.c | 20 +++++++++++---------
 libstuff/sqlite3.h |  2 +-
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/libstuff/sqlite3.c b/libstuff/sqlite3.c
index 727617327..a7111ef6c 100644
--- a/libstuff/sqlite3.c
+++ b/libstuff/sqlite3.c
@@ -18,7 +18,7 @@
 ** separate file. This file contains only code for the core SQLite library.
 **
 ** The content in this amalgamation comes from Fossil check-in
-** 1a59cae3c31aea25cef3705cce2477e26515.
+** df4183ace93b788b798b258274bf6b651906.
 */
 #define SQLITE_CORE 1
 #define SQLITE_AMALGAMATION 1
@@ -465,7 +465,7 @@ extern "C" {
 */
 #define SQLITE_VERSION        "3.47.0"
 #define SQLITE_VERSION_NUMBER 3047000
-#define SQLITE_SOURCE_ID      "2024-12-09 21:26:21 1a59cae3c31aea25cef3705cce2477e26515a0463cf9094bd29951899b758767"
+#define SQLITE_SOURCE_ID      "2024-12-10 14:56:20 df4183ace93b788b798b258274bf6b651906c9f1cf2af4983e447cdf52904523"
 
 /*
 ** CAPI3REF: Run-Time Library Version Numbers
@@ -93474,7 +93474,7 @@ SQLITE_PRIVATE void sqlite3CommitTimeLog(u64 *aCommit){
       }
       zStr = sqlite3_mprintf("%z%s%s%d%s", zStr, (zStr?", ":""),zHash,iVal,zU);
     }
-    sqlite3_log(SQLITE_WARNING, "slow commit (v=16): (%s)", zStr);
+    sqlite3_log(SQLITE_WARNING, "slow commit (v=17): (%s)", zStr);
     sqlite3_free(zStr);
   }
 }
@@ -93502,7 +93502,7 @@ SQLITE_PRIVATE void sqlite3PrepareTimeLog(const char *zSql, int nSql, u64 *aPrep
     }
     if( nByte<0 ){ nByte = sqlite3Strlen30(zSql); }
     sqlite3_log(SQLITE_WARNING,
-        "slow prepare (v=16): (%s) [%.*s]", zStr, nByte, zSql
+        "slow prepare (v=17): (%s) [%.*s]", zStr, nByte, zSql
     );
     sqlite3_free(zStr);
   }
@@ -93514,11 +93514,13 @@ SQLITE_PRIVATE void sqlite3SchemaTimeLog(u64 *aSchema, const char *zFile){
     char *zStr = 0;
     int ii;
     for(ii=1; ii<SCHEMA_TIME_N; ii++){
-      zStr = sqlite3_mprintf("%z%s%d", zStr, (zStr?", ":""),
-        (aSchema[ii]==0 ? 0 : (int)(aSchema[ii] - i1))
-      );
+      int val = aSchema[ii];
+      if( val!=0 && ii!=SCHEMA_TIME_STAT4_Q1_BODY ){
+        val -= i1;
+      }
+      zStr = sqlite3_mprintf("%z%s%d", zStr, (zStr?", ":""), val);
     }
-    sqlite3_log(SQLITE_WARNING, "slow schema (%s) (v=16): (%s)", zFile, zStr);
+    sqlite3_log(SQLITE_WARNING, "slow schema (%s) (v=17): (%s)", zFile, zStr);
     sqlite3_free(zStr);
   }
 }
@@ -257917,7 +257919,7 @@ static void fts5SourceIdFunc(
 ){
   assert( nArg==0 );
   UNUSED_PARAM2(nArg, apUnused);
-  sqlite3_result_text(pCtx, "fts5: 2024-12-09 21:26:21 1a59cae3c31aea25cef3705cce2477e26515a0463cf9094bd29951899b758767", -1, SQLITE_TRANSIENT);
+  sqlite3_result_text(pCtx, "fts5: 2024-12-10 14:56:20 df4183ace93b788b798b258274bf6b651906c9f1cf2af4983e447cdf52904523", -1, SQLITE_TRANSIENT);
 }
 
 /*
diff --git a/libstuff/sqlite3.h b/libstuff/sqlite3.h
index 77aaa8cbd..8d452cc85 100644
--- a/libstuff/sqlite3.h
+++ b/libstuff/sqlite3.h
@@ -148,7 +148,7 @@ extern "C" {
 */
 #define SQLITE_VERSION        "3.47.0"
 #define SQLITE_VERSION_NUMBER 3047000
-#define SQLITE_SOURCE_ID      "2024-12-09 21:26:21 1a59cae3c31aea25cef3705cce2477e26515a0463cf9094bd29951899b758767"
+#define SQLITE_SOURCE_ID      "2024-12-10 14:56:20 df4183ace93b788b798b258274bf6b651906c9f1cf2af4983e447cdf52904523"
 
 /*
 ** CAPI3REF: Run-Time Library Version Numbers

From 32181e385c94fec0757441cd10dd0d1c7b285c78 Mon Sep 17 00:00:00 2001
From: Tyler Karaszewski <tyler@expensify.com>
Date: Tue, 10 Dec 2024 10:44:49 -0800
Subject: [PATCH 05/10] Fix but maybe not the right fix

---
 BedrockServer.cpp            |  3 ++-
 sqlitecluster/SQLiteNode.cpp | 12 +++++++++---
 sqlitecluster/SQLiteNode.h   |  3 ++-
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/BedrockServer.cpp b/BedrockServer.cpp
index 55bb30ae6..98e59414f 100644
--- a/BedrockServer.cpp
+++ b/BedrockServer.cpp
@@ -213,8 +213,9 @@ void BedrockServer::sync()
         // we're leading, then the next update() loop will set us to standing down, and then we won't accept any new
         // commands, and we'll shortly run through the existing queue.
         if (_shutdownState.load() == COMMANDS_FINISHED) {
-            SINFO("All clients responded to, " << BedrockCommand::getCommandCount() << " commands remaining. Shutting down sync node.");
+            SINFO("All clients responded to, " << BedrockCommand::getCommandCount() << " commands remaining.");
             if (_syncNode->beginShutdown()) {
+                SINFO("Beginning shuttdown of sync node.");
                 // This will cause us to skip the next `poll` iteration which avoids a 1 second wait.
                 _notifyDoneSync.push(true);
             }
diff --git a/sqlitecluster/SQLiteNode.cpp b/sqlitecluster/SQLiteNode.cpp
index 920f03e10..b641614ca 100644
--- a/sqlitecluster/SQLiteNode.cpp
+++ b/sqlitecluster/SQLiteNode.cpp
@@ -145,6 +145,7 @@ SQLiteNode::SQLiteNode(SQLiteServer& server, shared_ptr<SQLitePool> dbPool, cons
       _stateTimeout(STimeNow() + firstTimeout),
       _syncPeer(nullptr)
 {
+    SINFO("TYLER _replicationThreadCount reset to : " << _replicationThreadCount);
     KILLABLE_SQLITE_NODE = this;
     SASSERT(_originalPriority >= 0);
     onPrepareHandlerEnabled = false;
@@ -1669,7 +1670,8 @@ void SQLiteNode::_onMESSAGE(SQLitePeer* peer, const SData& message) {
             }
         } else if (SIEquals(message.methodLine, "BEGIN_TRANSACTION") || SIEquals(message.methodLine, "COMMIT_TRANSACTION") || SIEquals(message.methodLine, "ROLLBACK_TRANSACTION")) {
             // Race condition here. What if _replicationThreadsShouldExit changes after this check?
-            if (_replicationThreadsShouldExit) {
+            if (_replicationThreadsShouldExit || _state == SQLiteNodeState::SEARCHING) {
+                // So this fix probably works, but maybe we don't even want to call _onMESSAGE when we're detaching?
                 // Interestingly, this doesn't happen.
                 // I think load is light when this issue occurs.
                 SINFO("Discarding replication message, stopping FOLLOWING");
@@ -1682,6 +1684,7 @@ void SQLiteNode::_onMESSAGE(SQLitePeer* peer, const SData& message) {
                 // Who is running this though, it should also be the sync thread?
                 // I don't see how anyone else would be able to do this.
                 auto threadID = _replicationThreadCount.fetch_add(1);
+                SINFO("TYLER _replicationThreadCount incremented to : " << threadID + 1);
                 SDEBUG("Spawning concurrent replicate thread (blocks until DB handle available): " << threadID);
                 try {
                     uint64_t threadAttemptStartTimestamp = STimeNow();
@@ -1703,7 +1706,8 @@ void SQLiteNode::_onMESSAGE(SQLitePeer* peer, const SData& message) {
                     // and waiting for the transaction that failed will be stuck in an infinite loop. To prevent that
                     // we're changing the state to SEARCHING and sending the cancelAfter property to drop all threads
                     // that depend on the transaction that failed to be threaded.
-                    _replicationThreadCount.fetch_sub(1);
+                    auto was = _replicationThreadCount.fetch_sub(1);
+                    SINFO("TYLER _replicationThreadCount decremented to : " << was - 1);
                     SWARN("Caught system_error starting _replicate thread with " << _replicationThreadCount.load() << " threads. e.what()=" << e.what());
                     _changeState(SQLiteNodeState::SEARCHING, message.calcU64("NewCount") - 1);
                     STHROW("Error starting replicate thread so giving up and reconnecting.");
@@ -1956,10 +1960,12 @@ void SQLiteNode::_changeState(SQLiteNodeState newState, uint64_t commitIDToCance
             // Polling wait for threads to quit. This could use a notification model such as with a condition_variable,
             // which would probably be "better" but introduces yet more state variables for a state that we're rarely
             // in, and so I've left it out for the time being.
-            while (_replicationThreadCount) {
+            SINFO("TYLER _replicationThreadCount before state change: " << _replicationThreadCount);
+            while (_replicationThreadCount.load()) {
                 SINFO("Waiting for " << _replicationThreadCount << " remaining replication threads.");
                 usleep(10'000);
             }
+            SINFO("TYLER _replicationThreadCount after state change: " << _replicationThreadCount);
             // How can the above fail????
             // We only increment _replicationThreadCount in the sync thread and we are reading it here in the sync thread.
             // It is feasible to call `_changeState` from another thread but that's not what's happening in the issue we're seeing.
diff --git a/sqlitecluster/SQLiteNode.h b/sqlitecluster/SQLiteNode.h
index 57c5c1056..ab476b653 100644
--- a/sqlitecluster/SQLiteNode.h
+++ b/sqlitecluster/SQLiteNode.h
@@ -198,7 +198,8 @@ class SQLiteNode : public STCPManager {
       public:
         ScopedDecrement(CounterType& counter) : _counter(counter) {}
         ~ScopedDecrement() {
-            --_counter;
+            auto decrementedTo = --_counter;
+            SINFO("TYLER _counter decremented to : " << decrementedTo);
         }
       private:
         CounterType& _counter;

From 39c0d877e27385aa87cc75989dfbec17a4c2960c Mon Sep 17 00:00:00 2001
From: Tyler Karaszewski <tyler@expensify.com>
Date: Tue, 10 Dec 2024 11:31:48 -0800
Subject: [PATCH 06/10] Add explanatory message

---
 sqlitecluster/SQLiteNode.cpp | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/sqlitecluster/SQLiteNode.cpp b/sqlitecluster/SQLiteNode.cpp
index b641614ca..2c1500240 100644
--- a/sqlitecluster/SQLiteNode.cpp
+++ b/sqlitecluster/SQLiteNode.cpp
@@ -145,7 +145,6 @@ SQLiteNode::SQLiteNode(SQLiteServer& server, shared_ptr<SQLitePool> dbPool, cons
       _stateTimeout(STimeNow() + firstTimeout),
       _syncPeer(nullptr)
 {
-    SINFO("TYLER _replicationThreadCount reset to : " << _replicationThreadCount);
     KILLABLE_SQLITE_NODE = this;
     SASSERT(_originalPriority >= 0);
     onPrepareHandlerEnabled = false;
@@ -1669,11 +1668,15 @@ void SQLiteNode::_onMESSAGE(SQLitePeer* peer, const SData& message) {
                 throw e;
             }
         } else if (SIEquals(message.methodLine, "BEGIN_TRANSACTION") || SIEquals(message.methodLine, "COMMIT_TRANSACTION") || SIEquals(message.methodLine, "ROLLBACK_TRANSACTION")) {
-            // Race condition here. What if _replicationThreadsShouldExit changes after this check?
-            if (_replicationThreadsShouldExit || _state == SQLiteNodeState::SEARCHING) {
-                // So this fix probably works, but maybe we don't even want to call _onMESSAGE when we're detaching?
-                // Interestingly, this doesn't happen.
-                // I think load is light when this issue occurs.
+            if (_state != SQLiteNodeState::FOLLOWING) {
+                // These messages are only valid while following, but we do not throw if we receive them in other states, as
+                // it's not neccesarily an error. Specifically, as we switch away from FOLLOWING, there may still be a stream
+                // of transactions being broadcast. We do not attempt to handle these, as we keep careful count of which
+                // replication threads are currently running, and reset the replication state tracking when we're not following.
+                // Attempting to handle replication messages in some other state will break that tracking.
+                return;
+            }
+            if (_replicationThreadsShouldExit) {
                 SINFO("Discarding replication message, stopping FOLLOWING");
             } else {
                 // Ok, so the race condition could be here, right?
@@ -1684,7 +1687,6 @@ void SQLiteNode::_onMESSAGE(SQLitePeer* peer, const SData& message) {
                 // Who is running this though, it should also be the sync thread?
                 // I don't see how anyone else would be able to do this.
                 auto threadID = _replicationThreadCount.fetch_add(1);
-                SINFO("TYLER _replicationThreadCount incremented to : " << threadID + 1);
                 SDEBUG("Spawning concurrent replicate thread (blocks until DB handle available): " << threadID);
                 try {
                     uint64_t threadAttemptStartTimestamp = STimeNow();
@@ -1706,8 +1708,6 @@ void SQLiteNode::_onMESSAGE(SQLitePeer* peer, const SData& message) {
                     // and waiting for the transaction that failed will be stuck in an infinite loop. To prevent that
                     // we're changing the state to SEARCHING and sending the cancelAfter property to drop all threads
                     // that depend on the transaction that failed to be threaded.
-                    auto was = _replicationThreadCount.fetch_sub(1);
-                    SINFO("TYLER _replicationThreadCount decremented to : " << was - 1);
                     SWARN("Caught system_error starting _replicate thread with " << _replicationThreadCount.load() << " threads. e.what()=" << e.what());
                     _changeState(SQLiteNodeState::SEARCHING, message.calcU64("NewCount") - 1);
                     STHROW("Error starting replicate thread so giving up and reconnecting.");
@@ -1960,12 +1960,10 @@ void SQLiteNode::_changeState(SQLiteNodeState newState, uint64_t commitIDToCance
             // Polling wait for threads to quit. This could use a notification model such as with a condition_variable,
             // which would probably be "better" but introduces yet more state variables for a state that we're rarely
             // in, and so I've left it out for the time being.
-            SINFO("TYLER _replicationThreadCount before state change: " << _replicationThreadCount);
-            while (_replicationThreadCount.load()) {
+            while (_replicationThreadCount) {
                 SINFO("Waiting for " << _replicationThreadCount << " remaining replication threads.");
                 usleep(10'000);
             }
-            SINFO("TYLER _replicationThreadCount after state change: " << _replicationThreadCount);
             // How can the above fail????
             // We only increment _replicationThreadCount in the sync thread and we are reading it here in the sync thread.
             // It is feasible to call `_changeState` from another thread but that's not what's happening in the issue we're seeing.

From 5d29d00945558a0bff37842bc6cceb217d5f2b77 Mon Sep 17 00:00:00 2001
From: Tyler Karaszewski <tyler@expensify.com>
Date: Tue, 10 Dec 2024 11:38:00 -0800
Subject: [PATCH 07/10] Cleanup

---
 sqlitecluster/SQLiteNode.cpp               | 49 +++-------------------
 sqlitecluster/SQLiteNode.h                 |  3 +-
 sqlitecluster/SQLiteSequentialNotifier.cpp |  8 +---
 3 files changed, 8 insertions(+), 52 deletions(-)

diff --git a/sqlitecluster/SQLiteNode.cpp b/sqlitecluster/SQLiteNode.cpp
index 2c1500240..70bf83dab 100644
--- a/sqlitecluster/SQLiteNode.cpp
+++ b/sqlitecluster/SQLiteNode.cpp
@@ -187,10 +187,6 @@ void SQLiteNode::_replicate(SQLitePeer* peer, SData command, size_t sqlitePoolIn
     }
     _replicateStartCV.notify_all();
 
-    if (_replicationThreadsShouldExit) {
-        SINFO("Late replicate start, just exiting.");
-    }
-
     // Initialize each new thread with a new number.
     SInitialize("replicate" + to_string(currentReplicateThreadID.fetch_add(1)));
 
@@ -199,19 +195,6 @@ void SQLiteNode::_replicate(SQLitePeer* peer, SData command, size_t sqlitePoolIn
 
     // Allow the DB handle to be returned regardless of how this function exits.
     SQLiteScopedHandle dbScope(*_dbPool, sqlitePoolIndex);
-    // In dev, we sometimes crash on the destructor for the above after `detach` which implies to me that the DB Pool could have been deleted before we exited.
-    // This is a different manifestation of what could be the same issue.
-    // IN dev, we get:
-    // 2024-12-09T21:12:45.030255+00:00 expensidev2004 bedrock10013: xxxxxx (SQLiteNode.cpp:215) _replicate [replicate2065] [info] {cluster_node_4/SEARCHING} [performance] BEGIN_TRANSACTION replicate thread for commit 15003 waiting on DB count 15001 (ASYNC)
-    // 2024-12-09T21:12:49.023666+00:00 expensidev2004 bedrock10013: xxxxxx (SQLiteNode.cpp:1913) _changeState [sync] [info] {cluster_node_4/FOLLOWING} Replication threads should exit, canceling commits after current leader commit 0
-    // 2024-12-09T21:12:49.030957+00:00 expensidev2004 bedrock10013: xxxxxx (SQLiteSequentialNotifier.cpp:57) waitFor [replicate2065] [info] Hit 1s timeout while global cancel 1 or  specific cancel 0
-    // 2024-12-09T21:12:49.030963+00:00 expensidev2004 bedrock10013: xxxxxx (SQLiteNode.cpp:225) _replicate [replicate2065] [info] {cluster_node_4/FOLLOWING} _localCommitNotifier.waitFor canceled early, returning.
-    // 2024-12-09T21:12:49.030977+00:00 expensidev2004 bedrock10013: xxxxxx (SSignal.cpp:193) _SSignal_StackTrace [replicate2065] [warn] Signal Segmentation fault(11) caused crash, logging stack trace.
-    // Why doesn't this get cancelled as well?
-    // I'm not sure why dev thinks leader has commit 0.
-    // 2024-12-09T21:12:49.023613+00:00 expensidev2004 bedrock10013: xxxxxx (SQLiteNode.cpp:1902) _changeState [sync] [info] {cluster_node_4/FOLLOWING} [NOTIFY] setting commit count to: 17001
-    // 2024-12-09T21:12:49.023674+00:00 expensidev2004 bedrock10013: xxxxxx (SQLiteSequentialNotifier.cpp:105) cancel [sync] [info] Canceling all pending transactions after 0
-
     SQLite& db = dbScope.db();
 
     bool goSearchingOnExit = false;
@@ -231,22 +214,12 @@ void SQLiteNode::_replicate(SQLitePeer* peer, SData command, size_t sqlitePoolIn
             uint64_t waitForCount = SStartsWith(command["ID"], "ASYNC") ? command.calcU64("dbCountAtStart") : currentCount;
             SINFO("[performance] BEGIN_TRANSACTION replicate thread for commit " << newCount << " waiting on DB count " << waitForCount << " (" << (quorum ? "QUORUM" : "ASYNC") << ")");
             while (true) {
-                // Ok, why doesn't this get counted?
-                // It's waiting on: commit 26056807239 waiting on DB count 26056807238
-                // It seems like this should return immediately.
-                // Ok, is it possible we got past here and waited somewhere else? We were either stuck here, or...
                 SQLiteSequentialNotifier::RESULT result = _localCommitNotifier.waitFor(waitForCount, false);
-                // My current inclination is that maybe we reset the commit notifier before the thread really starts.
-
-                // I think we get stuck in `waitFor` because it's been reset to 0 and we're waiting for every commit from the
-                // Beginning of time. I'm not acutally sure why it ever returns, though.
-
                 if (result == SQLiteSequentialNotifier::RESULT::UNKNOWN) {
                     // This should be impossible.
                     SERROR("Got UNKNOWN result from waitFor, which shouldn't happen");
                 } else if (result == SQLiteSequentialNotifier::RESULT::COMPLETED) {
                     // Success case.
-                    // If we didn't get stuck above, we must have hit here, because otherwise we would have logged.
                     break;
                 } else if (result == SQLiteSequentialNotifier::RESULT::CANCELED) {
                     SINFO("_localCommitNotifier.waitFor canceled early, returning.");
@@ -264,7 +237,6 @@ void SQLiteNode::_replicate(SQLitePeer* peer, SData command, size_t sqlitePoolIn
                     if (commitAttemptCount > 1) {
                         SINFO("Commit attempt number " << commitAttemptCount << " for concurrent replication.");
                     }
-                    // We never log this line, so we can't have gotten to here.
                     SINFO("[performance] BEGIN for commit " << newCount);
                     bool uniqueContraintsError = false;
                     try {
@@ -1679,19 +1651,11 @@ void SQLiteNode::_onMESSAGE(SQLitePeer* peer, const SData& message) {
             if (_replicationThreadsShouldExit) {
                 SINFO("Discarding replication message, stopping FOLLOWING");
             } else {
-                // Ok, so the race condition could be here, right?
-                // Right this instance, the thread count is 0, so we can move past the check that
-                // Waits for it to be 0.
-                // But then this thread starts. Can that happen?
-                // So the sync thread does the state switch.
-                // Who is running this though, it should also be the sync thread?
-                // I don't see how anyone else would be able to do this.
                 auto threadID = _replicationThreadCount.fetch_add(1);
                 SDEBUG("Spawning concurrent replicate thread (blocks until DB handle available): " << threadID);
                 try {
                     uint64_t threadAttemptStartTimestamp = STimeNow();
                     _replicateThreadStarted = false;
-                    // Either here.
                     thread(&SQLiteNode::_replicate, this, peer, message, _dbPool->getIndex(false), threadAttemptStartTimestamp).detach();
                     {
                         unique_lock<mutex> lock(_replicateStartMutex);
@@ -1708,6 +1672,7 @@ void SQLiteNode::_onMESSAGE(SQLitePeer* peer, const SData& message) {
                     // and waiting for the transaction that failed will be stuck in an infinite loop. To prevent that
                     // we're changing the state to SEARCHING and sending the cancelAfter property to drop all threads
                     // that depend on the transaction that failed to be threaded.
+                    _replicationThreadCount.fetch_sub(1);
                     SWARN("Caught system_error starting _replicate thread with " << _replicationThreadCount.load() << " threads. e.what()=" << e.what());
                     _changeState(SQLiteNodeState::SEARCHING, message.calcU64("NewCount") - 1);
                     STHROW("Error starting replicate thread so giving up and reconnecting.");
@@ -1960,23 +1925,21 @@ void SQLiteNode::_changeState(SQLiteNodeState newState, uint64_t commitIDToCance
             // Polling wait for threads to quit. This could use a notification model such as with a condition_variable,
             // which would probably be "better" but introduces yet more state variables for a state that we're rarely
             // in, and so I've left it out for the time being.
+            size_t infoCount = 1;
             while (_replicationThreadCount) {
-                SINFO("Waiting for " << _replicationThreadCount << " remaining replication threads.");
+                if (infoCount % 100 == 0) {
+                    SINFO("Waiting for " << _replicationThreadCount << " remaining replication threads.");
+                }
                 usleep(10'000);
+                infoCount++;
             }
-            // How can the above fail????
-            // We only increment _replicationThreadCount in the sync thread and we are reading it here in the sync thread.
-            // It is feasible to call `_changeState` from another thread but that's not what's happening in the issue we're seeing.
 
             // Done exiting. Reset so that we can resume FOLLOWING in the future.
             _replicationThreadsShouldExit = false;
 
             // Guaranteed to be done right now.
-            // I bet this is wrong when these get reset. If we no threads, these get reset before they check?
-            // That doesn't make sense to me for when we should increment _replicationThreadCount
             _localCommitNotifier.reset();
             _leaderCommitNotifier.reset();
-            // If the above completed, we should immediately see `Switching from... ` logged.
 
             // We have no leader anymore.
             _leadPeer = nullptr;
diff --git a/sqlitecluster/SQLiteNode.h b/sqlitecluster/SQLiteNode.h
index ab476b653..57c5c1056 100644
--- a/sqlitecluster/SQLiteNode.h
+++ b/sqlitecluster/SQLiteNode.h
@@ -198,8 +198,7 @@ class SQLiteNode : public STCPManager {
       public:
         ScopedDecrement(CounterType& counter) : _counter(counter) {}
         ~ScopedDecrement() {
-            auto decrementedTo = --_counter;
-            SINFO("TYLER _counter decremented to : " << decrementedTo);
+            --_counter;
         }
       private:
         CounterType& _counter;
diff --git a/sqlitecluster/SQLiteSequentialNotifier.cpp b/sqlitecluster/SQLiteSequentialNotifier.cpp
index 69d30f1ac..c7814d344 100644
--- a/sqlitecluster/SQLiteSequentialNotifier.cpp
+++ b/sqlitecluster/SQLiteSequentialNotifier.cpp
@@ -31,8 +31,6 @@ SQLiteSequentialNotifier::RESULT SQLiteSequentialNotifier::waitFor(uint64_t valu
                 SINFO("Canceled after " << _cancelAfter << ", but waiting for " << value << " so not returning yet.");
             } else {
                 // Canceled and we're not before the cancellation cutoff.
-                // I don't see how we don't return here. Maybe we never acquire `waitingThreadMutex`?
-                SINFO("Returning canceled because _cancelAfter=" << _cancelAfter << " and value=" << value);
                 return RESULT::CANCELED;
             }
         } else if (_globalResult != RESULT::UNKNOWN) {
@@ -54,10 +52,9 @@ SQLiteSequentialNotifier::RESULT SQLiteSequentialNotifier::waitFor(uint64_t valu
             // We should investigate any instances of thew below logline to see if they're same as for the success cases mentioned above (i.e., the timeout happens simultaneously as the
             // cancellation) or if the log line is delayed by up to a second (indicating a problem).
             if (_globalResult == RESULT::CANCELED || state->result == RESULT::CANCELED) {
-                // I bet removing a 1 second delay means we don't hit this. we get to calling `reset` on this sooner, possibly while there are threads waiting here.
-
                 // It's possible that we hit the timeout here after `cancel()` has set the global value, but before we received the notification.
                 // This isn't a problem, and we can jump back to the top of the loop and check again. If there's some problem, we'll see it there.
+                // Does this still happen?? Might be fixed.
                 SINFO("Hit 1s timeout while global cancel " << (_globalResult == RESULT::CANCELED) << " or " << " specific cancel " << (state->result == RESULT::CANCELED));
                 continue;
             }
@@ -120,10 +117,7 @@ void SQLiteSequentialNotifier::cancel(uint64_t cancelAfter) {
         auto start = _cancelAfter ? valueThreadMap.upper_bound(_cancelAfter) : valueThreadMap.begin();
         if (start == valueThreadMap.end()) {
             // There's nothing to remove.
-            SINFO("[performance] Next value to cancel after " << cancelAfter << " is N/A");
             return;
-        } else {
-            SINFO("[performance] Next value to cancel after " << cancelAfter << " is " << start->first);
         }
 
         // Now iterate across whatever's remaining and mark it canceled.

From 9e2094597bf29c04233e48ba4484fb0bcd0059dd Mon Sep 17 00:00:00 2001
From: Tyler Karaszewski <tyler@expensify.com>
Date: Tue, 10 Dec 2024 11:41:09 -0800
Subject: [PATCH 08/10] Cleanup 2

---
 sqlitecluster/SQLiteNode.cpp               | 2 +-
 sqlitecluster/SQLiteSequentialNotifier.cpp | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/sqlitecluster/SQLiteNode.cpp b/sqlitecluster/SQLiteNode.cpp
index 70bf83dab..34881cdf4 100644
--- a/sqlitecluster/SQLiteNode.cpp
+++ b/sqlitecluster/SQLiteNode.cpp
@@ -1930,8 +1930,8 @@ void SQLiteNode::_changeState(SQLiteNodeState newState, uint64_t commitIDToCance
                 if (infoCount % 100 == 0) {
                     SINFO("Waiting for " << _replicationThreadCount << " remaining replication threads.");
                 }
-                usleep(10'000);
                 infoCount++;
+                usleep(10'000);
             }
 
             // Done exiting. Reset so that we can resume FOLLOWING in the future.
diff --git a/sqlitecluster/SQLiteSequentialNotifier.cpp b/sqlitecluster/SQLiteSequentialNotifier.cpp
index c7814d344..a0f84a156 100644
--- a/sqlitecluster/SQLiteSequentialNotifier.cpp
+++ b/sqlitecluster/SQLiteSequentialNotifier.cpp
@@ -117,8 +117,10 @@ void SQLiteSequentialNotifier::cancel(uint64_t cancelAfter) {
         auto start = _cancelAfter ? valueThreadMap.upper_bound(_cancelAfter) : valueThreadMap.begin();
         if (start == valueThreadMap.end()) {
             // There's nothing to remove.
+            SINFO("[performance] No available values to cancel after " << cancelAfter);
             return;
         }
+        SINFO("[performance] Next value to cancel after " << cancelAfter << " is " << start->first);
 
         // Now iterate across whatever's remaining and mark it canceled.
         auto current = start;

From 6b211c5f16c8a5d2d633ed417d2b01e0bc00bb96 Mon Sep 17 00:00:00 2001
From: Tyler Karaszewski <tyler@expensify.com>
Date: Tue, 10 Dec 2024 12:18:03 -0800
Subject: [PATCH 09/10] Fix logline

---
 sqlitecluster/SQLiteSequentialNotifier.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sqlitecluster/SQLiteSequentialNotifier.cpp b/sqlitecluster/SQLiteSequentialNotifier.cpp
index a0f84a156..e12505ecd 100644
--- a/sqlitecluster/SQLiteSequentialNotifier.cpp
+++ b/sqlitecluster/SQLiteSequentialNotifier.cpp
@@ -54,8 +54,7 @@ SQLiteSequentialNotifier::RESULT SQLiteSequentialNotifier::waitFor(uint64_t valu
             if (_globalResult == RESULT::CANCELED || state->result == RESULT::CANCELED) {
                 // It's possible that we hit the timeout here after `cancel()` has set the global value, but before we received the notification.
                 // This isn't a problem, and we can jump back to the top of the loop and check again. If there's some problem, we'll see it there.
-                // Does this still happen?? Might be fixed.
-                SINFO("Hit 1s timeout while global cancel " << (_globalResult == RESULT::CANCELED) << " or " << " specific cancel " << (state->result == RESULT::CANCELED));
+                SINFO("Hit 1s timeout while global cancel " << (_globalResult == RESULT::CANCELED) << " or specific cancel " << (state->result == RESULT::CANCELED));
                 continue;
             }
         }

From 1723cfb3e28e364840efbba4b3453a9e18a1a79f Mon Sep 17 00:00:00 2001
From: Tyler Karaszewski <tyler@expensify.com>
Date: Tue, 10 Dec 2024 13:03:15 -0800
Subject: [PATCH 10/10] Add extra log line

---
 sqlitecluster/SQLiteNode.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sqlitecluster/SQLiteNode.cpp b/sqlitecluster/SQLiteNode.cpp
index 34881cdf4..9395cf6a0 100644
--- a/sqlitecluster/SQLiteNode.cpp
+++ b/sqlitecluster/SQLiteNode.cpp
@@ -1646,6 +1646,7 @@ void SQLiteNode::_onMESSAGE(SQLitePeer* peer, const SData& message) {
                 // of transactions being broadcast. We do not attempt to handle these, as we keep careful count of which
                 // replication threads are currently running, and reset the replication state tracking when we're not following.
                 // Attempting to handle replication messages in some other state will break that tracking.
+                SINFO("Ignoring " << message.methodLine << " in state " << stateName(_state));
                 return;
             }
             if (_replicationThreadsShouldExit) {