From bbaca47e65b721019e33e77007c3a0f76301666d Mon Sep 17 00:00:00 2001 From: Navin Agarwal <45832642+agarwal-navin@users.noreply.github.com> Date: Mon, 22 Jan 2024 11:09:44 -0800 Subject: [PATCH] Fix flaky GC test where a data store may be deleted depending on how long certain steps run (#19309) ## Bug The GC test "Trailing op [afterSweepTimeout] transitions data store from [ref -> unref] without deleting it" some times fails because the following error telemetry is logged - "fluid:telemetry:FluidDataStoreContext:GC_Deleted_DataStore_Unexpected_Delete". ## Root cause In the last summary in this test, a data store may be sweep ready depending on how much elapsed since the previous summary. The test used sweep timeout as 100 ms so if that much time has passed, a GC sweep op will be sent containing the id of the sweep ready data store. If this op is processed by the time the test completes, "fluid:telemetry:FluidDataStoreContext:GC_Deleted_DataStore_Unexpected_Delete" will be logged. Since the sweep timeout is low, it is possible that in certain cases, that times elapses between the last 2 summaries in the test resulting in the data store being deleted. ## Fix It's fine for the data store to be deleted in the scenario described above. However, the container that created the data store should be closed to emulate session expiry before sweep deletes it. The fix is to close this container before running the last summary. AB#6784 --- .../src/test/gc/gcTrailingOps.spec.ts | 28 ++++++++++++------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/packages/test/test-end-to-end-tests/src/test/gc/gcTrailingOps.spec.ts b/packages/test/test-end-to-end-tests/src/test/gc/gcTrailingOps.spec.ts index 348ff558ddc6..d6b63c54daad 100644 --- a/packages/test/test-end-to-end-tests/src/test/gc/gcTrailingOps.spec.ts +++ b/packages/test/test-end-to-end-tests/src/test/gc/gcTrailingOps.spec.ts @@ -163,12 +163,12 @@ describeCompat("GC trailing ops tests", "2.0.0-rc.1.0.0", (getTestObjectProvider mainDataObject._root.set(newDataStoreKey, newDataStore.entryPoint); // Update the initial reference state of the data store. - let referenced = transition === "ref -> unref" ? true : false; + let isReferenced = transition === "ref -> unref" ? true : false; updateDataStoreReferenceState( mainDataObject, newDataObject, newDataStoreKey, - referenced, + isReferenced, ); // Create a summarizer @@ -178,18 +178,18 @@ describeCompat("GC trailing ops tests", "2.0.0-rc.1.0.0", (getTestObjectProvider summarizerContainerConfig, ); - // Summarize and verify that the datastore reference state is in the opposite state of "transitionToReference". + // Summarize and verify that the datastore reference state is as per "isReferenced". await provider.ensureSynchronized(); const summary1 = await summarizeNow(mainSummarizer); validateDataStoreStateInSummary( summary1.summaryTree, newDataStore.entryPoint.absolutePath, false /* expectGCStateHandle */, - referenced, + isReferenced, ); // The reference state will transition now due to the trailing op sent next. - referenced = !referenced; + isReferenced = !isReferenced; // If beforeSweepTimeout, send the trailing op that transitions the data store's reference state first // and then wait for sweep timeout. @@ -202,7 +202,7 @@ describeCompat("GC trailing ops tests", "2.0.0-rc.1.0.0", (getTestObjectProvider mainDataObject, newDataObject, newDataStoreKey, - referenced, + isReferenced, ); await delay(sweepTimeoutMs); } else if (when === "afterSweepTimeout") { @@ -211,7 +211,7 @@ describeCompat("GC trailing ops tests", "2.0.0-rc.1.0.0", (getTestObjectProvider mainDataObject, newDataObject, newDataStoreKey, - referenced, + isReferenced, ); } @@ -235,10 +235,18 @@ describeCompat("GC trailing ops tests", "2.0.0-rc.1.0.0", (getTestObjectProvider summary2.summaryTree, newDataStore.entryPoint.absolutePath, false /* expectGCStateHandle */, - referenced, + isReferenced, ); - // Summarize again to ensure that GC sweep op (if any) is now processed. + // Close the main container before running GC which may generate a GC op. Otherwise, it will hit this error + // "GC_Deleted_DataStore_Unexpected_Delete". We don't expect local data stores to be deleted because + // their session expires before deletion. This mimics that behavior. + mainContainer.close(); + + // Summarize again. This summary may generate a GC op if "sweep timeout" time has passed. However, the + // data store will not be deleted in this summary. This is done to validate that even though trailing + // ops that can change reference state may have been generated after sweep timeout expired, the state + // change doesn't happen until next GC run. So, nothing can be deleted in this GC run. await provider.ensureSynchronized(); const summary3 = await summarizeNow(summarizer); @@ -247,7 +255,7 @@ describeCompat("GC trailing ops tests", "2.0.0-rc.1.0.0", (getTestObjectProvider summary3.summaryTree, newDataStore.entryPoint.absolutePath, true /* expectGCStateHandle */, - referenced, + isReferenced, ); }); };