Latest results for thesis manuscript

atlarge-research · May 29, 2024 · 6eeb7a2 · 6eeb7a2
1 parent 5f784d1
commit 6eeb7a2
Show file tree

Hide file tree

Showing 43 changed files with 25,662 additions and 10,246 deletions.
diff --git a/lib/src/main/scala/benchmark/Benchmark.scala b/lib/src/main/scala/benchmark/Benchmark.scala
@@ -88,9 +88,10 @@ object Benchmark {
       CaptureFilter(
         provenanceFilter = ProvenancePredicate(
           nodePredicate = ProvenanceGraph.allNodes,
-          edgePredicate = provenanceFilter(description.setup)
+          edgePredicate =
+            provenanceFilter(description.setup, description.algorithm)
         ),
-        dataFilter = dataFilter(gl, description.setup, description.algorithm)
+        dataFilter = dataFilter(description.setup, description.algorithm)
       )
     )
 
@@ -182,7 +183,7 @@ object Benchmark {
     )
 
     // Clean up lineage folder after being done with it
-//    fs.delete(lineagePath, true)
+    fs.delete(lineagePath, true)
   }
 
   def computeFlags(expSetup: ExperimentSetup): (Boolean, Boolean) = {
@@ -201,7 +202,6 @@ object Benchmark {
   }
 
   def dataFilter(
-      gl: GraphLineage[Unit, Double],
       experimentSetup: ExperimentSetup,
       algorithm: GraphAlgorithm
   ): DataPredicate = {
@@ -239,14 +239,20 @@ object Benchmark {
     }
   }
 
-  def provenanceFilter(expSetup: ExperimentSetup): Relation => Boolean = {
+  def provenanceFilter(
+      expSetup: ExperimentSetup,
+      algorithm: GraphAlgorithm
+  ): Relation => Boolean = {
     expSetup match {
       case ExperimentSetup.ProvenanceGraphPruning |
           ExperimentSetup.CombinedPruning =>
         (r: ProvenanceGraph.Relation) => {
           r.edge.event match {
-            case Operation("joinVertices") => true
-            case _                         => false
+            case Operation("outerJoinVertices") =>
+              algorithm == GraphAlgorithm.fromString("pr")
+            case Operation("joinVertices") =>
+              algorithm != GraphAlgorithm.fromString("pr")
+            case _ => false
           }
         }
       case _ =>

diff --git a/lib/src/test/resources/runner-config-example.conf b/lib/src/test/resources/runner-config-example.conf
@@ -1,61 +1,68 @@
 runner {
-        # Inputs
-        repetitions = 1
-        algorithms = [
-                BFS
-                PageRank
-                WCC
-                SSSP
-        ]
-
-        graphs = [
-                kgs
-                wiki-Talk
-                #cit-Patents
-                # S graphs
-                #datagen-7_5-fb
-                #datagen-7_6-fb
-                #datagen-7_7-zf
-                #datagen-7_8-zf
-                #datagen-7_9-fb
-                #dota-league
-                #graph500-22
-                # M graphs
-                datagen-8_4-fb
-                # L graphs
-                #datagen-8_8-zf
-        ]
-
-        storageFormats = [
-                TextFile()
-                ObjectFile()
-                ParquetFile()
-                AvroFile()
-                ORCFile()
-                CSVFile()
-                JSONFormat()
-                TextFile(true)
-                CSVFile(true)
-                JSONFormat(true)
-        ]
-
-        jar = "invalid-path"
-        datasetPath = "./src/test/resources"
-        experimentsPath = "/var/scratch/gmo520/thesis/experiments"
-        setups = [
-                Baseline
-                StorageFormats
-                # Compression
-                # Storage
-                # Tracing
-                # SmartPruning
-                # AlgorithmOpOnly
-                # JoinVerticesOpOnly
-                # Combined
-        ]
-
-        # Outputs
-        lineagePath = "file:///tmp/lineage"
-        outputPath = "file:///tmp/output"
-        sparkLogs = "file:///tmp/spark-logs"
+    // Inputs
+    repetitions = 1
+    algorithms = [
+        "BFS",
+        "PageRank",
+        "WCC",
+        "SSSP",
+    ]
+
+    setups = [
+        "Baseline",
+        "StorageFormats",
+        // "Compression",
+        // "Storage",
+        // "Tracing",
+        // "SmartPruning",
+        // "AlgorithmOpOnly",
+        // "JoinVerticesOpOnly",
+        // "Combined",
+    ]
+
+    graphs = [
+        // XS graphs
+        "kgs"
+        "wiki-Talk"
+        // "cit-Patents",
+
+        // S graphs
+        // "datagen-7_5-fb",
+        // "datagen-7_6-fb",
+        // "datagen-7_7-zf",
+        // "datagen-7_8-zf",
+        // "datagen-7_9-fb",
+        // "dota-league",
+        // "graph500-22",
+
+        // M graphs
+        "datagen-8_4-fb",
+
+        // L graphs
+        // "datagen-8_8-zf",
+    ]
+
+    storageFormats = [
+        "TextFile()",
+        "ObjectFile()",
+        "ParquetFile()",
+        "AvroFile()",
+        "ORCFile()",
+        "CSVFile()",
+        "JSONFormat()",
+        "TextFile(true)",
+        "CSVFile(true)",
+        "JSONFormat(true)",
+    ]
+
+    jar = "invalid-path"
+    datasetPath = "./src/test/resources"
+    experimentsPath = "/var/scratch/gmo520/thesis/experiments"
+
+    // Outputs
+    lineagePath = "file:///tmp/lineage"
+    outputPath = "file:///tmp/output"
+    sparkLogs = "file:///tmp/spark-logs"
+
+    timeoutMinutes = 10
 }
diff --git a/lib/src/test/scala/benchmark/BenchmarkTests.scala b/lib/src/test/scala/benchmark/BenchmarkTests.scala
@@ -11,6 +11,10 @@ import provenance.{ProvenanceGraph, ProvenanceGraphNode}
 import provenance.events.{BFS, Operation}
 import provenance.metrics.ObservationSet
 
+import lu.magalhaes.gilles.provxlib.provenance.query.{
+  DeltaPredicate,
+  GraphPredicate
+}
 import lu.magalhaes.gilles.provxlib.provenance.storage.TextFile
 import lu.magalhaes.gilles.provxlib.utils.LocalSparkSession.withSparkSession
 import org.apache.spark.graphx.{Edge, Graph}
@@ -61,7 +65,8 @@ class BenchmarkTests extends AnyFunSuite {
           outputDir = outputDir,
           graphalyticsConfigPath = graphalyticsConfigPath,
           lineageDir = runnerConfig.runner.lineagePath,
-          setup = ExperimentSetup.Baseline
+          setup = ExperimentSetup.Baseline,
+          numExecutors = 7
         )
       )
       Benchmark.run(sc, config)
@@ -70,32 +75,24 @@ class BenchmarkTests extends AnyFunSuite {
 
   test("Benchmark flags computation") {
     assert(
-      Benchmark.computeFlags(ExperimentSetup.Compression) == (true, true)
-    )
-    assert(
-      Benchmark.computeFlags(ExperimentSetup.Storage) == (true, true)
+      Benchmark.computeFlags(ExperimentSetup.CompleteProvenance) == (true, true)
     )
     assert(
       Benchmark.computeFlags(ExperimentSetup.Tracing) == (true, false)
     )
     assert(
       Benchmark.computeFlags(
-        ExperimentSetup.SmartPruning
-      ) == (true, true)
-    )
-    assert(
-      Benchmark.computeFlags(
-        ExperimentSetup.AlgorithmOpOnly
+        ExperimentSetup.DataGraphPruning
       ) == (true, true)
     )
     assert(
       Benchmark.computeFlags(
-        ExperimentSetup.JoinVerticesOpOnly
+        ExperimentSetup.ProvenanceGraphPruning
       ) == (true, true)
     )
     assert(
       Benchmark.computeFlags(
-        ExperimentSetup.Combined
+        ExperimentSetup.CombinedPruning
       ) == (true, true)
     )
     assert(
@@ -131,32 +128,61 @@ class BenchmarkTests extends AnyFunSuite {
 
       val g = Graph(longVertices, edges)
 
-      assert(
-        g.subgraph(vpred =
-          Benchmark
-            .dataFilter(ExperimentSetup.SmartPruning, GraphAlgorithm.WCC)
-        ).vertices
-          .collect()
-          .length == 1
-      )
+      {
+        val filter = Benchmark.dataFilter(
+          ExperimentSetup.DataGraphPruning,
+          GraphAlgorithm.WCC
+        ) match {
+          case GraphPredicate(nodePredicate, _) => nodePredicate
+          case DeltaPredicate(_)                => ???
+          case _                                => ???
+        }
 
-      assert(
-        g.subgraph(vpred =
-          Benchmark.dataFilter(ExperimentSetup.Baseline, GraphAlgorithm.WCC)
-        ).vertices
-          .collect()
-          .length == 3
-      )
+        assert(
+          g.subgraph(vpred = filter)
+            .vertices
+            .collect()
+            .length == 1
+        )
+      }
+
+      {
+        val filter = Benchmark.dataFilter(
+          ExperimentSetup.Baseline,
+          GraphAlgorithm.WCC
+        ) match {
+          case GraphPredicate(nodePredicate, _) => nodePredicate
+          case DeltaPredicate(_)                => ???
+          case _                                => ???
+        }
+        assert(
+          g.subgraph(vpred = filter)
+            .vertices
+            .collect()
+            .length == 3
+        )
+      }
+
+      {
+        val filter = Benchmark
+          .dataFilter(
+            ExperimentSetup.DataGraphPruning,
+            GraphAlgorithm.SSSP
+          ) match {
+          case DeltaPredicate(_)                => ???
+          case GraphPredicate(nodePredicate, _) => nodePredicate
+          case _                                => ???
+        }
 
-      val g2 = Graph(doubleVertices, edges)
-      assert(
-        g2.subgraph(vpred =
-          Benchmark
-            .dataFilter(ExperimentSetup.SmartPruning, GraphAlgorithm.SSSP)
-        ).vertices
-          .collect()
-          .length == 1
-      )
+        val g2 = Graph(doubleVertices, edges)
+        assert(
+          g2.subgraph(vpred = filter)
+            .vertices
+            .collect()
+            .length == 1
+        )
+
+      }
     }
   }
 
@@ -185,17 +211,8 @@ class BenchmarkTests extends AnyFunSuite {
       ProvenanceGraph.Edge(BFS(3), ObservationSet())
     )
 
-    val algOpFilter =
-      Benchmark.provenanceFilter(ExperimentSetup.AlgorithmOpOnly)
-
-    val res = pg.filter(nodeP = ProvenanceGraph.allNodes, edgeP = algOpFilter)
-
-    assert(res.graph.edges.count((e: ProvenanceGraph.Type#EdgeT) => {
-      algOpFilter(e.outer)
-    }) == 1)
-
     val joinVerticesFilter =
-      Benchmark.provenanceFilter(ExperimentSetup.JoinVerticesOpOnly)
+      Benchmark.provenanceFilter(ExperimentSetup.ProvenanceGraphPruning)
 
     val res2 =
       pg.filter(nodeP = ProvenanceGraph.allNodes, edgeP = joinVerticesFilter)

diff --git a/lib/src/test/scala/benchmark/ConfigFilesTest.scala b/lib/src/test/scala/benchmark/ConfigFilesTest.scala
@@ -101,7 +101,8 @@ class ConfigFilesTest extends AnyFunSuite {
       outputDir = outputDir,
       graphalyticsConfigPath = graphalyticsConfigPath,
       lineageDir = runnerConfig.runner.lineagePath,
-      setup = ExperimentSetup.Baseline
+      setup = ExperimentSetup.Baseline,
+      numExecutors = 7
     )
 
     println(BenchmarkAppConfig.write(config))

diff --git a/results/plots/das6/20240521-010312-baseline/duration-pagerank.pdf b/results/plots/das6/20240521-010312-baseline/duration-pagerank.pdf
diff --git a/results/plots/das6/20240521-010312-baseline/duration-sssp.pdf b/results/plots/das6/20240521-010312-baseline/duration-sssp.pdf
diff --git a/results/plots/das6/20240521-010312-baseline/duration-wcc.pdf b/results/plots/das6/20240521-010312-baseline/duration-wcc.pdf
diff --git a/results/plots/das6/20240521-022009-tracing/desc.csv b/results/plots/das6/20240521-022009-tracing/desc.csv
diff --git a/results/plots/das6/20240521-022009-tracing/overhead.pdf b/results/plots/das6/20240521-022009-tracing/overhead.pdf
diff --git a/results/plots/das6/20240521-034221-completeprovenance/duration-pagerank.pdf b/results/plots/das6/20240521-034221-completeprovenance/duration-pagerank.pdf
diff --git a/results/plots/das6/20240521-034221-completeprovenance/duration-sssp.pdf b/results/plots/das6/20240521-034221-completeprovenance/duration-sssp.pdf
diff --git a/results/plots/das6/20240521-034221-completeprovenance/duration-wcc.pdf b/results/plots/das6/20240521-034221-completeprovenance/duration-wcc.pdf
diff --git a/results/plots/das6/20240521-034221-completeprovenance/size-sssp.pdf b/results/plots/das6/20240521-034221-completeprovenance/size-sssp.pdf
diff --git a/results/plots/das6/20240521-034221-completeprovenance/size-wcc.pdf b/results/plots/das6/20240521-034221-completeprovenance/size-wcc.pdf
diff --git a/results/plots/das6/20240521-111351-combinedpruning/overhead-duration.pdf b/results/plots/das6/20240521-111351-combinedpruning/overhead-duration.pdf
diff --git a/results/plots/das6/20240521-111351-combinedpruning/overhead-size.pdf b/results/plots/das6/20240521-111351-combinedpruning/overhead-size.pdf
diff --git a/results/plots/das6/conclusion/factor.pdf b/results/plots/das6/conclusion/factor.pdf
diff --git a/results/plots/das6/final/csv/es01-duration.csv b/results/plots/das6/final/csv/es01-duration.csv
@@ -0,0 +1,22 @@
+algorithm,dataset,min,mean,max,std
+BFS,graph500-22,30.44,32.87,35.11,2.00
+BFS,datagen-7\_5-fb,33.20,34.32,36.05,1.20
+BFS,datagen-7\_9-fb,62.32,69.31,80.59,8.00
+BFS,cit-Patents,79.37,82.97,88.44,4.02
+BFS,datagen-8\_8-zf,184.12,218.72,247.36,26.50
+BFS,datagen-8\_4-fb,224.23,241.79,251.48,9.34
+PageRank,datagen-7\_5-fb,38.60,39.98,43.88,2.02
+PageRank,datagen-7\_9-fb,67.77,69.88,71.44,1.65
+PageRank,graph500-22,75.87,78.38,81.21,2.04
+PageRank,cit-Patents,76.20,85.10,88.41,4.45
+PageRank,datagen-8\_4-fb,205.04,215.87,227.36,7.12
+PageRank,datagen-8\_8-zf,223.81,245.95,258.35,11.81
+SSSP,datagen-7\_5-fb,34.57,38.12,45.16,3.77
+SSSP,datagen-7\_9-fb,60.92,76.50,94.05,14.17
+SSSP,datagen-8\_8-zf,162.00,209.25,248.77,30.70
+SSSP,datagen-8\_4-fb,234.95,255.83,264.24,11.53
+WCC,datagen-7\_5-fb,33.54,36.77,38.80,1.94
+WCC,datagen-7\_9-fb,62.89,66.34,72.11,3.28
+WCC,graph500-22,66.38,72.05,82.57,7.97
+WCC,cit-Patents,152.93,157.94,165.29,4.65
+WCC,datagen-8\_4-fb,230.89,239.02,243.93,5.38
diff --git a/results/plots/das6/final/csv/es01-size.csv b/results/plots/das6/final/csv/es01-size.csv
@@ -0,0 +1,7 @@
+dataset,size
+cit-Patents,280 MB
+datagen-7\_5-fb,1014 MB
+datagen-7\_9-fb,2 GB
+datagen-8\_4-fb,7 GB
+datagen-8\_8-zf,13 GB
+graph500-22,963 MB