Merge pull request #261 from MaLoefUDS/dev

Allow time-based data-splitting with multiple datasources as 'split.basis' Reviewed-by: Christian Hechtl <[email protected]> Reviewed-by: Thomas Bock <[email protected]>
se-sic · May 27, 2024 · a87ff24 · a87ff24
2 parents 4d35daa + 06d4d6a
commit a87ff24
Show file tree

Hide file tree

Showing 4 changed files with 103 additions and 17 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -10,6 +10,7 @@
 - Add commit-interaction networks that can be created with `create.author.network` and `create.artifact.network` if the `artifact.relation` and `author.relation` is configured to be `commit.interaction` (PR #252, d82857fbebd1111bb16588a4223bb24a8dcd07de, 329d97ec3de36a9e1bcadc0c7a53c1d92e8b481c) as well as tests for these features (PR #252, 07e7ed744209b0251217fa8f7f35d9b9875face2, 7068cfa10d993dcae3f5e3f76f8cafa99fa8b350)
 - Add helper function for prefixing function names with file names in `util-read.R` (PR #252, f8ea987b138173cf0509c7910e0572d8ee1b3f1f)
 - Add line-based code coverage reports into CI pipeline. Coverage reports are generated by `coverage.R` (PR #262, 10cac49d005e87c3964cc61711e7f5acef749626, b3b9f4ac7a9911bd00293c68fac88e0f9033bdfb, c815d18dc6266d620a7a145493417b87ac08679e, e8093525fdaf46e54f2f7fcc6358ca7892e795e5, 32d04823e2007c63d2a43ce59bea3057327c19a7)
+- Add the possibility to split data time-based by multiple data sources (PR #261, 1088395f46b84028c8d7c463ca86b5dc38500c26, e1f79fc9e40cd6f41c946be42db364b2101cfe10, 0bb187fec0fd801d7634bf8d5180525770f6ab0b, 371a97ac6ebf3de4fe9360dea79d62e2ed3ef585)
 
 ### Changed/Improved
 

diff --git a/showcase.R b/showcase.R
@@ -281,6 +281,7 @@ run.lapply(data, "get.data.path.callgraph")
 
 ## * Data-based splitting --------------------------------------------------
 
+## split time-based using commits as the data source to split by (with sliding windows)
 cf.data = split.data.time.based(x.data, time.period = "18 months", split.basis = "commits", sliding.window = TRUE)
 for (range in names(cf.data)) {
     y.data = cf.data[[range]]
@@ -289,6 +290,15 @@ for (range in names(cf.data)) {
 }
 print(run.lapply(cf.data, "get.class.name"))
 
+## split time-based using commits and issues as the data sources to split by (without sliding windows)
+cf.data = split.data.time.based(x.data, time.period = "18 month", split.basis = c("commits", "issues"))
+for (range in names(cf.data)) {
+    y.data = cf.data[[range]]
+    y = NetworkBuilder$new(project.data = y.data, network.conf = net.conf)
+    plot.network(y$get.bipartite.network())
+}
+print(run.lapply(cf.data, "get.class.name"))
+
 mybins = c("2012-07-10 15:58:00", "2012-07-15 16:02:00", "2012-07-20 16:04:00", "2012-07-25 16:06:30")
 cf.data = split.data.time.based(x.data, bins = mybins)
 for (range in names(cf.data)) {

diff --git a/tests/test-split-data-time-based.R b/tests/test-split-data-time-based.R
@@ -770,6 +770,65 @@ patrick::with_parameters_test_that("Split a data object time-based (split.basis
     "pasta, synchronicity: TRUE" = list(test.pasta = TRUE, test.synchronicity = TRUE)
 ))
 
+
+##
+## Tests for split.data.time.based(..., split.basis = c('mails', 'issues'), with and without sliding windows
+##
+
+patrick::with_parameters_test_that("Split a data object time-based (split.basis = c('mails', 'issues'))", {
+
+    ## configuration objects
+    proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT)
+    net.conf = NetworkConf$new()
+
+    ## data object
+    project.data = ProjectData$new(proj.conf)
+
+    # remove really old mail data
+    mail.data = project.data$get.mails()
+    mail.data = mail.data[-(1:12), ]
+    project.data$set.mails(mail.data)
+
+    # check mail date bounds
+    expect_equal(min(mail.data$date), as.POSIXct("2016-07-12 15:58:40"))
+    expect_equal(max(mail.data$date), as.POSIXct("2016-07-12 16:05:37"))
+
+    # keep issue data that roughly overlaps the mail data
+    issue.data = project.data$get.issues()
+    issue.data = issue.data[-(1:12), ]
+    issue.data = issue.data[-(8:12), ]
+    project.data$set.issues(issue.data)
+
+    # check issue date bounds
+    expect_equal(min(issue.data$date), as.POSIXct("2016-07-12 15:59:25"))
+    expect_equal(max(issue.data$date), as.POSIXct("2016-07-12 16:06:01"))
+
+    # split by 'mails' and 'issues'
+    results = split.data.time.based(project.data, time.period = "1 min",
+                                    split.basis = c("mails", "issues"), sliding.window = test.sliding.window)
+
+    # define bins for 'test.sliding.window' = TRUE
+    expected.bins = get.date.from.string(c("2016-07-12 15:58:40", "2016-07-12 15:59:10", "2016-07-12 15:59:40",
+                                           "2016-07-12 16:00:10", "2016-07-12 16:00:40", "2016-07-12 16:01:10",
+                                           "2016-07-12 16:01:40", "2016-07-12 16:02:10", "2016-07-12 16:02:40",
+                                           "2016-07-12 16:03:10", "2016-07-12 16:03:40", "2016-07-12 16:04:10",
+                                           "2016-07-12 16:04:40", "2016-07-12 16:05:10", "2016-07-12 16:05:40",
+                                           "2016-07-12 16:06:02"))
+
+    if (!test.sliding.window) {
+        # define bins for 'test.sliding.window' = FALSE
+        # remove every second sliding bin but the last one
+        expected.bins = expected.bins[c(seq(1, length(expected.bins), by = 2), length(expected.bins))]
+    }
+
+    expect_equal(attr(results, "bins"), expected.bins)
+
+}, patrick::cases(
+    "sliding.windows: FALSE" = list(test.sliding.window = FALSE),
+    "sliding.windoww: TRUE" = list(test.sliding.window = TRUE)
+))
+
+
 ## * * bins ----------------------------------------------------------------
 
 ##

diff --git a/util-split.R b/util-split.R
@@ -52,7 +52,8 @@ requireNamespace("lubridate") # for date conversion
 #'                       time-sized windows for all ranges. If set, the \code{time.period} and \code{bins} parameters are ignored;
 #'                       consequently, \code{sliding.window} does not make sense then either.
 #'                       [default: NULL]
-#' @param split.basis the data name to use as the basis for split bins, either 'commits', 'mails', or 'issues'
+#' @param split.basis the data source to use as the basis for split bins, either 'commits', 'mails', 'issues',
+#'                    or an arbitrary combination of them
 #'                    [default: "commits"]
 #' @param sliding.window logical indicating whether the splitting should be performed using a sliding-window approach
 #'                       [default: FALSE]
@@ -65,6 +66,14 @@ split.data.time.based = function(project.data, time.period = "3 months", bins =
                                  number.windows = NULL, split.basis = c("commits", "mails", "issues"),
                                  sliding.window = FALSE, project.conf.new = NULL) {
 
+    # ensure 'split.basis' defaults to 'commits' if not defined
+    # and allow it to contain multiple data sources if explicitly wanted
+    if (!hasArg("split.basis")) {
+        split.basis = match.arg.or.default(split.basis, several.ok = FALSE, default = "commits")
+    } else {
+        split.basis = match.arg.or.default(split.basis, several.ok = TRUE)
+    }
+
     # validate existence and type of the 'bins' parameter
     if (!is.null(bins) && !lubridate::is.POSIXct(bins)) {
         dates = parallel::mclapply(unlist(bins), get.date.from.string)
@@ -89,7 +98,9 @@ split.data.time.based = function(project.data, time.period = "3 months", bins =
 #'             \code{bins}: Dates defining the start of bins (the last date defines the end of the last bin, in an
 #'             *exclusive* manner).
 #'             The expected format of \code{bins} is produced by \code{split.get.bins.activity.based}.
-#' @param split.basis the data name to use as the basis for split bins, either 'commits', 'mails', or 'issues'
+#' @param split.basis the data source that was used to obtain \code{bins} from \code{split.get.bins.activity.based},
+#'                    either 'commits', 'mails', or 'issues'. \code{split.basis} is necessary to associate
+#'                    \code{bins$vector} with the correct data elements.
 #'                    [default: "commits"]
 #' @param sliding.window logical indicating whether a sliding-window approach was used when obtaining the \code{bins}.
 #'
@@ -99,6 +110,9 @@ split.data.time.based = function(project.data, time.period = "3 months", bins =
 split.data.by.bins = function(project.data, activity.amount, bins, split.basis = c("commits", "mails", "issues"),
                                      sliding.window) {
 
+    ## get basis for splitting process
+    split.basis = match.arg(split.basis)
+
     # validate type of the 'bins' parameter
     if (is.null(bins) || !is.list(bins)) {
         logging::logerror("The bins parameter needs to be of type list, (is %s)", class(bins))
@@ -183,7 +197,7 @@ split.data.activity.based = function(project.data, activity.type = c("commits",
                                      activity.amount = 5000, number.windows = NULL,
                                      sliding.window = FALSE, project.conf.new = NULL) {
 
-    ## get basis for splitting process
+    ## get activity type for splitting process
     activity.type = match.arg(activity.type)
 
     ## get actual raw data
@@ -195,13 +209,13 @@ split.data.activity.based = function(project.data, activity.type = c("commits",
     })
     names(data) = data.sources
 
-    ## if the data used by the split basis is not present, load it automatically
+    ## if the data used by the splitting activity type is not present, load it automatically
     if (!(activity.type %in% project.data$get.cached.data.sources("only.unfiltered"))) {
         function.name = DATASOURCE.TO.UNFILTERED.ARTIFACT.FUNCTION[[activity.type]]
         project.data[[function.name]]()
     }
 
-    ## define ID columns for mails and commits
+    ## define ID columns for commits, mails, and issues
     id.column = list(
         commits = "hash",
         mails = "message.id",
@@ -252,7 +266,7 @@ split.data.activity.based = function(project.data, activity.type = c("commits",
     logging::loginfo("Splitting data '%s' into activity ranges of %s %s (%s windows).",
                      project.data$get.class.name(), activity.amount, activity.type, number.windows)
 
-    ## get bins based on 'split.basis'. Here the 'include.duplicate.ids' parameter flag must be set, to
+    ## get bins based on 'activity.type'. Here the 'include.duplicate.ids' parameter flag must be set, to
     ## retrieve bins which map every event to a bin including events with non-unique ids. This is important
     ## to ensure that every range really has 'activity.amount' many entries after splitting
     logging::logdebug("Getting activity-based bins.")
@@ -887,8 +901,8 @@ split.network.by.bins = function(network, bins, bins.vector, bins.date = NULL, r
 #' @param split.by.time logical indicating whether splitting is done time-based or activity-bins-based
 #' @param number.windows see \code{number.windows} from \code{split.data.time.based}
 #'                       [default: NULL]
-#' @param split.basis the data source to use as the basis for split bins, either 'commits', 'mails', or 'issues'
-#'                    [default: "commits"]
+#' @param split.basis either formatted as the \code{split.basis} from \code{split.data.time.based}
+#'                    or from \code{split.data.by.bins}.
 #' @param sliding.window logical indicating whether the splitting should be performed using a sliding-window approach
 #'                       [default: FALSE]
 #' @param project.conf.new the new project config to construct the \code{RangeData} objects.
@@ -900,16 +914,16 @@ split.network.by.bins = function(network, bins, bins.vector, bins.date = NULL, r
 #' @seealso split.data.time.based
 #' @seealso split.data.by.bins
 split.data.by.time.or.bins = function(project.data, splitting.length, bins, split.by.time,
-                                      number.windows = NULL, split.basis = c("commits", "mails", "issues"),
-                                      sliding.window = FALSE, project.conf.new = NULL) {
-
-    ## get basis for splitting process
-    split.basis = match.arg(split.basis)
+                                      number.windows = NULL, split.basis, sliding.window = FALSE,
+                                      project.conf.new = NULL) {
 
     ## if the data used by the split basis is not present, load it automatically
-    if (!(split.basis %in% project.data$get.cached.data.sources("only.unfiltered"))) {
-        function.name = DATASOURCE.TO.UNFILTERED.ARTIFACT.FUNCTION[[split.basis]]
-        project.data[[function.name]]()
+    for (i in seq_along(split.basis)) {
+        data.source = split.basis[i]
+        if (!(data.source %in% project.data$get.cached.data.sources("only.unfiltered"))) {
+            function.name = DATASOURCE.TO.UNFILTERED.ARTIFACT.FUNCTION[[data.source]]
+            project.data[[function.name]]()
+        }
     }
 
     ## get actual raw data
@@ -945,7 +959,9 @@ split.data.by.time.or.bins = function(project.data, splitting.length, bins, spli
     ## if bins are NOT given explicitly
     if (is.null(bins)) {
         ## get bins based on split.basis
-        bins = split.get.bins.time.based(data[[split.basis]][["date"]], splitting.length, number.windows)$bins
+        dates = project.data$get.data.timestamps(split.basis)
+        dates = get.date.from.unix.timestamp(unname(unlist(dates)))
+        bins = split.get.bins.time.based(dates, splitting.length, number.windows)[["bins"]]
         bins.labels = head(bins, -1)
         ## logging
         logging::loginfo("Splitting data '%s' into time ranges of %s based on '%s' data.",