Skip to content

Commit

Permalink
Merge pull request #261 from MaLoefUDS/dev
Browse files Browse the repository at this point in the history
Allow time-based data-splitting with multiple datasources as 'split.basis'

Reviewed-by: Christian Hechtl <[email protected]>
Reviewed-by: Thomas Bock <[email protected]>
  • Loading branch information
bockthom authored May 27, 2024
2 parents 4d35daa + 06d4d6a commit a87ff24
Show file tree
Hide file tree
Showing 4 changed files with 103 additions and 17 deletions.
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
- Add commit-interaction networks that can be created with `create.author.network` and `create.artifact.network` if the `artifact.relation` and `author.relation` is configured to be `commit.interaction` (PR #252, d82857fbebd1111bb16588a4223bb24a8dcd07de, 329d97ec3de36a9e1bcadc0c7a53c1d92e8b481c) as well as tests for these features (PR #252, 07e7ed744209b0251217fa8f7f35d9b9875face2, 7068cfa10d993dcae3f5e3f76f8cafa99fa8b350)
- Add helper function for prefixing function names with file names in `util-read.R` (PR #252, f8ea987b138173cf0509c7910e0572d8ee1b3f1f)
- Add line-based code coverage reports into CI pipeline. Coverage reports are generated by `coverage.R` (PR #262, 10cac49d005e87c3964cc61711e7f5acef749626, b3b9f4ac7a9911bd00293c68fac88e0f9033bdfb, c815d18dc6266d620a7a145493417b87ac08679e, e8093525fdaf46e54f2f7fcc6358ca7892e795e5, 32d04823e2007c63d2a43ce59bea3057327c19a7)
- Add the possibility to split data time-based by multiple data sources (PR #261, 1088395f46b84028c8d7c463ca86b5dc38500c26, e1f79fc9e40cd6f41c946be42db364b2101cfe10, 0bb187fec0fd801d7634bf8d5180525770f6ab0b, 371a97ac6ebf3de4fe9360dea79d62e2ed3ef585)

### Changed/Improved

Expand Down
10 changes: 10 additions & 0 deletions showcase.R
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,7 @@ run.lapply(data, "get.data.path.callgraph")

## * Data-based splitting --------------------------------------------------

## split time-based using commits as the data source to split by (with sliding windows)
cf.data = split.data.time.based(x.data, time.period = "18 months", split.basis = "commits", sliding.window = TRUE)
for (range in names(cf.data)) {
y.data = cf.data[[range]]
Expand All @@ -289,6 +290,15 @@ for (range in names(cf.data)) {
}
print(run.lapply(cf.data, "get.class.name"))

## split time-based using commits and issues as the data sources to split by (without sliding windows)
cf.data = split.data.time.based(x.data, time.period = "18 month", split.basis = c("commits", "issues"))
for (range in names(cf.data)) {
y.data = cf.data[[range]]
y = NetworkBuilder$new(project.data = y.data, network.conf = net.conf)
plot.network(y$get.bipartite.network())
}
print(run.lapply(cf.data, "get.class.name"))

mybins = c("2012-07-10 15:58:00", "2012-07-15 16:02:00", "2012-07-20 16:04:00", "2012-07-25 16:06:30")
cf.data = split.data.time.based(x.data, bins = mybins)
for (range in names(cf.data)) {
Expand Down
59 changes: 59 additions & 0 deletions tests/test-split-data-time-based.R
Original file line number Diff line number Diff line change
Expand Up @@ -770,6 +770,65 @@ patrick::with_parameters_test_that("Split a data object time-based (split.basis
"pasta, synchronicity: TRUE" = list(test.pasta = TRUE, test.synchronicity = TRUE)
))


##
## Tests for split.data.time.based(..., split.basis = c('mails', 'issues'), with and without sliding windows
##

patrick::with_parameters_test_that("Split a data object time-based (split.basis = c('mails', 'issues'))", {

## configuration objects
proj.conf = ProjectConf$new(CF.DATA, CF.SELECTION.PROCESS, CASESTUDY, ARTIFACT)
net.conf = NetworkConf$new()

## data object
project.data = ProjectData$new(proj.conf)

# remove really old mail data
mail.data = project.data$get.mails()
mail.data = mail.data[-(1:12), ]
project.data$set.mails(mail.data)

# check mail date bounds
expect_equal(min(mail.data$date), as.POSIXct("2016-07-12 15:58:40"))
expect_equal(max(mail.data$date), as.POSIXct("2016-07-12 16:05:37"))

# keep issue data that roughly overlaps the mail data
issue.data = project.data$get.issues()
issue.data = issue.data[-(1:12), ]
issue.data = issue.data[-(8:12), ]
project.data$set.issues(issue.data)

# check issue date bounds
expect_equal(min(issue.data$date), as.POSIXct("2016-07-12 15:59:25"))
expect_equal(max(issue.data$date), as.POSIXct("2016-07-12 16:06:01"))

# split by 'mails' and 'issues'
results = split.data.time.based(project.data, time.period = "1 min",
split.basis = c("mails", "issues"), sliding.window = test.sliding.window)

# define bins for 'test.sliding.window' = TRUE
expected.bins = get.date.from.string(c("2016-07-12 15:58:40", "2016-07-12 15:59:10", "2016-07-12 15:59:40",
"2016-07-12 16:00:10", "2016-07-12 16:00:40", "2016-07-12 16:01:10",
"2016-07-12 16:01:40", "2016-07-12 16:02:10", "2016-07-12 16:02:40",
"2016-07-12 16:03:10", "2016-07-12 16:03:40", "2016-07-12 16:04:10",
"2016-07-12 16:04:40", "2016-07-12 16:05:10", "2016-07-12 16:05:40",
"2016-07-12 16:06:02"))

if (!test.sliding.window) {
# define bins for 'test.sliding.window' = FALSE
# remove every second sliding bin but the last one
expected.bins = expected.bins[c(seq(1, length(expected.bins), by = 2), length(expected.bins))]
}

expect_equal(attr(results, "bins"), expected.bins)

}, patrick::cases(
"sliding.windows: FALSE" = list(test.sliding.window = FALSE),
"sliding.windoww: TRUE" = list(test.sliding.window = TRUE)
))


## * * bins ----------------------------------------------------------------

##
Expand Down
50 changes: 33 additions & 17 deletions util-split.R
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,8 @@ requireNamespace("lubridate") # for date conversion
#' time-sized windows for all ranges. If set, the \code{time.period} and \code{bins} parameters are ignored;
#' consequently, \code{sliding.window} does not make sense then either.
#' [default: NULL]
#' @param split.basis the data name to use as the basis for split bins, either 'commits', 'mails', or 'issues'
#' @param split.basis the data source to use as the basis for split bins, either 'commits', 'mails', 'issues',
#' or an arbitrary combination of them
#' [default: "commits"]
#' @param sliding.window logical indicating whether the splitting should be performed using a sliding-window approach
#' [default: FALSE]
Expand All @@ -65,6 +66,14 @@ split.data.time.based = function(project.data, time.period = "3 months", bins =
number.windows = NULL, split.basis = c("commits", "mails", "issues"),
sliding.window = FALSE, project.conf.new = NULL) {

# ensure 'split.basis' defaults to 'commits' if not defined
# and allow it to contain multiple data sources if explicitly wanted
if (!hasArg("split.basis")) {
split.basis = match.arg.or.default(split.basis, several.ok = FALSE, default = "commits")
} else {
split.basis = match.arg.or.default(split.basis, several.ok = TRUE)
}

# validate existence and type of the 'bins' parameter
if (!is.null(bins) && !lubridate::is.POSIXct(bins)) {
dates = parallel::mclapply(unlist(bins), get.date.from.string)
Expand All @@ -89,7 +98,9 @@ split.data.time.based = function(project.data, time.period = "3 months", bins =
#' \code{bins}: Dates defining the start of bins (the last date defines the end of the last bin, in an
#' *exclusive* manner).
#' The expected format of \code{bins} is produced by \code{split.get.bins.activity.based}.
#' @param split.basis the data name to use as the basis for split bins, either 'commits', 'mails', or 'issues'
#' @param split.basis the data source that was used to obtain \code{bins} from \code{split.get.bins.activity.based},
#' either 'commits', 'mails', or 'issues'. \code{split.basis} is necessary to associate
#' \code{bins$vector} with the correct data elements.
#' [default: "commits"]
#' @param sliding.window logical indicating whether a sliding-window approach was used when obtaining the \code{bins}.
#'
Expand All @@ -99,6 +110,9 @@ split.data.time.based = function(project.data, time.period = "3 months", bins =
split.data.by.bins = function(project.data, activity.amount, bins, split.basis = c("commits", "mails", "issues"),
sliding.window) {

## get basis for splitting process
split.basis = match.arg(split.basis)

# validate type of the 'bins' parameter
if (is.null(bins) || !is.list(bins)) {
logging::logerror("The bins parameter needs to be of type list, (is %s)", class(bins))
Expand Down Expand Up @@ -183,7 +197,7 @@ split.data.activity.based = function(project.data, activity.type = c("commits",
activity.amount = 5000, number.windows = NULL,
sliding.window = FALSE, project.conf.new = NULL) {

## get basis for splitting process
## get activity type for splitting process
activity.type = match.arg(activity.type)

## get actual raw data
Expand All @@ -195,13 +209,13 @@ split.data.activity.based = function(project.data, activity.type = c("commits",
})
names(data) = data.sources

## if the data used by the split basis is not present, load it automatically
## if the data used by the splitting activity type is not present, load it automatically
if (!(activity.type %in% project.data$get.cached.data.sources("only.unfiltered"))) {
function.name = DATASOURCE.TO.UNFILTERED.ARTIFACT.FUNCTION[[activity.type]]
project.data[[function.name]]()
}

## define ID columns for mails and commits
## define ID columns for commits, mails, and issues
id.column = list(
commits = "hash",
mails = "message.id",
Expand Down Expand Up @@ -252,7 +266,7 @@ split.data.activity.based = function(project.data, activity.type = c("commits",
logging::loginfo("Splitting data '%s' into activity ranges of %s %s (%s windows).",
project.data$get.class.name(), activity.amount, activity.type, number.windows)

## get bins based on 'split.basis'. Here the 'include.duplicate.ids' parameter flag must be set, to
## get bins based on 'activity.type'. Here the 'include.duplicate.ids' parameter flag must be set, to
## retrieve bins which map every event to a bin including events with non-unique ids. This is important
## to ensure that every range really has 'activity.amount' many entries after splitting
logging::logdebug("Getting activity-based bins.")
Expand Down Expand Up @@ -887,8 +901,8 @@ split.network.by.bins = function(network, bins, bins.vector, bins.date = NULL, r
#' @param split.by.time logical indicating whether splitting is done time-based or activity-bins-based
#' @param number.windows see \code{number.windows} from \code{split.data.time.based}
#' [default: NULL]
#' @param split.basis the data source to use as the basis for split bins, either 'commits', 'mails', or 'issues'
#' [default: "commits"]
#' @param split.basis either formatted as the \code{split.basis} from \code{split.data.time.based}
#' or from \code{split.data.by.bins}.
#' @param sliding.window logical indicating whether the splitting should be performed using a sliding-window approach
#' [default: FALSE]
#' @param project.conf.new the new project config to construct the \code{RangeData} objects.
Expand All @@ -900,16 +914,16 @@ split.network.by.bins = function(network, bins, bins.vector, bins.date = NULL, r
#' @seealso split.data.time.based
#' @seealso split.data.by.bins
split.data.by.time.or.bins = function(project.data, splitting.length, bins, split.by.time,
number.windows = NULL, split.basis = c("commits", "mails", "issues"),
sliding.window = FALSE, project.conf.new = NULL) {

## get basis for splitting process
split.basis = match.arg(split.basis)
number.windows = NULL, split.basis, sliding.window = FALSE,
project.conf.new = NULL) {

## if the data used by the split basis is not present, load it automatically
if (!(split.basis %in% project.data$get.cached.data.sources("only.unfiltered"))) {
function.name = DATASOURCE.TO.UNFILTERED.ARTIFACT.FUNCTION[[split.basis]]
project.data[[function.name]]()
for (i in seq_along(split.basis)) {
data.source = split.basis[i]
if (!(data.source %in% project.data$get.cached.data.sources("only.unfiltered"))) {
function.name = DATASOURCE.TO.UNFILTERED.ARTIFACT.FUNCTION[[data.source]]
project.data[[function.name]]()
}
}

## get actual raw data
Expand Down Expand Up @@ -945,7 +959,9 @@ split.data.by.time.or.bins = function(project.data, splitting.length, bins, spli
## if bins are NOT given explicitly
if (is.null(bins)) {
## get bins based on split.basis
bins = split.get.bins.time.based(data[[split.basis]][["date"]], splitting.length, number.windows)$bins
dates = project.data$get.data.timestamps(split.basis)
dates = get.date.from.unix.timestamp(unname(unlist(dates)))
bins = split.get.bins.time.based(dates, splitting.length, number.windows)[["bins"]]
bins.labels = head(bins, -1)
## logging
logging::loginfo("Splitting data '%s' into time ranges of %s based on '%s' data.",
Expand Down

0 comments on commit a87ff24

Please sign in to comment.