From ed0f26565c33fc6eda5ea4ed4d8cb2f41fea55e7 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Thu, 4 Apr 2024 13:16:28 +0200 Subject: [PATCH] Changes to adress comments by @bockthom Fix style issues, modify README.md, add small test and add some comments for clarity Signed-off-by: Leo Sendelbach --- README.md | 5 ++ install.R | 4 +- tests/README.md | 1 + tests/test-data.R | 8 +++ tests/test-networks-artifact.R | 8 +-- tests/test-networks-author.R | 4 +- util-conf.R | 4 +- util-data.R | 119 ++++++++++++++++++--------------- util-networks.R | 14 ++-- util-read.R | 29 +++++--- 10 files changed, 116 insertions(+), 80 deletions(-) diff --git a/README.md b/README.md index 62c029b33..c39c35069 100644 --- a/README.md +++ b/README.md @@ -142,6 +142,8 @@ Alternatively, you can run `Rscript install.R` to install the packages. - `jsonlite`: For parsing the issue data - `rTensor`: For calculating EDCPTD centrality - `Matrix`: For sparse matrix representation of large adjacency matrices +- `fastmap`: For fast implementation of a map +- `purrr`: For fast implementtion of a mapping function ### Submodule @@ -597,6 +599,9 @@ There is no way to update the entries, except for the revision-based parameters. - `custom.event.timestamps.locked`: * Lock custom event timestamps to prevent them from being read if empty or not yet present when calling the getter. * [`TRUE`, *`FALSE`*] +- `commit.interactions`: + * Alloow construction of author and artifact networks using commit interaction data + * [`TRUE`, *`FALSE`*] ### NetworkConf diff --git a/install.R b/install.R index 99f047ccc..94d403d9c 100644 --- a/install.R +++ b/install.R @@ -44,7 +44,9 @@ packages = c( "viridis", "jsonlite", "rTensor", - "Matrix" + "Matrix", + "fastmap", + "purrr" ) diff --git a/tests/README.md b/tests/README.md index 6eb557919..cfe453fbf 100644 --- a/tests/README.md +++ b/tests/README.md @@ -16,6 +16,7 @@ We have two test projects you can use when writing your tests: * Commit messages * Pasta * Synchronicity + * Commit Interactions * Custom event timestamps in `custom-events.list` * Revisions 2. - Casestudy: `test_empty` diff --git a/tests/test-data.R b/tests/test-data.R index 9049e3ce8..984568468 100644 --- a/tests/test-data.R +++ b/tests/test-data.R @@ -99,6 +99,13 @@ test_that("Compare two ProjectData objects on empty data", { proj.data.two$set.project.conf.entry("commit.messages", "message") proj.data.two$get.commit.messages() expect_true(proj.data.one$equals(proj.data.two), "Two identical ProjectData objects (commit.messages).") + + proj.data.one$set.project.conf.entry("commit.interactions", TRUE) + proj.data.one$get.commit.interactions() + expect_false(proj.data.one$equals(proj.data.two), "Two non-identical ProjectData objects (commit.interactions).") + proj.data.two$set.project.conf.entry("commit.interactions", TRUE) + proj.data.two$get.commit.interactions() + expect_true(proj.data.one$equals(proj.data.two), "Two identical ProjectData objects (commit.interactions).") }) test_that("Compare two ProjectData objects on non-empty data", { @@ -540,6 +547,7 @@ test_that("Compare two ProjectData Objects with commit.interactions", { commit.data[["hash"]][[5]] = 1 proj.data.one$set.commits(commit.data) + ## use isTRUE to compress result of all.equal into a single boolean expect_false(isTRUE(all.equal(proj.data.one$get.commit.interactions(), proj.data.two$get.commit.interactions()))) diff --git a/tests/test-networks-artifact.R b/tests/test-networks-artifact.R index 13fad5f1b..e52dd973f 100644 --- a/tests/test-networks-artifact.R +++ b/tests/test-networks-artifact.R @@ -222,7 +222,7 @@ test_that("Network construction with commit-interactions as relation, artifact t proj.data = ProjectData$new(project.conf = proj.conf) net.conf = NetworkConf$new() - net.conf$update.value("artifact.relation", "interaction") + net.conf$update.value("artifact.relation", "commit.interaction") network.builder = NetworkBuilder$new(project.data = proj.data, network.conf = net.conf) network.built = network.builder$get.artifact.network() @@ -249,7 +249,7 @@ test_that("Network construction with commit-interactions as relation, artifact t interacting.author = c("Thomas", "Karl", "Olaf", "Thomas"), weight = c(1, 1, 1, 1), type = c(TYPE.EDGES.INTRA, TYPE.EDGES.INTRA, TYPE.EDGES.INTRA, TYPE.EDGES.INTRA), - relation = c("interaction", "interaction", "interaction", "interaction") + relation = c("commit.interaction", "commit.interaction", "commit.interaction", "commit.interaction") ) network = igraph::graph.data.frame(edges, directed = FALSE, vertices = vertices) @@ -265,7 +265,7 @@ test_that("Network construction with commit-interactions as relation, artifact t proj.data = ProjectData$new(project.conf = proj.conf) net.conf = NetworkConf$new() - net.conf$update.value("artifact.relation", "interaction") + net.conf$update.value("artifact.relation", "commit.interaction") network.builder = NetworkBuilder$new(project.data = proj.data, network.conf = net.conf) network.built = network.builder$get.artifact.network() @@ -292,7 +292,7 @@ test_that("Network construction with commit-interactions as relation, artifact t interacting.author = c("Thomas", "Karl", "Olaf", "Thomas"), weight = c(1, 1, 1, 1), type = c(TYPE.EDGES.INTRA, TYPE.EDGES.INTRA, TYPE.EDGES.INTRA, TYPE.EDGES.INTRA), - relation = c("interaction", "interaction", "interaction", "interaction") + relation = c("commit.interaction", "commit.interaction", "commit.interaction", "commit.interaction") ) network = igraph::graph.data.frame(edges, directed = FALSE, vertices = vertices) diff --git a/tests/test-networks-author.R b/tests/test-networks-author.R index cabb598e5..4f580ef29 100644 --- a/tests/test-networks-author.R +++ b/tests/test-networks-author.R @@ -688,7 +688,7 @@ test_that("Network construction with commit-interactions as relation", { proj.data = ProjectData$new(project.conf = proj.conf) net.conf = NetworkConf$new() - net.conf$update.value("author.relation", "interaction") + net.conf$update.value("author.relation", "commit.interaction") network.builder = NetworkBuilder$new(project.data = proj.data, network.conf = net.conf) network.built = network.builder$get.author.network() @@ -716,7 +716,7 @@ test_that("Network construction with commit-interactions as relation", { base.file = c("test2.c", "test2.c", "test3.c", "test2.c"), weight = c(1, 1, 1, 1), type = c(TYPE.EDGES.INTRA, TYPE.EDGES.INTRA, TYPE.EDGES.INTRA, TYPE.EDGES.INTRA), - relation = c("interaction", "interaction", "interaction", "interaction") + relation = c("commit.interaction", "commit.interaction", "commit.interaction", "commit.interaction") ) network = igraph::graph.data.frame(edges, directed = FALSE, vertices = vertices) diff --git a/util-conf.R b/util-conf.R index 434fbf963..64a9ed799 100644 --- a/util-conf.R +++ b/util-conf.R @@ -790,7 +790,7 @@ NetworkConf = R6::R6Class("NetworkConf", inherit = Conf, author.relation = list( default = "mail", type = "character", - allowed = c("mail", "cochange", "issue", "interaction"), + allowed = c("mail", "cochange", "issue", "commit.interaction"), allowed.number = Inf ), author.directed = list( @@ -821,7 +821,7 @@ NetworkConf = R6::R6Class("NetworkConf", inherit = Conf, artifact.relation = list( default = "cochange", type = "character", - allowed = c("cochange", "callgraph", "mail", "issue", "interaction"), + allowed = c("cochange", "callgraph", "mail", "issue", "commit.interaction"), allowed.number = Inf ), artifact.directed = list( diff --git a/util-data.R b/util-data.R index 32fa890ae..c1e46e454 100644 --- a/util-data.R +++ b/util-data.R @@ -78,8 +78,8 @@ DATASOURCE.TO.ADDITIONAL.ARTIFACT.FUNCTION = list( "synchronicity" = "get.synchronicity", "pasta" = "get.pasta", "gender" = "get.gender", - "custom.event.timestamps" = "get.custom.event.timestamps", - "commit.interactions" = "get.commit.interactions" + "commit.interactions" = "get.commit.interactions", + "custom.event.timestamps" = "get.custom.event.timestamps" ) #' Applies a function to list keys @@ -125,7 +125,8 @@ CONF.PARAMETERS.NO.RESET.ENVIRONMENT = c("commit.messages", "issues.locked", "mails.locked", "custom.event.timestamps", - "custom.event.timestamps.locked") + "custom.event.timestamps.locked", + "commit.interactions") ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / @@ -164,7 +165,7 @@ ProjectData = R6::R6Class("ProjectData", commits = create.empty.commits.list(), # data.frame commits.unfiltered = create.empty.commits.list(), # data.frame commit.messages = create.empty.commit.message.list(), # data.frame - commit.interactions = create.empty.commit.interaction.list(), + commit.interactions = create.empty.commit.interaction.list(), # data.frame ## mails mails.unfiltered = create.empty.mails.list(), # data.frame mails = create.empty.mails.list(), # data.frame @@ -414,46 +415,49 @@ ProjectData = R6::R6Class("ProjectData", #' #' This method should be called whenever the field \code{commit.interactions} is changed. update.commit.interactions = function() { - if (!self$is.data.source.cached("commits.unfiltered")) { - self$get.commits() - } + if (self$is.data.source.cached("commit.interactions")) { + if (!self$is.data.source.cached("commits.unfiltered")) { + self$get.commits() + } - ## remove existing columns named 'base.author' and 'interaction.author' - indices.to.remove = which("base.author" == colnames(private$commit.interactions)) - if (length(indices.to.remove) > 0) { - private$commit.interactions = private$commit.interactions[, -indices.to.remove] - } - indices.to.remove = which("interacting.author" == colnames(private$commit.interactions)) - if (length(indices.to.remove) > 0) { - private$commit.interactions = private$commit.interactions[, -indices.to.remove] - } + ## remove existing columns named 'base.author' and 'interaction.author' + indices.to.remove = which("base.author" == colnames(private$commit.interactions)) + if (length(indices.to.remove) > 0) { + private$commit.interactions = private$commit.interactions[, -indices.to.remove] + } + indices.to.remove = which("interacting.author" == colnames(private$commit.interactions)) + if (length(indices.to.remove) > 0) { + private$commit.interactions = private$commit.interactions[, -indices.to.remove] + } - ## get relevant data from commits - commit.data.subset = data.frame(hash = private$commits.unfiltered[["hash"]], - author.name = private$commits.unfiltered[["author.name"]]) - commit.data.subset = commit.data.subset[!duplicated(commit.data.subset[["hash"]]),] + ## get relevant data from commits + commit.data.subset = data.frame(hash = private$commits.unfiltered[["hash"]], + author.name = private$commits.unfiltered[["author.name"]]) + commit.data.subset = commit.data.subset[!duplicated(commit.data.subset[["hash"]]),] - ## merge commit interactions with commits and change colnames to avoid duplicates - commit.interaction.data = merge(private$commit.interactions, commit.data.subset, - by.x = "base.hash", by.y = "hash", all.x = TRUE) + ## merge commit interactions with commits and change colnames to avoid duplicates + commit.interaction.data = merge(private$commit.interactions, commit.data.subset, + by.x = "base.hash", by.y = "hash", all.x = TRUE) - author.index = match("author.name", colnames(commit.interaction.data)) - colnames(commit.interaction.data)[[author.index]] = "base.author" + author.index = match("author.name", colnames(commit.interaction.data)) + colnames(commit.interaction.data)[[author.index]] = "base.author" - commit.interaction.data = merge(commit.interaction.data, commit.data.subset, - by.x = "commit.hash", by.y = "hash", all.x = TRUE) + commit.interaction.data = merge(commit.interaction.data, commit.data.subset, + by.x = "commit.hash", by.y = "hash", all.x = TRUE) - author.index = match("author.name", colnames(commit.interaction.data)) - colnames(commit.interaction.data)[[author.index]] = "interacting.author" + author.index = match("author.name", colnames(commit.interaction.data)) + colnames(commit.interaction.data)[[author.index]] = "interacting.author" - ## warning if we have interactions without authors - if (anyNA(commit.interaction.data[["base.author"]]) || - anyNA(commit.interaction.data[["interacting.author"]])) { - logging::logwarn("There are authors in the commit-interactions that are not in the commit data! - This results in the commit-interactions having empty entries. - To clean up these entries, call cleanup.commit.interactions.") + ## warning if we have interactions without authors + if (anyNA(commit.interaction.data[["base.author"]]) || + anyNA(commit.interaction.data[["interacting.author"]])) { + logging::logwarn("There are commits in the commit-interactions that are not in + the commit data, possibly due to incomplete commit data or deleted users. + This results in the commit-interactions having empty entries. + To clean up these entries, call cleanup.commit.interactions.") + } + private$commit.interactions = commit.interaction.data } - private$commit.interactions = commit.interaction.data }, ## * * Gender data -------------------------------------------------- @@ -856,6 +860,7 @@ ProjectData = R6::R6Class("ProjectData", private$pasta.commits = create.empty.pasta.list() private$gender = create.empty.gender.list() private$synchronicity = create.empty.synchronicity.list() + private$commit.interactions = create.empty.commit.interaction.list() }, ## * * configuration ----------------------------------------------- @@ -1256,19 +1261,26 @@ ProjectData = R6::R6Class("ProjectData", get.commit.interactions = function(data.path = NULL) { logging::loginfo("Getting commit interactions.") - ## if the commit-interaction data have not yet been read do this - if (!self$is.data.source.cached("commit.interactions")) { - if (is.null(data.path)) { - commit.interaction.data = read.commit.interactions(self$get.data.path()) - } else { - commit.interaction.data = read.commit.interactions(data.path) - } + ## if commit-interaction data are to be read, do this + if (private$project.conf$get.value("commit.interactions")) { + ## if the commit-interaction data have not yet been read do this + if (!self$is.data.source.cached("commit.interactions")) { + if (is.null(data.path)) { + commit.interaction.data = read.commit.interactions(self$get.data.path()) + } else { + commit.interaction.data = read.commit.interactions(data.path) + } - ## cache the result - private$commit.interactions = commit.interaction.data - private$update.commit.interactions() + ## cache the result + private$commit.interactions = commit.interaction.data + private$update.commit.interactions() + } + } else { + logging::logwarn("You have not set the ProjectConf parameter + 'commit.interactions' to 'TRUE'! Ignoring...") + ## mark commit-interaction data as empty + private$commit.interactions = NULL } - return(private$commit.interactions) }, @@ -1289,9 +1301,10 @@ ProjectData = R6::R6Class("ProjectData", private$commit.interactions = data }, - #' Remove lines in the commit-interactions data that do not contain authors. - #' This should only be called AFTER 'update.commit.interactions' has already been called, as otherwise - #' all commit-interactions data will be removed + #' Remove lines in the commit-interaction data for which the corresponding commit is missing in the + #' commit data, indicated by a missing author in the commit-interaction data. + #' This should only be called AFTER \code{update.commit.interactions} has already been called, as otherwise + #' all commit-interactions data will be removed. cleanup.commit.interactions = function() { logging::loginfo("Cleaning up commit-interactions") @@ -1877,8 +1890,8 @@ ProjectData = R6::R6Class("ProjectData", "commit.messages" = "commit.messages", "synchronicity" = "synchronicity", "pasta" = "pasta", - "custom.event.timestamps" = "custom.event.timestamps", - "commit.interactions" = "commit.interactions" + "commit.interactions" = "commit.interactions", + "custom.event.timestamps" = "custom.event.timestamps" ) ) sources = self$get.cached.data.sources.internal(source.type) @@ -1910,7 +1923,7 @@ ProjectData = R6::R6Class("ProjectData", ## define the data sources unfiltered.data.sources = c("commits.unfiltered", "mails.unfiltered", "issues.unfiltered") additional.data.sources = c("authors", "commit.messages", "synchronicity", "pasta", - "gender", "custom.event.timestamps", "commit.interactions") + "gender", "commit.interactions", "custom.event.timestamps") main.data.sources = c("issues", "commits", "mails") ## set the right data sources to look for according to the argument diff --git a/util-networks.R b/util-networks.R index 0500d3fd6..00d339ea6 100644 --- a/util-networks.R +++ b/util-networks.R @@ -138,7 +138,7 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", callgraph = private$proj.data$get.project.conf.entry("artifact.codeface"), mail = "MailThread", issue = "Issue", - interaction = private$proj.data$get.project.conf.entry("artifact.codeface") + commit.interaction = private$proj.data$get.project.conf.entry("artifact.codeface") ) return(vertex.kind) @@ -377,7 +377,7 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", return(artifacts.net) }, - #' Build and get the the commit-interaction based artifact network. + #' Build and get the commit-interaction based artifact network. #' #' @return the commit-interaction based artifact network get.artifact.network.commit.interaction = function() { @@ -397,7 +397,7 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", edges = edges[, c("file", "base.file", "func", "commit.hash", "base.hash", "base.func", "base.author", "interacting.author")] - colnames(edges)[4] = "hash" + colnames(edges)[colnames(edges)=="commit.hash"] = "hash" } else if (proj.conf.artifact == "function") { ## change the vertices to the functions from the commit-interaction data vertices = unique(c(private$proj.data$get.commit.interactions()[["base.func"]], @@ -406,12 +406,12 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", edges = edges[, c("func", "base.func", "commit.hash", "file", "base.hash", "base.file", "base.author", "interacting.author")] - colnames(edges)[3] = "hash" + colnames(edges)[colnames(edges)=="commit.hash"] = "hash" } else { ## If neither 'function' nor 'file' was configured, send a warning ## and return an empty network logging::logwarn("when creating a commit-interaction artifact network, - the artifact relation should be either 'file' or 'function'!") + the artifact should be either 'file' or 'function'!") return(create.empty.network(directed = private$network.conf$get.value("artifact.directed"))) } colnames(edges)[1] = "to" @@ -826,7 +826,7 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", network = switch( relation, cochange = private$get.author.network.cochange(), - interaction = private$get.author.network.commit.interaction(), + commit.interaction = private$get.author.network.commit.interaction(), mail = private$get.author.network.mail(), issue = private$get.author.network.issue(), stop(sprintf("The author relation '%s' does not exist.", rel)) @@ -888,7 +888,7 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", callgraph = private$get.artifact.network.callgraph(), mail = private$get.artifact.network.mail(), issue = private$get.artifact.network.issue(), - interaction = private$get.artifact.network.commit.interaction(), + commit.interaction = private$get.artifact.network.commit.interaction(), stop(sprintf("The artifact relation '%s' does not exist.", relation)) ) diff --git a/util-read.R b/util-read.R index c83cdadf8..8af4f243b 100644 --- a/util-read.R +++ b/util-read.R @@ -44,6 +44,8 @@ requireNamespace("digest") # for sha1 hashing of IDs requireNamespace("sqldf") # for SQL-selections on data.frames requireNamespace("data.table") # for faster data.frame processing requireNamespace("yaml") # for reading commit interaction data +requireNamespace("fastmap") # for fast implementation of a map +requireNamespace("purrr") # for fast mapping function ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Helper functions -------------------------------------------------------- @@ -866,7 +868,6 @@ COMMIT.INTERACTION.LIST.DATA.TYPES = c( #' commit (hash) gets mapped to all commits it interacts with and the file/function because of #' which they interact. #' -#' #' @param data.path the path to the commit-interaction data #' #' @return the read and parsed commit-interaction data @@ -888,27 +889,31 @@ read.commit.interactions = function(data.path = NULL) { } ## extract the top level list of the yaml file which is called 'result-map' - result.map = commit.interaction.base$`result-map` + result.map = commit.interaction.base[["result-map"]] ## extract a mapping of functions to files to be able to determine what file the current interaction is ## based on + ## 1) create an empty map file.name.map = fastmap::fastmap() + ## 2) create a mapping between functions and files as a list function.file.list = purrr::map(result.map, 2) + ## 3) set the map using the list file.name.map$mset(.list = function.file.list) list.names = names(result.map) ## build the result dataframe by iterating over the 'result-map' list - commit.interaction.data = data.table::setDF(data.table::rbindlist(parallel::mcmapply(result.map, list.names, - SIMPLIFY = FALSE, - FUN = function(current.interaction, - function.name) { + commit.interaction.data = data.table::setDF(data.table::rbindlist( + parallel::mcmapply(result.map, + list.names, + SIMPLIFY = FALSE, + FUN = function(current.interaction, function.name) { ## get all commits that interact with the current one insts = current.interaction[[4]] interactions = data.table::setDF(data.table::rbindlist(lapply(insts, function(current.inst) { - base.hash = current.inst[[1]]$`commit` + base.hash = current.inst[[1]][["commit"]] interacting.hashes = current.inst[[2]] interacting.hashes.df = data.table::setDF(data.table::rbindlist(lapply(interacting.hashes, function(hash) { - ## if there is no function name in the current interaction we set the function name to 'GLOBAL' + ## if there is no function name in the current interaction, we set the function name to 'GLOBAL' ## as this is most likely code outside of functions, else we set the function name if (!"function" %in% names(hash)) { return(data.frame(func = "GLOBAL", commit.hash = hash[["commit"]], file = "GLOBAL")) @@ -921,11 +926,13 @@ read.commit.interactions = function(data.path = NULL) { file = file.name.map$get(hash[["function"]]))) } }))) - interacting.hashes.df$base.hash = base.hash - interacting.hashes.df$base.func = function.name - interacting.hashes.df$base.file = file.name.map$get(function.name) + interacting.hashes.df[["base.hash"]] = base.hash + interacting.hashes.df[["base.func"]] = function.name + interacting.hashes.df[["base.file"]] = file.name.map$get(function.name) return(interacting.hashes.df) }))) + ## Initialize author data as 'NA', since it is not available from the commit-interaction data. + ## Author data will be merged from commit data in \code{update.commit.interactions}. interactions["base.author"] = NA_character_ interactions["interacting.author"] = NA_character_ return(interactions)