From 3fb7437b68950303916b62984fa449732c70353e Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Thu, 25 Jul 2024 14:22:03 +0200 Subject: [PATCH] Fix endless recursion problem Add check for calling function in the beginning of 'update.commit.interactions'. Also contains minor fixes to adress PR comments and updates tests to reflect changes made in previous commit. Signed-off-by: Leo Sendelbach --- README.md | 7 +++ showcase.R | 4 +- tests/test-data.R | 10 +++-- tests/test-networks-commit.R | 3 ++ tests/test-read.R | 14 +++--- util-data.R | 12 ++--- util-networks-covariates.R | 5 +-- util-networks.R | 86 +++++++++++++++++++++--------------- 8 files changed, 84 insertions(+), 57 deletions(-) diff --git a/README.md b/README.md index 86b2671c..58b2c82e 100644 --- a/README.md +++ b/README.md @@ -234,6 +234,11 @@ There are four types of networks that can be built using this library: author ne * The vertices in an artifact network denote any kind of artifact, e.g., source-code artifact (such as features or files) or communication artifact (such as mail threads or issues). All artifact-type vertices are uniquely identifiable by their name. There are only unipartite edges among artifacts in this type of network. * The relations (i.e., the edges' meaning and source) can be configured using the [`NetworkConf`](#networkconf) attribute `artifact.relation`. The relation also describes which kinds of artifacts are represented as vertices in the network. (For example, if "mail" is selected as `artifact.relation`, only mail-thread vertices are included in the network.) +- Commit networks + * The vertices in a commit network denote any commits in the data. All vertices + are uniquely identifyable by the hash of the commit. There are only unipartite edges among commits in this type of network. + * The relations (i.e., the edges meaning and source) can be configured using the [`networkConf`](#networkconf) attribute `commit.relation`. The relation also describes the type of data used for network construction (`cochange` uses commit data, `commit.interaction` uses commit interaction data). + - Bipartite networks * The vertices in a bipartite network denote both authors and artifacts. There are only bipartite edges from authors to artifacts in this type of network. * The relations (i.e., the edges' meaning and source) can be configured using the [`NetworkConf`](#networkconf) attribute `artifact.relation`. @@ -249,6 +254,7 @@ Relations determine which information is used to construct edges among the verti - `cochange` * For author networks (configured via `author.relation` in the [`NetworkConf`](#networkconf)), authors who change the same source-code artifact are connected with an edge. * For artifact networks (configured via `artifact.relation` in the [`NetworkConf`](#networkconf)), source-code artifacts that are concurrently changed in the same commit are connected with an edge. + * For commit networks (configured vie `commit.relation` in the [`NetworkConf`](#networkconf)), commits are connected if they change the same artifact. * For bipartite networks (configured via `artifact.relation` in the [`NetworkConf`](#networkconf)), authors get linked to all source-code artifacts they have changed in their respective commits. - `mail` @@ -269,6 +275,7 @@ Relations determine which information is used to construct edges among the verti - `commit.interaction` * For author networks (configured via `author.relation` in the [`NetworkConf`](#networkconf)), authors who contribute to interacting commits are connected with an edge. * For artifact networks (configured via `artifact.relation` in the [`NetworkConf`](#networkconf)), artifacts are connected when there is an interaction between two commits that occur in the artifacts. + * For commit networks (configured via `commit.relation` in the [`NetworkConf`](#networkconf)), commits are connected when they interact in the commit interaction data. * This relation does not apply for bipartite networks. #### Edge-construction algorithms for author networks diff --git a/showcase.R b/showcase.R index 3d2aece7..4cb95d4a 100644 --- a/showcase.R +++ b/showcase.R @@ -239,8 +239,8 @@ sample.pull.requests = add.vertex.attribute.author.issue.count(my.networks, x.da ## add vertex attributes for the project-level network x.net.as.list = list("1970-01-01 00:00:00-2030-01-01 00:00:00" = x$get.author.network()) sample.entire = add.vertex.attribute.author.commit.count(x.net.as.list, x.data, aggregation.level = "complete") -## add vertex attributes to commit network -add.vertex.attribute.commit.network(x$get.commit.network(), x.data, "author.name", "NO_AUTHOR") +## add vertex attributes to commit network. Default value 'NO_AUTHOR' is used if vertex is not in commit data +add.vertex.attribute.commit.network(x$get.commit.network(), x.data, attr.name = "author.name", default.value = "NO_AUTHOR") ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / diff --git a/tests/test-data.R b/tests/test-data.R index 88ce0e42..c983946d 100644 --- a/tests/test-data.R +++ b/tests/test-data.R @@ -564,15 +564,15 @@ test_that("Compare two ProjectData Objects with commit.interactions", { proj.data.two$set.commits(create.empty.commits.list()) ## create empty data frame of correct size - commit.interactions.data.expected = data.frame(matrix(nrow = 4, ncol = 8)) + commit.interactions.data.expected = data.frame(matrix(nrow = 4, ncol = 9)) ## assure that the correct type is used - for(i in seq_len(8)) { + for(i in seq_len(9)) { commit.interactions.data.expected[[i]] = as.character(commit.interactions.data.expected[[i]]) } ## set everything except for authors as expected colnames(commit.interactions.data.expected) = c("commit.hash", "base.hash", "func", "file", - "base.func", "base.file", "base.author", - "interacting.author") + "base.func", "base.file","artifact.type", + "base.author", "interacting.author") commit.interactions.data.expected[["commit.hash"]] = c("0a1a5c523d835459c42f33e863623138555e2526", "418d1dc4929ad1df251d2aeb833dd45757b04a6f", @@ -588,6 +588,8 @@ test_that("Compare two ProjectData Objects with commit.interactions", { commit.interactions.data.expected[["base.func"]] = c("test2.c::test2", "test2.c::test2", "test3.c::test_function", "test2.c::test2") commit.interactions.data.expected[["base.file"]] = c("test2.c", "test2.c", "test3.c", "test2.c") + commit.interactions.data.expected[["artifact.type"]] = c("CommitInteraction", "CommitInteraction", + "CommitInteraction", "CommitInteraction") expect_equal(proj.data.two$get.commit.interactions(), commit.interactions.data.expected) diff --git a/tests/test-networks-commit.R b/tests/test-networks-commit.R index 8ddb87db..7de34eed 100644 --- a/tests/test-networks-commit.R +++ b/tests/test-networks-commit.R @@ -83,6 +83,9 @@ patrick::with_parameters_test_that("Network construction with commit-interaction ) network = igraph::graph.data.frame(edges, directed = test.directed, vertices = vertices) expect_true(igraph::identical_graphs(network.built, network)) + + network.new.attr = add.vertex.attribute.commit.network(network.built, proj.data, "deleted.lines", "NO_DATA") + expect_identical(igraph::V(network.new.attr)$deleted.lines, c("0", "0","0", "NO_DATA", "0", "NO_DATA")) }, patrick::cases( "directed: FALSE" = list(test.directed = FALSE), "directed: TRUE" = list(test.directed = TRUE) diff --git a/tests/test-read.R b/tests/test-read.R index c617e091..f01d16c1 100644 --- a/tests/test-read.R +++ b/tests/test-read.R @@ -505,15 +505,15 @@ test_that("Read the commit-interactions data.", { commit.interactions.data.read = read.commit.interactions(proj.conf$get.value("datapath")) ## build the expected data.frame - commit.interactions.data.expected = data.frame(matrix(nrow = 4, ncol = 8)) + commit.interactions.data.expected = data.frame(matrix(nrow = 4, ncol = 9)) ## assure that the correct type is used - for(i in seq_len(8)) { + for(i in seq_len(ncol(commit.interactions.data.expected))) { commit.interactions.data.expected[[i]] = as.character(commit.interactions.data.expected[[i]]) } ## set everything except for authors as expected colnames(commit.interactions.data.expected) = c("func", "commit.hash", "file", "base.hash", "base.func", "base.file", "base.author", - "interacting.author") + "interacting.author", "artifact.type") commit.interactions.data.expected[["commit.hash"]] = c("5a5ec9675e98187e1e92561e1888aa6f04faa338", "0a1a5c523d835459c42f33e863623138555e2526", @@ -529,6 +529,8 @@ test_that("Read the commit-interactions data.", { commit.interactions.data.expected[["base.func"]] = c("test3.c::test_function", "test2.c::test2", "test2.c::test2", "test2.c::test2") commit.interactions.data.expected[["base.file"]] = c("test3.c", "test2.c", "test2.c", "test2.c") + commit.interactions.data.expected[["artifact.type"]] = c("CommitInteraction", "CommitInteraction", + "CommitInteraction", "CommitInteraction") ## check the results expect_identical(commit.interactions.data.read, commit.interactions.data.expected, info = "commit interaction data.") @@ -543,11 +545,11 @@ test_that("Read the empty commit-interactions data.", { commit.interactions.data.read = read.commit.interactions("./codeface-data/results/testing/ test_empty_proximity/proximity") ## build the expected data.frame - commit.interactions.data.expected = data.frame(matrix(nrow = 0, ncol = 8)) + commit.interactions.data.expected = data.frame(matrix(nrow = 0, ncol = 9)) colnames(commit.interactions.data.expected) = c("func", "commit.hash", "file", "base.hash", "base.func", "base.file", - "base.author", "interacting.author") - for(i in seq_len(8)) { + "base.author", "interacting.author", "artifact.type") + for(i in seq_len(ncol(commit.interactions.data.expected))) { commit.interactions.data.expected[[i]] = as.character(commit.interactions.data.expected[[i]]) } ## check the results diff --git a/util-data.R b/util-data.R index 8d68765f..7f2a971a 100644 --- a/util-data.R +++ b/util-data.R @@ -415,7 +415,10 @@ ProjectData = R6::R6Class("ProjectData", #' #' This method should be called whenever the field \code{commit.interactions} is changed. update.commit.interactions = function() { - if (self$is.data.source.cached("commit.interactions")) { + stacktrace = get.stacktrace(sys.calls()) + caller = get.second.last.element(stacktrace) + if (self$is.data.source.cached("commit.interactions") && + (is.na(caller)|| paste(caller, collapse = " ") != "self$set.commits(commit.data)")) { if (!self$is.data.source.cached("commits.unfiltered")) { self$get.commits() } @@ -2143,8 +2146,6 @@ ProjectData = R6::R6Class("ProjectData", return(mylist) }, - ## * * processed data ---------------------------------------------- - #' Group the commits of the given \code{data.source} by the given \code{group.column}. #' For each group, the column \code{"hash"} is duplicated and prepended to each #' group's data as first column (see below for details). @@ -2162,12 +2163,11 @@ ProjectData = R6::R6Class("ProjectData", #' as first column (with name \code{"data.vertices"}) #' #' @seealso ProjectData$group.data.by.column - group.commits.by.data.column = function(data.source = c("commits", "mails", "issues"), - group.column = "artifact") { + group.commits.by.data.column = function(group.column = "artifact") { logging::loginfo("Grouping commits by data column.") ## store the commits per group that is determined by 'group.column' - mylist = self$group.data.by.column(data.source, group.column, "hash") + mylist = self$group.data.by.column("commits", group.column, "hash") return(mylist) }, diff --git a/util-networks-covariates.R b/util-networks-covariates.R index 5709126a..700b5e9f 100644 --- a/util-networks-covariates.R +++ b/util-networks-covariates.R @@ -149,8 +149,8 @@ add.vertex.attribute = function(net.to.range.list, attr.name, default.value, com #' @param network the commit network #' @param project.data the project data from which to extract the values #' @param attr.name the name of the attribute -#' @param default.value the dafault value of the attribute -#' if it does not occur in the commit data +#' @param default.value the default value that is used if the current hash +#' is not contained in the commit data at all #' #' @return a network with new vertex attribute add.vertex.attribute.commit.network = function(network, project.data, @@ -174,7 +174,6 @@ add.vertex.attribute.commit.network = function(network, project.data, attribute.values = c(attribute.values, value) } net.with.attr = igraph::set.vertex.attribute(network, attr.name, value = attribute.values) - } diff --git a/util-networks.R b/util-networks.R index 352794ad..dd27f36f 100644 --- a/util-networks.R +++ b/util-networks.R @@ -123,8 +123,8 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", artifacts.network.callgraph = NULL, # igraph artifacts.network.mail = NULL, # igraph artifacts.network.issue = NULL, # igraph - commit.network.commit.interaction = NULL, #igraph - commit.network.cochange = NULL, #igraph + commits.network.commit.interaction = NULL, #igraph + commits.network.cochange = NULL, #igraph ## * * relation-to-vertex-kind mapping ----------------------------- @@ -248,7 +248,9 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", colnames(edges)[1] = "to" colnames(edges)[2] = "from" colnames(edges)[4] = "hash" - edges[["artifact.type"]] = "CommitInteraction" + if (nrow(edges) > 0) { + edges[["artifact.type"]] = "CommitInteraction" + } author.net.data = list(vertices = vertices, edges = edges) ## construct the network author.net = construct.network.from.edge.list( @@ -402,7 +404,9 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", edges = edges[, c("file", "base.file", "func", "commit.hash", "base.hash", "base.func", "base.author", "interacting.author")] - edges[["artifact.type"]] = "File" + if (nrow(edges) > 0) { + edges[["artifact.type"]] = ARTIFACT.CODEFACE[[proj.conf.artifact]] + } colnames(edges)[colnames(edges) == "commit.hash"] = "hash" } else if (proj.conf.artifact == "function") { ## change the vertices to the functions from the commit-interaction data @@ -412,7 +416,9 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", edges = edges[, c("func", "base.func", "commit.hash", "file", "base.hash", "base.file", "base.author", "interacting.author")] - edges[["artifact.type"]] = "Function" + if (nrow(edges) > 0) { + edges[["artifact.type"]] = ARTIFACT.CODEFACE[[proj.conf.artifact]] + } colnames(edges)[colnames(edges) == "commit.hash"] = "hash" } else { ## If neither 'function' nor 'file' was configured, send a warning @@ -693,9 +699,9 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", logging::logdebug("get.commit.network.commit.interaction: starting.") ## do not compute anything more than once - if (!is.null(private$commit.network.commit.interaction)) { + if (!is.null(private$commits.network.commit.interaction)) { logging::logdebug("get.commit.network.commit.interaction: finished. (already existing)") - return(private$commit.network.commit.interaction) + return(private$commits.network.commit.interaction) } ## get the hashes that appear in the commit-interaction data as the vertices of the network @@ -708,7 +714,9 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", ## set the commits as the 'to' and 'from' of the network and order the dataframe edges = edges[, c("base.hash", "commit.hash", "func", "interacting.author", "file", "base.author", "base.func", "base.file")] - edges[["artifact.type"]] = "CommitInteraction" + if (nrow(edges) > 0) { + edges[["artifact.type"]] = "CommitInteraction" + } colnames(edges)[1] = "to" colnames(edges)[2] = "from" commit.net.data = list(vertices = vertices, edges = edges) @@ -722,13 +730,13 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", get.data.columns.for.data.source("commit.interactions") ) - private$commit.network.commit.interaction = commit.net + private$commits.network.commit.interaction = commit.net logging::logdebug("get.commit.network.commit.interaction: finished.") return(commit.net) }, - #' Get the co-change-based commit network, + #' Get the cochange-based commit network, #' If it does not already exist build it first. #' #' @return the commit network with cochange realtion @@ -737,13 +745,13 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", logging::logdebug("get.commit.network.cochange: starting.") ## do not compute anything more than once - if (!is.null(private$commit.network.cochange)) { + if (!is.null(private$commits.network.cochange)) { logging::logdebug("get.commit.network.cochange: finished. (already existing)") - return(private$commit.network.cochange) + return(private$commits.network.cochange) } ## construct edge list based on commit--artifact data - commit.net.data.raw = private$proj.data$group.commits.by.data.column("commits", "artifact") + commit.net.data.raw = private$proj.data$group.commits.by.data.column("artifact") commit.net.data = construct.edge.list.from.key.value.list( commit.net.data.raw, @@ -763,7 +771,7 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", ) ## store network - private$commit.network.cochange = commit.net + private$commits.network.cochange = commit.net logging::logdebug("get.commit.network.cochange: finished.") return(commit.net) @@ -843,8 +851,8 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", private$artifacts.network.cochange = NULL private$artifacts.network.issue = NULL private$artifacts.network.mail = NULL - private$commit.network.commit.interaction = NULL - private$commit.network.cochange = NULL + private$commits.network.commit.interaction = NULL + private$commits.network.cochange = NULL private$proj.data = private$proj.data.original if (private$network.conf$get.value("unify.date.ranges")) { private$cut.data.to.same.timestamps() @@ -1192,7 +1200,7 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", "bipartite.net" = bipartite.net, "authors.net" = authors.net, "artifacts.net" = artifacts.net, - "commit.net" = commit.net + "commits.net" = commit.net )) }, @@ -1322,7 +1330,8 @@ NetworkBuilder = R6::R6Class("NetworkBuilder", #' i.e., whether to only add edges from the later event to the previous one. #' If \code{NA} is passed, the default value is taken. #' [default: directed] -#' @param network.type the type of network for which the key value data is to be used as edges [default: "author"] +#' @param network.type the type of network for which the key value data is to be used as edges +#' (one out of "author", "artifact", or "commit")[default: "author"] #' #' @return a list of two data.frames named 'vertices' and 'edges' (compatible with return value #' of \code{igraph::as.data.frame}) @@ -1361,11 +1370,11 @@ construct.edge.list.from.key.value.list = function(list, network.conf, directed } } - ## if edges in a commit network contain 'date', 'hash' or 'file' attributes, remove them + ## if edges in a commit network contain 'hash' or 'file' attributes, remove them ## as they belong to commits, which are the vertices in commit networks if (network.type == "commit") { cols.which = which(edge.attributes %in% c("hash", "file")) - edge.attributes <- edge.attributes[-cols.which] + edge.attributes = edge.attributes[-cols.which] } if (respect.temporal.order) { @@ -1375,7 +1384,9 @@ construct.edge.list.from.key.value.list = function(list, network.conf, directed edge.attributes, keys, keys.number, network.type) edge.list = plyr::rbind.fill(edge.list.data) - vertices.processed = unlist( parallel::mclapply(edge.list.data, function(data) attr(data, "vertices.processed")) ) + vertices.processed = unlist(parallel::mclapply(edge.list.data, function(data) { + return(attr(data, "vertices.processed")) + })) } else { @@ -1384,28 +1395,31 @@ construct.edge.list.from.key.value.list = function(list, network.conf, directed edge.attributes, keys, keys.number) edge.list = plyr::rbind.fill(edge.list.data) - vertices.processed = unlist( parallel::mclapply(edge.list.data, function(data) attr(data, "vertices.processed")) ) + vertices.processed = unlist(parallel::mclapply(edge.list.data, function(data) { + return(attr(data, "vertices.processed")) + })) } logging::logdebug("construct.edge.list.from.key.value.list: finished.") if (network.type == "commit") { - vertices.dates.processed = unlist( parallel::mclapply(edge.list.data, - function(data) attr(data, "vertices.dates.processed")) ) + vertices.dates.processed = unlist(parallel::mclapply(edge.list.data, function(data) { + return (attr(data, "vertices.dates.processed")) + })) return(list( - vertices = data.frame( - name = unique(vertices.processed), - date = get.date.from.string(unique(vertices.dates.processed)) - ), - edges = edge.list + vertices = data.frame( + name = unique(vertices.processed), + date = get.date.from.string(unique(vertices.dates.processed)) + ), + edges = edge.list )) } else { return(list( - vertices = data.frame( - name = unique(vertices.processed) - ), - edges = edge.list + vertices = data.frame( + name = unique(vertices.processed) + ), + edges = edge.list )) } } @@ -1504,13 +1518,13 @@ construct.edges.temporal.order = function(set, network.conf, edge.attributes, ke construct.edges.no.temporal.order = function(set, network.conf, edge.attributes, keys, keys.number) { number.edges = sum(table(set[["data.vertices"]]) * (dim(table(set[["data.vertices"]])) - 1)) logging::logdebug("[%s/%s] Constructing edges for %s '%s': starting (%s edges to construct).", - match(attr(set, "group.name"), keys), keys.number, - attr(set, "group.type"), attr(set, "group.name"), number.edges) + match(attr(set, "group.name"), keys), keys.number, + attr(set, "group.type"), attr(set, "group.name"), number.edges) ## Skip artifacts with many, many edges if (number.edges > network.conf$get.value("skip.threshold")) { logging::logwarn("Skipping edges for %s '%s' due to amount (> %s).", - attr(set, "group.type"), attr(set, "group.name"), network.conf$get.value("skip.threshold")) + attr(set, "group.type"), attr(set, "group.name"), network.conf$get.value("skip.threshold")) return(NULL) }