diff --git a/NEWS.md b/NEWS.md index d4e457bc..c193e10a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -15,6 +15,7 @@ - Add commit network as a new type of network. It uses commits as vertices and connects them either via cochange or commit interactions. This includes adding new config parameters and the function `add.vertex.attribute.commit.network` for adding vertex attributes to a commit network (PR #263, ab73271781e8e9a0715f784936df4b371d64c338, ab73271781e8e9a0715f784936df4b371d64c338, cd9a930fcb54ff465c2a5a7c43cfe82ac15c134d) - Add `remove.duplicate.edges` function that takes a network as input and conflates identical edges (PR #268, d9a4be417b340812b744f59398ba6460ba527e1c, 0c2f47c4fea6f5f2f582c0259f8cf23af985058a, c6e90dd9cb462232563f753f414da14a24b392a3) - Add `cumulative` as an argument to `construct.ranges` which enables the creation of cumulative ranges from given revisions (PR #268, a135f6bb6f83ccb03ae27c735c2700fccc1ee0c8, 8ec207f1e306ef6a641fb0205a9982fa89c7e0d9) +- Add four new metric which can be used for the classification of authors into core and peripheral: Betweenness, Closeness, Pagerank and Eccentricity (PR #276, 65d5c9cc86708777ef458b0c2e744ab4b846bdd1, b392d1a125d0f306b4bce8d95032162a328a3ce2, c5d37d40024e32ad5778fa5971a45bc08f7631e0) ### Changed/Improved diff --git a/tests/test-core-peripheral.R b/tests/test-core-peripheral.R index e719d651..4d8359bd 100644 --- a/tests/test-core-peripheral.R +++ b/tests/test-core-peripheral.R @@ -18,6 +18,7 @@ ## Copyright 2019 by Christian Hechtl ## Copyright 2021 by Christian Hechtl ## Copyright 2023-2024 by Maximilian Löffler +## Copyright 2024 by Leo Sendelbach ## All Rights Reserved. @@ -105,6 +106,74 @@ test_that("Eigenvector classification", { expect_equal(expected, result, tolerance = 0.0001) }) +test_that("Betweenness classification", { + + ## Act + result = get.author.class.network.betweenness(network) + + ## Assert + expected.core = data.frame(author.name = c("Olaf"), + betweenness.centrality = c(1)) + expected.peripheral = data.frame(author.name = c("Björn", "udo", "Thomas", "Fritz fritz@example.org", + "georg", "Hans"), + betweenness.centrality = c(0, 0, 0, 0, 0, 0)) + expected = list(core = expected.core, peripheral = expected.peripheral) + row.names(result[["core"]]) = NULL + row.names(result[["peripheral"]]) = NULL + expect_equal(expected, result) +}) + +test_that("Closeness classification", { + + ## Act + result = get.author.class.network.closeness(network) + + ## Assert + expected.core = data.frame(author.name = c("Olaf"), + closeness.centrality = c(0.5)) + expected.peripheral = data.frame(author.name = c("Björn", "Thomas", "udo", "Fritz fritz@example.org", + "georg", "Hans"), + closeness.centrality = c(0.33333, 0.33333, 0.0, 0.0, 0.0, 0.0)) + expected = list(core = expected.core, peripheral = expected.peripheral) + row.names(result[["core"]]) = NULL + row.names(result[["peripheral"]]) = NULL + expect_equal(expected, result, tolerance = 0.0001) +}) + +test_that("Pagerank classification", { + + ## Act + result = get.author.class.network.pagerank(network) + + ## Assert + expected.core = data.frame(author.name = c("Olaf"), + pagerank.centrality = c(0.40541)) + expected.peripheral = data.frame(author.name = c("Björn", "Thomas", "udo", "Fritz fritz@example.org", + "georg", "Hans"), + pagerank.centrality = c(0.21396, 0.21396, 0.041667, 0.041667, 0.041667, 0.041667)) + expected = list(core = expected.core, peripheral = expected.peripheral) + row.names(result[["core"]]) = NULL + row.names(result[["peripheral"]]) = NULL + expect_equal(expected, result, tolerance = 0.0001) +}) + +test_that("Eccentricity classification", { + + ## Act + result = get.author.class.network.eccentricity(network) + + ## Assert + expected.core = data.frame(author.name = c("Olaf"), + eccentricity = c(1)) + expected.peripheral = data.frame(author.name = c("Björn", "udo", "Thomas", "Fritz fritz@example.org", + "georg", "Hans"), + eccentricity = c(0, 0, 0, 0, 0, 0)) + expected = list(core = expected.core, peripheral = expected.peripheral) + row.names(result[["core"]]) = NULL + row.names(result[["peripheral"]]) = NULL + expect_equal(expected, result) +}) + # TODO: Add a test for hierarchy classification test_that("Commit-count classification using 'result.limit'" , { diff --git a/util-core-peripheral.R b/util-core-peripheral.R index ef5a7997..a6eda367 100644 --- a/util-core-peripheral.R +++ b/util-core-peripheral.R @@ -22,6 +22,7 @@ ## Copyright 2019 by Thomas Bock ## Copyright 2019 by Jakob Kronawitter ## Copyright 2021 by Johannes Hostert +## Copyright 2024 by Leo Sendelbach ## All Rights Reserved. ## ## This file is derived from following Codeface script: @@ -59,6 +60,10 @@ CLASSIFICATION.TYPE.TO.CATEGORY = list( "network.degree" = "network", "network.eigen" = "network", "network.hierarchy" = "network", + "network.betweenness" = "network", + "network.closeness" = "network", + "network.pagerank" = "network", + "network.eccentricity" = "network", "commit.count" = "count", "loc.count" = "count", "mail.count" = "count", @@ -96,7 +101,7 @@ CLASSIFICATION.TYPE.TO.CATEGORY = list( #' Network-based options/metrics (parameter \code{network} has to be specified): #' - "network.degree" #' - "network.eigen" -#' - "network.hierarchy" +#' - "network.hierarchy" ###TODO check all documentation #' Count-based options/metrics (parameter \code{proj.data} has to be specified): #' - "commit.count" #' - "loc.count" @@ -126,7 +131,8 @@ CLASSIFICATION.TYPE.TO.CATEGORY = list( #' first column and their centrality values in the second column. get.author.class.by.type = function(network = NULL, proj.data = NULL, - type = c("network.degree", "network.eigen", "network.hierarchy", + type = c("network.degree", "network.eigen", "network.hierarchy", "network.betweenness", + "network.closeness", "network.pagerank", "network.eccentricity", "commit.count", "loc.count", "mail.count", "mail.thread.count", "issue.count", "issue.comment.count", "issue.commented.in.count", "issue.created.count"), @@ -144,6 +150,10 @@ get.author.class.by.type = function(network = NULL, "network.degree" = "vertex.degree", "network.eigen" = "eigen.centrality", "network.hierarchy" = "hierarchy", + "network.betweenness" = "betweenness.centrality", + "network.closeness" = "closeness.centrality", + "network.pagerank" = "pagerank.centrality", + "network.eccentricity" = "eccentricity", "commit.count" = "commit.count", "loc.count" = "loc.count", "mail.count" = "mail.count", @@ -231,6 +241,30 @@ get.author.class.by.type = function(network = NULL, ## Construct centrality dataframe centrality.dataframe = data.frame(author.name = row.names(hierarchy.base.df), centrality = hierarchy.calculated) + } else if (type == "network.betweenness") { + betweenness.centrality.vec = igraph::betweenness(network, directed = TRUE) + ## Construct centrality dataframe + centrality.dataframe = data.frame(author.name = names(betweenness.centrality.vec), + centrality = as.vector(betweenness.centrality.vec)) + } else if (type == "network.closeness") { + closeness.centrality.vec = igraph::closeness(network) + ## Construct centrality dataframe + centrality.dataframe = data.frame(author.name = names(closeness.centrality.vec), + centrality = as.vector(closeness.centrality.vec)) + } else if (type == "network.pagerank") { + pagerank.centrality.vec = igraph::page_rank(network, directed = TRUE)[["vector"]] + ## Construct centrality dataframe + centrality.dataframe = data.frame(author.name = names(pagerank.centrality.vec), + centrality = as.vector(pagerank.centrality.vec)) + } else if (type == "network.eccentricity") { + eccentricity.vec = igraph::eccentricity(network) + ## since core developers are expected to have a lower eccentricity, + ## we need to invert all non-zero values + indices = which(eccentricity.vec > 0) + eccentricity.vec[indices] = max(eccentricity.vec) - eccentricity.vec[indices] + ## Construct centrality dataframe + centrality.dataframe = data.frame(author.name = names(eccentricity.vec), + centrality = as.vector(eccentricity.vec)) } else if (type == "commit.count") { ## Construct centrality dataframe centrality.dataframe = get.author.commit.count(proj.data) @@ -669,6 +703,146 @@ get.author.class.network.hierarchy = function(network, result.limit = NULL, return(result) } +## * Betweenness-based classification -------------------------------------- + +#' Classify authors into "core" and "peripheral" based on the betweenness-centrality of author vertices in the network +#' and return the classification result. +#' +#' The details of the classification algorithm is explained in the documentation of \code{get.author.class.by.type}. +#' +#' @param network the network containing the authors to classify +#' @param result.limit the maximum number of authors contained in the classification result. Only the top +#' \code{result.limit} authors of the classification stack will be contained within the returned +#' classification result. \code{NULL} means that all authors will be returned. [default: NULL] +#' @param restrict.classification.to.authors a vector of author names. Only authors that are contained within this +#' vector are to be classified. Authors that appear in the vector but are not +#' part of the classification result (i.e., they are not present in the +#' underlying data) will be added to it afterwards (with a centrality value +#' of \code{NA}). \code{NULL} means that no restriction is made. +#' [default: NULL] +#' +#' @return the classification result, that is, a list containing two named list members \code{core} and +#' \code{peripheral}, each of which holding the authors classified as core or peripheral, respectively. Both +#' entries in this list (\code{core} and \code{peripheral) are dataframes containing the authors' names in the +#' first column and their centrality values in the second column. +#' +#' @seealso get.author.class.by.type +get.author.class.network.betweenness = function(network, result.limit = NULL, + restrict.classification.to.authors = NULL) { + logging::logdebug("get.author.class.network.betweenness: starting.") + + result = get.author.class.by.type(network = network, type = "network.betweenness", result.limit = result.limit, + restrict.classification.to.authors = restrict.classification.to.authors) + + logging::logdebug("get.author.class.network.betweenness: finished.") + return(result) +} + +## * Closeness-based classification -------------------------------------- + +#' Classify authors into "core" and "peripheral" based on the closeness-centrality of author vertices in the network +#' and return the classification result. +#' +#' The details of the classification algorithm is explained in the documentation of \code{get.author.class.by.type}. +#' +#' @param network the network containing the authors to classify +#' @param result.limit the maximum number of authors contained in the classification result. Only the top +#' \code{result.limit} authors of the classification stack will be contained within the returned +#' classification result. \code{NULL} means that all authors will be returned. [default: NULL] +#' @param restrict.classification.to.authors a vector of author names. Only authors that are contained within this +#' vector are to be classified. Authors that appear in the vector but are not +#' part of the classification result (i.e., they are not present in the +#' underlying data) will be added to it afterwards (with a centrality value +#' of \code{NA}). \code{NULL} means that no restriction is made. +#' [default: NULL] +#' +#' @return the classification result, that is, a list containing two named list members \code{core} and +#' \code{peripheral}, each of which holding the authors classified as core or peripheral, respectively. Both +#' entries in this list (\code{core} and \code{peripheral) are dataframes containing the authors' names in the +#' first column and their centrality values in the second column. +#' +#' @seealso get.author.class.by.type +get.author.class.network.closeness = function(network, result.limit = NULL, + restrict.classification.to.authors = NULL) { + logging::logdebug("get.author.class.network.closeness: starting.") + + result = get.author.class.by.type(network = network, type = "network.closeness", result.limit = result.limit, + restrict.classification.to.authors = restrict.classification.to.authors) + + logging::logdebug("get.author.class.network.closeness: finished.") + return(result) +} + +## * Pagerank-based classification -------------------------------------- + +#' Classify authors into "core" and "peripheral" based on the pagerank-centrality of author vertices in the network +#' and return the classification result. +#' +#' The details of the classification algorithm is explained in the documentation of \code{get.author.class.by.type}. +#' +#' @param network the network containing the authors to classify +#' @param result.limit the maximum number of authors contained in the classification result. Only the top +#' \code{result.limit} authors of the classification stack will be contained within the returned +#' classification result. \code{NULL} means that all authors will be returned. [default: NULL] +#' @param restrict.classification.to.authors a vector of author names. Only authors that are contained within this +#' vector are to be classified. Authors that appear in the vector but are not +#' part of the classification result (i.e., they are not present in the +#' underlying data) will be added to it afterwards (with a centrality value +#' of \code{NA}). \code{NULL} means that no restriction is made. +#' [default: NULL] +#' +#' @return the classification result, that is, a list containing two named list members \code{core} and +#' \code{peripheral}, each of which holding the authors classified as core or peripheral, respectively. Both +#' entries in this list (\code{core} and \code{peripheral) are dataframes containing the authors' names in the +#' first column and their centrality values in the second column. +#' +#' @seealso get.author.class.by.type +get.author.class.network.pagerank = function(network, result.limit = NULL, + restrict.classification.to.authors = NULL) { + logging::logdebug("get.author.class.network.pagerank: starting.") + + result = get.author.class.by.type(network = network, type = "network.pagerank", result.limit = result.limit, + restrict.classification.to.authors = restrict.classification.to.authors) + + logging::logdebug("get.author.class.network.pagerank: finished.") + return(result) +} + +## * Eccentricity-based classification -------------------------------------- + +#' Classify authors into "core" and "peripheral" based on the eccentricity of author vertices in the network +#' and return the classification result. +#' +#' The details of the classification algorithm is explained in the documentation of \code{get.author.class.by.type}. +#' +#' @param network the network containing the authors to classify +#' @param result.limit the maximum number of authors contained in the classification result. Only the top +#' \code{result.limit} authors of the classification stack will be contained within the returned +#' classification result. \code{NULL} means that all authors will be returned. [default: NULL] +#' @param restrict.classification.to.authors a vector of author names. Only authors that are contained within this +#' vector are to be classified. Authors that appear in the vector but are not +#' part of the classification result (i.e., they are not present in the +#' underlying data) will be added to it afterwards (with a centrality value +#' of \code{NA}). \code{NULL} means that no restriction is made. +#' [default: NULL] +#' +#' @return the classification result, that is, a list containing two named list members \code{core} and +#' \code{peripheral}, each of which holding the authors classified as core or peripheral, respectively. Both +#' entries in this list (\code{core} and \code{peripheral) are dataframes containing the authors' names in the +#' first column and their eccentricity values in the second column. +#' +#' @seealso get.author.class.by.type +get.author.class.network.eccentricity = function(network, result.limit = NULL, + restrict.classification.to.authors = NULL) { + logging::logdebug("get.author.class.network.eccentricity: starting.") + + result = get.author.class.by.type(network = network, type = "network.eccentricity", result.limit = result.limit, + restrict.classification.to.authors = restrict.classification.to.authors) + + logging::logdebug("get.author.class.network.eccentricity: finished.") + return(result) +} + ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / ## Count-based classification ---------------------------------------------