se-sic · Leo-Send · Nov 27, 2024 · Dec 11, 2024 · Dec 11, 2024 · Dec 11, 2024
diff --git a/NEWS.md b/NEWS.md
@@ -15,6 +15,7 @@
 - Add commit network as a new type of network. It uses commits as vertices and connects them either via cochange or commit interactions. This includes adding new config parameters and the function `add.vertex.attribute.commit.network` for adding vertex attributes to a commit network (PR #263, ab73271781e8e9a0715f784936df4b371d64c338, ab73271781e8e9a0715f784936df4b371d64c338, cd9a930fcb54ff465c2a5a7c43cfe82ac15c134d)
 - Add `remove.duplicate.edges` function that takes a network as input and conflates identical edges (PR #268, d9a4be417b340812b744f59398ba6460ba527e1c, 0c2f47c4fea6f5f2f582c0259f8cf23af985058a, c6e90dd9cb462232563f753f414da14a24b392a3)
 - Add `cumulative` as an argument to `construct.ranges` which enables the creation of cumulative ranges from given revisions (PR #268, a135f6bb6f83ccb03ae27c735c2700fccc1ee0c8, 8ec207f1e306ef6a641fb0205a9982fa89c7e0d9)
+- Add four new metric which can be used for the classification of authors into core and peripheral: Betweenness, Closeness, Pagerank and Eccentricity (PR #276, 65d5c9cc86708777ef458b0c2e744ab4b846bdd1, b392d1a125d0f306b4bce8d95032162a328a3ce2, c5d37d40024e32ad5778fa5971a45bc08f7631e0)
 
 ### Changed/Improved
 

diff --git a/tests/test-core-peripheral.R b/tests/test-core-peripheral.R
@@ -18,6 +18,7 @@
 ## Copyright 2019 by Christian Hechtl <[email protected]>
 ## Copyright 2021 by Christian Hechtl <[email protected]>
 ## Copyright 2023-2024 by Maximilian Löffler <[email protected]>
+## Copyright 2024 by Leo Sendelbach <[email protected]>
 ## All Rights Reserved.
 
 
@@ -105,6 +106,74 @@ test_that("Eigenvector classification", {
     expect_equal(expected, result, tolerance = 0.0001)
 })
 
+test_that("Betweenness classification", {
+
+    ## Act
+    result = get.author.class.network.betweenness(network)
+
+    ## Assert
+    expected.core = data.frame(author.name = c("Olaf"),
+                               betweenness.centrality = c(1))
+    expected.peripheral = data.frame(author.name = c("Björn", "udo", "Thomas", "Fritz [email protected]",
+                                                     "georg", "Hans"),
+                                     betweenness.centrality = c(0, 0, 0, 0, 0, 0))
+    expected = list(core = expected.core, peripheral = expected.peripheral)
+    row.names(result[["core"]]) = NULL
+    row.names(result[["peripheral"]]) = NULL
+    expect_equal(expected, result)
+})
+
+test_that("Closeness classification", {
+
+    ## Act
+    result = get.author.class.network.closeness(network)
+
+    ## Assert
+    expected.core = data.frame(author.name = c("Olaf"),
+                               closeness.centrality = c(0.5))
+    expected.peripheral = data.frame(author.name = c("Björn", "Thomas", "udo", "Fritz [email protected]",
+                                                     "georg", "Hans"),
+                                     closeness.centrality = c(0.33333, 0.33333, 0.0, 0.0, 0.0, 0.0))
+    expected = list(core = expected.core, peripheral = expected.peripheral)
+    row.names(result[["core"]]) = NULL
+    row.names(result[["peripheral"]]) = NULL
+    expect_equal(expected, result, tolerance = 0.0001)
+})
+
+test_that("Pagerank classification", {
+
+    ## Act
+    result = get.author.class.network.pagerank(network)
+
+    ## Assert
+    expected.core = data.frame(author.name = c("Olaf"),
+                               pagerank.centrality = c(0.40541))
+    expected.peripheral = data.frame(author.name = c("Björn", "Thomas", "udo", "Fritz [email protected]",
+                                                     "georg", "Hans"),
+                                     pagerank.centrality = c(0.21396, 0.21396, 0.041667, 0.041667, 0.041667, 0.041667))
+    expected = list(core = expected.core, peripheral = expected.peripheral)
+    row.names(result[["core"]]) = NULL
+    row.names(result[["peripheral"]]) = NULL
+    expect_equal(expected, result, tolerance = 0.0001)
+})
+
+test_that("Eccentricity classification", {
+
+    ## Act
+    result = get.author.class.network.eccentricity(network)
+
+    ## Assert
+    expected.core = data.frame(author.name = c("Olaf"),
+                               eccentricity = c(1))
+    expected.peripheral = data.frame(author.name = c("Björn", "udo", "Thomas", "Fritz [email protected]",
+                                                     "georg", "Hans"),
+                                     eccentricity = c(0, 0, 0, 0, 0, 0))
+    expected = list(core = expected.core, peripheral = expected.peripheral)
+    row.names(result[["core"]]) = NULL
+    row.names(result[["peripheral"]]) = NULL
+    expect_equal(expected, result)
+})
+
 # TODO: Add a test for hierarchy classification
 
 test_that("Commit-count classification using 'result.limit'" , {

diff --git a/util-core-peripheral.R b/util-core-peripheral.R
@@ -22,6 +22,7 @@
 ## Copyright 2019 by Thomas Bock <[email protected]>
 ## Copyright 2019 by Jakob Kronawitter <[email protected]>
 ## Copyright 2021 by Johannes Hostert <[email protected]>
+## Copyright 2024 by Leo Sendelbach <[email protected]>
 ## All Rights Reserved.
 ##
 ## This file is derived from following Codeface script:
@@ -59,6 +60,10 @@ CLASSIFICATION.TYPE.TO.CATEGORY = list(
     "network.degree"            = "network",
     "network.eigen"             = "network",
     "network.hierarchy"         = "network",
+    "network.betweenness"       = "network",
+    "network.closeness"         = "network",
+    "network.pagerank"          = "network",
+    "network.eccentricity"      = "network",
     "commit.count"              = "count",
     "loc.count"                 = "count",
     "mail.count"                = "count",
@@ -96,7 +101,7 @@ CLASSIFICATION.TYPE.TO.CATEGORY = list(
 #'             Network-based options/metrics (parameter \code{network} has to be specified):
 #'              - "network.degree"
 #'              - "network.eigen"
-#'              - "network.hierarchy"
+#'              - "network.hierarchy" ###TODO check all documentation
 #'             Count-based options/metrics (parameter \code{proj.data} has to be specified):
 #'              - "commit.count"
 #'              - "loc.count"
@@ -126,7 +131,8 @@ CLASSIFICATION.TYPE.TO.CATEGORY = list(
 #'         first column and their centrality values in the second column.
 get.author.class.by.type = function(network = NULL,
                                     proj.data = NULL,
-                                    type = c("network.degree", "network.eigen", "network.hierarchy",
+                                    type = c("network.degree", "network.eigen", "network.hierarchy", "network.betweenness",
+                                             "network.closeness", "network.pagerank", "network.eccentricity",
                                              "commit.count", "loc.count", "mail.count", "mail.thread.count",
                                              "issue.count", "issue.comment.count", "issue.commented.in.count",
                                              "issue.created.count"),
@@ -144,6 +150,10 @@ get.author.class.by.type = function(network = NULL,
                          "network.degree" = "vertex.degree",
                          "network.eigen" = "eigen.centrality",
                          "network.hierarchy" = "hierarchy",
+                         "network.betweenness" = "betweenness.centrality",
+                         "network.closeness" = "closeness.centrality",
+                         "network.pagerank" = "pagerank.centrality",
+                         "network.eccentricity" = "eccentricity",
                          "commit.count" = "commit.count",
                          "loc.count" = "loc.count",
                          "mail.count" = "mail.count",
@@ -231,6 +241,30 @@ get.author.class.by.type = function(network = NULL,
         ## Construct centrality dataframe
         centrality.dataframe = data.frame(author.name = row.names(hierarchy.base.df),
                                           centrality = hierarchy.calculated)
+    } else if (type == "network.betweenness") {
+        betweenness.centrality.vec = igraph::betweenness(network, directed = TRUE)
+        ## Construct centrality dataframe
+        centrality.dataframe = data.frame(author.name = names(betweenness.centrality.vec),
+                                          centrality = as.vector(betweenness.centrality.vec))
+    } else if (type == "network.closeness") {
+        closeness.centrality.vec = igraph::closeness(network)
+        ## Construct centrality dataframe
+        centrality.dataframe = data.frame(author.name = names(closeness.centrality.vec),
+                                          centrality = as.vector(closeness.centrality.vec))
+    } else if (type == "network.pagerank") {
+        pagerank.centrality.vec = igraph::page_rank(network, directed = TRUE)[["vector"]]
+        ## Construct centrality dataframe
+        centrality.dataframe = data.frame(author.name = names(pagerank.centrality.vec),
+                                          centrality = as.vector(pagerank.centrality.vec))
+    } else if (type == "network.eccentricity") {
+        eccentricity.vec = igraph::eccentricity(network)
+        ## since core developers are expected to have a lower eccentricity,
+        ## we need to invert all non-zero values
+        indices = which(eccentricity.vec > 0)
+        eccentricity.vec[indices] = max(eccentricity.vec) - eccentricity.vec[indices]
+        ## Construct centrality dataframe
+        centrality.dataframe = data.frame(author.name = names(eccentricity.vec),
+                                          centrality = as.vector(eccentricity.vec))
     } else if (type == "commit.count") {
         ## Construct centrality dataframe
         centrality.dataframe = get.author.commit.count(proj.data)
@@ -669,6 +703,146 @@ get.author.class.network.hierarchy = function(network, result.limit = NULL,
     return(result)
 }
 
+## * Betweenness-based classification --------------------------------------
+
+#' Classify authors into "core" and "peripheral" based on the betweenness-centrality of author vertices in the network
+#' and return the classification result.
+#'
+#' The details of the classification algorithm is explained in the documentation of \code{get.author.class.by.type}.
+#'
+#' @param network the network containing the authors to classify
+#' @param result.limit the maximum number of authors contained in the classification result. Only the top
+#'                     \code{result.limit} authors of the classification stack will be contained within the returned
+#'                     classification result. \code{NULL} means that all authors will be returned. [default: NULL]
+#' @param restrict.classification.to.authors a vector of author names. Only authors that are contained within this
+#'                                           vector are to be classified. Authors that appear in the vector but are not
+#'                                           part of the classification result (i.e., they are not present in the
+#'                                           underlying data) will be added to it afterwards (with a centrality value
+#'                                           of \code{NA}). \code{NULL} means that no restriction is made.
+#'                                           [default: NULL]
+#'
+#' @return the classification result, that is, a list containing two named list members \code{core} and
+#'         \code{peripheral}, each of which holding the authors classified as core or peripheral, respectively. Both
+#'         entries in this list (\code{core} and \code{peripheral) are dataframes containing the authors' names in the
+#'         first column and their centrality values in the second column.
+#'
+#' @seealso get.author.class.by.type
+get.author.class.network.betweenness = function(network, result.limit = NULL,
+                                              restrict.classification.to.authors = NULL) {
+    logging::logdebug("get.author.class.network.betweenness: starting.")
+
+    result = get.author.class.by.type(network = network, type = "network.betweenness", result.limit = result.limit,
+                                      restrict.classification.to.authors = restrict.classification.to.authors)
+
+    logging::logdebug("get.author.class.network.betweenness: finished.")
+    return(result)
+}
+
+## * Closeness-based classification --------------------------------------
+
+#' Classify authors into "core" and "peripheral" based on the closeness-centrality of author vertices in the network
+#' and return the classification result.
+#'
+#' The details of the classification algorithm is explained in the documentation of \code{get.author.class.by.type}.
+#'
+#' @param network the network containing the authors to classify
+#' @param result.limit the maximum number of authors contained in the classification result. Only the top
+#'                     \code{result.limit} authors of the classification stack will be contained within the returned
+#'                     classification result. \code{NULL} means that all authors will be returned. [default: NULL]
+#' @param restrict.classification.to.authors a vector of author names. Only authors that are contained within this
+#'                                           vector are to be classified. Authors that appear in the vector but are not
+#'                                           part of the classification result (i.e., they are not present in the
+#'                                           underlying data) will be added to it afterwards (with a centrality value
+#'                                           of \code{NA}). \code{NULL} means that no restriction is made.
+#'                                           [default: NULL]
+#'
+#' @return the classification result, that is, a list containing two named list members \code{core} and
+#'         \code{peripheral}, each of which holding the authors classified as core or peripheral, respectively. Both
+#'         entries in this list (\code{core} and \code{peripheral) are dataframes containing the authors' names in the
+#'         first column and their centrality values in the second column.
+#'
+#' @seealso get.author.class.by.type
+get.author.class.network.closeness = function(network, result.limit = NULL,
+                                              restrict.classification.to.authors = NULL) {
+    logging::logdebug("get.author.class.network.closeness: starting.")
+
+    result = get.author.class.by.type(network = network, type = "network.closeness", result.limit = result.limit,
+                                      restrict.classification.to.authors = restrict.classification.to.authors)
+
+    logging::logdebug("get.author.class.network.closeness: finished.")
+    return(result)
+}
+
+## * Pagerank-based classification --------------------------------------
+
+#' Classify authors into "core" and "peripheral" based on the pagerank-centrality of author vertices in the network
+#' and return the classification result.
+#'
+#' The details of the classification algorithm is explained in the documentation of \code{get.author.class.by.type}.
+#'
+#' @param network the network containing the authors to classify
+#' @param result.limit the maximum number of authors contained in the classification result. Only the top
+#'                     \code{result.limit} authors of the classification stack will be contained within the returned
+#'                     classification result. \code{NULL} means that all authors will be returned. [default: NULL]
+#' @param restrict.classification.to.authors a vector of author names. Only authors that are contained within this
+#'                                           vector are to be classified. Authors that appear in the vector but are not
+#'                                           part of the classification result (i.e., they are not present in the
+#'                                           underlying data) will be added to it afterwards (with a centrality value
+#'                                           of \code{NA}). \code{NULL} means that no restriction is made.
+#'                                           [default: NULL]
+#'
+#' @return the classification result, that is, a list containing two named list members \code{core} and
+#'         \code{peripheral}, each of which holding the authors classified as core or peripheral, respectively. Both
+#'         entries in this list (\code{core} and \code{peripheral) are dataframes containing the authors' names in the
+#'         first column and their centrality values in the second column.
+#'
+#' @seealso get.author.class.by.type
+get.author.class.network.pagerank = function(network, result.limit = NULL,
+                                              restrict.classification.to.authors = NULL) {
+    logging::logdebug("get.author.class.network.pagerank: starting.")
+
+    result = get.author.class.by.type(network = network, type = "network.pagerank", result.limit = result.limit,
+                                      restrict.classification.to.authors = restrict.classification.to.authors)
+
+    logging::logdebug("get.author.class.network.pagerank: finished.")
+    return(result)
+}
+
+## * Eccentricity-based classification --------------------------------------
+
+#' Classify authors into "core" and "peripheral" based on the eccentricity of author vertices in the network
+#' and return the classification result.
+#'
+#' The details of the classification algorithm is explained in the documentation of \code{get.author.class.by.type}.
+#'
+#' @param network the network containing the authors to classify
+#' @param result.limit the maximum number of authors contained in the classification result. Only the top
+#'                     \code{result.limit} authors of the classification stack will be contained within the returned
+#'                     classification result. \code{NULL} means that all authors will be returned. [default: NULL]
+#' @param restrict.classification.to.authors a vector of author names. Only authors that are contained within this
+#'                                           vector are to be classified. Authors that appear in the vector but are not
+#'                                           part of the classification result (i.e., they are not present in the
+#'                                           underlying data) will be added to it afterwards (with a centrality value
+#'                                           of \code{NA}). \code{NULL} means that no restriction is made.
+#'                                           [default: NULL]
+#'
+#' @return the classification result, that is, a list containing two named list members \code{core} and
+#'         \code{peripheral}, each of which holding the authors classified as core or peripheral, respectively. Both
+#'         entries in this list (\code{core} and \code{peripheral) are dataframes containing the authors' names in the
+#'         first column and their eccentricity values in the second column.
+#'
+#' @seealso get.author.class.by.type
+get.author.class.network.eccentricity = function(network, result.limit = NULL,
+                                              restrict.classification.to.authors = NULL) {
+    logging::logdebug("get.author.class.network.eccentricity: starting.")
+
+    result = get.author.class.by.type(network = network, type = "network.eccentricity", result.limit = result.limit,
+                                      restrict.classification.to.authors = restrict.classification.to.authors)
+
+    logging::logdebug("get.author.class.network.eccentricity: finished.")
+    return(result)
+}
+
 ## / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / / /
 ## Count-based classification ---------------------------------------------