From 291628450f2ad31b9cc9b99b0618396bb3ef360f Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Mon, 7 Oct 2024 21:05:44 +0200 Subject: [PATCH 01/18] init PipeOpDecode --- R/PipeOpDecode.R | 68 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 R/PipeOpDecode.R diff --git a/R/PipeOpDecode.R b/R/PipeOpDecode.R new file mode 100644 index 000000000..ae9092d4d --- /dev/null +++ b/R/PipeOpDecode.R @@ -0,0 +1,68 @@ +#' @title Factor Decoding +#' +#' @usage NULL +#' @name mlr_pipeops_decode +#' @format [`R6Class`][R6::R6Class] object inheriting from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`]. +#' +#' @description +#' +#' @section Construction: +#' ``` +#' PipeOpEncode$new(id = "decode", param_vals = list()) +#' ``` +#' * `id` :: `character(1)`\cr +#' Identifier of resulting object, default `"encode"`. +#' * `param_vals` :: named `list`\cr +#' List of hyperparameter settings, overwriting the hyperparameter settings that would otherwise be set during construction. Default `list()`. +#' +#' @section Input and Output Channels: +#' Input and output channels are inherited from [`PipeOpTaskPreproc`]. +#' +#' The output is the input [`Task`][mlr3::Task] with +#' +#' @section State: +#' The `$state` is a named `list` with the `$state` elements inherited from [`PipeOpTaskPreproc`], as well as: +#' * ... +#' +#' @section Parameters: +#' The parameters are the parameters inherited from [`PipeOpTaskPreproc`], as well as: +#' * ... +#' +#' @section Internals: +#' Uses the [`stats::contrasts`] functions. This is relatively inefficient for features with a large number of levels. +#' +#' @section Methods: +#' Only methods inherited from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`]. +#' +#' @family PipeOps +#' @template seealso_pipeopslist +#' @include PipeOpTaskPreproc.R +#' @export +#' @examples +#' library("mlr3") +#' +PipeOpDecode = R6Class("PipeOpDecode", + inherit = PipeOpTaskPreprocSimple, + public = list( + initialize = function(id = "decode", param_vals = list()) { + ps = ps( + treatment_encoding = p_lgl(tags = c("train", "predict")), + group_pattern = p_uty(custom_check = check_string, tags = c("train", "predict")) + ) + ps$values = list(treatment_encoding = FALSE, group_pattern = "^([^.]*)\\.") + super$initialize(id, param_set = ps, param_vals = param_vals, packages = "stats", tags = "encode", feature_types = c("factor", "ordered")) + } + ), + private = list( + + .get_state_dt = function(dt, levels, target) { + + }, + + .transform_dt = function(dt, levels) { + + } + ) +) + +mlr_pipeops$add("decode", PipeOpDecode) From 178863148ce01b82a4e631917bb2ec0b424ba47e Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Mon, 25 Nov 2024 15:09:20 +0100 Subject: [PATCH 02/18] add base logic WIP --- R/PipeOpDecode.R | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/R/PipeOpDecode.R b/R/PipeOpDecode.R index ae9092d4d..489e5e3eb 100644 --- a/R/PipeOpDecode.R +++ b/R/PipeOpDecode.R @@ -56,13 +56,54 @@ PipeOpDecode = R6Class("PipeOpDecode", private = list( .get_state_dt = function(dt, levels, target) { + pv = self$param_set$values + cols = colnames(dt) + # If pattern == "", all columns are collapsed into one column + if (pv$group_pattern == "") { + return(list(colmaps = list(result = set_names(cols, cols)))) + } + + # Extract group names + matches = regmatches(cols, regexec(pv$group_pattern, cols)) + grps = unlist(map(matches, function(x) if (length(x)) x[[2]] else "")) + # Extract level names + lvls = set_names(gsub(pv$group_pattern, "", cols), cols) + + # Drop entries for which no match to group_pattern was found + keep = fcts != "" + fcts = fcts[keep] + lvls = lvls[keep] + + # add "" = "ref" if pv$treatment_encoding == TRUE + # test that split is consistent for this use case + list(colmaps = split(lvls, fcts)) }, + # take maximum value, bc could be scaled + # treatment dass alles 0 ist, hard coden, referenzname als reference nennen (und ref.1 falls es die spalte schon gibt) + + # decide when to assign "ref" (e.g. no unique maximum) .transform_dt = function(dt, levels) { + colmaps = self$state$colmaps + + for (fct in names(colmaps)) { + old_cols = names(colmaps[[fct]]) + lvls = unname(colmaps[[fct]]) + # Find the column with the maximal value for each row + dt[, (fct) := old_cols[apply(.SD, 1, which.max)], .SDcols = old_cols] + # Assign the corresponding value from the named vector to the new column + dt[, (fct) := lvls[get(fct)]] + # Remove the old columns (can move this to outside the loop) + dt[, (old_cols) := NULL] + } } ) ) mlr_pipeops$add("decode", PipeOpDecode) + +# We don't add columns that have no match with group_pattern to state +# We only remove old_cols in .train_dt -> These columns are just ignored. Good. +# From e0f6e82e1a9c62a155026e29cc1737a5eb9b80c3 Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Mon, 25 Nov 2024 15:09:34 +0100 Subject: [PATCH 03/18] init tests PipeOpDecode --- tests/testthat/test_pipeop_decode.R | 48 +++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 tests/testthat/test_pipeop_decode.R diff --git a/tests/testthat/test_pipeop_decode.R b/tests/testthat/test_pipeop_decode.R new file mode 100644 index 000000000..9e76f727e --- /dev/null +++ b/tests/testthat/test_pipeop_decode.R @@ -0,0 +1,48 @@ +context("PipeOpDecode") + +test_that("PipeOpDecode - basic properties", { + # check what expect_datapreproc_pipeop_class does, whether we need custom task here + df = data.frame( + target = runif(120), + x.1 = rep(c(1, 0, 0), 40), + x.2 = rep(c(0, 1, 0), 40), + x.3 = rep(c(0, 0, 1), 40), + y.1 = rep(c(1, 0, 0), 40), + y.2 = rep(c(0, 1, 0), 40), + y.3 = rep(c(0, 0, 1), 40), + a = runif(120) + ) + task = TaskRegr$new(id = "decode", backend = df, target = "target") + + expect_datapreproc_pipeop_class(PipeOpDecode, task = task) +}) + +test_that("PipeOpDecode - assertions", { + # test that edge cases are caught +}) + +test_that("PipeOpDecode - one-hot-encoding", { + po = PipeOpDecode$new() + + df = data.frame( + target = runif(120), + x.1 = rep(c(1, 0, 0), 40), + x.2 = rep(c(0, 1, 0), 40), + x.3 = rep(c(0, 0, 1), 40), + y.1 = rep(c(1, 0, 0), 40), + y.2 = rep(c(0, 1, 0), 40), + y.3 = rep(c(0, 0, 1), 40), + a = runif(120) + ) + task = TaskRegr$new(id = "decode", backend = df, target = "target") + + +}) + +test_that("PipeOpDecode - treatment encoding", { + +}) + +test_that("PipOpDecode - different regex patterns", { + +}) From 33dbac70c197872133a9452265fd9dbb17dee38b Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Mon, 25 Nov 2024 21:59:52 +0100 Subject: [PATCH 04/18] add test skeletons --- tests/testthat/test_pipeop_decode.R | 70 +++++++++++++++++++++++------ 1 file changed, 57 insertions(+), 13 deletions(-) diff --git a/tests/testthat/test_pipeop_decode.R b/tests/testthat/test_pipeop_decode.R index 9e76f727e..090debc5a 100644 --- a/tests/testthat/test_pipeop_decode.R +++ b/tests/testthat/test_pipeop_decode.R @@ -17,32 +17,76 @@ test_that("PipeOpDecode - basic properties", { expect_datapreproc_pipeop_class(PipeOpDecode, task = task) }) -test_that("PipeOpDecode - assertions", { - # test that edge cases are caught -}) - test_that("PipeOpDecode - one-hot-encoding", { - po = PipeOpDecode$new() + op = PipeOpDecode$new() df = data.frame( - target = runif(120), - x.1 = rep(c(1, 0, 0), 40), - x.2 = rep(c(0, 1, 0), 40), - x.3 = rep(c(0, 0, 1), 40), - y.1 = rep(c(1, 0, 0), 40), - y.2 = rep(c(0, 1, 0), 40), - y.3 = rep(c(0, 0, 1), 40), - a = runif(120) + target = runif(10), + x.1 = rep(c(1, 0), 5), + x.2 = rep(c(0, 1), 5), + a = runif(10) ) task = TaskRegr$new(id = "decode", backend = df, target = "target") + train_out = op$train(list(task))[[1]]$data() + dt = data.table( + x = rep(c(1, 2), each = 5), + a = df$a + ) + expect_equal(train_out, dt) }) test_that("PipeOpDecode - treatment encoding", { + op = PipeOpDecode$new() + op$param_set$values$treatment_encoding = TRUE + + df = data.frame( + target = runif(15), + x.1 = rep(c(1, 0, 0), 5), + x.2 = rep(c(0, 0, 1), 5), + a = runif(15) + ) + task = TaskRegr$new(id = "decode", backend = df, target = "target") + train_out = op$train(list(task))[[1]]$data() + dt = data.table( + x = rep(c("1", "ref", "2"), times = 5), + a = df$a + ) + expect_equal(train_out, dt) }) test_that("PipOpDecode - different regex patterns", { + op = PipeOpDecode$new() + op$param_set$values$regex_pattern = "" + + df = data.frame( + target = runif(15), + x.1 = rep(c(1, 0, 0), 5), + x.2 = rep(c(0, 0, 1), 5), + a = rep(c(0, 1, 0), 5) + ) + task = TaskRegr$new(id = "decode", backend = df, target = "target") + + train_out = op$train(list(task))[[1]]$data() + dt = data.table( + x = rep(c("x.1", "a", "x.2"), times = 5) + ) + expect_equal(train_out, dt) +}) + +test_that("PipeOpDecode - errors", { + op = PipeOpDecode$new() + + df = data.frame( + target = runif(15), + x.1 = rep(c(1, 0, 0), 5), + x.2 = rep(c(0, 1, 1), 5), + a = runif(15) + ) + task = TaskRegr$new(id = "decode", backend = df, target = "target") + + expect_error(op$train(list(task))) # due to non-unique which.max }) From f25bcc766cbcbffcd872f664f5f7ccd70cbab250 Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Mon, 25 Nov 2024 22:00:05 +0100 Subject: [PATCH 05/18] further logic --- R/PipeOpDecode.R | 53 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 37 insertions(+), 16 deletions(-) diff --git a/R/PipeOpDecode.R b/R/PipeOpDecode.R index 489e5e3eb..edf6f07c6 100644 --- a/R/PipeOpDecode.R +++ b/R/PipeOpDecode.R @@ -5,6 +5,7 @@ #' @format [`R6Class`][R6::R6Class] object inheriting from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`]. #' #' @description +#' Description #' #' @section Construction: #' ``` @@ -50,7 +51,7 @@ PipeOpDecode = R6Class("PipeOpDecode", group_pattern = p_uty(custom_check = check_string, tags = c("train", "predict")) ) ps$values = list(treatment_encoding = FALSE, group_pattern = "^([^.]*)\\.") - super$initialize(id, param_set = ps, param_vals = param_vals, packages = "stats", tags = "encode", feature_types = c("factor", "ordered")) + super$initialize(id, param_set = ps, param_vals = param_vals, tags = "encode", feature_types = c("integer", "numeric")) } ), private = list( @@ -62,42 +63,62 @@ PipeOpDecode = R6Class("PipeOpDecode", # If pattern == "", all columns are collapsed into one column if (pv$group_pattern == "") { return(list(colmaps = list(result = set_names(cols, cols)))) + # should make sure that result is available (or name we chose instead) } - # Extract group names + # Drop columns that do contain group_pattern + # What about cols starting with .? -> probably let user exclude this by changing group_pattern are using affect_columns + cols = cols[grepl(pv$group_pattern, cols)] + + # Extract factor names matches = regmatches(cols, regexec(pv$group_pattern, cols)) - grps = unlist(map(matches, function(x) if (length(x)) x[[2]] else "")) + fcts = vapply(matches, function(x) x[[2]], character(1)) # Extract level names lvls = set_names(gsub(pv$group_pattern, "", cols), cols) - # Drop entries for which no match to group_pattern was found - keep = fcts != "" - fcts = fcts[keep] - lvls = lvls[keep] - - # add "" = "ref" if pv$treatment_encoding == TRUE # test that split is consistent for this use case - list(colmaps = split(lvls, fcts)) - }, + s = list(colmaps = split(lvls, fcts)) + + if (pv$treatment_encoding) { + # Set default name for reference level + ref_name = "ref" + counter = 1 + while (ref_name %in% cols) { + ref_name = paste0("ref.", counter) + counter = counter + 1 + } + # Append ref_name with empty name to all list entries + for (i in seq_along(s[["colmaps"]])) { + s[["colmaps"]][[i]][[length(s[["colmaps"]][[i]]) + 1]] = ref_name + } + } - # take maximum value, bc could be scaled - # treatment dass alles 0 ist, hard coden, referenzname als reference nennen (und ref.1 falls es die spalte schon gibt) + s + }, - # decide when to assign "ref" (e.g. no unique maximum) .transform_dt = function(dt, levels) { colmaps = self$state$colmaps for (fct in names(colmaps)) { old_cols = names(colmaps[[fct]]) - lvls = unname(colmaps[[fct]]) + lvls = colmaps[[fct]] + + # Do we check that which.max is unique? + # Generally what checks should we perform? e.g. that group_pattern contains a capturing group # Find the column with the maximal value for each row dt[, (fct) := old_cols[apply(.SD, 1, which.max)], .SDcols = old_cols] # Assign the corresponding value from the named vector to the new column - dt[, (fct) := lvls[get(fct)]] + # dt[, (fct) := lvls[get(fct)]] + dt[, (fct) := lvls[dt[[fct]]]] + + # type conversion to factor? + # Remove the old columns (can move this to outside the loop) dt[, (old_cols) := NULL] } + + dt } ) ) From c4a57812fe31941a21dc25ca03b08a94fbc75425 Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Mon, 25 Nov 2024 22:11:52 +0100 Subject: [PATCH 06/18] small changes --- R/PipeOpDecode.R | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/R/PipeOpDecode.R b/R/PipeOpDecode.R index edf6f07c6..15d405c2f 100644 --- a/R/PipeOpDecode.R +++ b/R/PipeOpDecode.R @@ -71,6 +71,7 @@ PipeOpDecode = R6Class("PipeOpDecode", cols = cols[grepl(pv$group_pattern, cols)] # Extract factor names + # If group_pattern contains does not contain a capturing group, fcts cannot be generated matches = regmatches(cols, regexec(pv$group_pattern, cols)) fcts = vapply(matches, function(x) x[[2]], character(1)) # Extract level names @@ -104,27 +105,23 @@ PipeOpDecode = R6Class("PipeOpDecode", lvls = colmaps[[fct]] # Do we check that which.max is unique? - # Generally what checks should we perform? e.g. that group_pattern contains a capturing group # Find the column with the maximal value for each row dt[, (fct) := old_cols[apply(.SD, 1, which.max)], .SDcols = old_cols] - # Assign the corresponding value from the named vector to the new column - # dt[, (fct) := lvls[get(fct)]] - dt[, (fct) := lvls[dt[[fct]]]] - - # type conversion to factor? + # add handling for reference class if treatment_encoding == TRUE - # Remove the old columns (can move this to outside the loop) - dt[, (old_cols) := NULL] + # Assign the corresponding value from the named vector to the new column + # dt[, (fct) := as.factor(lvls[get(fct)])] + dt[, (fct) := as.factor(lvls[dt[[fct]]])] } + # Drop old columns + drop = unlist(lapply(colmaps, names)) + dt[, (drop) := NULL] + dt } ) ) mlr_pipeops$add("decode", PipeOpDecode) - -# We don't add columns that have no match with group_pattern to state -# We only remove old_cols in .train_dt -> These columns are just ignored. Good. -# From f45e25d8acc4d86de1c9a5d7f17d92add4a48c80 Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Tue, 26 Nov 2024 14:07:41 +0100 Subject: [PATCH 07/18] added params and changed logic --- R/PipeOpDecode.R | 57 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 43 insertions(+), 14 deletions(-) diff --git a/R/PipeOpDecode.R b/R/PipeOpDecode.R index 15d405c2f..6c47c01a3 100644 --- a/R/PipeOpDecode.R +++ b/R/PipeOpDecode.R @@ -47,10 +47,12 @@ PipeOpDecode = R6Class("PipeOpDecode", public = list( initialize = function(id = "decode", param_vals = list()) { ps = ps( - treatment_encoding = p_lgl(tags = c("train", "predict")), - group_pattern = p_uty(custom_check = check_string, tags = c("train", "predict")) + treatment_encoding = p_lgl(tags = c("train", "required")), + treatment_cutoff = p_dbl(default = 0, tags = "train", requires = quote(treatment_encoding == TRUE)), + group_pattern = p_uty(custom_check = check_string, tags = c("train", "required")), + ties_method = p_fct(c("first", "last", "random"), tags = c("train", "required")) ) - ps$values = list(treatment_encoding = FALSE, group_pattern = "^([^.]*)\\.") + ps$values = list(treatment_encoding = FALSE, group_pattern = "^([^.]+)\\.", ties_method = "random") super$initialize(id, param_set = ps, param_vals = param_vals, tags = "encode", feature_types = c("integer", "numeric")) } ), @@ -60,22 +62,42 @@ PipeOpDecode = R6Class("PipeOpDecode", pv = self$param_set$values cols = colnames(dt) + if (treatment_encoding) { + # Determine name for reference level + ref_name = "ref" + counter = 1 + while (ref_name %in% cols) { + ref_name = paste0("ref.", counter) + counter = counter + 1 + } + } + # If pattern == "", all columns are collapsed into one column + # If "pipeop.decoded" is already taken, we overwrite it! if (pv$group_pattern == "") { - return(list(colmaps = list(result = set_names(cols, cols)))) - # should make sure that result is available (or name we chose instead) + if (treatment_encoding) { + # First add entry ref_name with empty name + cmap = list(pipeop.decoded = c(set_names(cols, cols))) + cmap[[pipeop.decoded]][[length(cols) + 1]] = ref_name + return(list(colmaps = cmap)) + } + return(list(colmaps = list(pipeop.decoded = set_names(cols, cols)))) } # Drop columns that do contain group_pattern # What about cols starting with .? -> probably let user exclude this by changing group_pattern are using affect_columns - cols = cols[grepl(pv$group_pattern, cols)] + cols = cols[grepl(pv$group_pattern, cols, perl = TRUE)] # Extract factor names # If group_pattern contains does not contain a capturing group, fcts cannot be generated - matches = regmatches(cols, regexec(pv$group_pattern, cols)) - fcts = vapply(matches, function(x) x[[2]], character(1)) + matches = regmatches(cols, regexec(pv$group_pattern, cols, perl = TRUE)) + fcts = map_chr(matches, 2) + + if (any(nchar(fcts) == 0)) { + stopf("Pattern %s with column %s would produce empty decoded column name", group_pattern, cols[nchar(fcts) == 0]) + } # Extract level names - lvls = set_names(gsub(pv$group_pattern, "", cols), cols) + lvls = set_names(gsub(pv$group_pattern, "", cols, perl = TRUE), cols) # test that split is consistent for this use case s = list(colmaps = split(lvls, fcts)) @@ -94,25 +116,32 @@ PipeOpDecode = R6Class("PipeOpDecode", } } + s$cutoff = pv$treatment_cutoff %??% 0 + s$ties_method = pv$ties_method s }, .transform_dt = function(dt, levels) { colmaps = self$state$colmaps + cutoff = self$state$cutoff + ties_method = self$state$ties_method for (fct in names(colmaps)) { old_cols = names(colmaps[[fct]]) lvls = colmaps[[fct]] - # Do we check that which.max is unique? + old_cols_matrix = as.matrix(dt[, old_cols, with = FALSE]) # Find the column with the maximal value for each row - dt[, (fct) := old_cols[apply(.SD, 1, which.max)], .SDcols = old_cols] - # add handling for reference class if treatment_encoding == TRUE + set(dt, , fct, old_cols[apply(old_cols_matrix, 1, which_max, ties_method = ties_method)]) + + # If any value in old_cols_matrix are smaller than the cutoff, replace with empty string + if (treatment_encoding) { + set(dt, rowSums(old_cols_matrix <= cutoff) > 0, "") + } # Assign the corresponding value from the named vector to the new column - # dt[, (fct) := as.factor(lvls[get(fct)])] - dt[, (fct) := as.factor(lvls[dt[[fct]]])] + set(dt, , fct, factor(lvls[match(dt[[fct]], names(lvls))], levels = lvls)) } # Drop old columns From 0a8e774973cc8ca3a39a2d9e0a38eae2eb0b7cc2 Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Tue, 26 Nov 2024 23:10:56 +0100 Subject: [PATCH 08/18] clean up --- R/PipeOpDecode.R | 98 +++++++++++++++++++++++++++--------------------- 1 file changed, 56 insertions(+), 42 deletions(-) diff --git a/R/PipeOpDecode.R b/R/PipeOpDecode.R index 6c47c01a3..ed807598c 100644 --- a/R/PipeOpDecode.R +++ b/R/PipeOpDecode.R @@ -48,7 +48,7 @@ PipeOpDecode = R6Class("PipeOpDecode", initialize = function(id = "decode", param_vals = list()) { ps = ps( treatment_encoding = p_lgl(tags = c("train", "required")), - treatment_cutoff = p_dbl(default = 0, tags = "train", requires = quote(treatment_encoding == TRUE)), + treatment_cutoff = p_dbl(default = 0, tags = "train", depends = quote(treatment_encoding == TRUE)), group_pattern = p_uty(custom_check = check_string, tags = c("train", "required")), ties_method = p_fct(c("first", "last", "random"), tags = c("train", "required")) ) @@ -62,7 +62,7 @@ PipeOpDecode = R6Class("PipeOpDecode", pv = self$param_set$values cols = colnames(dt) - if (treatment_encoding) { + if (pv$treatment_encoding) { # Determine name for reference level ref_name = "ref" counter = 1 @@ -72,76 +72,90 @@ PipeOpDecode = R6Class("PipeOpDecode", } } - # If pattern == "", all columns are collapsed into one column - # If "pipeop.decoded" is already taken, we overwrite it! + # If pattern == "", all columns are collapsed into one column. + # Note, that column "pipeop.decoded" gets overwritten if it already exists. if (pv$group_pattern == "") { - if (treatment_encoding) { - # First add entry ref_name with empty name - cmap = list(pipeop.decoded = c(set_names(cols, cols))) + cmap = list(pipeop.decoded = c(set_names(cols, cols))) + if (pv$treatment_encoding) { + # Append ref_name with empty name (i.e. "") cmap[[pipeop.decoded]][[length(cols) + 1]] = ref_name - return(list(colmaps = cmap)) } - return(list(colmaps = list(pipeop.decoded = set_names(cols, cols)))) + + s = list( + colmaps = cmap, + treatment_encoding = pv$treatment_encoding, + cutoff = pv$treatment_cutoff %??% 0, + ties_method = pv$ties_method + ) + + return(s) } - # Drop columns that do contain group_pattern - # What about cols starting with .? -> probably let user exclude this by changing group_pattern are using affect_columns + # Drop columns that do not match group_pattern cols = cols[grepl(pv$group_pattern, cols, perl = TRUE)] - # Extract factor names - # If group_pattern contains does not contain a capturing group, fcts cannot be generated + # Extract names for new levels + lvls = set_names(gsub(pv$group_pattern, "", cols, perl = TRUE), cols) + + # Extract names for new factor columns to be populated with lvls matches = regmatches(cols, regexec(pv$group_pattern, cols, perl = TRUE)) + # Error, if nothing was captured. + if (any(lengths(matches) < 2)) { + stopf("Pattern %s matches column name %s, but nothing was captured. Make sure group_pattern contains a capturing group.", + str_collapse(pv$group_pattern, quote = '"'), + str_collapse(cols[lengths(matches) < 2], quote = '"')) + } fcts = map_chr(matches, 2) + # Error if no group could be extracted for an entry in col. Thus, we could not create a column name from it. if (any(nchar(fcts) == 0)) { - stopf("Pattern %s with column %s would produce empty decoded column name", group_pattern, cols[nchar(fcts) == 0]) + stopf("Pattern %s with column(s) %s would produce empty string as decoded column name(s). Try using a different pattern.", + str_collapse(pv$group_pattern, quote = '"'), + str_collapse(cols[nchar(fcts) == 0], quote = '"')) } - # Extract level names - lvls = set_names(gsub(pv$group_pattern, "", cols, perl = TRUE), cols) - - # test that split is consistent for this use case - s = list(colmaps = split(lvls, fcts)) + # Create mapping of old column names and derived levels to new column names + cmap = split(lvls, fcts) if (pv$treatment_encoding) { - # Set default name for reference level - ref_name = "ref" - counter = 1 - while (ref_name %in% cols) { - ref_name = paste0("ref.", counter) - counter = counter + 1 - } - # Append ref_name with empty name to all list entries - for (i in seq_along(s[["colmaps"]])) { - s[["colmaps"]][[i]][[length(s[["colmaps"]][[i]]) + 1]] = ref_name + # Append ref_name with empty name (i.e. "") to all list entries + for (map in cmap) { + map[[length(map) + 1]] = ref_name } } - s$cutoff = pv$treatment_cutoff %??% 0 - s$ties_method = pv$ties_method - s + list( + colmaps = cmap, + treatment_encoding = pv$treatment_encoding, + cutoff = pv$treatment_cutoff %??% 0, + ties_method = pv$ties_method + ) }, .transform_dt = function(dt, levels) { colmaps = self$state$colmaps + # Early exit if no mapping is required + if (!length(colmaps)) { + return(dt) + } cutoff = self$state$cutoff ties_method = self$state$ties_method + treatment_encoding = self$state$treatment_encoding - for (fct in names(colmaps)) { - old_cols = names(colmaps[[fct]]) - lvls = colmaps[[fct]] + for (new_col in names(colmaps)) { + lvls = colmaps[[new_col]] + old_cols = names(lvls) + # Create matrix from subset of dt with columns old_cols old_cols_matrix = as.matrix(dt[, old_cols, with = FALSE]) - - # Find the column with the maximal value for each row - set(dt, , fct, old_cols[apply(old_cols_matrix, 1, which_max, ties_method = ties_method)]) - + # Populate new column with name of column with maximal value per row + set(dt, , new_col, old_cols[apply(old_cols_matrix, 1, which_max, ties_method = ties_method)]) # If any value in old_cols_matrix are smaller than the cutoff, replace with empty string + # This implies replacement with reference level in next step. if (treatment_encoding) { set(dt, rowSums(old_cols_matrix <= cutoff) > 0, "") } - - # Assign the corresponding value from the named vector to the new column - set(dt, , fct, factor(lvls[match(dt[[fct]], names(lvls))], levels = lvls)) + # Replace occurrences of old column names with corresponding new level names + set(dt, , new_col, factor(lvls[match(dt[[new_col]], names(lvls))], levels = lvls)) } # Drop old columns From 7a11e252db04fa23acb70850350f3666a38b3c84 Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Tue, 26 Nov 2024 23:11:16 +0100 Subject: [PATCH 09/18] added tests --- tests/testthat/test_pipeop_decode.R | 79 ++++++++++++++++++----------- 1 file changed, 50 insertions(+), 29 deletions(-) diff --git a/tests/testthat/test_pipeop_decode.R b/tests/testthat/test_pipeop_decode.R index 090debc5a..92bae0d1d 100644 --- a/tests/testthat/test_pipeop_decode.R +++ b/tests/testthat/test_pipeop_decode.R @@ -1,8 +1,7 @@ context("PipeOpDecode") test_that("PipeOpDecode - basic properties", { - # check what expect_datapreproc_pipeop_class does, whether we need custom task here - df = data.frame( + dt = data.table( target = runif(120), x.1 = rep(c(1, 0, 0), 40), x.2 = rep(c(0, 1, 0), 40), @@ -12,81 +11,103 @@ test_that("PipeOpDecode - basic properties", { y.3 = rep(c(0, 0, 1), 40), a = runif(120) ) - task = TaskRegr$new(id = "decode", backend = df, target = "target") + task = TaskRegr$new(id = "decode", backend = dt, target = "target") - expect_datapreproc_pipeop_class(PipeOpDecode, task = task) + expect_datapreproc_pipeop_class(PipeOpDecode, task = task, deterministic_train = FALSE, deterministic_predict = FALSE) }) test_that("PipeOpDecode - one-hot-encoding", { op = PipeOpDecode$new() - df = data.frame( + dt = data.frame( target = runif(10), x.1 = rep(c(1, 0), 5), x.2 = rep(c(0, 1), 5), + .a = runif(10), a = runif(10) ) - task = TaskRegr$new(id = "decode", backend = df, target = "target") + task = TaskRegr$new(id = "decode", backend = dt, target = "target") train_out = op$train(list(task))[[1]]$data() + dt_compare = data.table( + target = dt$target, + .a = dt$.a, + a = dt$a, + x = as.factor(rep(c(1, 2), times = 5)) + ) + expect_equal(train_out, dt_compare) + + # can handle task with no matches to group_pattern dt = data.table( - x = rep(c(1, 2), each = 5), - a = df$a + target = runif(10), + a = runif(10) ) + task = TaskRegr$new(id = "decode", backend = dt, target = "target") + train_out = op$train(list(task))[[1]]$data() expect_equal(train_out, dt) + # test tiebreak }) test_that("PipeOpDecode - treatment encoding", { op = PipeOpDecode$new() op$param_set$values$treatment_encoding = TRUE - df = data.frame( + dt = data.table( target = runif(15), x.1 = rep(c(1, 0, 0), 5), x.2 = rep(c(0, 0, 1), 5), a = runif(15) ) - task = TaskRegr$new(id = "decode", backend = df, target = "target") + task = TaskRegr$new(id = "decode", backend = dt, target = "target") train_out = op$train(list(task))[[1]]$data() - dt = data.table( - x = rep(c("1", "ref", "2"), times = 5), - a = df$a + dt_compare = data.table( + target = df$target, + a = df$a, + x = rep(c("1", "ref", "2"), times = 5) ) - expect_equal(train_out, dt) + expect_equal(train_out, dt_compare) + + # test cutoff + # test tiebreak }) -test_that("PipOpDecode - different regex patterns", { +test_that("PipOpDecode - collapse all into one", { op = PipeOpDecode$new() - op$param_set$values$regex_pattern = "" + op$param_set$values$group_pattern = "" - df = data.frame( + dt = data.frame( target = runif(15), x.1 = rep(c(1, 0, 0), 5), x.2 = rep(c(0, 0, 1), 5), a = rep(c(0, 1, 0), 5) ) - task = TaskRegr$new(id = "decode", backend = df, target = "target") + task = TaskRegr$new(id = "decode", backend = dt, target = "target") train_out = op$train(list(task))[[1]]$data() - dt = data.table( - x = rep(c("x.1", "a", "x.2"), times = 5) + dt_compare = data.table( + target = dt$target, + pipeop.decoded = as.factor(rep(c("x.1", "a", "x.2"), times = 5)) ) - expect_equal(train_out, dt) + expect_equal(train_out, dt_compare) }) test_that("PipeOpDecode - errors", { op = PipeOpDecode$new() - - df = data.frame( - target = runif(15), - x.1 = rep(c(1, 0, 0), 5), - x.2 = rep(c(0, 1, 1), 5), - a = runif(15) + dt = data.frame( + target = runif(20), + x.1 = rep(c(1, 0), 10), + x.2 = rep(c(0, 1), 10), + .a = rep(1, 20) ) - task = TaskRegr$new(id = "decode", backend = df, target = "target") + task = TaskRegr$new(id = "decode", backend = dt, target = "target") - expect_error(op$train(list(task))) # due to non-unique which.max + # pattern without capturing group + op$param_set$values$group_pattern = "^[^.]+\\." + expect_error(op$train(list(task)), "nothing was captured") + # pattern that would result in empty column names + op$param_set$values$group_pattern = "^([^.]*)\\." + expect_error(op$train(list(task)), "produce empty string as decoded column name") }) From e952ba2275866427a8736087e0831dca2991189b Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Thu, 28 Nov 2024 18:53:36 +0100 Subject: [PATCH 10/18] finalized tests --- tests/testthat/test_pipeop_decode.R | 59 ++++++++++++++++++++++------- 1 file changed, 46 insertions(+), 13 deletions(-) diff --git a/tests/testthat/test_pipeop_decode.R b/tests/testthat/test_pipeop_decode.R index 92bae0d1d..670e07d9e 100644 --- a/tests/testthat/test_pipeop_decode.R +++ b/tests/testthat/test_pipeop_decode.R @@ -1,14 +1,14 @@ context("PipeOpDecode") +#Notiz: tests mit mehreren gruppen und skallierten werten + test_that("PipeOpDecode - basic properties", { dt = data.table( target = runif(120), x.1 = rep(c(1, 0, 0), 40), x.2 = rep(c(0, 1, 0), 40), - x.3 = rep(c(0, 0, 1), 40), y.1 = rep(c(1, 0, 0), 40), y.2 = rep(c(0, 1, 0), 40), - y.3 = rep(c(0, 0, 1), 40), a = runif(120) ) task = TaskRegr$new(id = "decode", backend = dt, target = "target") @@ -23,6 +23,8 @@ test_that("PipeOpDecode - one-hot-encoding", { target = runif(10), x.1 = rep(c(1, 0), 5), x.2 = rep(c(0, 1), 5), + y.1 = rep(c(2, 1), 5), + y.2 = rep(c(1, 2), 5), .a = runif(10), a = runif(10) ) @@ -33,7 +35,8 @@ test_that("PipeOpDecode - one-hot-encoding", { target = dt$target, .a = dt$.a, a = dt$a, - x = as.factor(rep(c(1, 2), times = 5)) + x = as.factor(rep(c(1, 2), times = 5)), + y = as.factor(rep(c(1, 2), times = 5)) ) expect_equal(train_out, dt_compare) @@ -46,7 +49,30 @@ test_that("PipeOpDecode - one-hot-encoding", { train_out = op$train(list(task))[[1]]$data() expect_equal(train_out, dt) - # test tiebreak + # tiebreak + dt = data.frame( + target = runif(10), + x.1 = c(1, 0, 1, 0, 0), + x.2 = c(0, 1, 0, 1, 1), + x.3 = c(0, 0, 1, 1, 1) + ) + task = TaskRegr$new(id = "decode", backend = dt, target = "target") + + op$param_set$values$ties_method = "first" + train_out = op$train(list(task))[[1]]$data() + dt_compare = data.table( + target = dt$target, + x = as.factor(c(1, 2, 1, 2, 2)) + ) + expect_equal(train_out, dt_compare) + + op$param_set$values$ties_method = "last" + train_out = op$train(list(task))[[1]]$data() + dt_compare = data.table( + target = dt$target, + x = as.factor(c(1, 2, 3, 3, 3)) + ) + expect_equal(train_out, dt_compare) }) test_that("PipeOpDecode - treatment encoding", { @@ -56,21 +82,28 @@ test_that("PipeOpDecode - treatment encoding", { dt = data.table( target = runif(15), x.1 = rep(c(1, 0, 0), 5), - x.2 = rep(c(0, 0, 1), 5), + x.2 = rep(c(0, 0, 0.5), 5), a = runif(15) ) task = TaskRegr$new(id = "decode", backend = dt, target = "target") train_out = op$train(list(task))[[1]]$data() dt_compare = data.table( - target = df$target, - a = df$a, - x = rep(c("1", "ref", "2"), times = 5) + target = dt$target, + a = dt$a, + x = as.factor(rep(c("1", "ref", "2"), times = 5)) ) expect_equal(train_out, dt_compare) # test cutoff - # test tiebreak + op$param_set$values$treatment_cutoff = 0.5 + train_out = op$train(list(task))[[1]]$data() + dt_compare = data.table( + target = dt$target, + a = dt$a, + x = as.factor(rep(c("1", "ref", "ref"), times = 5)) + ) + expect_equal(train_out, dt_compare) }) test_that("PipOpDecode - collapse all into one", { @@ -79,16 +112,16 @@ test_that("PipOpDecode - collapse all into one", { dt = data.frame( target = runif(15), - x.1 = rep(c(1, 0, 0), 5), - x.2 = rep(c(0, 0, 1), 5), - a = rep(c(0, 1, 0), 5) + x = rep(c(1, 0, 0), 5), + y = rep(c(0, 1, 0), 5), + z = rep(c(0, 0, 1), 5) ) task = TaskRegr$new(id = "decode", backend = dt, target = "target") train_out = op$train(list(task))[[1]]$data() dt_compare = data.table( target = dt$target, - pipeop.decoded = as.factor(rep(c("x.1", "a", "x.2"), times = 5)) + pipeop.decoded = as.factor(rep(c("x", "y", "z"), times = 5)) ) expect_equal(train_out, dt_compare) }) From 98e93d4528eb7d82e77ae3e959e5782d15f03a6b Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Thu, 28 Nov 2024 18:53:58 +0100 Subject: [PATCH 11/18] fixes + docs started --- R/PipeOpDecode.R | 43 ++++++++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/R/PipeOpDecode.R b/R/PipeOpDecode.R index ed807598c..a6463f0c9 100644 --- a/R/PipeOpDecode.R +++ b/R/PipeOpDecode.R @@ -12,25 +12,39 @@ #' PipeOpEncode$new(id = "decode", param_vals = list()) #' ``` #' * `id` :: `character(1)`\cr -#' Identifier of resulting object, default `"encode"`. +#' Identifier of resulting object, default `"decode"`. #' * `param_vals` :: named `list`\cr #' List of hyperparameter settings, overwriting the hyperparameter settings that would otherwise be set during construction. Default `list()`. #' #' @section Input and Output Channels: #' Input and output channels are inherited from [`PipeOpTaskPreproc`]. #' -#' The output is the input [`Task`][mlr3::Task] with +#' The output is the input [`Task`][mlr3::Task] with decoded columns, i.e. reversed encoding. #' #' @section State: #' The `$state` is a named `list` with the `$state` elements inherited from [`PipeOpTaskPreproc`], as well as: -#' * ... +#' * `colmaps` :: named `list`\cr +#' Named list of named character vectors. Each element is named according to the new column name extracted by `group_pattern`. +#' Each vector contains the level names for the new factor column that should be created, named by the corresponding old column name. +#' If `treament_encoding` is `TRUE`, then each vector also contains `"ref"` as the name of the reference class with empty string as name instead +#' of an old column name. +#' * `treatment_encoding` :: `logical(1)`\cr +#' Parameter `treatment_encoding`. +#' * `cutoff` :: `numeric(1)`\cr +#' Parameter `treatment_cutoff`. +#' * `ties_method` :: `character(1)`\cr +#' Parameter `ties_method`. #' #' @section Parameters: #' The parameters are the parameters inherited from [`PipeOpTaskPreproc`], as well as: -#' * ... +#' * `group_pattern` :: `character(1)`\cr +#' * `treatment_encoding` :: `logical(1)`\cr +#' * `treatment_cutoff` :: `numeric(1)`\cr +#' * `ties_method` :: `character(1)`\cr +#' Method for resolving ties, if multiple columns have the same value +#' to be passed to [`mlr3misc::which_max`]. In case of ties, either the `first`, the `last` or +#' a `random` column is picked. Initialized to `"random"`. #' -#' @section Internals: -#' Uses the [`stats::contrasts`] functions. This is relatively inefficient for features with a large number of levels. #' #' @section Methods: #' Only methods inherited from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`]. @@ -47,9 +61,9 @@ PipeOpDecode = R6Class("PipeOpDecode", public = list( initialize = function(id = "decode", param_vals = list()) { ps = ps( + group_pattern = p_uty(custom_check = check_string, tags = c("train", "required")), treatment_encoding = p_lgl(tags = c("train", "required")), treatment_cutoff = p_dbl(default = 0, tags = "train", depends = quote(treatment_encoding == TRUE)), - group_pattern = p_uty(custom_check = check_string, tags = c("train", "required")), ties_method = p_fct(c("first", "last", "random"), tags = c("train", "required")) ) ps$values = list(treatment_encoding = FALSE, group_pattern = "^([^.]+)\\.", ties_method = "random") @@ -118,8 +132,8 @@ PipeOpDecode = R6Class("PipeOpDecode", cmap = split(lvls, fcts) if (pv$treatment_encoding) { # Append ref_name with empty name (i.e. "") to all list entries - for (map in cmap) { - map[[length(map) + 1]] = ref_name + for (i in seq_along(cmap)) { + cmap[[i]][[length(cmap[[i]]) + 1]] = ref_name } } @@ -143,16 +157,17 @@ PipeOpDecode = R6Class("PipeOpDecode", for (new_col in names(colmaps)) { lvls = colmaps[[new_col]] - old_cols = names(lvls) + # If existent, remove empty string element (for subsetting dt, later) + old_cols = discard(names(lvls), names(lvls) == "") # Create matrix from subset of dt with columns old_cols old_cols_matrix = as.matrix(dt[, old_cols, with = FALSE]) # Populate new column with name of column with maximal value per row set(dt, , new_col, old_cols[apply(old_cols_matrix, 1, which_max, ties_method = ties_method)]) - # If any value in old_cols_matrix are smaller than the cutoff, replace with empty string - # This implies replacement with reference level in next step. + # If all values in old_cols_matrix are smaller than or equal to the cutoff, replace with empty string + # This leads to replacement with reference level in next step. if (treatment_encoding) { - set(dt, rowSums(old_cols_matrix <= cutoff) > 0, "") + set(dt, which(rowSums(old_cols_matrix > cutoff) == 0), new_col, "") } # Replace occurrences of old column names with corresponding new level names set(dt, , new_col, factor(lvls[match(dt[[new_col]], names(lvls))], levels = lvls)) @@ -160,6 +175,8 @@ PipeOpDecode = R6Class("PipeOpDecode", # Drop old columns drop = unlist(lapply(colmaps, names)) + # If existent, remove empty string elements + drop = discard(drop, drop == "") dt[, (drop) := NULL] dt From 0b4c9abd2c16b993d52296f1a6c1bb787d76d2ec Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Fri, 29 Nov 2024 00:20:07 +0100 Subject: [PATCH 12/18] docs + run document --- DESCRIPTION | 1 + NAMESPACE | 1 + R/PipeOpDecode.R | 81 +++++++-- man/PipeOp.Rd | 1 + man/PipeOpEnsemble.Rd | 1 + man/PipeOpImpute.Rd | 1 + man/PipeOpTargetTrafo.Rd | 1 + man/PipeOpTaskPreproc.Rd | 1 + man/PipeOpTaskPreprocSimple.Rd | 1 + man/mlr_pipeops.Rd | 1 + man/mlr_pipeops_adas.Rd | 1 + man/mlr_pipeops_blsmote.Rd | 1 + man/mlr_pipeops_boxcox.Rd | 1 + man/mlr_pipeops_branch.Rd | 1 + man/mlr_pipeops_chunk.Rd | 1 + man/mlr_pipeops_classbalancing.Rd | 1 + man/mlr_pipeops_classifavg.Rd | 1 + man/mlr_pipeops_classweights.Rd | 1 + man/mlr_pipeops_colapply.Rd | 1 + man/mlr_pipeops_collapsefactors.Rd | 1 + man/mlr_pipeops_colroles.Rd | 1 + man/mlr_pipeops_copy.Rd | 1 + man/mlr_pipeops_datefeatures.Rd | 1 + man/mlr_pipeops_decode.Rd | 210 +++++++++++++++++++++++ man/mlr_pipeops_encode.Rd | 1 + man/mlr_pipeops_encodeimpact.Rd | 1 + man/mlr_pipeops_encodelmer.Rd | 1 + man/mlr_pipeops_featureunion.Rd | 1 + man/mlr_pipeops_filter.Rd | 1 + man/mlr_pipeops_fixfactors.Rd | 1 + man/mlr_pipeops_histbin.Rd | 1 + man/mlr_pipeops_ica.Rd | 1 + man/mlr_pipeops_imputeconstant.Rd | 1 + man/mlr_pipeops_imputehist.Rd | 1 + man/mlr_pipeops_imputelearner.Rd | 1 + man/mlr_pipeops_imputemean.Rd | 1 + man/mlr_pipeops_imputemedian.Rd | 1 + man/mlr_pipeops_imputemode.Rd | 1 + man/mlr_pipeops_imputeoor.Rd | 1 + man/mlr_pipeops_imputesample.Rd | 1 + man/mlr_pipeops_kernelpca.Rd | 1 + man/mlr_pipeops_learner.Rd | 1 + man/mlr_pipeops_missind.Rd | 1 + man/mlr_pipeops_modelmatrix.Rd | 1 + man/mlr_pipeops_multiplicityexply.Rd | 1 + man/mlr_pipeops_multiplicityimply.Rd | 1 + man/mlr_pipeops_mutate.Rd | 1 + man/mlr_pipeops_nearmiss.Rd | 1 + man/mlr_pipeops_nmf.Rd | 1 + man/mlr_pipeops_nop.Rd | 1 + man/mlr_pipeops_ovrsplit.Rd | 1 + man/mlr_pipeops_ovrunite.Rd | 1 + man/mlr_pipeops_pca.Rd | 1 + man/mlr_pipeops_proxy.Rd | 1 + man/mlr_pipeops_quantilebin.Rd | 1 + man/mlr_pipeops_randomprojection.Rd | 1 + man/mlr_pipeops_randomresponse.Rd | 1 + man/mlr_pipeops_regravg.Rd | 1 + man/mlr_pipeops_removeconstants.Rd | 1 + man/mlr_pipeops_renamecolumns.Rd | 1 + man/mlr_pipeops_replicate.Rd | 1 + man/mlr_pipeops_rowapply.Rd | 1 + man/mlr_pipeops_scale.Rd | 1 + man/mlr_pipeops_scalemaxabs.Rd | 1 + man/mlr_pipeops_scalerange.Rd | 1 + man/mlr_pipeops_select.Rd | 1 + man/mlr_pipeops_smote.Rd | 1 + man/mlr_pipeops_smotenc.Rd | 1 + man/mlr_pipeops_spatialsign.Rd | 1 + man/mlr_pipeops_subsample.Rd | 1 + man/mlr_pipeops_targetinvert.Rd | 1 + man/mlr_pipeops_targetmutate.Rd | 1 + man/mlr_pipeops_targettrafoscalerange.Rd | 1 + man/mlr_pipeops_textvectorizer.Rd | 1 + man/mlr_pipeops_threshold.Rd | 1 + man/mlr_pipeops_tomek.Rd | 1 + man/mlr_pipeops_tunethreshold.Rd | 1 + man/mlr_pipeops_unbranch.Rd | 1 + man/mlr_pipeops_updatetarget.Rd | 1 + man/mlr_pipeops_vtreat.Rd | 1 + man/mlr_pipeops_yeojohnson.Rd | 1 + tests/testthat/test_pipeop_decode.R | 34 ++-- 82 files changed, 372 insertions(+), 32 deletions(-) create mode 100644 man/mlr_pipeops_decode.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 53a9a06f2..63574ccae 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -137,6 +137,7 @@ Collate: 'PipeOpCollapseFactors.R' 'PipeOpCopy.R' 'PipeOpDateFeatures.R' + 'PipeOpDecode.R' 'PipeOpEncode.R' 'PipeOpEncodeImpact.R' 'PipeOpEncodeLmer.R' diff --git a/NAMESPACE b/NAMESPACE index 00539352c..6ae9bbec8 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -104,6 +104,7 @@ export(PipeOpColRoles) export(PipeOpCollapseFactors) export(PipeOpCopy) export(PipeOpDateFeatures) +export(PipeOpDecode) export(PipeOpEncode) export(PipeOpEncodeImpact) export(PipeOpEncodeLmer) diff --git a/R/PipeOpDecode.R b/R/PipeOpDecode.R index a6463f0c9..7d787d802 100644 --- a/R/PipeOpDecode.R +++ b/R/PipeOpDecode.R @@ -1,11 +1,19 @@ -#' @title Factor Decoding +#' @title Reverse Encoding #' #' @usage NULL #' @name mlr_pipeops_decode #' @format [`R6Class`][R6::R6Class] object inheriting from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`]. #' #' @description -#' Description +#' Reverses one-hot or treatment encoding of columns. It collapses multiple `numeric` or `integer` columns into one `factor` +#' column based on a specified grouping pattern of column names. +#' +#' May be applied to multiple groups of columns, grouped by matching a common naming pattern. The grouping pattern is +#' extracted to form the name of the newly derived `factor` column, and levels are constructed from the previous column +#' names, with parts matching the grouping pattern removed. The level per row of the new factor column is generally +#' determined as the name of the column with the maximum value in the group. +#' For example, columns `x.1` and `x.2` might be collapsed into a new factor column `x` with levels `1` and `2`, while +#' columns `y.1` and `y.2` might be interpreted as a separate group and collapsed into a new column `y`. #' #' @section Construction: #' ``` @@ -19,32 +27,42 @@ #' @section Input and Output Channels: #' Input and output channels are inherited from [`PipeOpTaskPreproc`]. #' -#' The output is the input [`Task`][mlr3::Task] with decoded columns, i.e. reversed encoding. +#' The output is the input [`Task`][mlr3::Task] with encoding columns collapsed into new decoded columns. #' #' @section State: #' The `$state` is a named `list` with the `$state` elements inherited from [`PipeOpTaskPreproc`], as well as: #' * `colmaps` :: named `list`\cr -#' Named list of named character vectors. Each element is named according to the new column name extracted by `group_pattern`. -#' Each vector contains the level names for the new factor column that should be created, named by the corresponding old column name. -#' If `treament_encoding` is `TRUE`, then each vector also contains `"ref"` as the name of the reference class with empty string as name instead -#' of an old column name. +#' Named list of named character vectors. Each element is named according to the new column name extracted by +#' `group_pattern`. Each vector contains the level names for the new factor column that should be created, named by +#' the corresponding old column name. If `treatment_encoding` is `TRUE`, then each vector also contains `"ref"` as the +#' reference class with an empty string (`""`) as name. #' * `treatment_encoding` :: `logical(1)`\cr -#' Parameter `treatment_encoding`. +#' Indicates whether treatment encoding (`TRUE`) or one-hot encoding (`FALSE`) is assumed. #' * `cutoff` :: `numeric(1)`\cr -#' Parameter `treatment_cutoff`. +#' The cutoff value for identifying the reference level in case of treatment encoding. #' * `ties_method` :: `character(1)`\cr -#' Parameter `ties_method`. +#' Method for resolving ties when multiple columns have the same value. Options include `"first"`, `"last"`, or `"random"`. #' #' @section Parameters: #' The parameters are the parameters inherited from [`PipeOpTaskPreproc`], as well as: #' * `group_pattern` :: `character(1)`\cr +#' A [regular expression](`base::regex`) to be applied to column names. Should contain a capturing group for the new +#' column name, and match everything that should not be interpreted as the new factor levels (which are constructed as +#' the difference between column names and what `group_pattern` matches). +#' If set to `""`, all columns matching the `group_pattern` are collapsed into one factor column called +#' `pipeop.decoded`. Use [`PipeOpRenameColumns`] to rename this column. +#' Initialized to `"^([^.]+)\\."`, which would extract everything up to the first dot as the new column name and +#' construct new levels as everything after the first dot. #' * `treatment_encoding` :: `logical(1)`\cr +#' If `TRUE`, treatment encoding is assumed instead of one-hot encoding. Initialized to `FALSE`. #' * `treatment_cutoff` :: `numeric(1)`\cr +#' If `treatment_encoding` is `TRUE`, specifies a cutoff value for identifying the reference level. The reference level +#' is set to `"ref"` in rows where the value is less than or equal to a specified cutoff value (e.g., `0`) in all +#' columns in that group To change the name of the reference level, use [`PipeOp???`] (Mutate? ColApply?). +#' Initialized to `0`. #' * `ties_method` :: `character(1)`\cr -#' Method for resolving ties, if multiple columns have the same value -#' to be passed to [`mlr3misc::which_max`]. In case of ties, either the `first`, the `last` or -#' a `random` column is picked. Initialized to `"random"`. -#' +#' Method for resolving ties if multiple columns have the same value. Specifies the value from which of the columns +#' with the same value is to be picked. Options are `"first"`, `"last"`, or `"random"`. Initialized to `"random"`. #' #' @section Methods: #' Only methods inherited from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`]. @@ -56,6 +74,41 @@ #' @examples #' library("mlr3") #' +#' # Create example task with one-hot encoding +#' df = data.frame( +#' target = runif(10), +#' x.1 = rep(c(1, 0), 5), +#' x.2 = rep(c(0, 1), 5), +#' y.1 = rep(c(1, 0), 5), +#' y.2 = rep(c(0, 1), 5), +#' a = runif(10) +#' ) +#' task = TaskRegr$new(id = "example", backend = df, target = "target") +#' +#' pop = po("decode") +#' +#' # Training +#' train_out = pop$train(list(task))[[1]] +#' # x.1 and x.2 are collapsed into x, same for y; a is ignored. +#' train_out$data() +#' +#' # Create example task with treatment encoding +#' df = data.frame( +#' target = runif(15), +#' x.1 = rep(c(1, 0, 0), 5), +#' x.2 = rep(c(0, 1, 0), 5) +#' ) +#' task = TaskRegr$new(id = "example", backend = df, target = "target") +#' +#' pop = po("decode") +#' pop$param_set$set_values(treatment_encoding = TRUE) +#' +#' # Training +#' train_out = pop$train(list(task))[[1]] +#' # x.1 and x.2 are collapsed into x; in rows where all values +#' # are smaller or equal to 0, the reference level is set +#' train_out$data() +#' PipeOpDecode = R6Class("PipeOpDecode", inherit = PipeOpTaskPreprocSimple, public = list( diff --git a/man/PipeOp.Rd b/man/PipeOp.Rd index e04b5cd15..a7d760bfc 100644 --- a/man/PipeOp.Rd +++ b/man/PipeOp.Rd @@ -286,6 +286,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/PipeOpEnsemble.Rd b/man/PipeOpEnsemble.Rd index ea902621f..c4d10413e 100644 --- a/man/PipeOpEnsemble.Rd +++ b/man/PipeOpEnsemble.Rd @@ -118,6 +118,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/PipeOpImpute.Rd b/man/PipeOpImpute.Rd index 56d4a8690..9b06175b2 100644 --- a/man/PipeOpImpute.Rd +++ b/man/PipeOpImpute.Rd @@ -151,6 +151,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/PipeOpTargetTrafo.Rd b/man/PipeOpTargetTrafo.Rd index 31473482e..fa65a666a 100644 --- a/man/PipeOpTargetTrafo.Rd +++ b/man/PipeOpTargetTrafo.Rd @@ -159,6 +159,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/PipeOpTaskPreproc.Rd b/man/PipeOpTaskPreproc.Rd index 43c04036a..37b684be7 100644 --- a/man/PipeOpTaskPreproc.Rd +++ b/man/PipeOpTaskPreproc.Rd @@ -214,6 +214,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/PipeOpTaskPreprocSimple.Rd b/man/PipeOpTaskPreprocSimple.Rd index 69e0186d3..4e9fb7309 100644 --- a/man/PipeOpTaskPreprocSimple.Rd +++ b/man/PipeOpTaskPreprocSimple.Rd @@ -151,6 +151,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops.Rd b/man/mlr_pipeops.Rd index 0a9bddc1b..60c97eec2 100644 --- a/man/mlr_pipeops.Rd +++ b/man/mlr_pipeops.Rd @@ -88,6 +88,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_adas.Rd b/man/mlr_pipeops_adas.Rd index 8663baff4..b023194de 100644 --- a/man/mlr_pipeops_adas.Rd +++ b/man/mlr_pipeops_adas.Rd @@ -111,6 +111,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_blsmote.Rd b/man/mlr_pipeops_blsmote.Rd index 0ad16183d..cf9ec0b78 100644 --- a/man/mlr_pipeops_blsmote.Rd +++ b/man/mlr_pipeops_blsmote.Rd @@ -116,6 +116,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_boxcox.Rd b/man/mlr_pipeops_boxcox.Rd index 2dd377c58..dda2d7777 100644 --- a/man/mlr_pipeops_boxcox.Rd +++ b/man/mlr_pipeops_boxcox.Rd @@ -102,6 +102,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_branch.Rd b/man/mlr_pipeops_branch.Rd index b2799c93f..0a8fb2090 100644 --- a/man/mlr_pipeops_branch.Rd +++ b/man/mlr_pipeops_branch.Rd @@ -120,6 +120,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_chunk.Rd b/man/mlr_pipeops_chunk.Rd index 2431862b6..805e26be9 100644 --- a/man/mlr_pipeops_chunk.Rd +++ b/man/mlr_pipeops_chunk.Rd @@ -99,6 +99,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_classbalancing.Rd b/man/mlr_pipeops_classbalancing.Rd index 1b4386e06..ba1a1b1c8 100644 --- a/man/mlr_pipeops_classbalancing.Rd +++ b/man/mlr_pipeops_classbalancing.Rd @@ -140,6 +140,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_classifavg.Rd b/man/mlr_pipeops_classifavg.Rd index 8a2d28851..49ff24c8c 100644 --- a/man/mlr_pipeops_classifavg.Rd +++ b/man/mlr_pipeops_classifavg.Rd @@ -116,6 +116,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_classweights.Rd b/man/mlr_pipeops_classweights.Rd index baa195e12..f14201030 100644 --- a/man/mlr_pipeops_classweights.Rd +++ b/man/mlr_pipeops_classweights.Rd @@ -119,6 +119,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_colapply.Rd b/man/mlr_pipeops_colapply.Rd index 682b5e293..9c08b3e0a 100644 --- a/man/mlr_pipeops_colapply.Rd +++ b/man/mlr_pipeops_colapply.Rd @@ -129,6 +129,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_collapsefactors.Rd b/man/mlr_pipeops_collapsefactors.Rd index f0e9f0599..b9f3ff568 100644 --- a/man/mlr_pipeops_collapsefactors.Rd +++ b/man/mlr_pipeops_collapsefactors.Rd @@ -96,6 +96,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_colroles.Rd b/man/mlr_pipeops_colroles.Rd index 2fa18c5a9..28098350e 100644 --- a/man/mlr_pipeops_colroles.Rd +++ b/man/mlr_pipeops_colroles.Rd @@ -88,6 +88,7 @@ Other PipeOps: \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_copy.Rd b/man/mlr_pipeops_copy.Rd index b1e770b10..877f4c216 100644 --- a/man/mlr_pipeops_copy.Rd +++ b/man/mlr_pipeops_copy.Rd @@ -118,6 +118,7 @@ Other PipeOps: \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_datefeatures.Rd b/man/mlr_pipeops_datefeatures.Rd index 60dee4b79..00ee1e6a5 100644 --- a/man/mlr_pipeops_datefeatures.Rd +++ b/man/mlr_pipeops_datefeatures.Rd @@ -135,6 +135,7 @@ Other PipeOps: \code{\link{mlr_pipeops_collapsefactors}}, \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_decode.Rd b/man/mlr_pipeops_decode.Rd new file mode 100644 index 000000000..635d4db84 --- /dev/null +++ b/man/mlr_pipeops_decode.Rd @@ -0,0 +1,210 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/PipeOpDecode.R +\name{mlr_pipeops_decode} +\alias{mlr_pipeops_decode} +\alias{PipeOpDecode} +\title{Reverse Encoding} +\format{ +\code{\link[R6:R6Class]{R6Class}} object inheriting from \code{\link{PipeOpTaskPreprocSimple}}/\code{\link{PipeOpTaskPreproc}}/\code{\link{PipeOp}}. +} +\description{ +Reverses one-hot or treatment encoding of columns. It collapses multiple \code{numeric} or \code{integer} columns into one \code{factor} +column based on a specified grouping pattern of column names. + +May be applied to multiple groups of columns, grouped by matching a common naming pattern. The grouping pattern is +extracted to form the name of the newly derived \code{factor} column, and levels are constructed from the previous column +names, with parts matching the grouping pattern removed. The level per row of the new factor column is generally +determined as the name of the column with the maximum value in the group. +For example, columns \code{x.1} and \code{x.2} might be collapsed into a new factor column \code{x} with levels \code{1} and \code{2}, while +columns \code{y.1} and \code{y.2} might be interpreted as a separate group and collapsed into a new column \code{y}. +} +\section{Construction}{ + + +\if{html}{\out{
}}\preformatted{PipeOpEncode$new(id = "decode", param_vals = list()) +}\if{html}{\out{
}} +\itemize{ +\item \code{id} :: \code{character(1)}\cr +Identifier of resulting object, default \code{"decode"}. +\item \code{param_vals} :: named \code{list}\cr +List of hyperparameter settings, overwriting the hyperparameter settings that would otherwise be set during construction. Default \code{list()}. +} +} + +\section{Input and Output Channels}{ + +Input and output channels are inherited from \code{\link{PipeOpTaskPreproc}}. + +The output is the input \code{\link[mlr3:Task]{Task}} with encoding columns collapsed into new decoded columns. +} + +\section{State}{ + +The \verb{$state} is a named \code{list} with the \verb{$state} elements inherited from \code{\link{PipeOpTaskPreproc}}, as well as: +\itemize{ +\item \code{colmaps} :: named \code{list}\cr +Named list of named character vectors. Each element is named according to the new column name extracted by +\code{group_pattern}. Each vector contains the level names for the new factor column that should be created, named by +the corresponding old column name. If \code{treatment_encoding} is \code{TRUE}, then each vector also contains \code{"ref"} as the +reference class with an empty string (\code{""}) as name. +\item \code{treatment_encoding} :: \code{logical(1)}\cr +Indicates whether treatment encoding (\code{TRUE}) or one-hot encoding (\code{FALSE}) is assumed. +\item \code{cutoff} :: \code{numeric(1)}\cr +The cutoff value for identifying the reference level in case of treatment encoding. +\item \code{ties_method} :: \code{character(1)}\cr +Method for resolving ties when multiple columns have the same value. Options include \code{"first"}, \code{"last"}, or \code{"random"}. +} +} + +\section{Parameters}{ + +The parameters are the parameters inherited from \code{\link{PipeOpTaskPreproc}}, as well as: +\itemize{ +\item \code{group_pattern} :: \code{character(1)}\cr +A \href{`base::regex`}{regular expression} to be applied to column names. Should contain a capturing group for the new +column name, and match everything that should not be interpreted as the new factor levels (which are constructed as +the difference between column names and what \code{group_pattern} matches). +If set to \code{""}, all columns matching the \code{group_pattern} are collapsed into one factor column called +\code{pipeop.decoded}. Use \code{\link{PipeOpRenameColumns}} to rename this column. +Initialized to \code{"^([^.]+)\\\\."}, which would extract everything up to the first dot as the new column name and +construct new levels as everything after the first dot. +\item \code{treatment_encoding} :: \code{logical(1)}\cr +If \code{TRUE}, treatment encoding is assumed instead of one-hot encoding. Initialized to \code{FALSE}. +\item \code{treatment_cutoff} :: \code{numeric(1)}\cr +If \code{treatment_encoding} is \code{TRUE}, specifies a cutoff value for identifying the reference level. The reference level +is set to \code{"ref"} in rows where the value is less than or equal to a specified cutoff value (e.g., \code{0}) in all +columns in that group To change the name of the reference level, use \code{\link{PipeOp???}} (Mutate? ColApply?). +Initialized to \code{0}. +\item \code{ties_method} :: \code{character(1)}\cr +Method for resolving ties if multiple columns have the same value. Specifies the value from which of the columns +with the same value is to be picked. Options are \code{"first"}, \code{"last"}, or \code{"random"}. Initialized to \code{"random"}. +} +} + +\section{Methods}{ + +Only methods inherited from \code{\link{PipeOpTaskPreprocSimple}}/\code{\link{PipeOpTaskPreproc}}/\code{\link{PipeOp}}. +} + +\examples{ +library("mlr3") + +# Create example task with one-hot encoding +df = data.frame( + target = runif(10), + x.1 = rep(c(1, 0), 5), + x.2 = rep(c(0, 1), 5), + y.1 = rep(c(1, 0), 5), + y.2 = rep(c(0, 1), 5), + a = runif(10) +) +task = TaskRegr$new(id = "example", backend = df, target = "target") + +pop = po("decode") + +# Training +train_out = pop$train(list(task))[[1]] +# x.1 and x.2 are collapsed into x, same for y; a is ignored. +train_out$data() + +# Create example task with treatment encoding +df = data.frame( + target = runif(15), + x.1 = rep(c(1, 0, 0), 5), + x.2 = rep(c(0, 1, 0), 5) +) +task = TaskRegr$new(id = "example", backend = df, target = "target") + +pop = po("decode") +pop$param_set$set_values(treatment_encoding = TRUE) + +# Training +train_out = pop$train(list(task))[[1]] +# x.1 and x.2 are collapsed into x; in rows where all values +# are smaller or equal to 0, the reference level is set +train_out$data() + +} +\seealso{ +https://mlr-org.com/pipeops.html + +Other PipeOps: +\code{\link{PipeOp}}, +\code{\link{PipeOpEnsemble}}, +\code{\link{PipeOpImpute}}, +\code{\link{PipeOpTargetTrafo}}, +\code{\link{PipeOpTaskPreproc}}, +\code{\link{PipeOpTaskPreprocSimple}}, +\code{\link{mlr_pipeops}}, +\code{\link{mlr_pipeops_adas}}, +\code{\link{mlr_pipeops_blsmote}}, +\code{\link{mlr_pipeops_boxcox}}, +\code{\link{mlr_pipeops_branch}}, +\code{\link{mlr_pipeops_chunk}}, +\code{\link{mlr_pipeops_classbalancing}}, +\code{\link{mlr_pipeops_classifavg}}, +\code{\link{mlr_pipeops_classweights}}, +\code{\link{mlr_pipeops_colapply}}, +\code{\link{mlr_pipeops_collapsefactors}}, +\code{\link{mlr_pipeops_colroles}}, +\code{\link{mlr_pipeops_copy}}, +\code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_encode}}, +\code{\link{mlr_pipeops_encodeimpact}}, +\code{\link{mlr_pipeops_encodelmer}}, +\code{\link{mlr_pipeops_featureunion}}, +\code{\link{mlr_pipeops_filter}}, +\code{\link{mlr_pipeops_fixfactors}}, +\code{\link{mlr_pipeops_histbin}}, +\code{\link{mlr_pipeops_ica}}, +\code{\link{mlr_pipeops_imputeconstant}}, +\code{\link{mlr_pipeops_imputehist}}, +\code{\link{mlr_pipeops_imputelearner}}, +\code{\link{mlr_pipeops_imputemean}}, +\code{\link{mlr_pipeops_imputemedian}}, +\code{\link{mlr_pipeops_imputemode}}, +\code{\link{mlr_pipeops_imputeoor}}, +\code{\link{mlr_pipeops_imputesample}}, +\code{\link{mlr_pipeops_kernelpca}}, +\code{\link{mlr_pipeops_learner}}, +\code{\link{mlr_pipeops_missind}}, +\code{\link{mlr_pipeops_modelmatrix}}, +\code{\link{mlr_pipeops_multiplicityexply}}, +\code{\link{mlr_pipeops_multiplicityimply}}, +\code{\link{mlr_pipeops_mutate}}, +\code{\link{mlr_pipeops_nearmiss}}, +\code{\link{mlr_pipeops_nmf}}, +\code{\link{mlr_pipeops_nop}}, +\code{\link{mlr_pipeops_ovrsplit}}, +\code{\link{mlr_pipeops_ovrunite}}, +\code{\link{mlr_pipeops_pca}}, +\code{\link{mlr_pipeops_proxy}}, +\code{\link{mlr_pipeops_quantilebin}}, +\code{\link{mlr_pipeops_randomprojection}}, +\code{\link{mlr_pipeops_randomresponse}}, +\code{\link{mlr_pipeops_regravg}}, +\code{\link{mlr_pipeops_removeconstants}}, +\code{\link{mlr_pipeops_renamecolumns}}, +\code{\link{mlr_pipeops_replicate}}, +\code{\link{mlr_pipeops_rowapply}}, +\code{\link{mlr_pipeops_scale}}, +\code{\link{mlr_pipeops_scalemaxabs}}, +\code{\link{mlr_pipeops_scalerange}}, +\code{\link{mlr_pipeops_select}}, +\code{\link{mlr_pipeops_smote}}, +\code{\link{mlr_pipeops_smotenc}}, +\code{\link{mlr_pipeops_spatialsign}}, +\code{\link{mlr_pipeops_subsample}}, +\code{\link{mlr_pipeops_targetinvert}}, +\code{\link{mlr_pipeops_targetmutate}}, +\code{\link{mlr_pipeops_targettrafoscalerange}}, +\code{\link{mlr_pipeops_textvectorizer}}, +\code{\link{mlr_pipeops_threshold}}, +\code{\link{mlr_pipeops_tomek}}, +\code{\link{mlr_pipeops_tunethreshold}}, +\code{\link{mlr_pipeops_unbranch}}, +\code{\link{mlr_pipeops_updatetarget}}, +\code{\link{mlr_pipeops_vtreat}}, +\code{\link{mlr_pipeops_yeojohnson}} +} +\concept{PipeOps} diff --git a/man/mlr_pipeops_encode.Rd b/man/mlr_pipeops_encode.Rd index 79e39e75e..cf345f306 100644 --- a/man/mlr_pipeops_encode.Rd +++ b/man/mlr_pipeops_encode.Rd @@ -132,6 +132,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_featureunion}}, diff --git a/man/mlr_pipeops_encodeimpact.Rd b/man/mlr_pipeops_encodeimpact.Rd index 18f532a7b..2ae9d72a6 100644 --- a/man/mlr_pipeops_encodeimpact.Rd +++ b/man/mlr_pipeops_encodeimpact.Rd @@ -114,6 +114,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodelmer}}, \code{\link{mlr_pipeops_featureunion}}, diff --git a/man/mlr_pipeops_encodelmer.Rd b/man/mlr_pipeops_encodelmer.Rd index 9d5d7fa90..39aeb9ae6 100644 --- a/man/mlr_pipeops_encodelmer.Rd +++ b/man/mlr_pipeops_encodelmer.Rd @@ -129,6 +129,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_featureunion}}, diff --git a/man/mlr_pipeops_featureunion.Rd b/man/mlr_pipeops_featureunion.Rd index 20fe861b0..142091cb0 100644 --- a/man/mlr_pipeops_featureunion.Rd +++ b/man/mlr_pipeops_featureunion.Rd @@ -134,6 +134,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_filter.Rd b/man/mlr_pipeops_filter.Rd index ebd0ed711..0e21c7693 100644 --- a/man/mlr_pipeops_filter.Rd +++ b/man/mlr_pipeops_filter.Rd @@ -165,6 +165,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_fixfactors.Rd b/man/mlr_pipeops_fixfactors.Rd index 9bf3536f3..9195643e2 100644 --- a/man/mlr_pipeops_fixfactors.Rd +++ b/man/mlr_pipeops_fixfactors.Rd @@ -89,6 +89,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_histbin.Rd b/man/mlr_pipeops_histbin.Rd index 5d03b0fc3..d405889dc 100644 --- a/man/mlr_pipeops_histbin.Rd +++ b/man/mlr_pipeops_histbin.Rd @@ -101,6 +101,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_ica.Rd b/man/mlr_pipeops_ica.Rd index 02435608b..4b8f7c3e8 100644 --- a/man/mlr_pipeops_ica.Rd +++ b/man/mlr_pipeops_ica.Rd @@ -129,6 +129,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_imputeconstant.Rd b/man/mlr_pipeops_imputeconstant.Rd index 2bdd6a834..be5797393 100644 --- a/man/mlr_pipeops_imputeconstant.Rd +++ b/man/mlr_pipeops_imputeconstant.Rd @@ -103,6 +103,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_imputehist.Rd b/man/mlr_pipeops_imputehist.Rd index 8d5e7e282..cdddf750c 100644 --- a/man/mlr_pipeops_imputehist.Rd +++ b/man/mlr_pipeops_imputehist.Rd @@ -95,6 +95,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_imputelearner.Rd b/man/mlr_pipeops_imputelearner.Rd index 993218618..c47cfa319 100644 --- a/man/mlr_pipeops_imputelearner.Rd +++ b/man/mlr_pipeops_imputelearner.Rd @@ -140,6 +140,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_imputemean.Rd b/man/mlr_pipeops_imputemean.Rd index 743d5e40d..8f02d01ba 100644 --- a/man/mlr_pipeops_imputemean.Rd +++ b/man/mlr_pipeops_imputemean.Rd @@ -88,6 +88,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_imputemedian.Rd b/man/mlr_pipeops_imputemedian.Rd index d6957908f..f56b2089a 100644 --- a/man/mlr_pipeops_imputemedian.Rd +++ b/man/mlr_pipeops_imputemedian.Rd @@ -88,6 +88,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_imputemode.Rd b/man/mlr_pipeops_imputemode.Rd index 2afef773d..b5a9088ae 100644 --- a/man/mlr_pipeops_imputemode.Rd +++ b/man/mlr_pipeops_imputemode.Rd @@ -95,6 +95,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_imputeoor.Rd b/man/mlr_pipeops_imputeoor.Rd index f66d6e767..955361f70 100644 --- a/man/mlr_pipeops_imputeoor.Rd +++ b/man/mlr_pipeops_imputeoor.Rd @@ -136,6 +136,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_imputesample.Rd b/man/mlr_pipeops_imputesample.Rd index a798e003b..ec4aa435d 100644 --- a/man/mlr_pipeops_imputesample.Rd +++ b/man/mlr_pipeops_imputesample.Rd @@ -90,6 +90,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_kernelpca.Rd b/man/mlr_pipeops_kernelpca.Rd index b80983868..5229de8a5 100644 --- a/man/mlr_pipeops_kernelpca.Rd +++ b/man/mlr_pipeops_kernelpca.Rd @@ -104,6 +104,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_learner.Rd b/man/mlr_pipeops_learner.Rd index bfb923270..832bb1d75 100644 --- a/man/mlr_pipeops_learner.Rd +++ b/man/mlr_pipeops_learner.Rd @@ -135,6 +135,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_missind.Rd b/man/mlr_pipeops_missind.Rd index df3befe85..63baf7a61 100644 --- a/man/mlr_pipeops_missind.Rd +++ b/man/mlr_pipeops_missind.Rd @@ -118,6 +118,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_modelmatrix.Rd b/man/mlr_pipeops_modelmatrix.Rd index 88feb3fdb..a9244665d 100644 --- a/man/mlr_pipeops_modelmatrix.Rd +++ b/man/mlr_pipeops_modelmatrix.Rd @@ -94,6 +94,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_multiplicityexply.Rd b/man/mlr_pipeops_multiplicityexply.Rd index abdfe2f82..073c7265d 100644 --- a/man/mlr_pipeops_multiplicityexply.Rd +++ b/man/mlr_pipeops_multiplicityexply.Rd @@ -100,6 +100,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_multiplicityimply.Rd b/man/mlr_pipeops_multiplicityimply.Rd index 23e0a3e1d..5ace584c4 100644 --- a/man/mlr_pipeops_multiplicityimply.Rd +++ b/man/mlr_pipeops_multiplicityimply.Rd @@ -106,6 +106,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_mutate.Rd b/man/mlr_pipeops_mutate.Rd index 4bc31cc8b..d41f93336 100644 --- a/man/mlr_pipeops_mutate.Rd +++ b/man/mlr_pipeops_mutate.Rd @@ -111,6 +111,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_nearmiss.Rd b/man/mlr_pipeops_nearmiss.Rd index ce5a1908b..03c5d04ff 100644 --- a/man/mlr_pipeops_nearmiss.Rd +++ b/man/mlr_pipeops_nearmiss.Rd @@ -112,6 +112,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_nmf.Rd b/man/mlr_pipeops_nmf.Rd index 95332aedf..ed82b8251 100644 --- a/man/mlr_pipeops_nmf.Rd +++ b/man/mlr_pipeops_nmf.Rd @@ -147,6 +147,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_nop.Rd b/man/mlr_pipeops_nop.Rd index c5ada4ead..396b3da8e 100644 --- a/man/mlr_pipeops_nop.Rd +++ b/man/mlr_pipeops_nop.Rd @@ -96,6 +96,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_ovrsplit.Rd b/man/mlr_pipeops_ovrsplit.Rd index 48711b6b6..12686c86e 100644 --- a/man/mlr_pipeops_ovrsplit.Rd +++ b/man/mlr_pipeops_ovrsplit.Rd @@ -113,6 +113,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_ovrunite.Rd b/man/mlr_pipeops_ovrunite.Rd index e24fe4c8b..17621d38a 100644 --- a/man/mlr_pipeops_ovrunite.Rd +++ b/man/mlr_pipeops_ovrunite.Rd @@ -108,6 +108,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_pca.Rd b/man/mlr_pipeops_pca.Rd index 57b211c53..84214665a 100644 --- a/man/mlr_pipeops_pca.Rd +++ b/man/mlr_pipeops_pca.Rd @@ -105,6 +105,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_proxy.Rd b/man/mlr_pipeops_proxy.Rd index 62983a50a..527d0d5b3 100644 --- a/man/mlr_pipeops_proxy.Rd +++ b/man/mlr_pipeops_proxy.Rd @@ -119,6 +119,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_quantilebin.Rd b/man/mlr_pipeops_quantilebin.Rd index 169a46189..9ffefd526 100644 --- a/man/mlr_pipeops_quantilebin.Rd +++ b/man/mlr_pipeops_quantilebin.Rd @@ -93,6 +93,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_randomprojection.Rd b/man/mlr_pipeops_randomprojection.Rd index b175d28cc..5ef11cf0e 100644 --- a/man/mlr_pipeops_randomprojection.Rd +++ b/man/mlr_pipeops_randomprojection.Rd @@ -105,6 +105,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_randomresponse.Rd b/man/mlr_pipeops_randomresponse.Rd index f6abf31eb..62f4d7198 100644 --- a/man/mlr_pipeops_randomresponse.Rd +++ b/man/mlr_pipeops_randomresponse.Rd @@ -122,6 +122,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_regravg.Rd b/man/mlr_pipeops_regravg.Rd index c2caf6b06..8c45496be 100644 --- a/man/mlr_pipeops_regravg.Rd +++ b/man/mlr_pipeops_regravg.Rd @@ -108,6 +108,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_removeconstants.Rd b/man/mlr_pipeops_removeconstants.Rd index 8f8e58076..ca867d47c 100644 --- a/man/mlr_pipeops_removeconstants.Rd +++ b/man/mlr_pipeops_removeconstants.Rd @@ -98,6 +98,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_renamecolumns.Rd b/man/mlr_pipeops_renamecolumns.Rd index a229e4688..46d18670c 100644 --- a/man/mlr_pipeops_renamecolumns.Rd +++ b/man/mlr_pipeops_renamecolumns.Rd @@ -97,6 +97,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_replicate.Rd b/man/mlr_pipeops_replicate.Rd index 2fbd2f941..25e22555c 100644 --- a/man/mlr_pipeops_replicate.Rd +++ b/man/mlr_pipeops_replicate.Rd @@ -90,6 +90,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_rowapply.Rd b/man/mlr_pipeops_rowapply.Rd index a2d0b05cc..1dabe4803 100644 --- a/man/mlr_pipeops_rowapply.Rd +++ b/man/mlr_pipeops_rowapply.Rd @@ -96,6 +96,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_scale.Rd b/man/mlr_pipeops_scale.Rd index cd89c5112..db70aaced 100644 --- a/man/mlr_pipeops_scale.Rd +++ b/man/mlr_pipeops_scale.Rd @@ -112,6 +112,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_scalemaxabs.Rd b/man/mlr_pipeops_scalemaxabs.Rd index 7a116ab1e..b390746b9 100644 --- a/man/mlr_pipeops_scalemaxabs.Rd +++ b/man/mlr_pipeops_scalemaxabs.Rd @@ -87,6 +87,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_scalerange.Rd b/man/mlr_pipeops_scalerange.Rd index 4074f7935..8c069242e 100644 --- a/man/mlr_pipeops_scalerange.Rd +++ b/man/mlr_pipeops_scalerange.Rd @@ -92,6 +92,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_select.Rd b/man/mlr_pipeops_select.Rd index 09378f95e..b733a0bcd 100644 --- a/man/mlr_pipeops_select.Rd +++ b/man/mlr_pipeops_select.Rd @@ -108,6 +108,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_smote.Rd b/man/mlr_pipeops_smote.Rd index 824d3b331..d9dca2e82 100644 --- a/man/mlr_pipeops_smote.Rd +++ b/man/mlr_pipeops_smote.Rd @@ -111,6 +111,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_smotenc.Rd b/man/mlr_pipeops_smotenc.Rd index b7b321973..82a447c49 100644 --- a/man/mlr_pipeops_smotenc.Rd +++ b/man/mlr_pipeops_smotenc.Rd @@ -120,6 +120,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_spatialsign.Rd b/man/mlr_pipeops_spatialsign.Rd index 3271ffec9..42c159293 100644 --- a/man/mlr_pipeops_spatialsign.Rd +++ b/man/mlr_pipeops_spatialsign.Rd @@ -87,6 +87,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_subsample.Rd b/man/mlr_pipeops_subsample.Rd index f0eb2c63b..f66c0777d 100644 --- a/man/mlr_pipeops_subsample.Rd +++ b/man/mlr_pipeops_subsample.Rd @@ -102,6 +102,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_targetinvert.Rd b/man/mlr_pipeops_targetinvert.Rd index fcd24816e..164b718fd 100644 --- a/man/mlr_pipeops_targetinvert.Rd +++ b/man/mlr_pipeops_targetinvert.Rd @@ -87,6 +87,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_targetmutate.Rd b/man/mlr_pipeops_targetmutate.Rd index 1d84ccfa7..e6de74cd0 100644 --- a/man/mlr_pipeops_targetmutate.Rd +++ b/man/mlr_pipeops_targetmutate.Rd @@ -135,6 +135,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_targettrafoscalerange.Rd b/man/mlr_pipeops_targettrafoscalerange.Rd index bf5592cb6..cfbffcfcb 100644 --- a/man/mlr_pipeops_targettrafoscalerange.Rd +++ b/man/mlr_pipeops_targettrafoscalerange.Rd @@ -101,6 +101,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_textvectorizer.Rd b/man/mlr_pipeops_textvectorizer.Rd index bbf4948ed..6c91ce67b 100644 --- a/man/mlr_pipeops_textvectorizer.Rd +++ b/man/mlr_pipeops_textvectorizer.Rd @@ -201,6 +201,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_threshold.Rd b/man/mlr_pipeops_threshold.Rd index f6c4d32bc..d9820f592 100644 --- a/man/mlr_pipeops_threshold.Rd +++ b/man/mlr_pipeops_threshold.Rd @@ -100,6 +100,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_tomek.Rd b/man/mlr_pipeops_tomek.Rd index 490f8e929..77720a37f 100644 --- a/man/mlr_pipeops_tomek.Rd +++ b/man/mlr_pipeops_tomek.Rd @@ -105,6 +105,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_tunethreshold.Rd b/man/mlr_pipeops_tunethreshold.Rd index 04a47d55a..acac7896e 100644 --- a/man/mlr_pipeops_tunethreshold.Rd +++ b/man/mlr_pipeops_tunethreshold.Rd @@ -130,6 +130,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_unbranch.Rd b/man/mlr_pipeops_unbranch.Rd index 03cae3768..490b6bc26 100644 --- a/man/mlr_pipeops_unbranch.Rd +++ b/man/mlr_pipeops_unbranch.Rd @@ -99,6 +99,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_updatetarget.Rd b/man/mlr_pipeops_updatetarget.Rd index c06359e07..eb96c8fb1 100644 --- a/man/mlr_pipeops_updatetarget.Rd +++ b/man/mlr_pipeops_updatetarget.Rd @@ -114,6 +114,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_vtreat.Rd b/man/mlr_pipeops_vtreat.Rd index fd33f6fad..3cc0691e4 100644 --- a/man/mlr_pipeops_vtreat.Rd +++ b/man/mlr_pipeops_vtreat.Rd @@ -167,6 +167,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_yeojohnson.Rd b/man/mlr_pipeops_yeojohnson.Rd index d157c79ec..36ebed286 100644 --- a/man/mlr_pipeops_yeojohnson.Rd +++ b/man/mlr_pipeops_yeojohnson.Rd @@ -104,6 +104,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/tests/testthat/test_pipeop_decode.R b/tests/testthat/test_pipeop_decode.R index 670e07d9e..7a4671e07 100644 --- a/tests/testthat/test_pipeop_decode.R +++ b/tests/testthat/test_pipeop_decode.R @@ -1,15 +1,13 @@ context("PipeOpDecode") -#Notiz: tests mit mehreren gruppen und skallierten werten - test_that("PipeOpDecode - basic properties", { dt = data.table( - target = runif(120), - x.1 = rep(c(1, 0, 0), 40), - x.2 = rep(c(0, 1, 0), 40), - y.1 = rep(c(1, 0, 0), 40), - y.2 = rep(c(0, 1, 0), 40), - a = runif(120) + target = runif(12), + x.1 = rep(c(1, 0, 0), 4), + x.2 = rep(c(0, 1, 0), 4), + y.1 = rep(c(1, 0, 0), 4), + y.2 = rep(c(0, 1, 0), 4), + a = runif(12) ) task = TaskRegr$new(id = "decode", backend = dt, target = "target") @@ -19,7 +17,7 @@ test_that("PipeOpDecode - basic properties", { test_that("PipeOpDecode - one-hot-encoding", { op = PipeOpDecode$new() - dt = data.frame( + dt = data.table( target = runif(10), x.1 = rep(c(1, 0), 5), x.2 = rep(c(0, 1), 5), @@ -28,7 +26,7 @@ test_that("PipeOpDecode - one-hot-encoding", { .a = runif(10), a = runif(10) ) - task = TaskRegr$new(id = "decode", backend = dt, target = "target") + task = TaskRegr$new(id = "test", backend = dt, target = "target") train_out = op$train(list(task))[[1]]$data() dt_compare = data.table( @@ -45,18 +43,18 @@ test_that("PipeOpDecode - one-hot-encoding", { target = runif(10), a = runif(10) ) - task = TaskRegr$new(id = "decode", backend = dt, target = "target") + task = TaskRegr$new(id = "test", backend = dt, target = "target") train_out = op$train(list(task))[[1]]$data() expect_equal(train_out, dt) # tiebreak - dt = data.frame( + dt = data.table( target = runif(10), x.1 = c(1, 0, 1, 0, 0), x.2 = c(0, 1, 0, 1, 1), x.3 = c(0, 0, 1, 1, 1) ) - task = TaskRegr$new(id = "decode", backend = dt, target = "target") + task = TaskRegr$new(id = "test", backend = dt, target = "target") op$param_set$values$ties_method = "first" train_out = op$train(list(task))[[1]]$data() @@ -85,7 +83,7 @@ test_that("PipeOpDecode - treatment encoding", { x.2 = rep(c(0, 0, 0.5), 5), a = runif(15) ) - task = TaskRegr$new(id = "decode", backend = dt, target = "target") + task = TaskRegr$new(id = "test", backend = dt, target = "target") train_out = op$train(list(task))[[1]]$data() dt_compare = data.table( @@ -110,13 +108,13 @@ test_that("PipOpDecode - collapse all into one", { op = PipeOpDecode$new() op$param_set$values$group_pattern = "" - dt = data.frame( + dt = data.table( target = runif(15), x = rep(c(1, 0, 0), 5), y = rep(c(0, 1, 0), 5), z = rep(c(0, 0, 1), 5) ) - task = TaskRegr$new(id = "decode", backend = dt, target = "target") + task = TaskRegr$new(id = "test", backend = dt, target = "target") train_out = op$train(list(task))[[1]]$data() dt_compare = data.table( @@ -128,13 +126,13 @@ test_that("PipOpDecode - collapse all into one", { test_that("PipeOpDecode - errors", { op = PipeOpDecode$new() - dt = data.frame( + dt = data.table( target = runif(20), x.1 = rep(c(1, 0), 10), x.2 = rep(c(0, 1), 10), .a = rep(1, 20) ) - task = TaskRegr$new(id = "decode", backend = dt, target = "target") + task = TaskRegr$new(id = "test", backend = dt, target = "target") # pattern without capturing group op$param_set$values$group_pattern = "^[^.]+\\." From 4e9d1ad4bc6acb42a4647df1ac1e9f9bccf40c22 Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Fri, 29 Nov 2024 00:29:46 +0100 Subject: [PATCH 13/18] fix doc link --- R/PipeOpDecode.R | 4 ++-- man/mlr_pipeops_decode.Rd | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/PipeOpDecode.R b/R/PipeOpDecode.R index 7d787d802..173138e2e 100644 --- a/R/PipeOpDecode.R +++ b/R/PipeOpDecode.R @@ -35,7 +35,7 @@ #' Named list of named character vectors. Each element is named according to the new column name extracted by #' `group_pattern`. Each vector contains the level names for the new factor column that should be created, named by #' the corresponding old column name. If `treatment_encoding` is `TRUE`, then each vector also contains `"ref"` as the -#' reference class with an empty string (`""`) as name. +#' reference class with an empty string as name. #' * `treatment_encoding` :: `logical(1)`\cr #' Indicates whether treatment encoding (`TRUE`) or one-hot encoding (`FALSE`) is assumed. #' * `cutoff` :: `numeric(1)`\cr @@ -46,7 +46,7 @@ #' @section Parameters: #' The parameters are the parameters inherited from [`PipeOpTaskPreproc`], as well as: #' * `group_pattern` :: `character(1)`\cr -#' A [regular expression](`base::regex`) to be applied to column names. Should contain a capturing group for the new +#' A regular expression to be applied to column names. Should contain a capturing group for the new #' column name, and match everything that should not be interpreted as the new factor levels (which are constructed as #' the difference between column names and what `group_pattern` matches). #' If set to `""`, all columns matching the `group_pattern` are collapsed into one factor column called diff --git a/man/mlr_pipeops_decode.Rd b/man/mlr_pipeops_decode.Rd index 635d4db84..682bda28e 100644 --- a/man/mlr_pipeops_decode.Rd +++ b/man/mlr_pipeops_decode.Rd @@ -46,7 +46,7 @@ The \verb{$state} is a named \code{list} with the \verb{$state} elements inherit Named list of named character vectors. Each element is named according to the new column name extracted by \code{group_pattern}. Each vector contains the level names for the new factor column that should be created, named by the corresponding old column name. If \code{treatment_encoding} is \code{TRUE}, then each vector also contains \code{"ref"} as the -reference class with an empty string (\code{""}) as name. +reference class with an empty string as name. \item \code{treatment_encoding} :: \code{logical(1)}\cr Indicates whether treatment encoding (\code{TRUE}) or one-hot encoding (\code{FALSE}) is assumed. \item \code{cutoff} :: \code{numeric(1)}\cr @@ -61,7 +61,7 @@ Method for resolving ties when multiple columns have the same value. Options inc The parameters are the parameters inherited from \code{\link{PipeOpTaskPreproc}}, as well as: \itemize{ \item \code{group_pattern} :: \code{character(1)}\cr -A \href{`base::regex`}{regular expression} to be applied to column names. Should contain a capturing group for the new +A regular expression to be applied to column names. Should contain a capturing group for the new column name, and match everything that should not be interpreted as the new factor levels (which are constructed as the difference between column names and what \code{group_pattern} matches). If set to \code{""}, all columns matching the \code{group_pattern} are collapsed into one factor column called From 79351c3ca3e6f120a5c9b70d73f6617ce91c87bc Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Sun, 1 Dec 2024 23:58:46 +0100 Subject: [PATCH 14/18] updated NEWS.mde --- NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/NEWS.md b/NEWS.md index f6d094e79..c403ce0dc 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,6 +4,7 @@ * Fix: `PipeOpCollapseFactors` now correctly collapses levels of ordered factors. * Fix: `LearnerClassifAvg` and `LearnerRegrAvg` hyperparameters get the `"required"` tag. * New parameter `use_groups` (default `TRUE`) for `PipeOpSubsampling` to respect grouping (changed default behaviour for grouped data) +* New PipeOp `PipeOpDecode` / `po("decode")` to reverse one-hot or treatment encoding. # mlr3pipelines 0.7.1 From a0fb01c88047210878953fa08867fd407bc72457 Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Tue, 3 Dec 2024 09:58:44 +0100 Subject: [PATCH 15/18] small changes --- R/PipeOpDecode.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/PipeOpDecode.R b/R/PipeOpDecode.R index 173138e2e..b2ead4621 100644 --- a/R/PipeOpDecode.R +++ b/R/PipeOpDecode.R @@ -142,7 +142,7 @@ PipeOpDecode = R6Class("PipeOpDecode", # If pattern == "", all columns are collapsed into one column. # Note, that column "pipeop.decoded" gets overwritten if it already exists. if (pv$group_pattern == "") { - cmap = list(pipeop.decoded = c(set_names(cols, cols))) + cmap = list(pipeop.decoded = set_names(cols, cols)) if (pv$treatment_encoding) { # Append ref_name with empty name (i.e. "") cmap[[pipeop.decoded]][[length(cols) + 1]] = ref_name @@ -174,7 +174,7 @@ PipeOpDecode = R6Class("PipeOpDecode", } fcts = map_chr(matches, 2) - # Error if no group could be extracted for an entry in col. Thus, we could not create a column name from it. + # Error, if no group could be extracted for an entry in col so that we could not create a column name from it. if (any(nchar(fcts) == 0)) { stopf("Pattern %s with column(s) %s would produce empty string as decoded column name(s). Try using a different pattern.", str_collapse(pv$group_pattern, quote = '"'), @@ -210,7 +210,7 @@ PipeOpDecode = R6Class("PipeOpDecode", for (new_col in names(colmaps)) { lvls = colmaps[[new_col]] - # If existent, remove empty string element (for subsetting dt, later) + # If existent, remove empty string element (for subsetting dt in next step) old_cols = discard(names(lvls), names(lvls) == "") # Create matrix from subset of dt with columns old_cols From 15ca6f68a363f2aa1f98e90254be4a57eafa6e91 Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Fri, 6 Dec 2024 17:10:11 +0100 Subject: [PATCH 16/18] extracted incrementing suffix as helper function + param for ref level + tests for this --- R/PipeOpDecode.R | 78 ++++++++++++++++------------- tests/testthat/test_pipeop_decode.R | 39 +++++++++++++++ 2 files changed, 82 insertions(+), 35 deletions(-) diff --git a/R/PipeOpDecode.R b/R/PipeOpDecode.R index b2ead4621..f0b342b20 100644 --- a/R/PipeOpDecode.R +++ b/R/PipeOpDecode.R @@ -1,4 +1,4 @@ -#' @title Reverse Encoding +#' @title Reverse Factor Encoding #' #' @usage NULL #' @name mlr_pipeops_decode @@ -34,7 +34,7 @@ #' * `colmaps` :: named `list`\cr #' Named list of named character vectors. Each element is named according to the new column name extracted by #' `group_pattern`. Each vector contains the level names for the new factor column that should be created, named by -#' the corresponding old column name. If `treatment_encoding` is `TRUE`, then each vector also contains `"ref"` as the +#' the corresponding old column name. If `treatment_encoding` is `TRUE`, then each vector also contains `reflevel_name` as the #' reference class with an empty string as name. #' * `treatment_encoding` :: `logical(1)`\cr #' Indicates whether treatment encoding (`TRUE`) or one-hot encoding (`FALSE`) is assumed. @@ -57,12 +57,14 @@ #' If `TRUE`, treatment encoding is assumed instead of one-hot encoding. Initialized to `FALSE`. #' * `treatment_cutoff` :: `numeric(1)`\cr #' If `treatment_encoding` is `TRUE`, specifies a cutoff value for identifying the reference level. The reference level -#' is set to `"ref"` in rows where the value is less than or equal to a specified cutoff value (e.g., `0`) in all +#' is set to `reflevel_name` in rows where the value is less than or equal to a specified cutoff value (e.g., `0`) in all #' columns in that group To change the name of the reference level, use [`PipeOp???`] (Mutate? ColApply?). -#' Initialized to `0`. +#' Default is `0`. #' * `ties_method` :: `character(1)`\cr #' Method for resolving ties if multiple columns have the same value. Specifies the value from which of the columns #' with the same value is to be picked. Options are `"first"`, `"last"`, or `"random"`. Initialized to `"random"`. +#' * `ref_name` :: `character(1)`\cr +#' Default is `"ref"`. #' #' @section Methods: #' Only methods inherited from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`]. @@ -117,6 +119,7 @@ PipeOpDecode = R6Class("PipeOpDecode", group_pattern = p_uty(custom_check = check_string, tags = c("train", "required")), treatment_encoding = p_lgl(tags = c("train", "required")), treatment_cutoff = p_dbl(default = 0, tags = "train", depends = quote(treatment_encoding == TRUE)), + ref_name = p_uty(custom_check = crate(function(x) check_string(x, min.chars = 1)), tags = "train", depends = quote(treatment_encoding == TRUE)), ties_method = p_fct(c("first", "last", "random"), tags = c("train", "required")) ) ps$values = list(treatment_encoding = FALSE, group_pattern = "^([^.]+)\\.", ties_method = "random") @@ -127,25 +130,17 @@ PipeOpDecode = R6Class("PipeOpDecode", .get_state_dt = function(dt, levels, target) { pv = self$param_set$values + ref_name = pv$ref_name %??% "ref" cols = colnames(dt) - if (pv$treatment_encoding) { - # Determine name for reference level - ref_name = "ref" - counter = 1 - while (ref_name %in% cols) { - ref_name = paste0("ref.", counter) - counter = counter + 1 - } - } - # If pattern == "", all columns are collapsed into one column. # Note, that column "pipeop.decoded" gets overwritten if it already exists. if (pv$group_pattern == "") { cmap = list(pipeop.decoded = set_names(cols, cols)) + if (pv$treatment_encoding) { - # Append ref_name with empty name (i.e. "") - cmap[[pipeop.decoded]][[length(cols) + 1]] = ref_name + # Append reference level with empty name (i.e. "") + cmap[["pipeop.decoded"]][[length(cols) + 1]] = get_ref_name(ref_name, cmap[["pipeop.decoded"]]) } s = list( @@ -168,12 +163,12 @@ PipeOpDecode = R6Class("PipeOpDecode", matches = regmatches(cols, regexec(pv$group_pattern, cols, perl = TRUE)) # Error, if nothing was captured. if (any(lengths(matches) < 2)) { - stopf("Pattern %s matches column name %s, but nothing was captured. Make sure group_pattern contains a capturing group.", + stopf("Pattern %s matches column name %s, but nothing was captured. Make sure \"group_pattern\" contains a capturing group or is an empty string to collapse all colunns into one factor.", str_collapse(pv$group_pattern, quote = '"'), str_collapse(cols[lengths(matches) < 2], quote = '"')) } - fcts = map_chr(matches, 2) + fcts = map_chr(matches, 2) # Error, if no group could be extracted for an entry in col so that we could not create a column name from it. if (any(nchar(fcts) == 0)) { stopf("Pattern %s with column(s) %s would produce empty string as decoded column name(s). Try using a different pattern.", @@ -183,10 +178,11 @@ PipeOpDecode = R6Class("PipeOpDecode", # Create mapping of old column names and derived levels to new column names cmap = split(lvls, fcts) + if (pv$treatment_encoding) { - # Append ref_name with empty name (i.e. "") to all list entries + # Append reference level with empty name (i.e. "") to all list entries for (i in seq_along(cmap)) { - cmap[[i]][[length(cmap[[i]]) + 1]] = ref_name + cmap[[i]][[length(cmap[[i]]) + 1]] = get_ref_name(ref_name, cmap[[i]]) } } @@ -200,41 +196,53 @@ PipeOpDecode = R6Class("PipeOpDecode", .transform_dt = function(dt, levels) { colmaps = self$state$colmaps - # Early exit if no mapping is required if (!length(colmaps)) { - return(dt) + return(dt) # Early exit if no mapping is required } cutoff = self$state$cutoff ties_method = self$state$ties_method treatment_encoding = self$state$treatment_encoding - for (new_col in names(colmaps)) { + dt_collapsed = data.table() + lapply(names(colmaps), function(new_col) { lvls = colmaps[[new_col]] - # If existent, remove empty string element (for subsetting dt in next step) + # Get old column names and, ff existent, remove empty string element (for subsetting dt_collapse in next step) old_cols = discard(names(lvls), names(lvls) == "") - - # Create matrix from subset of dt with columns old_cols + # Create matrix from subset of dt with column names given by old_cols old_cols_matrix = as.matrix(dt[, old_cols, with = FALSE]) # Populate new column with name of column with maximal value per row - set(dt, , new_col, old_cols[apply(old_cols_matrix, 1, which_max, ties_method = ties_method)]) - # If all values in old_cols_matrix are smaller than or equal to the cutoff, replace with empty string - # This leads to replacement with reference level in next step. + set(dt_collapsed, , new_col, old_cols[apply(old_cols_matrix, 1, which_max, ties_method = ties_method)]) if (treatment_encoding) { - set(dt, which(rowSums(old_cols_matrix > cutoff) == 0), new_col, "") + # If all values in old_cols_matrix are smaller than or equal to the cutoff, replace with empty string + # This leads to replacement with reference level in next step. + set(dt_collapsed, which(rowSums(old_cols_matrix > cutoff) == 0), new_col, "") } # Replace occurrences of old column names with corresponding new level names - set(dt, , new_col, factor(lvls[match(dt[[new_col]], names(lvls))], levels = lvls)) - } + set(dt_collapsed, , new_col, factor(lvls[match(dt_collapsed[[new_col]], names(lvls))], levels = lvls)) + }) - # Drop old columns + # Drop old columns (if existent, remove empty string elements, to allow subsetting) drop = unlist(lapply(colmaps, names)) - # If existent, remove empty string elements drop = discard(drop, drop == "") dt[, (drop) := NULL] - dt + # cbind new columns + do.call(cbind, list(dt, dt_collapsed)) } ) ) mlr_pipeops$add("decode", PipeOpDecode) + +# Ensures the reference level name is unique for a given factor by appending an incrementing suffix if needed. +# * ref_name: name of the reference level by default +# * lvl_names: all other level names for a given factor +get_ref_name = function(ref_name, lvl_names) { + new_ref_name = ref_name + counter = 1 + while (new_ref_name %in% lvl_names) { + new_ref_name = paste0(ref_name, ".", counter) + counter = counter + 1 + } + new_ref_name +} diff --git a/tests/testthat/test_pipeop_decode.R b/tests/testthat/test_pipeop_decode.R index 7a4671e07..bea1591a1 100644 --- a/tests/testthat/test_pipeop_decode.R +++ b/tests/testthat/test_pipeop_decode.R @@ -71,6 +71,27 @@ test_that("PipeOpDecode - one-hot-encoding", { x = as.factor(c(1, 2, 3, 3, 3)) ) expect_equal(train_out, dt_compare) + + # no name collision + op$param_set$values$group_pattern = "^(.+)\\." # matches everything till last dot + + dt = data.table( + target = runif(10), + x.1 = rep(c(1, 0), 5), + x.2 = rep(c(0, 1), 5), + x.1.a = rep(c(2, 1), 5), + x.1.b = rep(c(1, 2), 5) + ) + task = TaskRegr$new(id = "test", backend = dt, target = "target") + + train_out = op$train(list(task))[[1]]$data() + dt_compare = data.table( + target = dt$target, + x = as.factor(rep(c(1, 2), 5)), + x.1 = as.factor(rep(c("a", "b"), 5)) + ) + expect_equal(train_out, dt_compare) + }) test_that("PipeOpDecode - treatment encoding", { @@ -102,6 +123,24 @@ test_that("PipeOpDecode - treatment encoding", { x = as.factor(rep(c("1", "ref", "ref"), times = 5)) ) expect_equal(train_out, dt_compare) + op$param_set$values$treatment_cutoff = 0 + + # test incrementing reference level name + op$param_set$values$ref_name = "x" + dt = data.table( + target = runif(15), + x.x = rep(c(1, 0, 0), 5), + x.x.1 = rep(c(0, 0, 0.5), 5) + ) + task = TaskRegr$new(id = "test", backend = dt, target = "target") + + train_out = op$train(list(task))[[1]]$data() + dt_compare = data.table( + target = dt$target, + x = as.factor(rep(c("x", "x.2", "x.1"), times = 5)) + ) + expect_equal(train_out, dt_compare) + }) test_that("PipOpDecode - collapse all into one", { From 0fe923f0194fe9ca844ab84425478d96ffce6849 Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Fri, 6 Dec 2024 17:45:56 +0100 Subject: [PATCH 17/18] updated docs for new param + more elaborate examples + ran document --- R/PipeOpDecode.R | 84 ++++++++++++++++----------- man/mlr_pipeops_decode.Rd | 86 ++++++++++++++++++---------- man/mlr_pipeops_learner_pi_cvplus.Rd | 1 + man/mlr_pipeops_learner_quantiles.Rd | 1 + 4 files changed, 109 insertions(+), 63 deletions(-) diff --git a/R/PipeOpDecode.R b/R/PipeOpDecode.R index f0b342b20..e1fa12fea 100644 --- a/R/PipeOpDecode.R +++ b/R/PipeOpDecode.R @@ -6,14 +6,12 @@ #' #' @description #' Reverses one-hot or treatment encoding of columns. It collapses multiple `numeric` or `integer` columns into one `factor` -#' column based on a specified grouping pattern of column names. +#' column based on a pre-specified grouping pattern of column names. #' #' May be applied to multiple groups of columns, grouped by matching a common naming pattern. The grouping pattern is #' extracted to form the name of the newly derived `factor` column, and levels are constructed from the previous column -#' names, with parts matching the grouping pattern removed. The level per row of the new factor column is generally +#' names, with parts matching the grouping pattern removed (see examples). The level per row of the new factor column is generally #' determined as the name of the column with the maximum value in the group. -#' For example, columns `x.1` and `x.2` might be collapsed into a new factor column `x` with levels `1` and `2`, while -#' columns `y.1` and `y.2` might be interpreted as a separate group and collapsed into a new column `y`. #' #' @section Construction: #' ``` @@ -34,14 +32,14 @@ #' * `colmaps` :: named `list`\cr #' Named list of named character vectors. Each element is named according to the new column name extracted by #' `group_pattern`. Each vector contains the level names for the new factor column that should be created, named by -#' the corresponding old column name. If `treatment_encoding` is `TRUE`, then each vector also contains `reflevel_name` as the +#' the corresponding old column name. If `treatment_encoding` is `TRUE`, then each vector also contains `ref_name` as the #' reference class with an empty string as name. #' * `treatment_encoding` :: `logical(1)`\cr -#' Indicates whether treatment encoding (`TRUE`) or one-hot encoding (`FALSE`) is assumed. +#' Value of `treatment_encoding` hyperparameter. #' * `cutoff` :: `numeric(1)`\cr -#' The cutoff value for identifying the reference level in case of treatment encoding. +#' Value of `treatment_encoding` hyperparameter, or `0` if that is not given. #' * `ties_method` :: `character(1)`\cr -#' Method for resolving ties when multiple columns have the same value. Options include `"first"`, `"last"`, or `"random"`. +#' Value of `ties_method` hyperparameter. #' #' @section Parameters: #' The parameters are the parameters inherited from [`PipeOpTaskPreproc`], as well as: @@ -57,14 +55,13 @@ #' If `TRUE`, treatment encoding is assumed instead of one-hot encoding. Initialized to `FALSE`. #' * `treatment_cutoff` :: `numeric(1)`\cr #' If `treatment_encoding` is `TRUE`, specifies a cutoff value for identifying the reference level. The reference level -#' is set to `reflevel_name` in rows where the value is less than or equal to a specified cutoff value (e.g., `0`) in all -#' columns in that group To change the name of the reference level, use [`PipeOp???`] (Mutate? ColApply?). -#' Default is `0`. +#' is set to `ref_name` in rows where the value is less than or equal to a specified cutoff value (e.g., `0`) in all +#' columns in that group. Default is `0`. +#' * `ref_name` :: `character(1)`\cr +#' If `treatment_encoding` is `TRUE`, specifies the name for reference levels. Default is `"ref"`. #' * `ties_method` :: `character(1)`\cr #' Method for resolving ties if multiple columns have the same value. Specifies the value from which of the columns #' with the same value is to be picked. Options are `"first"`, `"last"`, or `"random"`. Initialized to `"random"`. -#' * `ref_name` :: `character(1)`\cr -#' Default is `"ref"`. #' #' @section Methods: #' Only methods inherited from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`]. @@ -76,39 +73,62 @@ #' @examples #' library("mlr3") #' -#' # Create example task with one-hot encoding +#' # Reverse one-hot encoding #' df = data.frame( -#' target = runif(10), -#' x.1 = rep(c(1, 0), 5), -#' x.2 = rep(c(0, 1), 5), -#' y.1 = rep(c(1, 0), 5), -#' y.2 = rep(c(0, 1), 5), -#' a = runif(10) +#' target = runif(4), +#' x.1 = rep(c(1, 0), 2), +#' x.2 = rep(c(0, 1), 2), +#' y.1 = rep(c(1, 0), 2), +#' y.2 = rep(c(0, 1), 2), +#' a = runif(4) #' ) -#' task = TaskRegr$new(id = "example", backend = df, target = "target") +#' task_one_hot = TaskRegr$new(id = "example", backend = df, target = "target") #' #' pop = po("decode") #' -#' # Training -#' train_out = pop$train(list(task))[[1]] +#' train_out = pop$train(list(task_one_hot))[[1]] #' # x.1 and x.2 are collapsed into x, same for y; a is ignored. #' train_out$data() #' -#' # Create example task with treatment encoding +#' # Reverse treatment encoding from PipeOpEncode #' df = data.frame( -#' target = runif(15), -#' x.1 = rep(c(1, 0, 0), 5), -#' x.2 = rep(c(0, 1, 0), 5) +#' target = runif(6), +#' fct = factor(rep(c("a", "b", "c"), 2)) #' ) #' task = TaskRegr$new(id = "example", backend = df, target = "target") #' -#' pop = po("decode") -#' pop$param_set$set_values(treatment_encoding = TRUE) +#' po_enc = po("encode", method = "treatment") +#' task_encoded = po_enc$train(list(task))[[1]] +#' task_encoded$data() +#' +#' po_dec = po("decode", treatment_encoding = TRUE) +#' task_decoded = pop$train(list(task))[[1]] +#' # x.1 and x.2 are collapsed into x. All rows where all values +#' # are smaller or equal to 0, the level is set to the reference level. +#' task_decoded$data() +#' +#' # Different group_pattern +#' df = data.frame( +#' target = runif(4), +#' x_1 = rep(c(1, 0), 2), +#' x_2 = rep(c(0, 1), 2), +#' y_1 = rep(c(2, 0), 2), +#' y_2 = rep(c(0, 1), 2) +#' ) +#' task = TaskRegr$new(id = "example", backend = df, target = "target") +#' +#' # Grouped by first underscore +#' pop = po("decode", group_pattern = "^([^_]+)\\_") +#' train_out = pop$train(list(task))[[1]] +#' # x_1 and x_2 are collapsed into x, same for y +#' train_out$data() #' -#' # Training +#' # Empty string to collapse all matches into one factor column. +#' pop$param_set$set_values(group_pattern = "") #' train_out = pop$train(list(task))[[1]] -#' # x.1 and x.2 are collapsed into x; in rows where all values -#' # are smaller or equal to 0, the reference level is set +#' # All columns are combined into a single column. +#' # The level for each row is determined by the column with the largest value in that row. +#' # By default, ties are resolved randomly. #' train_out$data() #' PipeOpDecode = R6Class("PipeOpDecode", diff --git a/man/mlr_pipeops_decode.Rd b/man/mlr_pipeops_decode.Rd index 682bda28e..d3f6301b1 100644 --- a/man/mlr_pipeops_decode.Rd +++ b/man/mlr_pipeops_decode.Rd @@ -3,20 +3,18 @@ \name{mlr_pipeops_decode} \alias{mlr_pipeops_decode} \alias{PipeOpDecode} -\title{Reverse Encoding} +\title{Reverse Factor Encoding} \format{ \code{\link[R6:R6Class]{R6Class}} object inheriting from \code{\link{PipeOpTaskPreprocSimple}}/\code{\link{PipeOpTaskPreproc}}/\code{\link{PipeOp}}. } \description{ Reverses one-hot or treatment encoding of columns. It collapses multiple \code{numeric} or \code{integer} columns into one \code{factor} -column based on a specified grouping pattern of column names. +column based on a pre-specified grouping pattern of column names. May be applied to multiple groups of columns, grouped by matching a common naming pattern. The grouping pattern is extracted to form the name of the newly derived \code{factor} column, and levels are constructed from the previous column -names, with parts matching the grouping pattern removed. The level per row of the new factor column is generally +names, with parts matching the grouping pattern removed (see examples). The level per row of the new factor column is generally determined as the name of the column with the maximum value in the group. -For example, columns \code{x.1} and \code{x.2} might be collapsed into a new factor column \code{x} with levels \code{1} and \code{2}, while -columns \code{y.1} and \code{y.2} might be interpreted as a separate group and collapsed into a new column \code{y}. } \section{Construction}{ @@ -45,14 +43,14 @@ The \verb{$state} is a named \code{list} with the \verb{$state} elements inherit \item \code{colmaps} :: named \code{list}\cr Named list of named character vectors. Each element is named according to the new column name extracted by \code{group_pattern}. Each vector contains the level names for the new factor column that should be created, named by -the corresponding old column name. If \code{treatment_encoding} is \code{TRUE}, then each vector also contains \code{"ref"} as the +the corresponding old column name. If \code{treatment_encoding} is \code{TRUE}, then each vector also contains \code{ref_name} as the reference class with an empty string as name. \item \code{treatment_encoding} :: \code{logical(1)}\cr -Indicates whether treatment encoding (\code{TRUE}) or one-hot encoding (\code{FALSE}) is assumed. +Value of \code{treatment_encoding} hyperparameter. \item \code{cutoff} :: \code{numeric(1)}\cr -The cutoff value for identifying the reference level in case of treatment encoding. +Value of \code{treatment_encoding} hyperparameter, or \code{0} if that is not given. \item \code{ties_method} :: \code{character(1)}\cr -Method for resolving ties when multiple columns have the same value. Options include \code{"first"}, \code{"last"}, or \code{"random"}. +Value of \code{ties_method} hyperparameter. } } @@ -72,9 +70,10 @@ construct new levels as everything after the first dot. If \code{TRUE}, treatment encoding is assumed instead of one-hot encoding. Initialized to \code{FALSE}. \item \code{treatment_cutoff} :: \code{numeric(1)}\cr If \code{treatment_encoding} is \code{TRUE}, specifies a cutoff value for identifying the reference level. The reference level -is set to \code{"ref"} in rows where the value is less than or equal to a specified cutoff value (e.g., \code{0}) in all -columns in that group To change the name of the reference level, use \code{\link{PipeOp???}} (Mutate? ColApply?). -Initialized to \code{0}. +is set to \code{ref_name} in rows where the value is less than or equal to a specified cutoff value (e.g., \code{0}) in all +columns in that group. Default is \code{0}. +\item \code{ref_name} :: \code{character(1)}\cr +If \code{treatment_encoding} is \code{TRUE}, specifies the name for reference levels. Default is \code{"ref"}. \item \code{ties_method} :: \code{character(1)}\cr Method for resolving ties if multiple columns have the same value. Specifies the value from which of the columns with the same value is to be picked. Options are \code{"first"}, \code{"last"}, or \code{"random"}. Initialized to \code{"random"}. @@ -89,39 +88,62 @@ Only methods inherited from \code{\link{PipeOpTaskPreprocSimple}}/\code{\link{Pi \examples{ library("mlr3") -# Create example task with one-hot encoding +# Reverse one-hot encoding df = data.frame( - target = runif(10), - x.1 = rep(c(1, 0), 5), - x.2 = rep(c(0, 1), 5), - y.1 = rep(c(1, 0), 5), - y.2 = rep(c(0, 1), 5), - a = runif(10) + target = runif(4), + x.1 = rep(c(1, 0), 2), + x.2 = rep(c(0, 1), 2), + y.1 = rep(c(1, 0), 2), + y.2 = rep(c(0, 1), 2), + a = runif(4) ) -task = TaskRegr$new(id = "example", backend = df, target = "target") +task_one_hot = TaskRegr$new(id = "example", backend = df, target = "target") pop = po("decode") -# Training -train_out = pop$train(list(task))[[1]] +train_out = pop$train(list(task_one_hot))[[1]] # x.1 and x.2 are collapsed into x, same for y; a is ignored. train_out$data() -# Create example task with treatment encoding +# Reverse treatment encoding from PipeOpEncode df = data.frame( - target = runif(15), - x.1 = rep(c(1, 0, 0), 5), - x.2 = rep(c(0, 1, 0), 5) + target = runif(6), + fct = factor(rep(c("a", "b", "c"), 2)) ) task = TaskRegr$new(id = "example", backend = df, target = "target") -pop = po("decode") -pop$param_set$set_values(treatment_encoding = TRUE) +po_enc = po("encode", method = "treatment") +task_encoded = po_enc$train(list(task))[[1]] +task_encoded$data() + +po_dec = po("decode", treatment_encoding = TRUE) +task_decoded = pop$train(list(task))[[1]] +# x.1 and x.2 are collapsed into x. All rows where all values +# are smaller or equal to 0, the level is set to the reference level. +task_decoded$data() + +# Different group_pattern +df = data.frame( + target = runif(4), + x_1 = rep(c(1, 0), 2), + x_2 = rep(c(0, 1), 2), + y_1 = rep(c(2, 0), 2), + y_2 = rep(c(0, 1), 2) +) +task = TaskRegr$new(id = "example", backend = df, target = "target") + +# Grouped by first underscore +pop = po("decode", group_pattern = "^([^_]+)\\\\_") +train_out = pop$train(list(task))[[1]] +# x_1 and x_2 are collapsed into x, same for y +train_out$data() -# Training +# Empty string to collapse all matches into one factor column. +pop$param_set$set_values(group_pattern = "") train_out = pop$train(list(task))[[1]] -# x.1 and x.2 are collapsed into x; in rows where all values -# are smaller or equal to 0, the reference level is set +# All columns are combined into a single column. +# The level for each row is determined by the column with the largest value in that row. +# By default, ties are resolved randomly. train_out$data() } @@ -167,6 +189,8 @@ Other PipeOps: \code{\link{mlr_pipeops_imputesample}}, \code{\link{mlr_pipeops_kernelpca}}, \code{\link{mlr_pipeops_learner}}, +\code{\link{mlr_pipeops_learner_pi_cvplus}}, +\code{\link{mlr_pipeops_learner_quantiles}}, \code{\link{mlr_pipeops_missind}}, \code{\link{mlr_pipeops_modelmatrix}}, \code{\link{mlr_pipeops_multiplicityexply}}, diff --git a/man/mlr_pipeops_learner_pi_cvplus.Rd b/man/mlr_pipeops_learner_pi_cvplus.Rd index 69355ab14..9104d8d0d 100644 --- a/man/mlr_pipeops_learner_pi_cvplus.Rd +++ b/man/mlr_pipeops_learner_pi_cvplus.Rd @@ -141,6 +141,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, diff --git a/man/mlr_pipeops_learner_quantiles.Rd b/man/mlr_pipeops_learner_quantiles.Rd index 7ffd250e5..fc87fdc19 100644 --- a/man/mlr_pipeops_learner_quantiles.Rd +++ b/man/mlr_pipeops_learner_quantiles.Rd @@ -126,6 +126,7 @@ Other PipeOps: \code{\link{mlr_pipeops_colroles}}, \code{\link{mlr_pipeops_copy}}, \code{\link{mlr_pipeops_datefeatures}}, +\code{\link{mlr_pipeops_decode}}, \code{\link{mlr_pipeops_encode}}, \code{\link{mlr_pipeops_encodeimpact}}, \code{\link{mlr_pipeops_encodelmer}}, From e2a3878ff9dbf7ba5a59562c0b2e62229f068c39 Mon Sep 17 00:00:00 2001 From: kenomersmannPC Date: Fri, 6 Dec 2024 17:56:01 +0100 Subject: [PATCH 18/18] updated NEWS, addendum for dictionary sugar PR --- NEWS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/NEWS.md b/NEWS.md index c403ce0dc..c612f2fcc 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,6 +4,7 @@ * Fix: `PipeOpCollapseFactors` now correctly collapses levels of ordered factors. * Fix: `LearnerClassifAvg` and `LearnerRegrAvg` hyperparameters get the `"required"` tag. * New parameter `use_groups` (default `TRUE`) for `PipeOpSubsampling` to respect grouping (changed default behaviour for grouped data) +* Dictionary sugar functions `po()` / `pos()` / `ppl()` / `ppls()` now make suggestions for entries in both `mlr_pipeops` as well as `mlr_graphs` when an object by the given name could not be found in the respective dictionary. * New PipeOp `PipeOpDecode` / `po("decode")` to reverse one-hot or treatment encoding. # mlr3pipelines 0.7.1