From 46e82d83aa68104c4fd2e12692ea8641bfa9e887 Mon Sep 17 00:00:00 2001 From: kenomersmannLaptop Date: Fri, 20 Dec 2024 17:16:08 +0100 Subject: [PATCH 1/5] init PipeOpEncodePL --- R/PipeOpEncodePL.R | 130 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 R/PipeOpEncodePL.R diff --git a/R/PipeOpEncodePL.R b/R/PipeOpEncodePL.R new file mode 100644 index 000000000..1eefdd0ad --- /dev/null +++ b/R/PipeOpEncodePL.R @@ -0,0 +1,130 @@ +#' @title Factor Encoding +#' +#' @usage NULL +#' @name mlr_pipeops_encode +#' @format [`R6Class`][R6::R6Class] object inheriting from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`]. +#' +#' @description +#' Encodes columns of type `factor` and `ordered`. +#' +#' Use the [`PipeOpTaskPreproc`] `$affect_columns` functionality to only encode a subset of columns, or only encode columns of a certain type. +#' +#' @section Construction: +#' ``` +#' PipeOpEncodePL$new(id = "encodepl", param_vals = list()) +#' ``` +#' * `id` :: `character(1)`\cr +#' Identifier of resulting object, default `"encode"`. +#' * `param_vals` :: named `list`\cr +#' List of hyperparameter settings, overwriting the hyperparameter settings that would otherwise be set during construction. Default `list()`. +#' +#' @section Input and Output Channels: +#' Input and output channels are inherited from [`PipeOpTaskPreproc`]. +#' +#' The output is the input [`Task`][mlr3::Task] with all affected `numeric` and `integer` columns +#' +#' @section State: +#' The `$state` is a named `list` with the `$state` elements inherited from [`PipeOpTaskPreproc`], as well as: +#' * ` ` :: named `list`\cr +#' +#' @section Parameters: +#' The parameters are the parameters inherited from [`PipeOpTaskPreproc`], as well as: +#' * `method` :: `character(1)` \cr +#' Initialized to `""`. One of: +#' +#' @section Methods: +#' Only methods inherited from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`]. +#' +#' @family PipeOps +#' @template seealso_pipeopslist +#' @include PipeOpTaskPreproc.R +#' @export +#' @examples +#' library("mlr3") +#' +PipeOpEncodePL = R6Class("PipeOpEncodePL", + inherit = PipeOpTaskPreprocSimple, + public = list( + initialize = function(id = "encodepl", param_vals = list()) { + private$.reg_tree = LearnerRegrRpart$new() + + private$.encodepl_param_set = ps( + method = p_fct(levels = c("quantiles", "regtree"), tags = c("train", "predict", "required")), + # cannot set init value for quantiles numsplits since it has depends, use %??% instead? then document it as default or not? + quantiles_numsplits = p_int(lower = 2, default = 2, tags = c("train", "predict"), depends = quote(method == "quantiles")) + ) + private$.encodepl_param_set$values = list(method = "quantiles") + + super$initialize(id, param_set = alist(encodepl = private$.encodepl_param_set, private$.reg_tree$param_set), + param_vals = param_vals, packages = private$.reg_tree$packages, tags = "encode", feature_types = c("numeric", "integer")) + } + ), + private = list( + + .reg_tree = NULL, + .encodepl_param_set = NULL, + + .get_state_dt = function(dt, levels, target) { + pv = private$.encodepl_param_set$values + numsplits = pv$quantiles_numsplits %??% 2 + + if (pv$method == "quantiles") { + bins = lapply(dt, function(d) + unique(c(min(d), stats::quantile(d, seq(1, numsplits - 1) / numsplits, na.rm = TRUE), max(d)))) + # check that min / max is correct here (according to paper / implementation) + } else { + learner = private$.reg_tree + cols = colnames(dt) + + bins = list() + for (col in cols) { + t = TaskRegr$new(id = "binning", backend = dt[, ..col], target = task$target_names) + splits = learner$train(t)$model$splits + rules = unname(sort(splits[, which(colnames(splits) == "index")])) + bins[[col]] = c(min(dt[[col]]), rules, max(dt[[col]])) + } + } + + list(bins = bins) + }, + + .transform_dt = function(dt, levels) { + bins = self$state$bins + + cols = colnames(dt) + + for (col in cols) { + dt = cbind(dt, ple(dt[, ..col], bins[[col]])) + # do name checking ... + } + + # Drop old columns + dt[, (cols) := NULL] + dt + } + ) +) + +mlr_pipeops$add("encodepl", PipeOpEncodePL) + +# Piecewise linear encoding +ple = function(column, bins) { + n_bins = length(bins) - 1 + + dt = data.table(matrix(0, nrow(column), n_bins)) + setnames(dt, paste0(colnames(column), ".bin", seq_len(n_bins))) + + # Transform into vector for logical subsetting in data.table + vec = column[[1]] + + for (t in seq_len(n_bins)) { + lower = bins[[t]] + upper = bins[[t + 1]] + + dt[vec >= upper, colnames(dt)[[t]] := 1] + indices = vec < upper & vec >= lower + dt[indices, colnames(dt)[[t]] := (vec[indices] - lower) / (upper - lower)] + } + + dt +} From 68360d7cd5040628cc8e8acf598c4c1798554fc9 Mon Sep 17 00:00:00 2001 From: kenomersmannLaptop Date: Fri, 20 Dec 2024 17:16:26 +0100 Subject: [PATCH 2/5] small docs correction --- R/PipeOpEncode.R | 2 +- R/PipeOpQuantileBin.R | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/R/PipeOpEncode.R b/R/PipeOpEncode.R index 495451a3f..6aff9c744 100644 --- a/R/PipeOpEncode.R +++ b/R/PipeOpEncode.R @@ -28,7 +28,7 @@ #' @section Input and Output Channels: #' Input and output channels are inherited from [`PipeOpTaskPreproc`]. #' -#' The output is the input [`Task`][mlr3::Task] with all affected `factor` and `ordered` parameters encoded according to the `method` +#' The output is the input [`Task`][mlr3::Task] with all affected `factor` and `ordered` columns encoded according to the `method` #' parameter. #' #' @section State: diff --git a/R/PipeOpQuantileBin.R b/R/PipeOpQuantileBin.R index ab68c9685..ac869db60 100644 --- a/R/PipeOpQuantileBin.R +++ b/R/PipeOpQuantileBin.R @@ -58,7 +58,7 @@ PipeOpQuantileBin = R6Class("PipeOpQuantileBin", initialize = function(id = "quantilebin", param_vals = list()) { ps = ps( numsplits = p_int(lower = 2, special_vals = list(NULL), tags = "train") - ) + ) ps$values = list(numsplits = 2L) super$initialize(id, param_set = ps, param_vals = param_vals, packages = "stats", feature_types = c("numeric", "integer")) } From 930b713fa2b076c2b073ce407d40dcc544741313 Mon Sep 17 00:00:00 2001 From: kenomersmannLaptop Date: Fri, 20 Dec 2024 18:01:58 +0100 Subject: [PATCH 3/5] switch to get_state/transform for easier handling of task --- R/PipeOpEncodePL.R | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/R/PipeOpEncodePL.R b/R/PipeOpEncodePL.R index 1eefdd0ad..8938a4844 100644 --- a/R/PipeOpEncodePL.R +++ b/R/PipeOpEncodePL.R @@ -47,6 +47,7 @@ PipeOpEncodePL = R6Class("PipeOpEncodePL", public = list( initialize = function(id = "encodepl", param_vals = list()) { private$.reg_tree = LearnerRegrRpart$new() + # this would only work for regr tasks, how do we handle classif tasks, esp. since we don't now task type in init? private$.encodepl_param_set = ps( method = p_fct(levels = c("quantiles", "regtree"), tags = c("train", "predict", "required")), @@ -56,7 +57,7 @@ PipeOpEncodePL = R6Class("PipeOpEncodePL", private$.encodepl_param_set$values = list(method = "quantiles") super$initialize(id, param_set = alist(encodepl = private$.encodepl_param_set, private$.reg_tree$param_set), - param_vals = param_vals, packages = private$.reg_tree$packages, tags = "encode", feature_types = c("numeric", "integer")) + param_vals = param_vals, packages = c("stats", private$.reg_tree$packages), tags = "encode", feature_types = c("numeric", "integer")) } ), private = list( @@ -64,43 +65,50 @@ PipeOpEncodePL = R6Class("PipeOpEncodePL", .reg_tree = NULL, .encodepl_param_set = NULL, - .get_state_dt = function(dt, levels, target) { + .get_state = function(task) { + cols = private$.select_cols(task) + # do we need early exit if there are no cols? + pv = private$.encodepl_param_set$values numsplits = pv$quantiles_numsplits %??% 2 if (pv$method == "quantiles") { - bins = lapply(dt, function(d) - unique(c(min(d), stats::quantile(d, seq(1, numsplits - 1) / numsplits, na.rm = TRUE), max(d)))) - # check that min / max is correct here (according to paper / implementation) + # TODO: check that min / max is correct here (according to paper / implementation) + bins = lapply(task$data(cols = cols), function(d) { + unique(c(min(d), stats::quantile(d, seq(1, numsplits - 1) / numsplits, na.rm = TRUE), max(d))) + }) } else { learner = private$.reg_tree - cols = colnames(dt) bins = list() for (col in cols) { - t = TaskRegr$new(id = "binning", backend = dt[, ..col], target = task$target_names) + t = task$clone(deep = TRUE)$select(col) splits = learner$train(t)$model$splits - rules = unname(sort(splits[, which(colnames(splits) == "index")])) - bins[[col]] = c(min(dt[[col]]), rules, max(dt[[col]])) + # Get column "index" in model splits + boundaries = unname(sort(splits[, which(colnames(splits) == "index")])) + + d = task$data(cols = col) + bins[[col]] = c(min(d), boundaries, max(d)) } } list(bins = bins) }, - .transform_dt = function(dt, levels) { + .transform = function(task) { bins = self$state$bins + cols = names(bins) + if (!length(cols)) { + return(task) # early exit + } - cols = colnames(dt) - + dt = data.table() for (col in cols) { - dt = cbind(dt, ple(dt[, ..col], bins[[col]])) - # do name checking ... + dt = cbind(dt, ple(task$data(cols = col), bins[[col]])) } - # Drop old columns - dt[, (cols) := NULL] - dt + # TODO: handle name colision + task$cbind(dt) } ) ) From ae345e5712e02f8bfd2c7fff54aad370650e0dfc Mon Sep 17 00:00:00 2001 From: kenomersmannLaptop Date: Fri, 20 Dec 2024 19:56:13 +0100 Subject: [PATCH 4/5] WIP changes from code review --- R/PipeOpEncodePL.R | 73 +++++++++++++++++++++++++++------------------- R/bibentries.R | 29 +++++++++++++----- 2 files changed, 64 insertions(+), 38 deletions(-) diff --git a/R/PipeOpEncodePL.R b/R/PipeOpEncodePL.R index 8938a4844..4557466c3 100644 --- a/R/PipeOpEncodePL.R +++ b/R/PipeOpEncodePL.R @@ -5,14 +5,18 @@ #' @format [`R6Class`][R6::R6Class] object inheriting from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`]. #' #' @description -#' Encodes columns of type `factor` and `ordered`. +#' Encodes columns of type `numeric` and `integer`. +#' +#' #' #' Use the [`PipeOpTaskPreproc`] `$affect_columns` functionality to only encode a subset of columns, or only encode columns of a certain type. #' #' @section Construction: #' ``` -#' PipeOpEncodePL$new(id = "encodepl", param_vals = list()) +#' PipeOpEncodePL$new(task_type, id = "encodepl", param_vals = list()) #' ``` +#' * `task_type` :: `character(1)`\cr +#' #' * `id` :: `character(1)`\cr #' Identifier of resulting object, default `"encode"`. #' * `param_vals` :: named `list`\cr @@ -35,6 +39,9 @@ #' @section Methods: #' Only methods inherited from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`]. #' +#' @references +#' `r format_bib("gorishniy_2022")` +#' #' @family PipeOps #' @template seealso_pipeopslist #' @include PipeOpTaskPreproc.R @@ -45,29 +52,38 @@ PipeOpEncodePL = R6Class("PipeOpEncodePL", inherit = PipeOpTaskPreprocSimple, public = list( - initialize = function(id = "encodepl", param_vals = list()) { - private$.reg_tree = LearnerRegrRpart$new() - # this would only work for regr tasks, how do we handle classif tasks, esp. since we don't now task type in init? + initialize = function(task_type, id = "encodepl", param_vals = list()) { + # NOTE: Might use different name, change assert, and conditions + assert_choice(task_type, mlr_reflections$task_types$task) + if (task_type == "TaskRegr") { + private$.tree_learner = LearnerRegrRpart$new() + } else if (task_type == "TaskClassif") { + private$.tree_learner = LearnerClassifRpart$new() + } else { + stopf("Task type %s not supported", task_type) + } private$.encodepl_param_set = ps( - method = p_fct(levels = c("quantiles", "regtree"), tags = c("train", "predict", "required")), - # cannot set init value for quantiles numsplits since it has depends, use %??% instead? then document it as default or not? + method = p_fct(levels = c("quantiles", "tree"), tags = c("train", "predict", "required")), quantiles_numsplits = p_int(lower = 2, default = 2, tags = c("train", "predict"), depends = quote(method == "quantiles")) ) private$.encodepl_param_set$values = list(method = "quantiles") - super$initialize(id, param_set = alist(encodepl = private$.encodepl_param_set, private$.reg_tree$param_set), - param_vals = param_vals, packages = c("stats", private$.reg_tree$packages), tags = "encode", feature_types = c("numeric", "integer")) + super$initialize(id, param_set = alist(encodepl = private$.encodepl_param_set, private$.tree_learner$param_set), + param_vals = param_vals, packages = c("stats", private$.tree_learner$packages), + task_type = task_type, tags = "encode", feature_types = c("numeric", "integer")) } ), private = list( - .reg_tree = NULL, + .tree_learner = NULL, .encodepl_param_set = NULL, .get_state = function(task) { cols = private$.select_cols(task) - # do we need early exit if there are no cols? + if (!length(cols)) { + return(task) # early exit + } pv = private$.encodepl_param_set$values numsplits = pv$quantiles_numsplits %??% 2 @@ -78,14 +94,14 @@ PipeOpEncodePL = R6Class("PipeOpEncodePL", unique(c(min(d), stats::quantile(d, seq(1, numsplits - 1) / numsplits, na.rm = TRUE), max(d))) }) } else { - learner = private$.reg_tree + learner = private$.tree_learner bins = list() for (col in cols) { t = task$clone(deep = TRUE)$select(col) splits = learner$train(t)$model$splits # Get column "index" in model splits - boundaries = unname(sort(splits[, which(colnames(splits) == "index")])) + boundaries = unname(sort(splits[, "index"])) d = task$data(cols = col) bins[[col]] = c(min(d), boundaries, max(d)) @@ -102,36 +118,33 @@ PipeOpEncodePL = R6Class("PipeOpEncodePL", return(task) # early exit } - dt = data.table() - for (col in cols) { - dt = cbind(dt, ple(task$data(cols = col), bins[[col]])) - } + dt = task$data(cols = cols) + res = as.data.table(imap(dt, function(d, col) encode_piecewise_linear(d, col, bins[[col]]))) - # TODO: handle name colision - task$cbind(dt) + task$select(setdiff(task$feature_names, cols))$cbind(res) } ) ) -mlr_pipeops$add("encodepl", PipeOpEncodePL) +mlr_pipeops$add("encodepl", PipeOpEncodePL, list(task_type = "TaskRegr")) -# Piecewise linear encoding -ple = function(column, bins) { +# Helper function to implement piecewise linear encoding. +# * column: numeric vector +# * colname: name of `column` +# * bins as numeric vector of boundaries +encode_piecewise_linear = function(column, colname, bins) { n_bins = length(bins) - 1 - dt = data.table(matrix(0, nrow(column), n_bins)) - setnames(dt, paste0(colnames(column), ".bin", seq_len(n_bins))) - - # Transform into vector for logical subsetting in data.table - vec = column[[1]] + dt = data.table(matrix(0, length(column), n_bins)) + setnames(dt, paste0(colname, ".bin", seq_len(n_bins))) for (t in seq_len(n_bins)) { lower = bins[[t]] upper = bins[[t + 1]] - dt[vec >= upper, colnames(dt)[[t]] := 1] - indices = vec < upper & vec >= lower - dt[indices, colnames(dt)[[t]] := (vec[indices] - lower) / (upper - lower)] + dt[column >= upper, colnames(dt)[[t]] := 1] + indices = column < upper & column >= lower + dt[indices, colnames(dt)[[t]] := (column[indices] - lower) / (upper - lower)] } dt diff --git a/R/bibentries.R b/R/bibentries.R index e6079539c..642bfae4b 100644 --- a/R/bibentries.R +++ b/R/bibentries.R @@ -95,8 +95,8 @@ bibentries = c( han_2005 = bibentry("InProceedings", doi = "10.1007/11538059_91", - author = "Han, Hui and Wang, Wen-Yuan and Mao, Bing-Huan", - editor = "Huang, De-Shuang and Zhang, Xiao-Ping and Huang, Guang-Bin", + author = "Hui Han and Wen-Yuan Wang and Bing-Huan Mao", + editor = "De-Shuang Huang and Xiao-Ping Zhang and Guang-Bin Huang", title = "Borderline-SMOTE: A New Over-Sampling Method in Imbalanced Data Sets Learning", booktitle = "Advances in Intelligent Computing", year = "2005", @@ -107,11 +107,24 @@ bibentries = c( ), freeman_1979 = bibentry("InCollection", - author = "Freeman III, A Myrick", - title = "The Hedonic Price Approach to Measuring Demand for Neighborhood Characteristics", - booktitle = "The Economics of Neighborhood", - year = "1979", - publisher = "Elsevier", - pages = "191--217" + doi = "10.1016/B978-0-12-636250-3.50015-5", + author = "A Myrick Freeman III", + title = "The Hedonic Price Approach to Measuring Demand for Neighborhood Characteristics", + booktitle = "The Economics of Neighborhood", + year = "1979", + publisher = "Elsevier", + pages = "191--217" + ), + + + gorishniy_2022 = bibentry("InProceedings", + title = "On Embeddings for Numerical Features in Tabular Deep Learning", + volume = "35", + url = "https://proceedings.neurips.cc/paper_files/paper/2022/hash/9e9f0ffc3d836836ca96cbf8fe14b105-Abstract-Conference.html", + booktitle = "Advances in Neural Information Processing Systems", + author = "Yury Gorishniy and Ivan Rubachev and Artem Babenko", + year = "2022", + pages = "24991--25004" ) + ) From 8a5e162206b60bc4d9e246ae0b382c4cdd0ea39f Mon Sep 17 00:00:00 2001 From: kenomersmannLaptop Date: Fri, 20 Dec 2024 19:56:22 +0100 Subject: [PATCH 5/5] init tests --- tests/testthat/test_pipeop_encodepl.R | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 tests/testthat/test_pipeop_encodepl.R diff --git a/tests/testthat/test_pipeop_encodepl.R b/tests/testthat/test_pipeop_encodepl.R new file mode 100644 index 000000000..ceb52e064 --- /dev/null +++ b/tests/testthat/test_pipeop_encodepl.R @@ -0,0 +1,16 @@ +context("PipeOpEncodePL") + +test_that("PipeOpEncodePL - basic properties", { + task = mlr_tasks$get("mtcars") + expect_datapreproc_pipeop_class(PipeOpEncodePL, constargs = list(task_type = "TaskRegr"), task = task) + + task = mlr_tasks$get("iris") + expect_datapreproc_pipeop_class(PipeOpEncodePL, task = task) + expect_datapreproc_pipeop_class(PipeOpEncodePL, constargs = list(task_type = "TaskClassif"), task = task) +}) + +# Tests: +# - different methods +# - with params (not all for regtree, hopefully) +# - test on tasks with simple data that behaviour is as expected (compare dts) +# - TODO: decide how to handle NAs in feature columns and test that