mlr-org · advieser · Dec 20, 2024 · Dec 20, 2024 · Dec 20, 2024 · Dec 20, 2024
diff --git a/R/PipeOpEncode.R b/R/PipeOpEncode.R
@@ -28,7 +28,7 @@
 #' @section Input and Output Channels:
 #' Input and output channels are inherited from [`PipeOpTaskPreproc`].
 #'
-#' The output is the input [`Task`][mlr3::Task] with all affected `factor` and `ordered` parameters encoded according to the `method`
+#' The output is the input [`Task`][mlr3::Task] with all affected `factor` and `ordered` columns encoded according to the `method`
 #' parameter.
 #'
 #' @section State:

diff --git a/R/PipeOpEncodePL.R b/R/PipeOpEncodePL.R
@@ -0,0 +1,151 @@
+#' @title Factor Encoding
+#'
+#' @usage NULL
+#' @name mlr_pipeops_encode
+#' @format [`R6Class`][R6::R6Class] object inheriting from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`].
+#'
+#' @description
+#' Encodes columns of type `numeric` and `integer`.
+#'
+#'
+#'
+#' Use the [`PipeOpTaskPreproc`] `$affect_columns` functionality to only encode a subset of columns, or only encode columns of a certain type.
+#'
+#' @section Construction:
+#' ```
+#' PipeOpEncodePL$new(task_type, id = "encodepl", param_vals = list())
+#' ```
+#' * `task_type` :: `character(1)`\cr
+#'
+#' * `id` :: `character(1)`\cr
+#'   Identifier of resulting object, default `"encode"`.
+#' * `param_vals` :: named `list`\cr
+#'   List of hyperparameter settings, overwriting the hyperparameter settings that would otherwise be set during construction. Default `list()`.
+#'
+#' @section Input and Output Channels:
+#' Input and output channels are inherited from [`PipeOpTaskPreproc`].
+#'
+#' The output is the input [`Task`][mlr3::Task] with all affected `numeric` and `integer` columns
+#'
+#' @section State:
+#' The `$state` is a named `list` with the `$state` elements inherited from [`PipeOpTaskPreproc`], as well as:
+#' * ` ` :: named `list`\cr
+#'
+#' @section Parameters:
+#' The parameters are the parameters inherited from [`PipeOpTaskPreproc`], as well as:
+#' * `method`  :: `character(1)` \cr
+#'   Initialized to `""`. One of:
+#'
+#' @section Methods:
+#' Only methods inherited from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`].
+#'
+#' @references
+#' `r format_bib("gorishniy_2022")`
+#'
+#' @family PipeOps
+#' @template seealso_pipeopslist
+#' @include PipeOpTaskPreproc.R
+#' @export
+#' @examples
+#' library("mlr3")
+#'
+PipeOpEncodePL = R6Class("PipeOpEncodePL",
+  inherit = PipeOpTaskPreprocSimple,
+  public = list(
+    initialize = function(task_type, id = "encodepl", param_vals = list()) {
+      # NOTE: Might use different name, change assert, and conditions
+      assert_choice(task_type, mlr_reflections$task_types$task)
+      if (task_type == "TaskRegr") {
+        private$.tree_learner = LearnerRegrRpart$new()
+      } else if (task_type == "TaskClassif") {
+        private$.tree_learner = LearnerClassifRpart$new()
+      } else {
+        stopf("Task type %s not supported", task_type)
+      }
+
+      private$.encodepl_param_set = ps(
+        method = p_fct(levels = c("quantiles", "tree"), tags = c("train", "predict", "required")),
+        quantiles_numsplits = p_int(lower = 2, default = 2, tags = c("train", "predict"), depends = quote(method == "quantiles"))
+      )
+      private$.encodepl_param_set$values = list(method = "quantiles")
+
+      super$initialize(id, param_set = alist(encodepl = private$.encodepl_param_set, private$.tree_learner$param_set),
+                       param_vals = param_vals, packages = c("stats", private$.tree_learner$packages),
+                       task_type = task_type, tags = "encode", feature_types = c("numeric", "integer"))
+    }
+  ),
+  private = list(
+
+    .tree_learner = NULL,
+    .encodepl_param_set = NULL,
+
+    .get_state = function(task) {
+      cols = private$.select_cols(task)
+      if (!length(cols)) {
+        return(task)  # early exit
+      }
+
+      pv = private$.encodepl_param_set$values
+      numsplits = pv$quantiles_numsplits %??% 2
+
+      if (pv$method == "quantiles") {
+        # TODO: check that min / max is correct here (according to paper / implementation)
+        bins = lapply(task$data(cols = cols), function(d) {
+          unique(c(min(d), stats::quantile(d, seq(1, numsplits - 1) / numsplits, na.rm = TRUE), max(d)))
+        })
+      } else {
+        learner = private$.tree_learner
+
+        bins = list()
+        for (col in cols) {
+          t = task$clone(deep = TRUE)$select(col)
+          splits = learner$train(t)$model$splits
+          # Get column "index" in model splits
+          boundaries = unname(sort(splits[, "index"]))
+
+          d = task$data(cols = col)
+          bins[[col]] = c(min(d), boundaries, max(d))
+        }
+      }
+
+      list(bins = bins)
+    },
+
+    .transform = function(task) {
+      bins = self$state$bins
+      cols = names(bins)
+      if (!length(cols)) {
+        return(task)  # early exit
+      }
+
+      dt = task$data(cols = cols)
+      res = as.data.table(imap(dt, function(d, col) encode_piecewise_linear(d, col, bins[[col]])))
+
+      task$select(setdiff(task$feature_names, cols))$cbind(res)
+    }
+  )
+)
+
+mlr_pipeops$add("encodepl", PipeOpEncodePL, list(task_type = "TaskRegr"))
+
+# Helper function to implement piecewise linear encoding.
+# * column: numeric vector
+# * colname: name of `column`
+# * bins as numeric vector of boundaries
+encode_piecewise_linear = function(column, colname, bins) {
+  n_bins = length(bins) - 1
+
+  dt = data.table(matrix(0, length(column), n_bins))
+  setnames(dt, paste0(colname, ".bin", seq_len(n_bins)))
+
+  for (t in seq_len(n_bins)) {
+    lower = bins[[t]]
+    upper = bins[[t + 1]]
+
+    dt[column >= upper, colnames(dt)[[t]] := 1]
+    indices = column < upper & column >= lower
+    dt[indices, colnames(dt)[[t]] := (column[indices] - lower) / (upper - lower)]
+  }
+
+  dt
+}
diff --git a/R/PipeOpQuantileBin.R b/R/PipeOpQuantileBin.R
@@ -58,7 +58,7 @@ PipeOpQuantileBin = R6Class("PipeOpQuantileBin",
     initialize = function(id = "quantilebin", param_vals = list()) {
       ps = ps(
         numsplits = p_int(lower = 2, special_vals = list(NULL), tags = "train")
-        )
+      )
       ps$values = list(numsplits = 2L)
       super$initialize(id, param_set = ps, param_vals = param_vals, packages = "stats", feature_types = c("numeric", "integer"))
     }

diff --git a/R/bibentries.R b/R/bibentries.R
@@ -95,8 +95,8 @@ bibentries = c(
 
   han_2005    = bibentry("InProceedings",
     doi       = "10.1007/11538059_91",
-    author    = "Han, Hui and Wang, Wen-Yuan and Mao, Bing-Huan",
-    editor    = "Huang, De-Shuang and Zhang, Xiao-Ping and Huang, Guang-Bin",
+    author    = "Hui Han and Wen-Yuan Wang and Bing-Huan Mao",
+    editor    = "De-Shuang Huang and Xiao-Ping Zhang and Guang-Bin Huang",
     title     = "Borderline-SMOTE: A New Over-Sampling Method in Imbalanced Data Sets Learning",
     booktitle = "Advances in Intelligent Computing",
     year      = "2005",
@@ -107,11 +107,24 @@ bibentries = c(
   ),
 
   freeman_1979 = bibentry("InCollection",
-    author    = "Freeman III, A Myrick",
-    title     = "The Hedonic Price Approach to Measuring Demand for Neighborhood Characteristics",
-    booktitle = "The Economics of Neighborhood",
-    year      = "1979",
-    publisher = "Elsevier",
-    pages     = "191--217"
+    doi        = "10.1016/B978-0-12-636250-3.50015-5",
+    author     = "A Myrick Freeman III",
+    title      = "The Hedonic Price Approach to Measuring Demand for Neighborhood Characteristics",
+    booktitle  = "The Economics of Neighborhood",
+    year       = "1979",
+    publisher  = "Elsevier",
+    pages      = "191--217"
+  ),
+
+
+  gorishniy_2022 = bibentry("InProceedings",
+    title        = "On Embeddings for Numerical Features in Tabular Deep Learning",
+    volume       = "35",
+    url          = "https://proceedings.neurips.cc/paper_files/paper/2022/hash/9e9f0ffc3d836836ca96cbf8fe14b105-Abstract-Conference.html",
+    booktitle    = "Advances in Neural Information Processing Systems",
+    author       = "Yury Gorishniy and Ivan Rubachev and Artem Babenko",
+    year         = "2022",
+    pages        = "24991--25004"
   )
+
 )
diff --git a/tests/testthat/test_pipeop_encodepl.R b/tests/testthat/test_pipeop_encodepl.R
@@ -0,0 +1,16 @@
+context("PipeOpEncodePL")
+
+test_that("PipeOpEncodePL - basic properties", {
+  task = mlr_tasks$get("mtcars")
+  expect_datapreproc_pipeop_class(PipeOpEncodePL, constargs = list(task_type = "TaskRegr"), task = task)
+
+  task = mlr_tasks$get("iris")
+  expect_datapreproc_pipeop_class(PipeOpEncodePL, task = task)
+  expect_datapreproc_pipeop_class(PipeOpEncodePL, constargs = list(task_type = "TaskClassif"), task = task)
+})
+
+# Tests:
+# - different methods
+#    - with params (not all for regtree, hopefully)
+# - test on tasks with simple data that behaviour is as expected (compare dts)
+# - TODO: decide how to handle NAs in feature columns and test that