From 46e82d83aa68104c4fd2e12692ea8641bfa9e887 Mon Sep 17 00:00:00 2001
From: kenomersmannLaptop <advieser@gmail.com>
Date: Fri, 20 Dec 2024 17:16:08 +0100
Subject: [PATCH 1/5] init PipeOpEncodePL

---
 R/PipeOpEncodePL.R | 130 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 130 insertions(+)
 create mode 100644 R/PipeOpEncodePL.R

diff --git a/R/PipeOpEncodePL.R b/R/PipeOpEncodePL.R
new file mode 100644
index 000000000..1eefdd0ad
--- /dev/null
+++ b/R/PipeOpEncodePL.R
@@ -0,0 +1,130 @@
+#' @title Factor Encoding
+#'
+#' @usage NULL
+#' @name mlr_pipeops_encode
+#' @format [`R6Class`][R6::R6Class] object inheriting from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`].
+#'
+#' @description
+#' Encodes columns of type `factor` and `ordered`.
+#'
+#' Use the [`PipeOpTaskPreproc`] `$affect_columns` functionality to only encode a subset of columns, or only encode columns of a certain type.
+#'
+#' @section Construction:
+#' ```
+#' PipeOpEncodePL$new(id = "encodepl", param_vals = list())
+#' ```
+#' * `id` :: `character(1)`\cr
+#'   Identifier of resulting object, default `"encode"`.
+#' * `param_vals` :: named `list`\cr
+#'   List of hyperparameter settings, overwriting the hyperparameter settings that would otherwise be set during construction. Default `list()`.
+#'
+#' @section Input and Output Channels:
+#' Input and output channels are inherited from [`PipeOpTaskPreproc`].
+#'
+#' The output is the input [`Task`][mlr3::Task] with all affected `numeric` and `integer` columns
+#'
+#' @section State:
+#' The `$state` is a named `list` with the `$state` elements inherited from [`PipeOpTaskPreproc`], as well as:
+#' * ` ` :: named `list`\cr
+#'
+#' @section Parameters:
+#' The parameters are the parameters inherited from [`PipeOpTaskPreproc`], as well as:
+#' * `method`  :: `character(1)` \cr
+#'   Initialized to `""`. One of:
+#'
+#' @section Methods:
+#' Only methods inherited from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`].
+#'
+#' @family PipeOps
+#' @template seealso_pipeopslist
+#' @include PipeOpTaskPreproc.R
+#' @export
+#' @examples
+#' library("mlr3")
+#'
+PipeOpEncodePL = R6Class("PipeOpEncodePL",
+  inherit = PipeOpTaskPreprocSimple,
+  public = list(
+    initialize = function(id = "encodepl", param_vals = list()) {
+      private$.reg_tree = LearnerRegrRpart$new()
+
+      private$.encodepl_param_set = ps(
+        method = p_fct(levels = c("quantiles", "regtree"), tags = c("train", "predict", "required")),
+        # cannot set init value for quantiles numsplits since it has depends, use %??% instead? then document it as default or not?
+        quantiles_numsplits = p_int(lower = 2, default = 2, tags = c("train", "predict"), depends = quote(method == "quantiles"))
+      )
+      private$.encodepl_param_set$values = list(method = "quantiles")
+
+      super$initialize(id, param_set = alist(encodepl = private$.encodepl_param_set, private$.reg_tree$param_set),
+                       param_vals = param_vals, packages = private$.reg_tree$packages, tags = "encode", feature_types = c("numeric", "integer"))
+    }
+  ),
+  private = list(
+
+    .reg_tree = NULL,
+    .encodepl_param_set = NULL,
+
+    .get_state_dt = function(dt, levels, target) {
+      pv = private$.encodepl_param_set$values
+      numsplits = pv$quantiles_numsplits %??% 2
+
+      if (pv$method == "quantiles") {
+        bins = lapply(dt, function(d)
+          unique(c(min(d), stats::quantile(d, seq(1, numsplits - 1) / numsplits, na.rm = TRUE), max(d))))
+          # check that min / max is correct here (according to paper / implementation)
+      } else {
+        learner = private$.reg_tree
+        cols = colnames(dt)
+
+        bins = list()
+        for (col in cols) {
+          t = TaskRegr$new(id = "binning", backend = dt[, ..col], target = task$target_names)
+          splits = learner$train(t)$model$splits
+          rules = unname(sort(splits[, which(colnames(splits) == "index")]))
+          bins[[col]] = c(min(dt[[col]]), rules, max(dt[[col]]))
+        }
+      }
+
+      list(bins = bins)
+    },
+
+    .transform_dt = function(dt, levels) {
+      bins = self$state$bins
+
+      cols = colnames(dt)
+
+      for (col in cols) {
+        dt = cbind(dt, ple(dt[, ..col], bins[[col]]))
+        # do name checking ...
+      }
+
+      # Drop old columns
+      dt[, (cols) := NULL]
+      dt
+    }
+  )
+)
+
+mlr_pipeops$add("encodepl", PipeOpEncodePL)
+
+# Piecewise linear encoding
+ple = function(column, bins) {
+  n_bins = length(bins) - 1
+
+  dt = data.table(matrix(0, nrow(column), n_bins))
+  setnames(dt, paste0(colnames(column), ".bin", seq_len(n_bins)))
+
+  # Transform into vector for logical subsetting in data.table
+  vec = column[[1]]
+
+  for (t in seq_len(n_bins)) {
+    lower = bins[[t]]
+    upper = bins[[t + 1]]
+
+    dt[vec >= upper, colnames(dt)[[t]] := 1]
+    indices = vec < upper & vec >= lower
+    dt[indices, colnames(dt)[[t]] := (vec[indices] - lower) / (upper - lower)]
+  }
+
+  dt
+}

From 68360d7cd5040628cc8e8acf598c4c1798554fc9 Mon Sep 17 00:00:00 2001
From: kenomersmannLaptop <advieser@gmail.com>
Date: Fri, 20 Dec 2024 17:16:26 +0100
Subject: [PATCH 2/5] small docs correction

---
 R/PipeOpEncode.R      | 2 +-
 R/PipeOpQuantileBin.R | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/R/PipeOpEncode.R b/R/PipeOpEncode.R
index 495451a3f..6aff9c744 100644
--- a/R/PipeOpEncode.R
+++ b/R/PipeOpEncode.R
@@ -28,7 +28,7 @@
 #' @section Input and Output Channels:
 #' Input and output channels are inherited from [`PipeOpTaskPreproc`].
 #'
-#' The output is the input [`Task`][mlr3::Task] with all affected `factor` and `ordered` parameters encoded according to the `method`
+#' The output is the input [`Task`][mlr3::Task] with all affected `factor` and `ordered` columns encoded according to the `method`
 #' parameter.
 #'
 #' @section State:
diff --git a/R/PipeOpQuantileBin.R b/R/PipeOpQuantileBin.R
index ab68c9685..ac869db60 100644
--- a/R/PipeOpQuantileBin.R
+++ b/R/PipeOpQuantileBin.R
@@ -58,7 +58,7 @@ PipeOpQuantileBin = R6Class("PipeOpQuantileBin",
     initialize = function(id = "quantilebin", param_vals = list()) {
       ps = ps(
         numsplits = p_int(lower = 2, special_vals = list(NULL), tags = "train")
-        )
+      )
       ps$values = list(numsplits = 2L)
       super$initialize(id, param_set = ps, param_vals = param_vals, packages = "stats", feature_types = c("numeric", "integer"))
     }

From 930b713fa2b076c2b073ce407d40dcc544741313 Mon Sep 17 00:00:00 2001
From: kenomersmannLaptop <advieser@gmail.com>
Date: Fri, 20 Dec 2024 18:01:58 +0100
Subject: [PATCH 3/5] switch to get_state/transform for easier handling of task

---
 R/PipeOpEncodePL.R | 42 +++++++++++++++++++++++++-----------------
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/R/PipeOpEncodePL.R b/R/PipeOpEncodePL.R
index 1eefdd0ad..8938a4844 100644
--- a/R/PipeOpEncodePL.R
+++ b/R/PipeOpEncodePL.R
@@ -47,6 +47,7 @@ PipeOpEncodePL = R6Class("PipeOpEncodePL",
   public = list(
     initialize = function(id = "encodepl", param_vals = list()) {
       private$.reg_tree = LearnerRegrRpart$new()
+      # this would only work for regr tasks, how do we handle classif tasks, esp. since we don't now task type in init?
 
       private$.encodepl_param_set = ps(
         method = p_fct(levels = c("quantiles", "regtree"), tags = c("train", "predict", "required")),
@@ -56,7 +57,7 @@ PipeOpEncodePL = R6Class("PipeOpEncodePL",
       private$.encodepl_param_set$values = list(method = "quantiles")
 
       super$initialize(id, param_set = alist(encodepl = private$.encodepl_param_set, private$.reg_tree$param_set),
-                       param_vals = param_vals, packages = private$.reg_tree$packages, tags = "encode", feature_types = c("numeric", "integer"))
+                       param_vals = param_vals, packages = c("stats", private$.reg_tree$packages), tags = "encode", feature_types = c("numeric", "integer"))
     }
   ),
   private = list(
@@ -64,43 +65,50 @@ PipeOpEncodePL = R6Class("PipeOpEncodePL",
     .reg_tree = NULL,
     .encodepl_param_set = NULL,
 
-    .get_state_dt = function(dt, levels, target) {
+    .get_state = function(task) {
+      cols = private$.select_cols(task)
+      # do we need early exit if there are no cols?
+
       pv = private$.encodepl_param_set$values
       numsplits = pv$quantiles_numsplits %??% 2
 
       if (pv$method == "quantiles") {
-        bins = lapply(dt, function(d)
-          unique(c(min(d), stats::quantile(d, seq(1, numsplits - 1) / numsplits, na.rm = TRUE), max(d))))
-          # check that min / max is correct here (according to paper / implementation)
+        # TODO: check that min / max is correct here (according to paper / implementation)
+        bins = lapply(task$data(cols = cols), function(d) {
+          unique(c(min(d), stats::quantile(d, seq(1, numsplits - 1) / numsplits, na.rm = TRUE), max(d)))
+        })
       } else {
         learner = private$.reg_tree
-        cols = colnames(dt)
 
         bins = list()
         for (col in cols) {
-          t = TaskRegr$new(id = "binning", backend = dt[, ..col], target = task$target_names)
+          t = task$clone(deep = TRUE)$select(col)
           splits = learner$train(t)$model$splits
-          rules = unname(sort(splits[, which(colnames(splits) == "index")]))
-          bins[[col]] = c(min(dt[[col]]), rules, max(dt[[col]]))
+          # Get column "index" in model splits
+          boundaries = unname(sort(splits[, which(colnames(splits) == "index")]))
+
+          d = task$data(cols = col)
+          bins[[col]] = c(min(d), boundaries, max(d))
         }
       }
 
       list(bins = bins)
     },
 
-    .transform_dt = function(dt, levels) {
+    .transform = function(task) {
       bins = self$state$bins
+      cols = names(bins)
+      if (!length(cols)) {
+        return(task)  # early exit
+      }
 
-      cols = colnames(dt)
-
+      dt = data.table()
       for (col in cols) {
-        dt = cbind(dt, ple(dt[, ..col], bins[[col]]))
-        # do name checking ...
+        dt = cbind(dt, ple(task$data(cols = col), bins[[col]]))
       }
 
-      # Drop old columns
-      dt[, (cols) := NULL]
-      dt
+      # TODO: handle name colision
+      task$cbind(dt)
     }
   )
 )

From ae345e5712e02f8bfd2c7fff54aad370650e0dfc Mon Sep 17 00:00:00 2001
From: kenomersmannLaptop <advieser@gmail.com>
Date: Fri, 20 Dec 2024 19:56:13 +0100
Subject: [PATCH 4/5] WIP changes from code review

---
 R/PipeOpEncodePL.R | 73 +++++++++++++++++++++++++++-------------------
 R/bibentries.R     | 29 +++++++++++++-----
 2 files changed, 64 insertions(+), 38 deletions(-)

diff --git a/R/PipeOpEncodePL.R b/R/PipeOpEncodePL.R
index 8938a4844..4557466c3 100644
--- a/R/PipeOpEncodePL.R
+++ b/R/PipeOpEncodePL.R
@@ -5,14 +5,18 @@
 #' @format [`R6Class`][R6::R6Class] object inheriting from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`].
 #'
 #' @description
-#' Encodes columns of type `factor` and `ordered`.
+#' Encodes columns of type `numeric` and `integer`.
+#'
+#'
 #'
 #' Use the [`PipeOpTaskPreproc`] `$affect_columns` functionality to only encode a subset of columns, or only encode columns of a certain type.
 #'
 #' @section Construction:
 #' ```
-#' PipeOpEncodePL$new(id = "encodepl", param_vals = list())
+#' PipeOpEncodePL$new(task_type, id = "encodepl", param_vals = list())
 #' ```
+#' * `task_type` :: `character(1)`\cr
+#'
 #' * `id` :: `character(1)`\cr
 #'   Identifier of resulting object, default `"encode"`.
 #' * `param_vals` :: named `list`\cr
@@ -35,6 +39,9 @@
 #' @section Methods:
 #' Only methods inherited from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`].
 #'
+#' @references
+#' `r format_bib("gorishniy_2022")`
+#'
 #' @family PipeOps
 #' @template seealso_pipeopslist
 #' @include PipeOpTaskPreproc.R
@@ -45,29 +52,38 @@
 PipeOpEncodePL = R6Class("PipeOpEncodePL",
   inherit = PipeOpTaskPreprocSimple,
   public = list(
-    initialize = function(id = "encodepl", param_vals = list()) {
-      private$.reg_tree = LearnerRegrRpart$new()
-      # this would only work for regr tasks, how do we handle classif tasks, esp. since we don't now task type in init?
+    initialize = function(task_type, id = "encodepl", param_vals = list()) {
+      # NOTE: Might use different name, change assert, and conditions
+      assert_choice(task_type, mlr_reflections$task_types$task)
+      if (task_type == "TaskRegr") {
+        private$.tree_learner = LearnerRegrRpart$new()
+      } else if (task_type == "TaskClassif") {
+        private$.tree_learner = LearnerClassifRpart$new()
+      } else {
+        stopf("Task type %s not supported", task_type)
+      }
 
       private$.encodepl_param_set = ps(
-        method = p_fct(levels = c("quantiles", "regtree"), tags = c("train", "predict", "required")),
-        # cannot set init value for quantiles numsplits since it has depends, use %??% instead? then document it as default or not?
+        method = p_fct(levels = c("quantiles", "tree"), tags = c("train", "predict", "required")),
         quantiles_numsplits = p_int(lower = 2, default = 2, tags = c("train", "predict"), depends = quote(method == "quantiles"))
       )
       private$.encodepl_param_set$values = list(method = "quantiles")
 
-      super$initialize(id, param_set = alist(encodepl = private$.encodepl_param_set, private$.reg_tree$param_set),
-                       param_vals = param_vals, packages = c("stats", private$.reg_tree$packages), tags = "encode", feature_types = c("numeric", "integer"))
+      super$initialize(id, param_set = alist(encodepl = private$.encodepl_param_set, private$.tree_learner$param_set),
+                       param_vals = param_vals, packages = c("stats", private$.tree_learner$packages),
+                       task_type = task_type, tags = "encode", feature_types = c("numeric", "integer"))
     }
   ),
   private = list(
 
-    .reg_tree = NULL,
+    .tree_learner = NULL,
     .encodepl_param_set = NULL,
 
     .get_state = function(task) {
       cols = private$.select_cols(task)
-      # do we need early exit if there are no cols?
+      if (!length(cols)) {
+        return(task)  # early exit
+      }
 
       pv = private$.encodepl_param_set$values
       numsplits = pv$quantiles_numsplits %??% 2
@@ -78,14 +94,14 @@ PipeOpEncodePL = R6Class("PipeOpEncodePL",
           unique(c(min(d), stats::quantile(d, seq(1, numsplits - 1) / numsplits, na.rm = TRUE), max(d)))
         })
       } else {
-        learner = private$.reg_tree
+        learner = private$.tree_learner
 
         bins = list()
         for (col in cols) {
           t = task$clone(deep = TRUE)$select(col)
           splits = learner$train(t)$model$splits
           # Get column "index" in model splits
-          boundaries = unname(sort(splits[, which(colnames(splits) == "index")]))
+          boundaries = unname(sort(splits[, "index"]))
 
           d = task$data(cols = col)
           bins[[col]] = c(min(d), boundaries, max(d))
@@ -102,36 +118,33 @@ PipeOpEncodePL = R6Class("PipeOpEncodePL",
         return(task)  # early exit
       }
 
-      dt = data.table()
-      for (col in cols) {
-        dt = cbind(dt, ple(task$data(cols = col), bins[[col]]))
-      }
+      dt = task$data(cols = cols)
+      res = as.data.table(imap(dt, function(d, col) encode_piecewise_linear(d, col, bins[[col]])))
 
-      # TODO: handle name colision
-      task$cbind(dt)
+      task$select(setdiff(task$feature_names, cols))$cbind(res)
     }
   )
 )
 
-mlr_pipeops$add("encodepl", PipeOpEncodePL)
+mlr_pipeops$add("encodepl", PipeOpEncodePL, list(task_type = "TaskRegr"))
 
-# Piecewise linear encoding
-ple = function(column, bins) {
+# Helper function to implement piecewise linear encoding.
+# * column: numeric vector
+# * colname: name of `column`
+# * bins as numeric vector of boundaries
+encode_piecewise_linear = function(column, colname, bins) {
   n_bins = length(bins) - 1
 
-  dt = data.table(matrix(0, nrow(column), n_bins))
-  setnames(dt, paste0(colnames(column), ".bin", seq_len(n_bins)))
-
-  # Transform into vector for logical subsetting in data.table
-  vec = column[[1]]
+  dt = data.table(matrix(0, length(column), n_bins))
+  setnames(dt, paste0(colname, ".bin", seq_len(n_bins)))
 
   for (t in seq_len(n_bins)) {
     lower = bins[[t]]
     upper = bins[[t + 1]]
 
-    dt[vec >= upper, colnames(dt)[[t]] := 1]
-    indices = vec < upper & vec >= lower
-    dt[indices, colnames(dt)[[t]] := (vec[indices] - lower) / (upper - lower)]
+    dt[column >= upper, colnames(dt)[[t]] := 1]
+    indices = column < upper & column >= lower
+    dt[indices, colnames(dt)[[t]] := (column[indices] - lower) / (upper - lower)]
   }
 
   dt
diff --git a/R/bibentries.R b/R/bibentries.R
index e6079539c..642bfae4b 100644
--- a/R/bibentries.R
+++ b/R/bibentries.R
@@ -95,8 +95,8 @@ bibentries = c(
 
   han_2005    = bibentry("InProceedings",
     doi       = "10.1007/11538059_91",
-    author    = "Han, Hui and Wang, Wen-Yuan and Mao, Bing-Huan",
-    editor    = "Huang, De-Shuang and Zhang, Xiao-Ping and Huang, Guang-Bin",
+    author    = "Hui Han and Wen-Yuan Wang and Bing-Huan Mao",
+    editor    = "De-Shuang Huang and Xiao-Ping Zhang and Guang-Bin Huang",
     title     = "Borderline-SMOTE: A New Over-Sampling Method in Imbalanced Data Sets Learning",
     booktitle = "Advances in Intelligent Computing",
     year      = "2005",
@@ -107,11 +107,24 @@ bibentries = c(
   ),
 
   freeman_1979 = bibentry("InCollection",
-    author    = "Freeman III, A Myrick",
-    title     = "The Hedonic Price Approach to Measuring Demand for Neighborhood Characteristics",
-    booktitle = "The Economics of Neighborhood",
-    year      = "1979",
-    publisher = "Elsevier",
-    pages     = "191--217"
+    doi        = "10.1016/B978-0-12-636250-3.50015-5",
+    author     = "A Myrick Freeman III",
+    title      = "The Hedonic Price Approach to Measuring Demand for Neighborhood Characteristics",
+    booktitle  = "The Economics of Neighborhood",
+    year       = "1979",
+    publisher  = "Elsevier",
+    pages      = "191--217"
+  ),
+
+
+  gorishniy_2022 = bibentry("InProceedings",
+    title        = "On Embeddings for Numerical Features in Tabular Deep Learning",
+    volume       = "35",
+    url          = "https://proceedings.neurips.cc/paper_files/paper/2022/hash/9e9f0ffc3d836836ca96cbf8fe14b105-Abstract-Conference.html",
+    booktitle    = "Advances in Neural Information Processing Systems",
+    author       = "Yury Gorishniy and Ivan Rubachev and Artem Babenko",
+    year         = "2022",
+    pages        = "24991--25004"
   )
+
 )

From 8a5e162206b60bc4d9e246ae0b382c4cdd0ea39f Mon Sep 17 00:00:00 2001
From: kenomersmannLaptop <advieser@gmail.com>
Date: Fri, 20 Dec 2024 19:56:22 +0100
Subject: [PATCH 5/5] init tests

---
 tests/testthat/test_pipeop_encodepl.R | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 tests/testthat/test_pipeop_encodepl.R

diff --git a/tests/testthat/test_pipeop_encodepl.R b/tests/testthat/test_pipeop_encodepl.R
new file mode 100644
index 000000000..ceb52e064
--- /dev/null
+++ b/tests/testthat/test_pipeop_encodepl.R
@@ -0,0 +1,16 @@
+context("PipeOpEncodePL")
+
+test_that("PipeOpEncodePL - basic properties", {
+  task = mlr_tasks$get("mtcars")
+  expect_datapreproc_pipeop_class(PipeOpEncodePL, constargs = list(task_type = "TaskRegr"), task = task)
+
+  task = mlr_tasks$get("iris")
+  expect_datapreproc_pipeop_class(PipeOpEncodePL, task = task)
+  expect_datapreproc_pipeop_class(PipeOpEncodePL, constargs = list(task_type = "TaskClassif"), task = task)
+})
+
+# Tests:
+# - different methods
+#    - with params (not all for regtree, hopefully)
+# - test on tasks with simple data that behaviour is as expected (compare dts)
+# - TODO: decide how to handle NAs in feature columns and test that