Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New PipeOpEncodePL for Piecewise Linear Encoding #861

Draft
wants to merge 5 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion R/PipeOpEncode.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
#' @section Input and Output Channels:
#' Input and output channels are inherited from [`PipeOpTaskPreproc`].
#'
#' The output is the input [`Task`][mlr3::Task] with all affected `factor` and `ordered` parameters encoded according to the `method`
#' The output is the input [`Task`][mlr3::Task] with all affected `factor` and `ordered` columns encoded according to the `method`
#' parameter.
#'
#' @section State:
Expand Down
151 changes: 151 additions & 0 deletions R/PipeOpEncodePL.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
#' @title Factor Encoding
#'
#' @usage NULL
#' @name mlr_pipeops_encode
#' @format [`R6Class`][R6::R6Class] object inheriting from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`].
#'
#' @description
#' Encodes columns of type `numeric` and `integer`.
#'
#'
#'
#' Use the [`PipeOpTaskPreproc`] `$affect_columns` functionality to only encode a subset of columns, or only encode columns of a certain type.
#'
#' @section Construction:
#' ```
#' PipeOpEncodePL$new(task_type, id = "encodepl", param_vals = list())
#' ```
#' * `task_type` :: `character(1)`\cr
#'
#' * `id` :: `character(1)`\cr
#' Identifier of resulting object, default `"encode"`.
#' * `param_vals` :: named `list`\cr
#' List of hyperparameter settings, overwriting the hyperparameter settings that would otherwise be set during construction. Default `list()`.
#'
#' @section Input and Output Channels:
#' Input and output channels are inherited from [`PipeOpTaskPreproc`].
#'
#' The output is the input [`Task`][mlr3::Task] with all affected `numeric` and `integer` columns
#'
#' @section State:
#' The `$state` is a named `list` with the `$state` elements inherited from [`PipeOpTaskPreproc`], as well as:
#' * ` ` :: named `list`\cr
#'
#' @section Parameters:
#' The parameters are the parameters inherited from [`PipeOpTaskPreproc`], as well as:
#' * `method` :: `character(1)` \cr
#' Initialized to `""`. One of:
#'
#' @section Methods:
#' Only methods inherited from [`PipeOpTaskPreprocSimple`]/[`PipeOpTaskPreproc`]/[`PipeOp`].
#'
#' @references
#' `r format_bib("gorishniy_2022")`
#'
#' @family PipeOps
#' @template seealso_pipeopslist
#' @include PipeOpTaskPreproc.R
#' @export
#' @examples
#' library("mlr3")
#'
PipeOpEncodePL = R6Class("PipeOpEncodePL",
inherit = PipeOpTaskPreprocSimple,
public = list(
initialize = function(task_type, id = "encodepl", param_vals = list()) {
# NOTE: Might use different name, change assert, and conditions
assert_choice(task_type, mlr_reflections$task_types$task)
if (task_type == "TaskRegr") {
private$.tree_learner = LearnerRegrRpart$new()
} else if (task_type == "TaskClassif") {
private$.tree_learner = LearnerClassifRpart$new()
} else {
stopf("Task type %s not supported", task_type)
}
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don‘t need task_type if we use method = "quantiles", meaning that we should only check this in that case and don‘t throw an error, etc. We could set task type as "Task" by default …
However, mlr_reflections$task_types does not contain this as an acceptable name, so change the assert.


private$.encodepl_param_set = ps(
method = p_fct(levels = c("quantiles", "tree"), tags = c("train", "predict", "required")),
quantiles_numsplits = p_int(lower = 2, default = 2, tags = c("train", "predict"), depends = quote(method == "quantiles"))
)
private$.encodepl_param_set$values = list(method = "quantiles")

super$initialize(id, param_set = alist(encodepl = private$.encodepl_param_set, private$.tree_learner$param_set),
param_vals = param_vals, packages = c("stats", private$.tree_learner$packages),
task_type = task_type, tags = "encode", feature_types = c("numeric", "integer"))
}
),
private = list(

.tree_learner = NULL,
.encodepl_param_set = NULL,

.get_state = function(task) {
cols = private$.select_cols(task)
if (!length(cols)) {
return(task) # early exit
}

pv = private$.encodepl_param_set$values
numsplits = pv$quantiles_numsplits %??% 2

if (pv$method == "quantiles") {
# TODO: check that min / max is correct here (according to paper / implementation)
bins = lapply(task$data(cols = cols), function(d) {
unique(c(min(d), stats::quantile(d, seq(1, numsplits - 1) / numsplits, na.rm = TRUE), max(d)))
})
} else {
learner = private$.tree_learner

bins = list()
for (col in cols) {
t = task$clone(deep = TRUE)$select(col)
splits = learner$train(t)$model$splits
# Get column "index" in model splits
boundaries = unname(sort(splits[, "index"]))
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can simplify this into one line now


d = task$data(cols = col)
bins[[col]] = c(min(d), boundaries, max(d))
}
}

list(bins = bins)
},

.transform = function(task) {
bins = self$state$bins
cols = names(bins)
if (!length(cols)) {
return(task) # early exit
}

dt = task$data(cols = cols)
res = as.data.table(imap(dt, function(d, col) encode_piecewise_linear(d, col, bins[[col]])))

task$select(setdiff(task$feature_names, cols))$cbind(res)
}
)
)

mlr_pipeops$add("encodepl", PipeOpEncodePL, list(task_type = "TaskRegr"))

# Helper function to implement piecewise linear encoding.
# * column: numeric vector
# * colname: name of `column`
# * bins as numeric vector of boundaries
encode_piecewise_linear = function(column, colname, bins) {
n_bins = length(bins) - 1

dt = data.table(matrix(0, length(column), n_bins))
setnames(dt, paste0(colname, ".bin", seq_len(n_bins)))

for (t in seq_len(n_bins)) {
lower = bins[[t]]
upper = bins[[t + 1]]

dt[column >= upper, colnames(dt)[[t]] := 1]
indices = column < upper & column >= lower
dt[indices, colnames(dt)[[t]] := (column[indices] - lower) / (upper - lower)]
}

dt
}
2 changes: 1 addition & 1 deletion R/PipeOpQuantileBin.R
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ PipeOpQuantileBin = R6Class("PipeOpQuantileBin",
initialize = function(id = "quantilebin", param_vals = list()) {
ps = ps(
numsplits = p_int(lower = 2, special_vals = list(NULL), tags = "train")
)
)
ps$values = list(numsplits = 2L)
super$initialize(id, param_set = ps, param_vals = param_vals, packages = "stats", feature_types = c("numeric", "integer"))
}
Expand Down
29 changes: 21 additions & 8 deletions R/bibentries.R
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,8 @@ bibentries = c(

han_2005 = bibentry("InProceedings",
doi = "10.1007/11538059_91",
author = "Han, Hui and Wang, Wen-Yuan and Mao, Bing-Huan",
editor = "Huang, De-Shuang and Zhang, Xiao-Ping and Huang, Guang-Bin",
author = "Hui Han and Wen-Yuan Wang and Bing-Huan Mao",
editor = "De-Shuang Huang and Xiao-Ping Zhang and Guang-Bin Huang",
title = "Borderline-SMOTE: A New Over-Sampling Method in Imbalanced Data Sets Learning",
booktitle = "Advances in Intelligent Computing",
year = "2005",
Expand All @@ -107,11 +107,24 @@ bibentries = c(
),

freeman_1979 = bibentry("InCollection",
author = "Freeman III, A Myrick",
title = "The Hedonic Price Approach to Measuring Demand for Neighborhood Characteristics",
booktitle = "The Economics of Neighborhood",
year = "1979",
publisher = "Elsevier",
pages = "191--217"
doi = "10.1016/B978-0-12-636250-3.50015-5",
author = "A Myrick Freeman III",
title = "The Hedonic Price Approach to Measuring Demand for Neighborhood Characteristics",
booktitle = "The Economics of Neighborhood",
year = "1979",
publisher = "Elsevier",
pages = "191--217"
),


gorishniy_2022 = bibentry("InProceedings",
title = "On Embeddings for Numerical Features in Tabular Deep Learning",
volume = "35",
url = "https://proceedings.neurips.cc/paper_files/paper/2022/hash/9e9f0ffc3d836836ca96cbf8fe14b105-Abstract-Conference.html",
booktitle = "Advances in Neural Information Processing Systems",
author = "Yury Gorishniy and Ivan Rubachev and Artem Babenko",
year = "2022",
pages = "24991--25004"
)

)
16 changes: 16 additions & 0 deletions tests/testthat/test_pipeop_encodepl.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
context("PipeOpEncodePL")

test_that("PipeOpEncodePL - basic properties", {
task = mlr_tasks$get("mtcars")
expect_datapreproc_pipeop_class(PipeOpEncodePL, constargs = list(task_type = "TaskRegr"), task = task)

task = mlr_tasks$get("iris")
expect_datapreproc_pipeop_class(PipeOpEncodePL, task = task)
expect_datapreproc_pipeop_class(PipeOpEncodePL, constargs = list(task_type = "TaskClassif"), task = task)
})

# Tests:
# - different methods
# - with params (not all for regtree, hopefully)
# - test on tasks with simple data that behaviour is as expected (compare dts)
# - TODO: decide how to handle NAs in feature columns and test that
Loading