From 54472222ae82818139f44a32134e19ce001e0cbd Mon Sep 17 00:00:00 2001 From: Sebastian Fischer Date: Wed, 16 Apr 2025 06:19:28 +0200 Subject: [PATCH 1/6] ... --- DESCRIPTION | 1 + NAMESPACE | 3 + NEWS.md | 2 + R/DataBackendLazyTensors.R | 105 +++++++++++++++++++ R/DataDescriptor.R | 14 +-- R/LearnerTorch.R | 1 + R/lazy_tensor.R | 13 +++ R/materialize.R | 10 +- R/shape.R | 2 +- R/task_dataset.R | 12 ++- R/utils.R | 12 +++ TODO.md | 22 ++++ tests/testthat/test_DataBackendLazyTensors.R | 41 ++++++++ tests/testthat/test_lazy_tensor.R | 2 - tests/testthat/test_materialize.R | 9 +- tests/testthat/test_shape.R | 2 + tests/testthat/test_utils.R | 1 + 17 files changed, 230 insertions(+), 22 deletions(-) create mode 100644 R/DataBackendLazyTensors.R create mode 100644 TODO.md create mode 100644 tests/testthat/test_DataBackendLazyTensors.R diff --git a/DESCRIPTION b/DESCRIPTION index b6dd4ba0d..e41e48f24 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -91,6 +91,7 @@ Collate: 'CallbackSetUnfreeze.R' 'ContextTorch.R' 'DataBackendLazy.R' + 'DataBackendLazyTensors.R' 'utils.R' 'DataDescriptor.R' 'LearnerTorch.R' diff --git a/NAMESPACE b/NAMESPACE index 0d0ebe087..5d6fbe5c1 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -12,6 +12,7 @@ S3method(as_lazy_tensor,DataDescriptor) S3method(as_lazy_tensor,dataset) S3method(as_lazy_tensor,numeric) S3method(as_lazy_tensor,torch_tensor) +S3method(as_lazy_tensors,dataset) S3method(as_torch_callback,R6ClassGenerator) S3method(as_torch_callback,TorchCallback) S3method(as_torch_callback,character) @@ -27,6 +28,7 @@ S3method(as_torch_optimizer,character) S3method(as_torch_optimizer,torch_optimizer_generator) S3method(c,lazy_tensor) S3method(col_info,DataBackendLazy) +S3method(col_info,DataBackendLazyTensors) S3method(format,lazy_tensor) S3method(hash_input,TorchIngressToken) S3method(hash_input,lazy_tensor) @@ -158,6 +160,7 @@ export(TorchLoss) export(TorchOptimizer) export(as_data_descriptor) export(as_lazy_tensor) +export(as_lazy_tensors) export(as_lr_scheduler) export(as_torch_callback) export(as_torch_callbacks) diff --git a/NEWS.md b/NEWS.md index a046df6ca..1999928f0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -5,6 +5,8 @@ * feat: `TorchIngressToken` now also can take a `Selector` as argument `features`. * feat: Added encoders for numericals and categoricals * feat: Added `po("nn_fn")` for calling custom functions in a network. +* fix: `NA` is now a valid shape for lazy tensors. +* feat: `lazy_tensor`s of length 0 can now be materialized. # mlr3torch 0.2.1 diff --git a/R/DataBackendLazyTensors.R b/R/DataBackendLazyTensors.R new file mode 100644 index 000000000..8fb94f626 --- /dev/null +++ b/R/DataBackendLazyTensors.R @@ -0,0 +1,105 @@ + +#' @title Data Backend for Lazy Tensors +#' @description +#' Special **experimental** data backend that converts [`lazy_tensor`] columns to their R representation. +#' However, [`LearnerTorch`] can directly operate on the lazy tensors. +#' @export +DataBackendLazyTensors = R6Class("DataBackendLazyTensors", + cloneable = FALSE, + inherit = DataBackendDataTable, + public = list( + #' @description + #' Create a new instance of this [R6][R6::R6Class] class. + #' @param data (`data.table`)\cr + #' Data containing (among others) [`lazy_tensor`] columns. + #' @param primary_key (`character(1)`)\cr + #' Name of the column used as primary key. + #' @param converter (named `list()` of `function`s)\cr + #' A named list of functions that convert the lazy tensor columns to their R representation. + #' The names must be the names of the columns that need conversion. + #' @param cache (`character()`)\cr + #' Names of the columns that should be cached. + #' Per default, all columns that are converted are cached. + initialize = function(data, primary_key, converter, cache = names(converter)) { + private$.converter = assert_list(converter, types = "function", any.missing = FALSE) + assert_subset(names(converter), colnames(data)) + private$.cached_cols = assert_subset(cache, names(converter)) + walk(names(private$.converter), function(nm) { + if (!inherits(data[[nm]], "lazy_tensor")) { + stopf("Column '%s' is not a lazy tensor.", nm) + } + }) + super$initialize(data, primary_key) + # select the column whose name is stored in primary_key from private$.data but keep its name + private$.data_cache = private$.data[, primary_key, with = FALSE] + }, + data = function(rows, cols) { + rows = assert_integerish(rows, coerce = TRUE) + assert_names(cols, type = "unique") + + if (getOption("mlr3torch.data_loading", FALSE)) { + # no caching, no materialization as this is called in the training loop + return(super$data(rows, cols)) + } + if (all(cols %in% names(private$.data_cache))) { + cache_hit = private$.data_cache[list(rows), cols, on = self$primary_key, with = FALSE] + complete = complete.cases(cache_hit) + cache_hit = cache_hit[complete] + if (nrow(cache_hit) == length(rows)) { + return(cache_hit) + } + combined = rbindlist(list(cache_hit, private$.load_and_cache(rows[!complete], cols))) + reorder = vector("integer", nrow(combined)) + reorder[complete] = seq_len(nrow(cache_hit)) + reorder[!complete] = nrow(cache_hit) + seq_len(nrow(combined) - nrow(cache_hit)) + return(combined[reorder]) + } + + private$.load_and_cache(rows, cols) + }, + head = function(n = 6L) { + if (getOption("mlr3torch.data_loading", FALSE)) { + return(super$head(n)) + } + + self$data(n, self$colnames) + }, + missings = function(rows, cols) { + set_names(rep(0L, length(cols)), cols) + } + ), + private = list( + # call this function only with rows that are not in the cache yet + .load_and_cache = function(rows, cols) { + # Process columns that need conversion + tbl = super$data(rows, cols) + for (nm in intersect(names(private$.converter), names(tbl))) { + converted = private$.converter[[nm]](materialize(tbl[[nm]], rbind = TRUE)) + tbl[[nm]] = converted + + if (nm %in% private$.cached_cols) { + set(private$.data_cache, i = rows, j = nm, value = converted) + } + } + return(tbl) + }, + .data_cache = NULL, + .converter = NULL, + .cached_cols = NULL + ) +) + +#' @export +col_info.DataBackendLazyTensors = function(x, ...) { # nolint + first_row = x$head(1L) + types = map_chr(first_row, function(x) class(x)[1L]) + discrete = setdiff(names(types)[types %chin% c("factor", "ordered")], x$primary_key) + levels = insert_named(named_list(names(types)), map(first_row[discrete], levels)) + data.table(id = names(types), type = unname(types), levels = levels, key = "id") +} + +#' @export +as_data_backend.dataset = function(x, dataset_shapes, primary_key ...) { + + +} \ No newline at end of file diff --git a/R/DataDescriptor.R b/R/DataDescriptor.R index 1bf3cd68d..11410b480 100644 --- a/R/DataDescriptor.R +++ b/R/DataDescriptor.R @@ -60,14 +60,7 @@ DataDescriptor = R6Class("DataDescriptor", # For simplicity we here require the first dimension of the shape to be NA so we don't have to deal with it, # e.g. during subsetting - if (is.null(dataset_shapes)) { - if (is.null(dataset$.getbatch)) { - stopf("dataset_shapes must be provided if dataset does not have a `.getbatch` method.") - } - dataset_shapes = infer_shapes_from_getbatch(dataset) - } else { - assert_compatible_shapes(dataset_shapes, dataset) - } + dataset_shapes = get_or_check_dataset_shapes(dataset, dataset_shapes) if (is.null(graph)) { # avoid name conflicts @@ -84,8 +77,7 @@ DataDescriptor = R6Class("DataDescriptor", assert_true(length(graph$pipeops) >= 1L) } # no preprocessing, dataset returns only a single element (there we can infer a lot) - simple_case = length(graph$pipeops) == 1L && inherits(graph$pipeops[[1L]], "PipeOpNOP") && - length(dataset_shapes) == 1L + simple_case = (length(graph$pipeops) == 1L) && inherits(graph$pipeops[[1L]], "PipeOpNOP") if (is.null(input_map) && nrow(graph$input) == 1L && length(dataset_shapes) == 1L) { input_map = names(dataset_shapes) @@ -100,7 +92,7 @@ DataDescriptor = R6Class("DataDescriptor", assert_choice(pointer[[2]], graph$pipeops[[pointer[[1]]]]$output$name) } if (is.null(pointer_shape) && simple_case) { - pointer_shape = dataset_shapes[[1L]] + pointer_shape = dataset_shapes[[input_map]] } else { assert_shape(pointer_shape, null_ok = TRUE) } diff --git a/R/LearnerTorch.R b/R/LearnerTorch.R index 865ac733e..7829c91f3 100644 --- a/R/LearnerTorch.R +++ b/R/LearnerTorch.R @@ -93,6 +93,7 @@ #' * `.index` are the indices of the batch (`integer()` or a `torch_int()`). #' #' Moreover, one needs to pay attention respect the row ids of the provided task. +#' It is strongly recommended to use the [`task_dataset`] class to create the dataset. #' #' It is also possible to overwrite the private `.dataloader()` method. #' This must respect the dataloader parameters from the [`ParamSet`][paradox::ParamSet]. diff --git a/R/lazy_tensor.R b/R/lazy_tensor.R index d050f8545..079cf436d 100644 --- a/R/lazy_tensor.R +++ b/R/lazy_tensor.R @@ -197,6 +197,19 @@ as_lazy_tensor.torch_tensor = function(x, ...) { # nolint as_lazy_tensor(ds, dataset_shapes = list(x = c(NA, dim(x)[-1]))) } +#' @export +as_lazy_tensors = function(x, ...) { + UseMethod("as_lazy_tensors") +} + +#' @export +as_lazy_tensors.dataset = function(x, dataset_shapes = NULL, ...) { + dataset_shapes = get_or_check_dataset_shapes(x, dataset_shapes) + set_names(map_dtc(names(dataset_shapes), function(shape) { + as_lazy_tensor(x, dataset_shapes = dataset_shapes, input_map = shape) + }), names(dataset_shapes)) +} + #' Assert Lazy Tensor #' #' Asserts whether something is a lazy tensor. diff --git a/R/materialize.R b/R/materialize.R index 849024ad4..185baa5b2 100644 --- a/R/materialize.R +++ b/R/materialize.R @@ -44,6 +44,13 @@ materialize = function(x, device = "cpu", rbind = FALSE, ...) { assert_choice(device, mlr_reflections$torch$devices) assert_flag(rbind) + if (length(x) == 0L) { + if (rbind) { + return(torch_empty(0L)) + } else { + return(list()) + } + } UseMethod("materialize") } @@ -154,9 +161,6 @@ get_output = function(input, graph, varying_shapes, rbind, device) { #' @return [`lazy_tensor()`] #' @keywords internal materialize_internal = function(x, device = "cpu", cache = NULL, rbind) { - if (!length(x)) { - stopf("Cannot materialize lazy tensor of length 0.") - } do_caching = !is.null(cache) ids = map_int(x, 1) diff --git a/R/shape.R b/R/shape.R index d1fdda83d..7970c37ec 100644 --- a/R/shape.R +++ b/R/shape.R @@ -30,7 +30,7 @@ test_shape = function(shape, null_ok = FALSE, unknown_batch = NULL, len = NULL) if (is.null(shape) && null_ok) { return(TRUE) } - ok = test_integerish(shape, min.len = 2L, all.missing = FALSE, any.missing = TRUE, len = len) + ok = test_integerish(shape, min.len = 1L, any.missing = TRUE, len = len) if (!ok) { return(FALSE) diff --git a/R/task_dataset.R b/R/task_dataset.R index af52519ca..07d373465 100644 --- a/R/task_dataset.R +++ b/R/task_dataset.R @@ -81,13 +81,21 @@ task_dataset = dataset("task_dataset", .getbatch = function(index) { cache = if (self$cache_lazy_tensors) new.env() - datapool = self$task$data(rows = self$task$row_ids[index], cols = self$all_features) + datapool = withr::with_options(list(mlr3torch.data_loading = TRUE), { + self$task$data(rows = self$task$row_ids[index], cols = self$all_features) + }) + x = lapply(self$feature_ingress_tokens, function(it) { it$batchgetter(datapool[, it$features, with = FALSE], cache = cache) }) y = if (!is.null(self$target_batchgetter)) { - self$target_batchgetter(datapool[, self$task$target_names, with = FALSE]) + target = datapool[, self$task$target_names, with = FALSE] + if (!inherits(target[[1L]], "lazy_tensor")) { + self$target_batchgetter(target) + } else { + materialize(target[[1L]], rbind = TRUE) + } } out = list(x = x, .index = torch_tensor(index, dtype = torch_long())) if (!is.null(y)) out$y = y diff --git a/R/utils.R b/R/utils.R index 74ae570e3..684d8993b 100644 --- a/R/utils.R +++ b/R/utils.R @@ -307,3 +307,15 @@ infer_shapes = function(shapes_in, param_vals, output_names, fn, rowwise, id) { set_names(list(sout), output_names) } + +get_or_check_dataset_shapes = function(dataset, dataset_shapes) { + if (is.null(dataset_shapes)) { + if (is.null(dataset$.getbatch)) { + stopf("dataset_shapes must be provided if dataset does not have a `.getbatch` method.") + } + dataset_shapes = infer_shapes_from_getbatch(dataset) + } else { + assert_compatible_shapes(dataset_shapes, dataset) + } + dataset_shapes +} \ No newline at end of file diff --git a/TODO.md b/TODO.md new file mode 100644 index 000000000..cb92919fe --- /dev/null +++ b/TODO.md @@ -0,0 +1,22 @@ +* Add `as_lazy_tensors()` +* Make it easier to se +* Fix the bug that the shapes are reported as unknown below and make the code easier. + ```r + ds = dataset("test", + initialize = function() { + self$x = torch_randn(100, 10) + self$y = torch_randn(100, 1) + }, + .getitem = function(i) { + list(x = self$x[i, ], y = self$y[i]) + }, + .length = function() { + nrow(self$x) + } + )() + x_lt = as_lazy_tensor(ds, list(x = c(NA, 10), y = c(NA, 1)), input_map = "x") + y_lt = as_lazy_tensor(ds, list(x = c(NA, 10), y = c(NA, 1)), input_map = "y") + + tbl = data.table(x = x_lt, y = y_lt) + ``` +* Add checks on usage of `DataBackendLazyTensors` in `task_dataset` \ No newline at end of file diff --git a/tests/testthat/test_DataBackendLazyTensors.R b/tests/testthat/test_DataBackendLazyTensors.R new file mode 100644 index 000000000..614ee5fd4 --- /dev/null +++ b/tests/testthat/test_DataBackendLazyTensors.R @@ -0,0 +1,41 @@ +test_that("DataBackendDataset", { + ds = dataset( + initialize = function() { + self$x = torch_randn(100, 10) + self$y = torch_randn(100, 1) + }, + .getitem = function(i) { + list(x = self$x[i, ], y = self$y[i]) + }, + .length = function() { + nrow(self$x) + } + )() + + tbl = as_lazy_tensors(ds, list(x = c(NA, 10), y = c(NA, 1))) + tbl$row_id = 1:100 + + be = DataBackendLazyTensors$new(tbl, primary_key = "row_id", converter = list(y = as.numeric), cache = "y") + + expect_data_backend(be) + + be$data(1, "y") + + be$data(2, c("x", "y")) + + be$head() + + + withr::with_options(list(mlr3torch.data_loading = TRUE), { + be$data(1, c("x", "y")) + }) + + learner = lrn("regr.mlp", batch_size = 32, epochs = 1) + + task = as_task_regr(be, target = "y") + learner$train(task) +}) + +test_that("mlp works with it", { + learner = lrn("classif.mlp") +}) diff --git a/tests/testthat/test_lazy_tensor.R b/tests/testthat/test_lazy_tensor.R index b74208f8c..95881b083 100644 --- a/tests/testthat/test_lazy_tensor.R +++ b/tests/testthat/test_lazy_tensor.R @@ -3,8 +3,6 @@ test_that("prototype", { expect_class(proto, "lazy_tensor") expect_true(length(proto) == 0L) expect_error(dd(proto)) - - expect_error(materialize(lazy_tensor()), "Cannot materialize") }) test_that("input checks", { diff --git a/tests/testthat/test_materialize.R b/tests/testthat/test_materialize.R index 170f673a4..f1fc5a8ae 100644 --- a/tests/testthat/test_materialize.R +++ b/tests/testthat/test_materialize.R @@ -17,8 +17,6 @@ test_that("materialize works on lazy_tensor", { expect_equal(torch_cat(map(output_meta_list, function(x) x$unsqueeze(1)), dim = 1L)$shape, output_meta_tnsr$shape) expect_true(output_meta_tnsr$device == torch_device("meta")) - - expect_error(materialize(lazy_tensor()), "Cannot materialize ") }) test_that("materialize works with differing shapes (hence uses .getitem)", { @@ -75,7 +73,7 @@ test_that("materialize works with same shapes and .getitem method", { }) test_that("materialize_internal works", { - expect_error(materialize_internal(lazy_tensor()), "Cannot materialize ") + expect_error(materialize_internal(lazy_tensor()), "Cannot access data descriptor") task = tsk("lazy_iris") x = task$data(1:2, cols = "x")[[1L]] res1 = materialize(x) @@ -184,3 +182,8 @@ test_that("PipeOpFeatureUnion can properly check whether two lazy tensors are id expect_error(graph$train(task), "cannot aggregate different features sharing") }) + +test_that("0-length", { + expect_equal(torch_empty(0L), materialize(lazy_tensor(), rbind = TRUE)) + expect_equal(list(), materialize(lazy_tensor(), rbind = FALSE)) +}) diff --git a/tests/testthat/test_shape.R b/tests/testthat/test_shape.R index dbafcb4dc..b1670a96e 100644 --- a/tests/testthat/test_shape.R +++ b/tests/testthat/test_shape.R @@ -21,4 +21,6 @@ test_that("assert_shape and friends", { expect_error(assert_shape(c(NA, 1, 2), len = 2)) # NULL is ok even when len is specified expect_true(check_shape(NULL, null_ok = TRUE, len = 2)) + # NA is valid shape + expect_true(check_shape(NA)) }) diff --git a/tests/testthat/test_utils.R b/tests/testthat/test_utils.R index bfd20f409..fdc4f13f4 100644 --- a/tests/testthat/test_utils.R +++ b/tests/testthat/test_utils.R @@ -61,6 +61,7 @@ test_that("order_named_args works", { expect_error(order_named_args(function(..., x) NULL, list(2, 3, x = 1)), regexp = "`...` must") expect_error(order_named_args(function(y, ..., x) NULL, list(y = 4, 2, 3, x = 1)), regexp = "`...` must") }) + test_that("shape_to_str works", { expect_equal(shape_to_str(1), "(1)") expect_equal(shape_to_str(c(1, 2)), "(1,2)") From b012de9e51bb9349809076814d06dadb7ce234f3 Mon Sep 17 00:00:00 2001 From: Sebastian Fischer Date: Wed, 16 Apr 2025 10:17:19 +0200 Subject: [PATCH 2/6] ... --- TODO.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/TODO.md b/TODO.md index cb92919fe..ac96fb010 100644 --- a/TODO.md +++ b/TODO.md @@ -19,4 +19,6 @@ tbl = data.table(x = x_lt, y = y_lt) ``` -* Add checks on usage of `DataBackendLazyTensors` in `task_dataset` \ No newline at end of file +* Add checks on usage of `DataBackendLazyTensors` in `task_dataset` +* Add optimization that truths values don't have to be loaded twice during resampling, i.e. + once for making the predictions and once for retrieving the truth column. \ No newline at end of file From 521fdd7905686dda67d4dd9ae2c62bab28a83c1e Mon Sep 17 00:00:00 2001 From: Sebastian Fischer Date: Wed, 16 Apr 2025 17:06:54 +0200 Subject: [PATCH 3/6] ... --- NAMESPACE | 3 + R/DataBackendLazyTensors.R | 71 +++++-- R/lazy_tensor.R | 10 + R/materialize.R | 28 ++- R/utils.R | 5 +- man/DataBackendLazyTensors.Rd | 104 +++++++++ man/mlr_learners_torch.Rd | 1 + tests/testthat/test_DataBackendLazyTensors.R | 210 +++++++++++++++++-- 8 files changed, 394 insertions(+), 38 deletions(-) create mode 100644 man/DataBackendLazyTensors.Rd diff --git a/NAMESPACE b/NAMESPACE index 5d6fbe5c1..194cbde74 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,6 +7,7 @@ S3method("[[<-",lazy_tensor) S3method(as.data.table,DictionaryMlr3torchCallbacks) S3method(as.data.table,DictionaryMlr3torchLosses) S3method(as.data.table,DictionaryMlr3torchOptimizers) +S3method(as_data_backend,dataset) S3method(as_data_descriptor,dataset) S3method(as_lazy_tensor,DataDescriptor) S3method(as_lazy_tensor,dataset) @@ -29,6 +30,7 @@ S3method(as_torch_optimizer,torch_optimizer_generator) S3method(c,lazy_tensor) S3method(col_info,DataBackendLazy) S3method(col_info,DataBackendLazyTensors) +S3method(distinct_values,lazy_tensor) S3method(format,lazy_tensor) S3method(hash_input,TorchIngressToken) S3method(hash_input,lazy_tensor) @@ -71,6 +73,7 @@ export(CallbackSetTB) export(CallbackSetUnfreeze) export(ContextTorch) export(DataBackendLazy) +export(DataBackendLazyTensors) export(DataDescriptor) export(LearnerTorch) export(LearnerTorchFeatureless) diff --git a/R/DataBackendLazyTensors.R b/R/DataBackendLazyTensors.R index 8fb94f626..d02897323 100644 --- a/R/DataBackendLazyTensors.R +++ b/R/DataBackendLazyTensors.R @@ -1,9 +1,37 @@ -#' @title Data Backend for Lazy Tensors +#' @title Special Backend for Lazy Tensors #' @description -#' Special **experimental** data backend that converts [`lazy_tensor`] columns to their R representation. -#' However, [`LearnerTorch`] can directly operate on the lazy tensors. +#' This backend essentially allows you to use a [`torch::dataset`] directly with +#' an [`mlr3::Learner`]. +#' +#' * The data cannot contain missing values, as [`lazy_tensor`]s do not support them. +#' For this reason, calling `$missings()` will always return `0` for all columns. +#' * The `$distinct()` method will consider two lazy tensors that refer to the same element of a +#' [`DataDescriptor`] to be identical. +#' This means, that it might be underreporting the number of distinct values of lazy tensor columns. +#' #' @export +#' @examplesIf torch::torch_is_installed() +#' # used as feature in all backends +#' x = torch_randn(100, 10) +#' # regression +#' ds_regr = tensor_dataset(x = x, y = torch_randn(100, 1)) +#' be_regr = as_data_backend(ds_regr, converter = list(y = as.numeric)) +#' be_regr$head() +#' +#' +#' # binary classification: underlying target tensor must be float in [0, 1] +#' ds_binary = tensor_dataset(x = x, y = torch_randint(0, 2, c(100, 1))$float()) +#' be_binary = as_data_backend(ds_binary, converter = list( +#' y = function(x) factor(as.integer(x), levels = c(0, 1), labels = c("A", "yes")) +#' )) +#' be_binary$head() +#' +#' # multi-class classification: underlying target tensor must be integer in [1, K] +#' ds_multiclass = tensor_dataset(x = x, y = torch_randint(1, 4, size = c(100, 1))) +#' be_multiclass = as_data_backend(ds_multiclass, converter = list(y = as.numeric)) +#' be_multiclass$head() + DataBackendLazyTensors = R6Class("DataBackendLazyTensors", cloneable = FALSE, inherit = DataBackendDataTable, @@ -62,7 +90,7 @@ DataBackendLazyTensors = R6Class("DataBackendLazyTensors", return(super$head(n)) } - self$data(n, self$colnames) + self$data(seq_len(n), self$colnames) }, missings = function(rows, cols) { set_names(rep(0L, length(cols)), cols) @@ -73,8 +101,14 @@ DataBackendLazyTensors = R6Class("DataBackendLazyTensors", .load_and_cache = function(rows, cols) { # Process columns that need conversion tbl = super$data(rows, cols) - for (nm in intersect(names(private$.converter), names(tbl))) { - converted = private$.converter[[nm]](materialize(tbl[[nm]], rbind = TRUE)) + cols_to_convert = intersect(names(private$.converter), names(tbl)) + tbl_to_mat = tbl[, cols_to_convert, with = FALSE] + tbl_mat = materialize(tbl_to_mat, rbind = TRUE) + + if (!length(rows)) browser() + + for (nm in cols_to_convert) { + converted = private$.converter[[nm]](tbl_mat[[nm]]) tbl[[nm]] = converted if (nm %in% private$.cached_cols) { @@ -90,16 +124,27 @@ DataBackendLazyTensors = R6Class("DataBackendLazyTensors", ) #' @export -col_info.DataBackendLazyTensors = function(x, ...) { # nolint - first_row = x$head(1L) - types = map_chr(first_row, function(x) class(x)[1L]) - discrete = setdiff(names(types)[types %chin% c("factor", "ordered")], x$primary_key) - levels = insert_named(named_list(names(types)), map(first_row[discrete], levels)) - data.table(id = names(types), type = unname(types), levels = levels, key = "id") +as_data_backend.dataset = function(x, dataset_shapes, ...) { + tbl = as_lazy_tensors(x, dataset_shapes, ...) + tbl$row_id = seq_len(nrow(tbl)) + DataBackendLazyTensors$new(tbl, primary_key = "row_id", ...) } #' @export -as_data_backend.dataset = function(x, dataset_shapes, primary_key ...) { +as_task_classif.dataset = function(x, dataset_shapes, target, ...) { + # TODO +} +#' @export +as_task_regr.dataset = function(x, dataset_shapes, target, converter, ...) { + # TODO +} +#' @export +col_info.DataBackendLazyTensors = function(x, ...) { # nolint + first_row = x$head(1L) + types = map_chr(first_row, function(x) class(x)[1L]) + discrete = setdiff(names(types)[types %chin% c("factor", "ordered")], x$primary_key) + levels = insert_named(named_list(names(types)), map(first_row[, discrete, with = FALSE], levels)) + data.table(id = names(types), type = unname(types), levels = levels, key = "id") } \ No newline at end of file diff --git a/R/lazy_tensor.R b/R/lazy_tensor.R index 079cf436d..00b397575 100644 --- a/R/lazy_tensor.R +++ b/R/lazy_tensor.R @@ -352,3 +352,13 @@ rep.lazy_tensor = function(x, ...) { rep_len.lazy_tensor = function(x, ...) { set_class(NextMethod(), c("lazy_tensor", "list")) } + + +#' @export +distinct_values.lazy_tensor = function(x, drop = TRUE, na_rm = TRUE) { + if (!length(x)) { + return(x) + } + ids = distinct_values(map_int(x, 1)) + lazy_tensor(dd(x), ids) +} \ No newline at end of file diff --git a/R/materialize.R b/R/materialize.R index 185baa5b2..a2294b47a 100644 --- a/R/materialize.R +++ b/R/materialize.R @@ -44,13 +44,6 @@ materialize = function(x, device = "cpu", rbind = FALSE, ...) { assert_choice(device, mlr_reflections$torch$devices) assert_flag(rbind) - if (length(x) == 0L) { - if (rbind) { - return(torch_empty(0L)) - } else { - return(list()) - } - } UseMethod("materialize") } @@ -70,6 +63,13 @@ materialize.list = function(x, device = "cpu", rbind = FALSE, cache = "auto", .. map(x, function(col) { if (is_lazy_tensor(col)) { + if (length(col) == 0L) { + if (rbind) { + return(torch_empty(0L)) + } else { + return(list()) + } + } materialize_internal(col, device = device, cache = cache, rbind = rbind) } else { col @@ -83,12 +83,26 @@ materialize.list = function(x, device = "cpu", rbind = FALSE, cache = "auto", .. #' @method materialize data.frame #' @export materialize.data.frame = function(x, device = "cpu", rbind = FALSE, cache = "auto", ...) { # nolint + if (nrow(x) == 0L) { + if (rbind) { + set_names(replicate(ncol(x), torch_empty(0L)), names(x)) + } else { + set_names(replicate(ncol(x), list()), names(x)) + } + } materialize(as.list(x), device = device, rbind = rbind, cache = cache) } #' @export materialize.lazy_tensor = function(x, device = "cpu", rbind = FALSE, ...) { # nolint + if (length(x) == 0L) { + if (rbind) { + return(torch_empty(0L)) + } else { + return(list()) + } + } materialize_internal(x = x, device = device, cache = NULL, rbind = rbind) } diff --git a/R/utils.R b/R/utils.R index 684d8993b..4ec8f8030 100644 --- a/R/utils.R +++ b/R/utils.R @@ -198,7 +198,10 @@ list_to_batch = function(tensors) { } auto_cache_lazy_tensors = function(lts) { - any(duplicated(map_chr(lts, function(x) dd(x)$dataset_hash))) + if (length(lts) <= 1L) { + FALSE + } + anyDuplicated(unlist(map_if(lts, function(x) length(x) > 0, function(x) dd(x)$dataset_hash))) } #' Replace the head of a network diff --git a/man/DataBackendLazyTensors.Rd b/man/DataBackendLazyTensors.Rd new file mode 100644 index 000000000..6ae1817a4 --- /dev/null +++ b/man/DataBackendLazyTensors.Rd @@ -0,0 +1,104 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/DataBackendLazyTensors.R +\name{DataBackendLazyTensors} +\alias{DataBackendLazyTensors} +\title{Data Backend for Lazy Tensors} +\description{ +Special \strong{experimental} data backend that converts \code{\link{lazy_tensor}} columns to their R representation. +However, \code{\link{LearnerTorch}} can directly operate on the lazy tensors. +\itemize{ +\item The data cannot contain missing values, as \code{\link{lazy_tensor}}s do not support them. +For this reason, calling \verb{$missings()} will always return \code{0} for all columns. +\item The \verb{$distinct()} method will consider two lazy tensors that refer to the same element of a +\code{\link{DataDescriptor}} to be identical. +This means, that it might be underreporting the number of distinct values of lazy tensor columns. +} +} +\examples{ +# regression +ds = tensor_dataset(x = torch_randn(100, 10), y = torch_randn(100, 1)) +tbl = as +} +\section{Super classes}{ +\code{\link[mlr3:DataBackend]{mlr3::DataBackend}} -> \code{\link[mlr3:DataBackendDataTable]{mlr3::DataBackendDataTable}} -> \code{DataBackendLazyTensors} +} +\section{Methods}{ +\subsection{Public methods}{ +\itemize{ +\item \href{#method-DataBackendLazyTensors-new}{\code{DataBackendLazyTensors$new()}} +\item \href{#method-DataBackendLazyTensors-data}{\code{DataBackendLazyTensors$data()}} +\item \href{#method-DataBackendLazyTensors-head}{\code{DataBackendLazyTensors$head()}} +\item \href{#method-DataBackendLazyTensors-missings}{\code{DataBackendLazyTensors$missings()}} +} +} +\if{html}{\out{ +
Inherited methods + +
+}} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-DataBackendLazyTensors-new}{}}} +\subsection{Method \code{new()}}{ +Create a new instance of this \link[R6:R6Class]{R6} class. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{DataBackendLazyTensors$new( + data, + primary_key, + converter, + cache = names(converter) +)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{data}}{(\code{data.table})\cr +Data containing (among others) \code{\link{lazy_tensor}} columns.} + +\item{\code{primary_key}}{(\code{character(1)})\cr +Name of the column used as primary key.} + +\item{\code{converter}}{(named \code{list()} of \code{function}s)\cr +A named list of functions that convert the lazy tensor columns to their R representation. +The names must be the names of the columns that need conversion.} + +\item{\code{cache}}{(\code{character()})\cr +Names of the columns that should be cached. +Per default, all columns that are converted are cached.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-DataBackendLazyTensors-data}{}}} +\subsection{Method \code{data()}}{ +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{DataBackendLazyTensors$data(rows, cols)}\if{html}{\out{
}} +} + +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-DataBackendLazyTensors-head}{}}} +\subsection{Method \code{head()}}{ +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{DataBackendLazyTensors$head(n = 6L)}\if{html}{\out{
}} +} + +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-DataBackendLazyTensors-missings}{}}} +\subsection{Method \code{missings()}}{ +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{DataBackendLazyTensors$missings(rows, cols)}\if{html}{\out{
}} +} + +} +} diff --git a/man/mlr_learners_torch.Rd b/man/mlr_learners_torch.Rd index 3f1d3e6d0..cbb497d59 100644 --- a/man/mlr_learners_torch.Rd +++ b/man/mlr_learners_torch.Rd @@ -183,6 +183,7 @@ For networks with more than one input, the names must correspond to the inputs o } Moreover, one needs to pay attention respect the row ids of the provided task. +It is strongly recommended to use the \code{\link{task_dataset}} class to create the dataset. } It is also possible to overwrite the private \code{.dataloader()} method. diff --git a/tests/testthat/test_DataBackendLazyTensors.R b/tests/testthat/test_DataBackendLazyTensors.R index 614ee5fd4..c248c9b67 100644 --- a/tests/testthat/test_DataBackendLazyTensors.R +++ b/tests/testthat/test_DataBackendLazyTensors.R @@ -1,41 +1,217 @@ -test_that("DataBackendDataset", { +test_that("correct input checks", { + +}) + +test_that("main API works", { + # regression target + ds = tensor_dataset( + x = torch_tensor(matrix(100:1, nrow = 100, ncol = 1)), + y = torch_tensor(as.matrix(1:100, nrow = 100, ncol = 1)) + ) + + be = as_data_backend(ds, converter = list(y = as.numeric), dataset_shapes = list(x = c(NA, 1), y = c(NA, 1))) + + # converted data + + batch1 = be$data(1, c("x", "y")) + expect_class(batch1$x, "lazy_tensor") + expect_equal(length(batch1$x), 1) + expect_equal(materialize(batch1$x, rbind = TRUE), torch_tensor(matrix(100L, nrow = 1, ncol = 1))) + expect_equal(batch1$y, 1) + + batch2 = be$data(2:1, c("x", "y")) + expect_class(batch2$x, "lazy_tensor") + expect_equal(length(batch2$x), 2) + expect_equal(materialize(batch2$x, rbind = TRUE), torch_tensor(matrix(100:99, nrow = 2, ncol = 1))) + expect_equal(batch2$y, c(2, 1)) + + # lt data + batch_lt1 = withr::with_options(list(mlr3torch.data_loading = TRUE), { + be$data(1, c("x", "y")) + }) + expect_class(batch_lt1$x, "lazy_tensor") + expect_equal(length(batch_lt1$x), 1) + expect_equal(materialize(batch_lt1$x, rbind = TRUE), torch_tensor(matrix(100L, nrow = 1, ncol = 1))) + # y is still a lazy tensor + expect_class(batch_lt1$y, "lazy_tensor") + expect_equal(length(batch_lt1$y), 1) + + batch_lt2 = withr::with_options(list(mlr3torch.data_loading = TRUE), { + be$data(2:1, c("x", "y")) + }) + expect_class(batch_lt2$x, "lazy_tensor") + expect_equal(length(batch_lt2$x), 2) + expect_equal(materialize(batch_lt2$x, rbind = TRUE), torch_tensor(matrix(100:99, nrow = 2, ncol = 1))) + # y is still a lazy tensor + expect_class(batch_lt2$y, "lazy_tensor") + expect_equal(length(batch_lt2$y), 2) + + # missings + expect_equal(be$missings(1:100, c("y", "x")), c(y = 0, x = 0)) + expect_equal(be$missings(1:100, "y"), c(y = 0)) + expect_equal(be$missings(1:100, "x"), c(x = 0)) + + # head + tbl = be$head(n = 3) + expect_data_table(tbl, nrow = 3, ncol = 3) + expect_class(tbl$x, "lazy_tensor") + expect_equal(materialize(tbl$x, rbind = TRUE), torch_tensor(matrix(100:98, nrow = 3, ncol = 1))) + expect_class(tbl$y, "numeric") + expect_equal(tbl$row_id, as.numeric(1:3)) + expect_class(tbl$row_id, "integer") + expect_equal(tbl$row_id, 1:3) + + # distinct values: this can be expensive + dist = be$distinct(1:3, c("x", "y", "row_id")) + expect_list(dist, len = 3) + expect_equal(materialize(dist$x, rbind = TRUE), torch_tensor(matrix(100:98, nrow = 3, ncol = 1))) + expect_equal(dist$y, c(1, 2, 3)) + expect_equal(dist$row_id, 1:3) +}) + +test_that("classif target works", { ds = dataset( initialize = function() { - self$x = torch_randn(100, 10) - self$y = torch_randn(100, 1) + self$x = torch_tensor(matrix(100:1, nrow = 100, ncol = 1)) + self$y = torch_tensor(matrix(rep(c(0, 1), each = 50), nrow = 100, ncol = 1)) }, .getitem = function(i) { - list(x = self$x[i, ], y = self$y[i]) + list(x = self$x[i], y = self$y[i]) }, .length = function() { nrow(self$x) } )() - tbl = as_lazy_tensors(ds, list(x = c(NA, 10), y = c(NA, 1))) + tbl = as_lazy_tensors(ds, list(x = c(NA, 1), y = c(NA, 1))) tbl$row_id = 1:100 - be = DataBackendLazyTensors$new(tbl, primary_key = "row_id", converter = list(y = as.numeric), cache = "y") + be = DataBackendLazyTensors$new(tbl, primary_key = "row_id", converter = list( + y = function(x) factor(as.integer(x), levels = c(0, 1), labels = c("yes", "no")) + )) + batch = be$data(c(1, 2, 51, 52), c("x", "y", "row_id")) + expect_class(batch$y, "factor") + expect_equal(batch$y, factor(c("yes", "yes", "no", "no"), levels = c("yes", "no"))) - expect_data_backend(be) + batch_lt = withr::with_options(list(mlr3torch.data_loading = TRUE), { + be$data(c(1, 2, 51, 52), c("x", "y", "row_id")) + }) + expect_class(batch_lt$y, "lazy_tensor") + expect_equal(length(batch_lt$y), 4) + expect_equal(materialize(batch_lt$y, rbind = TRUE), torch_tensor(matrix(c(1, 1, 0, 0), nrow = 4, ncol = 1))) +}) - be$data(1, "y") +test_that("errors when weird preprocessing", { + # test following example pipeops: + # - target trafo + # - fix factors + # - smote - be$data(2, c("x", "y")) +}) - be$head() +test_that("caching works", { + dsc = dataset( + initialize = function() { + self$x = torch_tensor(matrix(100:1, nrow = 100, ncol = 1)) + self$y = torch_tensor(as.matrix(1:100, nrow = 100, ncol = 1)) + self$counter = 0 + }, + .getitem = function(i) { + self$counter = self$counter + 1 + list(x = self$x[i], y = self$y[i]) + }, + .length = function() { + nrow(self$x) + } + ) + ds = dsc() - withr::with_options(list(mlr3torch.data_loading = TRUE), { - be$data(1, c("x", "y")) - }) + be = as_data_backend(ds, dataset_shapes = list(x = c(NA, 1), y = c(NA, 1)), + converter = list(y = as.integer), cache = "y") + + check = function(be, ds, rows, cols, n) { + counter_prev = ds$counter + tbl = be$data(rows, cols) + observed_n = ds$counter - counter_prev + expect_equal(observed_n, n) + + expect_equal(materialize(tbl$x, rbind = TRUE), ds$x[rows]) + expect_equal(tbl$y, as.integer(ds$y[rows])) + } + check(be, ds, 1, c("x", "y"), 1) + # y is no in the cache, so .getitem() is not called on $data() + check(be, ds, 1, "y", 0) + + # but x is not cached, so we still need to call .getitem below + check(be, ds, 1, c("x", "y"), 1) + + # more than one row also works + check(be, ds, 2:1, "y", 1) + check(be, ds, c(3, 1), "y", 1) + check(be, ds, 1:3, "y", 0) + + # when caching more than one, we materialize only once per batch + be2 = as_data_backend(ds, dataset_shapes = list(x = c(NA, 1), y = c(NA, 1)), + converter = list(y = as.integer, x = as.integer), cache = c("y", "x")) - learner = lrn("regr.mlp", batch_size = 32, epochs = 1) + check2 = function(be, ds, rows, cols, n) { + counter_prev = ds$counter + tbl = be$data(rows, cols) + observed_n = ds$counter - counter_prev + expect_equal(observed_n, n) + expect_equal(tbl$y, as.integer(ds$y[rows])) + expect_equal(tbl$x, as.integer(ds$x[rows])) + } + + check2(be2, ds, 1, c("x", "y"), 1) + check2(be2, ds, 1, c("x", "y"), 0) + check2(be2, ds, 2:1, c("x", "y"), 1) + check2(be2, ds, 2, c("x", "y"), 0) +}) + +test_that("can train a regression learner", { + ds = tensor_dataset( + x = torch_tensor(matrix(100:1, nrow = 100, ncol = 1))$float(), + y = torch_tensor(as.matrix(1:100, nrow = 100, ncol = 1))$float() + ) + + be = as_data_backend(ds, dataset_shapes = list(x = c(NA, 1), y = c(NA, 1)), + converter = list(y = as.numeric)) task = as_task_regr(be, target = "y") - learner$train(task) + + learner = lrn("regr.mlp", epochs = 200, batch_size = 100, jit_trace = TRUE, opt.lr = 1, seed = 1) + rr = resample(task, learner, rsmp("insample")) + expect_true(rr$aggregate(msr("regr.rmse")) < 3) }) -test_that("mlp works with it", { - learner = lrn("classif.mlp") +test_that("can train a binary classification learner", { + ds = tensor_dataset( + x = torch_tensor(matrix(100:1, nrow = 100, ncol = 1))$float(), + y = torch_tensor(as.matrix(1:100, nrow = 100, ncol = 1))$float() + ) + + be = as_data_backend(ds, dataset_shapes = list(x = c(NA, 1), y = c(NA, 1)), + converter = list(y = as.numeric)) + task = as_task_regr(be, target = "y") + + learner = lrn("regr.mlp", epochs = 200, batch_size = 100, jit_trace = TRUE, opt.lr = 1, seed = 1) + rr = resample(task, learner, rsmp("insample")) + expect_true(rr$aggregate(msr("regr.rmse")) < 3) }) + +test_that("can train a multiclass classification learner", { + ds = tensor_dataset( + x = torch_tensor(matrix(100:1, nrow = 100, ncol = 1))$float(), + y = torch_tensor(matrix(rep(c(0, 1), each = 50), nrow = 100, ncol = 1))$float() + ) + + be = as_data_backend(ds, dataset_shapes = list(x = c(NA, 1), y = c(NA, 1)), + converter = list(y = function(x) factor(as.integer(x), levels = c(0, 1), labels = c("yes", "no")))) + task = as_task_classif(be, target = "y") + + learner = lrn("classif.mlp", epochs = 200, batch_size = 100, jit_trace = TRUE, opt.lr = 1, seed = 1) + rr = resample(task, learner, rsmp("insample")) + expect_true(rr$aggregate(msr("regr.rmse")) < 3) +}) \ No newline at end of file From 9aa26f16dbb8a929d08991e24041b1f50efeecff Mon Sep 17 00:00:00 2001 From: Sebastian Fischer Date: Thu, 17 Apr 2025 08:27:53 +0200 Subject: [PATCH 4/6] ... --- NAMESPACE | 2 ++ R/DataBackendLazyTensors.R | 2 -- R/DataDescriptor.R | 8 +++----- R/utils.R | 1 + man/DataBackendLazyTensors.Rd | 28 +++++++++++++++++++++++----- 5 files changed, 29 insertions(+), 12 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 4228002d1..1ed76e181 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -14,6 +14,8 @@ S3method(as_lazy_tensor,dataset) S3method(as_lazy_tensor,numeric) S3method(as_lazy_tensor,torch_tensor) S3method(as_lazy_tensors,dataset) +S3method(as_task_classif,dataset) +S3method(as_task_regr,dataset) S3method(as_torch_callback,R6ClassGenerator) S3method(as_torch_callback,TorchCallback) S3method(as_torch_callback,character) diff --git a/R/DataBackendLazyTensors.R b/R/DataBackendLazyTensors.R index d02897323..21697b860 100644 --- a/R/DataBackendLazyTensors.R +++ b/R/DataBackendLazyTensors.R @@ -105,8 +105,6 @@ DataBackendLazyTensors = R6Class("DataBackendLazyTensors", tbl_to_mat = tbl[, cols_to_convert, with = FALSE] tbl_mat = materialize(tbl_to_mat, rbind = TRUE) - if (!length(rows)) browser() - for (nm in cols_to_convert) { converted = private$.converter[[nm]](tbl_mat[[nm]]) tbl[[nm]] = converted diff --git a/R/DataDescriptor.R b/R/DataDescriptor.R index 11410b480..8377bdca4 100644 --- a/R/DataDescriptor.R +++ b/R/DataDescriptor.R @@ -234,12 +234,10 @@ assert_compatible_shapes = function(shapes, dataset) { } }) - if (is.null(dataset$.getbatch)) { - example = map(example, function(x) x$unsqueeze(1)) - } - iwalk(shapes, function(dataset_shape, name) { - if (!is.null(dataset_shape) && !test_equal(shapes[[name]][-1], example[[name]]$shape[-1L])) { + observed_shape = example[[name]]$shape + observed_shape[1] = NA + if (!is.null(dataset_shape) && !test_equal(shapes[[name]], observed_shape)) { expected_shape = example[[name]]$shape expected_shape[1] = NA stopf(paste0("First batch from dataset is incompatible with the provided shape of %s:\n", diff --git a/R/utils.R b/R/utils.R index c6cdbec0d..2680785fc 100644 --- a/R/utils.R +++ b/R/utils.R @@ -304,6 +304,7 @@ infer_shapes = function(shapes_in, param_vals, output_names, fn, rowwise, id) { } get_or_check_dataset_shapes = function(dataset, dataset_shapes) { + dataset_shapes = map(dataset_shapes, as.integer) if (is.null(dataset_shapes)) { if (is.null(dataset$.getbatch)) { stopf("dataset_shapes must be provided if dataset does not have a `.getbatch` method.") diff --git a/man/DataBackendLazyTensors.Rd b/man/DataBackendLazyTensors.Rd index 6ae1817a4..9ac930281 100644 --- a/man/DataBackendLazyTensors.Rd +++ b/man/DataBackendLazyTensors.Rd @@ -2,10 +2,10 @@ % Please edit documentation in R/DataBackendLazyTensors.R \name{DataBackendLazyTensors} \alias{DataBackendLazyTensors} -\title{Data Backend for Lazy Tensors} +\title{Special Backend for Lazy Tensors} \description{ -Special \strong{experimental} data backend that converts \code{\link{lazy_tensor}} columns to their R representation. -However, \code{\link{LearnerTorch}} can directly operate on the lazy tensors. +This backend essentially allows you to use a \code{\link[torch:dataset]{torch::dataset}} directly with +an \code{\link[mlr3:Learner]{mlr3::Learner}}. \itemize{ \item The data cannot contain missing values, as \code{\link{lazy_tensor}}s do not support them. For this reason, calling \verb{$missings()} will always return \code{0} for all columns. @@ -15,9 +15,27 @@ This means, that it might be underreporting the number of distinct values of laz } } \examples{ +\dontshow{if (torch::torch_is_installed()) (if (getRversion() >= "3.4") withAutoprint else force)(\{ # examplesIf} +# used as feature in all backends +x = torch_randn(100, 10) # regression -ds = tensor_dataset(x = torch_randn(100, 10), y = torch_randn(100, 1)) -tbl = as +ds_regr = tensor_dataset(x = x, y = torch_randn(100, 1)) +be_regr = as_data_backend(ds_regr, converter = list(y = as.numeric)) +be_regr$head() + + +# binary classification: underlying target tensor must be float in [0, 1] +ds_binary = tensor_dataset(x = x, y = torch_randint(0, 2, c(100, 1))$float()) +be_binary = as_data_backend(ds_binary, converter = list( + y = function(x) factor(as.integer(x), levels = c(0, 1), labels = c("A", "yes")) +)) +be_binary$head() + +# multi-class classification: underlying target tensor must be integer in [1, K] +ds_multiclass = tensor_dataset(x = x, y = torch_randint(1, 4, size = c(100, 1))) +be_multiclass = as_data_backend(ds_multiclass, converter = list(y = as.numeric)) +be_multiclass$head() +\dontshow{\}) # examplesIf} } \section{Super classes}{ \code{\link[mlr3:DataBackend]{mlr3::DataBackend}} -> \code{\link[mlr3:DataBackendDataTable]{mlr3::DataBackendDataTable}} -> \code{DataBackendLazyTensors} From 06595c78193529eaa418883847c620b6ca40d3b0 Mon Sep 17 00:00:00 2001 From: Sebastian Fischer Date: Thu, 17 Apr 2025 11:11:16 +0200 Subject: [PATCH 5/6] fixes --- R/DataBackendLazyTensors.R | 32 +++++++++ R/DataDescriptor.R | 23 ++++--- R/learner_torch_methods.R | 4 ++ R/utils.R | 5 +- tests/testthat/test_DataBackendLazyTensors.R | 68 ++++++++++++++------ 5 files changed, 101 insertions(+), 31 deletions(-) diff --git a/R/DataBackendLazyTensors.R b/R/DataBackendLazyTensors.R index 21697b860..2680634cc 100644 --- a/R/DataBackendLazyTensors.R +++ b/R/DataBackendLazyTensors.R @@ -96,6 +96,12 @@ DataBackendLazyTensors = R6Class("DataBackendLazyTensors", set_names(rep(0L, length(cols)), cols) } ), + active = list( + converter = function(rhs) { + assert_ro_binding(rhs) + private$.converter + } + ), private = list( # call this function only with rows that are not in the cache yet .load_and_cache = function(rows, cols) { @@ -145,4 +151,30 @@ col_info.DataBackendLazyTensors = function(x, ...) { # nolint discrete = setdiff(names(types)[types %chin% c("factor", "ordered")], x$primary_key) levels = insert_named(named_list(names(types)), map(first_row[, discrete, with = FALSE], levels)) data.table(id = names(types), type = unname(types), levels = levels, key = "id") +} + + +# conservative check that avoids that a pseudo-lazy-tensor is preprocessed by some pipeop +# @param be +# the backend +# @param candidates +# the feature and target names +# @param visited +# Union of all colnames already visited +# @return visited +check_lazy_tensors_backend = function(be, candidates, visited = character()) { + if (inherits(be, "DataBackendRbind") || inherits(be, "DataBackendCbind")) { + bs = be$.__enclos_env__$private$.data + # first we check b2, then b1, because b2 possibly overshadows some b1 rows/cols + visited = check_lazy_tensors_backend(bs$b2, candidates, visited) + check_lazy_tensors_backend(bs$b1, candidates, visited) + } else { + if (inherits(be, "DataBackendLazyTensors")) { + if (any(names(be$converter) %in% visited)) { + converter_cols = names(be$converter)[names(be$converter) %in% visited] + stopf("A converter column ('%s') from a DataBackendLazyTensors was presumably preprocessed by some PipeOp. This can cause inefficiencies and is therefore not allowed. If you want to preprocess them, please directly encode them as R types.", paste0(converter_cols, collapse = ", ")) # nolint + } + } + union(visited, intersect(candidates, be$colnames)) + } } \ No newline at end of file diff --git a/R/DataDescriptor.R b/R/DataDescriptor.R index 8377bdca4..6a1d65740 100644 --- a/R/DataDescriptor.R +++ b/R/DataDescriptor.R @@ -217,13 +217,14 @@ infer_shapes_from_getbatch = function(ds) { } assert_compatible_shapes = function(shapes, dataset) { - assert_shapes(shapes, null_ok = TRUE, unknown_batch = TRUE, named = TRUE) + shapes = assert_shapes(shapes, null_ok = TRUE, unknown_batch = TRUE, named = TRUE, coerce = TRUE) # prevent user from e.g. forgetting to wrap the return in a list - example = if (is.null(dataset$.getbatch)) { - dataset$.getitem(1L) - } else { + has_getbatch = !is.null(dataset$.getbatch) + example = if (has_getbatch) { dataset$.getbatch(1L) + } else { + dataset$.getitem(1L) } if (!test_list(example, names = "unique") || !test_permutation(names(example), names(shapes))) { stopf("Dataset must return a list with named elements that are a permutation of the dataset_shapes names.") @@ -236,13 +237,15 @@ assert_compatible_shapes = function(shapes, dataset) { iwalk(shapes, function(dataset_shape, name) { observed_shape = example[[name]]$shape - observed_shape[1] = NA - if (!is.null(dataset_shape) && !test_equal(shapes[[name]], observed_shape)) { - expected_shape = example[[name]]$shape - expected_shape[1] = NA + if (has_getbatch) { + observed_shape[1L] = NA_integer_ + } else { + observed_shape = c(NA_integer_, observed_shape) + } + if (!is.null(dataset_shape) && !test_equal(observed_shape, dataset_shape)) { stopf(paste0("First batch from dataset is incompatible with the provided shape of %s:\n", - "* Provided shape: %s.\n* Expected shape: %s."), name, - shape_to_str(unname(shapes[name])), shape_to_str(list(expected_shape))) + "* Provided shape: %s.\n* Observed shape: %s."), name, + shape_to_str(unname(shapes[name])), shape_to_str(list(observed_shape))) } }) } diff --git a/R/learner_torch_methods.R b/R/learner_torch_methods.R index 79cebaa4a..7259bf587 100644 --- a/R/learner_torch_methods.R +++ b/R/learner_torch_methods.R @@ -18,8 +18,10 @@ learner_torch_predict = function(self, private, super, task, param_vals) { private$.encode_prediction(predict_tensor = predict_tensor, task = task) } + learner_torch_train = function(self, private, super, task, param_vals) { # Here, all param_vals (like seed = "random" or device = "auto") have already been resolved + check_lazy_tensors_backend(task$backend, c(task$feature_names, task$target_names)) dataset_train = private$.dataset(task, param_vals) dataset_train = as_multi_tensor_dataset(dataset_train, param_vals) loader_train = private$.dataloader(dataset_train, param_vals) @@ -356,3 +358,5 @@ as_multi_tensor_dataset = function(dataset, param_vals) { dataset } } + + diff --git a/R/utils.R b/R/utils.R index 2680785fc..bcb17af66 100644 --- a/R/utils.R +++ b/R/utils.R @@ -191,9 +191,9 @@ list_to_batch = function(tensors) { auto_cache_lazy_tensors = function(lts) { if (length(lts) <= 1L) { - FALSE + return(FALSE) } - anyDuplicated(unlist(map_if(lts, function(x) length(x) > 0, function(x) dd(x)$dataset_hash))) + anyDuplicated(unlist(map_if(lts, function(x) length(x) > 0, function(x) dd(x)$dataset_hash))) > 0L } #' Replace the head of a network @@ -304,7 +304,6 @@ infer_shapes = function(shapes_in, param_vals, output_names, fn, rowwise, id) { } get_or_check_dataset_shapes = function(dataset, dataset_shapes) { - dataset_shapes = map(dataset_shapes, as.integer) if (is.null(dataset_shapes)) { if (is.null(dataset$.getbatch)) { stopf("dataset_shapes must be provided if dataset does not have a `.getbatch` method.") diff --git a/tests/testthat/test_DataBackendLazyTensors.R b/tests/testthat/test_DataBackendLazyTensors.R index c248c9b67..d1272ed5a 100644 --- a/tests/testthat/test_DataBackendLazyTensors.R +++ b/tests/testthat/test_DataBackendLazyTensors.R @@ -53,7 +53,7 @@ test_that("main API works", { # head tbl = be$head(n = 3) - expect_data_table(tbl, nrow = 3, ncol = 3) + expect_data_table(tbl, nrows = 3, ncols = 3) expect_class(tbl$x, "lazy_tensor") expect_equal(materialize(tbl$x, rbind = TRUE), torch_tensor(matrix(100:98, nrow = 3, ncol = 1))) expect_class(tbl$y, "numeric") @@ -136,8 +136,12 @@ test_that("caching works", { observed_n = ds$counter - counter_prev expect_equal(observed_n, n) - expect_equal(materialize(tbl$x, rbind = TRUE), ds$x[rows]) - expect_equal(tbl$y, as.integer(ds$y[rows])) + if ("x" %in% cols) { + expect_equal(materialize(tbl$x, rbind = TRUE), ds$x[rows]) + } + if ("y" %in% cols) { + expect_equal(tbl$y, as.integer(ds$y[rows])) + } } check(be, ds, 1, c("x", "y"), 1) # y is no in the cache, so .getitem() is not called on $data() @@ -145,6 +149,8 @@ test_that("caching works", { # but x is not cached, so we still need to call .getitem below check(be, ds, 1, c("x", "y"), 1) + # lazy tensor causes no materialization + check(be, ds, 1, "x", 0) # more than one row also works check(be, ds, 2:1, "y", 1) @@ -172,46 +178,72 @@ test_that("caching works", { }) test_that("can train a regression learner", { + x = torch_randn(100, 1) + y = x + torch_randn(100, 1) ds = tensor_dataset( - x = torch_tensor(matrix(100:1, nrow = 100, ncol = 1))$float(), - y = torch_tensor(as.matrix(1:100, nrow = 100, ncol = 1))$float() + x = x, + y = y ) be = as_data_backend(ds, dataset_shapes = list(x = c(NA, 1), y = c(NA, 1)), converter = list(y = as.numeric)) task = as_task_regr(be, target = "y") - learner = lrn("regr.mlp", epochs = 200, batch_size = 100, jit_trace = TRUE, opt.lr = 1, seed = 1) + learner = lrn("regr.mlp", epochs = 10, batch_size = 100, jit_trace = TRUE, opt.lr = 1, seed = 1) rr = resample(task, learner, rsmp("insample")) - expect_true(rr$aggregate(msr("regr.rmse")) < 3) + expect_true(rr$aggregate(msr("regr.rmse")) < 1.5) }) test_that("can train a binary classification learner", { ds = tensor_dataset( x = torch_tensor(matrix(100:1, nrow = 100, ncol = 1))$float(), - y = torch_tensor(as.matrix(1:100, nrow = 100, ncol = 1))$float() + y = torch_tensor(rep(0:1, each = 50))$float()$unsqueeze(2L) ) be = as_data_backend(ds, dataset_shapes = list(x = c(NA, 1), y = c(NA, 1)), - converter = list(y = as.numeric)) - task = as_task_regr(be, target = "y") + converter = list(y = function(x) factor(as.integer(x), levels = c(1, 0), labels = c("yes", "no")))) + task = as_task_classif(be, target = "y") - learner = lrn("regr.mlp", epochs = 200, batch_size = 100, jit_trace = TRUE, opt.lr = 1, seed = 1) + learner = lrn("classif.mlp", epochs = 10, batch_size = 100, jit_trace = TRUE, opt.lr = 10, seed = 1) rr = resample(task, learner, rsmp("insample")) - expect_true(rr$aggregate(msr("regr.rmse")) < 3) + expect_true(rr$aggregate(msr("classif.ce")) < 0.1) }) test_that("can train a multiclass classification learner", { ds = tensor_dataset( x = torch_tensor(matrix(100:1, nrow = 100, ncol = 1))$float(), - y = torch_tensor(matrix(rep(c(0, 1), each = 50), nrow = 100, ncol = 1))$float() + y = torch_tensor(rep(1:4, each = 25)) ) - be = as_data_backend(ds, dataset_shapes = list(x = c(NA, 1), y = c(NA, 1)), - converter = list(y = function(x) factor(as.integer(x), levels = c(0, 1), labels = c("yes", "no")))) + be = as_data_backend(ds, dataset_shapes = list(x = c(NA, 1), y = NA), + converter = list(y = function(x) factor(as.integer(x), levels = 1:4, labels = c("a", "b", "c", "d")))) task = as_task_classif(be, target = "y") - learner = lrn("classif.mlp", epochs = 200, batch_size = 100, jit_trace = TRUE, opt.lr = 1, seed = 1) + learner = lrn("classif.mlp", epochs = 10, batch_size = 100, jit_trace = TRUE, opt.lr = 0.2, seed = 1, + neurons = 100) rr = resample(task, learner, rsmp("insample")) - expect_true(rr$aggregate(msr("regr.rmse")) < 3) -}) \ No newline at end of file + # just ensures that we lear something + expect_true(rr$aggregate(msr("classif.ce")) < 0.6) +}) + +test_that("check_lazy_tensors_backend works", { + ds = tensor_dataset( + x = torch_tensor(matrix(100:1, nrow = 100, ncol = 1))$float(), + y = torch_tensor(as.matrix(1:100, nrow = 100, ncol = 1))$float() + ) + + be = as_data_backend(ds, dataset_shapes = list(x = c(NA, 1), y = c(NA, 1)), + converter = list(y = as.numeric)) + task_orig = as_task_regr(be, target = "y") + + expect_error(check_lazy_tensors_backend(task_orig$backend, c("x", "y")), + regexp = NA) + + task1 = task_orig$clone(deep = TRUE)$cbind(data.table(y = 1:100)) + expect_error(check_lazy_tensors_backend(task1$backend, c("x", "y")), + regexp = "A converter column ('y')", fixed = TRUE) + + task2 = task_orig$clone(deep = TRUE)$rbind(data.table(x = as_lazy_tensor(1), y = 2, row_id = 999)) + expect_error(check_lazy_tensors_backend(task2$backend, c("x", "y")), + regexp = "A converter column ('y')", fixed = TRUE) +}) From 61dc13b883829e0d23dc4f9f23e127938cd9f5c5 Mon Sep 17 00:00:00 2001 From: Sebastian Fischer Date: Thu, 24 Apr 2025 06:30:41 +0200 Subject: [PATCH 6/6] ... --- R/DataBackendLazyTensors.R | 93 ++++++++++++++--- R/materialize.R | 4 +- TODO.md | 3 +- man/DataBackendLazyTensors.Rd | 3 +- tests/testthat/test_DataBackendLazyTensors.R | 104 +++++++++++++++++-- 5 files changed, 181 insertions(+), 26 deletions(-) diff --git a/R/DataBackendLazyTensors.R b/R/DataBackendLazyTensors.R index 2680634cc..2328a86b4 100644 --- a/R/DataBackendLazyTensors.R +++ b/R/DataBackendLazyTensors.R @@ -36,6 +36,7 @@ DataBackendLazyTensors = R6Class("DataBackendLazyTensors", cloneable = FALSE, inherit = DataBackendDataTable, public = list( + chunk_size = NULL, #' @description #' Create a new instance of this [R6][R6::R6Class] class. #' @param data (`data.table`)\cr @@ -48,10 +49,12 @@ DataBackendLazyTensors = R6Class("DataBackendLazyTensors", #' @param cache (`character()`)\cr #' Names of the columns that should be cached. #' Per default, all columns that are converted are cached. - initialize = function(data, primary_key, converter, cache = names(converter)) { + initialize = function(data, primary_key, converter, cache = names(converter), chunk_size = 100) { private$.converter = assert_list(converter, types = "function", any.missing = FALSE) assert_subset(names(converter), colnames(data)) + assert_subset(cache, names(converter), empty.ok = TRUE) private$.cached_cols = assert_subset(cache, names(converter)) + self$chunk_size = assert_int(chunk_size, lower = 1L) walk(names(private$.converter), function(nm) { if (!inherits(data[[nm]], "lazy_tensor")) { stopf("Column '%s' is not a lazy tensor.", nm) @@ -69,18 +72,25 @@ DataBackendLazyTensors = R6Class("DataBackendLazyTensors", # no caching, no materialization as this is called in the training loop return(super$data(rows, cols)) } - if (all(cols %in% names(private$.data_cache))) { - cache_hit = private$.data_cache[list(rows), cols, on = self$primary_key, with = FALSE] + if (all(intersect(cols, private$.cached_cols) %in% names(private$.data_cache))) { + expensive_cols = intersect(cols, private$.cached_cols) + other_cols = setdiff(cols, expensive_cols) + cache_hit = private$.data_cache[list(rows), expensive_cols, on = self$primary_key, with = FALSE] complete = complete.cases(cache_hit) cache_hit = cache_hit[complete] if (nrow(cache_hit) == length(rows)) { - return(cache_hit) + tbl = cbind(cache_hit, super$data(rows, other_cols)) + setcolorder(tbl, cols) + return(tbl) } - combined = rbindlist(list(cache_hit, private$.load_and_cache(rows[!complete], cols))) + combined = rbindlist(list(cache_hit, private$.load_and_cache(rows[!complete], expensive_cols))) reorder = vector("integer", nrow(combined)) reorder[complete] = seq_len(nrow(cache_hit)) reorder[!complete] = nrow(cache_hit) + seq_len(nrow(combined) - nrow(cache_hit)) - return(combined[reorder]) + + tbl = cbind(combined[reorder], super$data(rows, other_cols)) + setcolorder(tbl, cols) + return(tbl) } private$.load_and_cache(rows, cols) @@ -109,7 +119,17 @@ DataBackendLazyTensors = R6Class("DataBackendLazyTensors", tbl = super$data(rows, cols) cols_to_convert = intersect(names(private$.converter), names(tbl)) tbl_to_mat = tbl[, cols_to_convert, with = FALSE] - tbl_mat = materialize(tbl_to_mat, rbind = TRUE) + # chunk the rows of tbl_to_mat into chunks of size self$chunk_size, apply materialize + n = nrow(tbl_to_mat) + chunks = split(seq_len(n), rep(seq_len(ceiling(n / self$chunk_size)), each = self$chunk_size, length.out = n)) + + tbl_mat = if (n == 0) { + set_names(list(torch_empty(0)), names(tbl_to_mat)) + } else { + set_names(lapply(transpose_list(lapply(chunks, function(chunk) { + materialize(tbl_to_mat[chunk, ], rbind = TRUE) + })), torch_cat, dim = 1L), names(tbl_to_mat)) + } for (nm in cols_to_convert) { converted = private$.converter[[nm]](tbl_mat[[nm]]) @@ -135,13 +155,62 @@ as_data_backend.dataset = function(x, dataset_shapes, ...) { } #' @export -as_task_classif.dataset = function(x, dataset_shapes, target, ...) { - # TODO +as_task_classif.dataset = function(x, target, levels, converter = NULL, dataset_shapes = NULL, chunk_size = 100, cache = names(converter), ...) { + if (length(x) < 2) { + stopf("Dataset must have at least 2 rows.") + } + batch = dataloader(x, batch_size = 2)$.iter()$.next() + if (is.null(converter)) { + if (length(levels) == 2) { + if (batch[[target]]$dtype != torch_float()) { + stopf("Target must be a float tensor, but has dtype %s", batch[[target]]$dtype) + } + if (test_equal(batch[[target]]$shape, c(2L, 1L))) { + converter = set_names(list(crate(function(x) factor(as.integer(x), levels = 0:1, labels = levels), levels)), target) + } else { + stopf("Target must be a float tensor of shape (batch_size, 1), but has shape (batch_size, %s)", + paste(batch[[target]]$shape[-1L], collapse = ", ")) + } + converter = set_names(list(crate(function(x) factor(as.integer(x), levels = 0:1, labels = levels), levels)), target) + } else { + if (batch[[target]]$dtype != torch_int()) { + stopf("Target must be an integer tensor, but has dtype %s", batch[[target]]$dtype) + } + if (test_equal(batch[[target]]$shape, 2L)) { + converter = set_names(list(crate(function(x) factor(as.integer(x), labels = levels), levels)), target) + } else { + stopf("Target must be an integer tensor of shape (batch_size), but has shape (batch_size, %s)", + paste(batch[[target]]$shape[-1L], collapse = ", ")) + } + converter = set_names(list(crate(function(x) factor(as.integer(x), labels = levels), levels)), target) + } + } + be = as_data_backend(x, dataset_shapes, converter = converter, cache = cache, chunk_size = chunk_size) + as_task_classif(be, target = target, ...) } #' @export -as_task_regr.dataset = function(x, dataset_shapes, target, converter, ...) { - # TODO +as_task_regr.dataset = function(x, target, converter = NULL, dataset_shapes = NULL, chunk_size = 100, cache = names(converter), ...) { + if (length(x) < 2) { + stopf("Dataset must have at least 2 rows.") + } + if (is.null(converter)) { + converter = set_names(list(as.numeric), target) + } + batch = dataloader(x, batch_size = 2)$.iter()$.next() + + if (batch[[target]]$dtype != torch_float()) { + stopf("Target must be a float tensor, but has dtype %s", batch[[target]]$dtype) + } + + if (!test_equal(batch[[target]]$shape, c(2L, 1L))) { + stopf("Target must be a float tensor of shape (batch_size, 1), but has shape (batch_size, %s)", + paste(batch[[target]]$shape[-1L], collapse = ", ")) + } + + dataset_shapes = get_or_check_dataset_shapes(x, dataset_shapes) + be = as_data_backend(x, dataset_shapes, converter = converter, cache = cache, chunk_size = chunk_size) + as_task_regr(be, target = target, ...) } #' @export @@ -177,4 +246,4 @@ check_lazy_tensors_backend = function(be, candidates, visited = character()) { } union(visited, intersect(candidates, be$colnames)) } -} \ No newline at end of file +} diff --git a/R/materialize.R b/R/materialize.R index a2294b47a..ee113830d 100644 --- a/R/materialize.R +++ b/R/materialize.R @@ -106,7 +106,7 @@ materialize.lazy_tensor = function(x, device = "cpu", rbind = FALSE, ...) { # no materialize_internal(x = x, device = device, cache = NULL, rbind = rbind) } -get_input = function(ds, ids, varying_shapes, rbind) { +get_input = function(ds, ids, varying_shapes) { if (is.null(ds$.getbatch)) { # .getindex is never NULL but a function that errs if it was not defined x = map(ids, function(id) map(ds$.getitem(id), function(x) x$unsqueeze(1))) if (varying_shapes) { @@ -201,7 +201,7 @@ materialize_internal = function(x, device = "cpu", cache = NULL, rbind) { } if (!do_caching || !input_hit) { - input = get_input(ds, ids, varying_shapes, rbind) + input = get_input(ds, ids, varying_shapes) } if (do_caching && !input_hit) { diff --git a/TODO.md b/TODO.md index ac96fb010..62d953809 100644 --- a/TODO.md +++ b/TODO.md @@ -21,4 +21,5 @@ ``` * Add checks on usage of `DataBackendLazyTensors` in `task_dataset` * Add optimization that truths values don't have to be loaded twice during resampling, i.e. - once for making the predictions and once for retrieving the truth column. \ No newline at end of file + once for making the predictions and once for retrieving the truth column. +* only allow caching converter columns in `DataBackendLazyTensors` (probably just remove the `cache` parameter) \ No newline at end of file diff --git a/man/DataBackendLazyTensors.Rd b/man/DataBackendLazyTensors.Rd index 9ac930281..42880dec7 100644 --- a/man/DataBackendLazyTensors.Rd +++ b/man/DataBackendLazyTensors.Rd @@ -68,7 +68,8 @@ Create a new instance of this \link[R6:R6Class]{R6} class. data, primary_key, converter, - cache = names(converter) + cache = names(converter), + chunk_size = 100 )}\if{html}{\out{}} } diff --git a/tests/testthat/test_DataBackendLazyTensors.R b/tests/testthat/test_DataBackendLazyTensors.R index d1272ed5a..06fb91574 100644 --- a/tests/testthat/test_DataBackendLazyTensors.R +++ b/tests/testthat/test_DataBackendLazyTensors.R @@ -1,7 +1,3 @@ -test_that("correct input checks", { - -}) - test_that("main API works", { # regression target ds = tensor_dataset( @@ -102,11 +98,71 @@ test_that("classif target works", { }) test_that("errors when weird preprocessing", { - # test following example pipeops: - # - target trafo - # - fix factors - # - smote +}) + +test_that("chunking works ", { + ds = dataset( + initialize = function() { + self$x = torch_tensor(matrix(100:1, nrow = 100, ncol = 1)) + self$y = torch_tensor(as.matrix(1:100, nrow = 100, ncol = 1)) + self$counter = 0 + }, + .getbatch = function(i) { + self$counter = self$counter + 1 + list(x = self$x[i, drop = FALSE], y = self$y[i, drop = FALSE]) + }, + .length = function() { + nrow(self$x) + } + )() + be = as_data_backend(ds, dataset_shapes = list(x = c(NA, 1), y = c(NA, 1)), chunk_size = 3, + converter = list(y = as.numeric)) + + counter_prev = ds$counter + be$data(1:3, c("x", "y")) + expect_equal(ds$counter, counter_prev + 1) + counter_prev = ds$counter + be$data(4:10, c("x", "y")) + expect_equal(ds$counter, counter_prev + 3) +}) + +test_that("can retrieve 0 rows", { + ds = tensor_dataset( + x = torch_tensor(matrix(100:1, nrow = 100, ncol = 1)), + y = torch_tensor(as.matrix(1:100, nrow = 100, ncol = 1)) + ) + be = as_data_backend(ds, dataset_shapes = list(x = c(NA, 1), y = c(NA, 1)), + converter = list(y = as.numeric)) + res = be$data(integer(0), c("x", "y", "row_id")) + expect_data_table(res, nrows = 0, ncols = 3) + expect_class(res$x, "lazy_tensor") + expect_class(res$y, "numeric") + expect_equal(res$row_id, integer(0)) +}) + +test_that("task converters work", { + # regression target + ds = tensor_dataset( + x = torch_tensor(matrix(100:1, nrow = 100, ncol = 1))$float(), + y = torch_tensor(as.matrix(1:100, nrow = 100, ncol = 1))$float() + ) + task = as_task_regr(ds, target = "y", converter = list(y = as.numeric)) + task$data(integer(0)) + expect_equal(task$head(2)$y, 1:2) + expect_equal(task$feature_names, "x") + expect_equal(task$target_names, "y") + expect_task(task) + + + # binary classification + ds = tensor_dataset( + x = torch_tensor(matrix(100:1, nrow = 100, ncol = 1))$float(), + y = torch_tensor(rep(0:1, times = 50))$float()$unsqueeze(2L) + ) + task = as_task_classif(ds, target = "y", levels = c("yes", "no")) + expect_task(task) + expect_equal(task$head()$y, factor(rep(c("yes", "no"), times = 3), levels = c("yes", "no"))) }) test_that("caching works", { @@ -147,8 +203,8 @@ test_that("caching works", { # y is no in the cache, so .getitem() is not called on $data() check(be, ds, 1, "y", 0) - # but x is not cached, so we still need to call .getitem below - check(be, ds, 1, c("x", "y"), 1) + # everything is in the cache + check(be, ds, 1, c("x", "y"), 0) # lazy tensor causes no materialization check(be, ds, 1, "x", 0) @@ -247,3 +303,31 @@ test_that("check_lazy_tensors_backend works", { expect_error(check_lazy_tensors_backend(task2$backend, c("x", "y")), regexp = "A converter column ('y')", fixed = TRUE) }) + + +test_that("...", { + ds = dataset( + initialize = function(x, y) { + self$x = torch_randn(100, 3) + self$y = torch_randn(100, 1) + self$counter = 0 + }, + .getbatch = function(i) { + print("hallo") + self$counter = self$counter + 1L + list(x = self$x[i, drop = FALSE], y = self$y[i, drop = FALSE]) + }, + .length = function() 100 + )() + +task = as_task_regr(ds, target = "y") + +counter = ds$counter +task$head() +print(ds$counter - counter) +counter = ds$counter +task$head() +expec +print(ds$counter - counter) + +})