diff --git a/DESCRIPTION b/DESCRIPTION index 41c136f..baf7e7a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -8,7 +8,8 @@ Authors@R: c( person("Genevera", "Allen", role = c("ths", "aut"), email = "gallen@rice.edu"), person("Lewis", "Brian W.", role = "cph", comment = "R/util_hcs.R"), person("Daniel", "Englund", role = "aut", email = "dse1@rice.edu"), - person("Yue", "Zhuo", role = "aut", email = "yz154@rice.edu")) + person("Yue", "Zhuo", role = "aut", email = "yz154@rice.edu"), + person("R Core", role = "cph", comment = "R/scale_complex.R")) Description: Fast computation and interactive for the convex clustering and bi-clustering problems. The CARP and CBASS algorithms use an algorithmic regularization scheme to obtain high-quality global diff --git a/R/carp.R b/R/carp.R index 3ee86db..4e4dbaf 100644 --- a/R/carp.R +++ b/R/carp.R @@ -330,7 +330,7 @@ CARP <- function(X, #' and the variant of \code{CARP} used. #' #' @details The \code{as.dendrogram} and \code{as.hclust} methods convert the -#' \code{CBASS} output to an object of class \code{dendrogram} or \code{hclust} +#' \code{CARP} output to an object of class \code{dendrogram} or \code{hclust} #' respectively. #' #' @param x an object of class \code{CARP} as returned by \code{\link{CARP}} diff --git a/R/scale_complex.R b/R/scale_complex.R new file mode 100644 index 0000000..8063352 --- /dev/null +++ b/R/scale_complex.R @@ -0,0 +1,46 @@ +#' @export +#' @author R Core (\code{base} package) +# +# This function is basically just scale.default, but with is.numeric replaced by +# (is.numeric | is.complex) so that it scales complex matrices correctly and returns +# the appropriate scale factors as attributes. +scale.complex <- function(x, center = TRUE, scale = TRUE){ + is.nc <- function(x) is.numeric(x) | is.complex(x) + + x <- as.matrix(x) + nc <- ncol(x) + if (is.logical(center)) { + if (center) { + center <- colMeans(x, na.rm=TRUE) + x <- sweep(x, 2L, center, check.margin=FALSE) + } + } + else { + if(!is.nc(center)) center <- as.complex(center) + if (length(center) == nc) + x <- sweep(x, 2L, center, check.margin=FALSE) + else + stop("length of 'center' must equal the number of columns of 'x'") + } + if (is.logical(scale)) { + if (scale) { + f <- function(v) { + v <- v[!is.na(v)] + # Use squared modulus in SD calc here rather than just squaring + sqrt(sum(abs(v)^2) / max(1, length(v) - 1L)) + } + scale <- apply(x, 2L, f) + x <- sweep(x, 2L, scale, "/", check.margin=FALSE) + } + } + else { + if(!is.nc(scale)) scale <- as.complex(scale) + if (length(scale) == nc) + x <- sweep(x, 2L, scale, "/", check.margin=FALSE) + else + stop("length of 'scale' must equal the number of columns of 'x'") + } + if(is.nc(center)) attr(x, "scaled:center") <- center + if(is.nc(scale )) attr(x, "scaled:scale" ) <- scale + x +} diff --git a/R/solvers.R b/R/solvers.R index 16911d9..4bd0d83 100644 --- a/R/solvers.R +++ b/R/solvers.R @@ -649,3 +649,286 @@ print.ConvexBiClustering <- function(x, ...) { invisible(x) } + +#' Compute Time Series (Phase Aligned Complex) Convex Clustering Solution Path on a User-Specified Grid +#' +#' \code{ts_convex_clustering} calculates the phase-aligned complex convex clustering solution path +#' at a user-specified grid of lambda values (or just a single value). It is, +#' in general, difficult to know a useful set of lambda values \emph{a priori}, +#' so this function is more useful for timing comparisons and methodological +#' research than applied work. +#' +#' Compared to the \code{\link{TROUT}} function, the returned object +#' is much more "bare-bones," containing only the estimated \eqn{U} matrices, +#' and no information used for dendrogram or path visualizations. +#' +#' @param X The data matrix (\eqn{X \in C^{n \times p}}{X}): rows correspond to +#' the observations (to be clustered) and columns to the variables (which +#' will not be clustered). +#' @param lambda_grid A user-supplied set of \eqn{\lambda}{lambda} values at which +#' to solve the convex clustering problem. These must be strictly +#' positive values and will be automatically sorted internally. +#' @param X.center A logical: Should \code{X} be centered columnwise? +#' @param X.scale A logical: Should \code{X} be scaled columnwise? +#' @param norm Which norm to use in the fusion penalty? Currently only \code{1} +#' and \code{2} (default) are supported. +#' @param ... Unused arguements. An error will be thrown if any unrecognized +#' arguments as given. All arguments other than \code{X} must be given +#' by name. +#' @param weights One of the following: \itemize{ +#' \item A function which, when called with argument \code{X}, +#' returns an b-by-n matrix of fusion weights. +#' \item A matrix of size n-by-n containing fusion weights +#' } +#' @param status Should a status message be printed to the console? +#' @return An object of class \code{convex_clustering} containing the following elements (among others): +#' \itemize{ +#' \item \code{X}: the original data matrix +#' \item \code{n}: the number of observations (rows of \code{X}) +#' \item \code{p}: the number of variables (columns of \code{X}) +#' \item \code{X.center}: a logical indicating whether \code{X} was centered +#' column-wise before clustering +#' \item \code{X.scale}: a logical indicating whether \code{X} was scaled +#' column-wise before centering +#' \item \code{weight_type}: a record of the scheme used to create +#' fusion weights +#' \item \code{U}: a tensor (3-array) of clustering solutions +#' } +#' @importFrom utils data +#' @importFrom dplyr %>% mutate group_by ungroup as_tibble n_distinct +#' @importFrom rlang %||% +#' @importFrom stats var +#' @export +ts_convex_clustering <- function(X, + ..., + lambda_grid, + weights = sparse_rbf_kernel_weights(k = "auto", + phi = "auto", + dist.method = "trout", + p = 2), + X.center = TRUE, + X.scale = FALSE, + norm = 2, + status = (interactive() && (clustRviz_logger_level() %in% c("MESSAGE", "WARNING", "ERROR")))) { + + tic <- Sys.time() + + #################### + ## + ## Input validation + ## + #################### + + dots <- list(...) + + if (length(dots) != 0L) { + if (!is.null(names(dots))) { + crv_error("Unknown argument ", sQuote(names(dots)[1L]), " passed to ", sQuote("ts_convex_clustering.")) + } else { + crv_error("Unknown ", sQuote("..."), " arguments passed to ", sQuote("ts_convex_clustering.")) + } + } + + if (!is.matrix(X)) { + crv_warning(sQuote("X"), " should be a matrix, not a " , class(X)[1], + ". Converting with as.matrix().") + X <- as.matrix(X) + } + + if (!is.complex(X)) { + crv_error(sQuote("X"), " must be complex.") + } + + # Missing data mask: M_{ij} = 1 means we see X_{ij}; + # Currently we don't support missing values in X so this better be all ones + M <- 1 - is.na(X) + + ## Check that imputation was successful. + if (anyNA(X)) { + crv_error("ts_convex_clustering() does not support missing values in ", sQuote("X.")) + } + + if (!all(is.finite(X))) { + crv_error("All elements of ", sQuote("X"), " must be finite.") + } + + if (!is_logical_scalar(X.center)) { + crv_error(sQuote("X.center"), "must be either ", sQuote("TRUE"), " or ", sQuote("FALSE.")) + } + + if (!is_logical_scalar(X.scale)) { + crv_error(sQuote("X.scale"), "must be either ", sQuote("TRUE"), " or ", sQuote("FALSE.")) + } + + if (norm %not.in% c(1, 2)){ + crv_error(sQuote("norm"), " must be either 1 or 2.") + } + + if(missing(lambda_grid)){ + crv_error(sQuote("lambda_grid"), " must be supplied.") + } + + if(!all(is.finite(lambda_grid))){ + crv_error(sQuote("lambda_grid"), " containts non-finite values.") + } + + if(any(lambda_grid < 0)){ + crv_error(sQuote("lambda_grid"), " must be strictly positive.") + } + + if(any(lambda_grid == 0)){ + crv_error(sQuote("lambda_grid"), " must be strictly positive - 0 will be automatically added.") + } + + if(is.unsorted(lambda_grid)){ + crv_warning(sQuote("lambda_grid"), " is unsorted: sorting for maximum performance.") + lambda_grid <- sort(lambda_grid) + } + + l1 <- (norm == 1) + + n <- NROW(X) + p <- NCOL(X) + + X.orig <- X + # Center and scale X + if (X.center | X.scale) { + X <- scale(X, center = X.center, scale = X.scale) + } + + scale_vector <- attr(X, "scaled:scale", exact=TRUE) %||% rep(1, p) + center_vector <- attr(X, "scaled:center", exact=TRUE) %||% rep(0, p) + + crv_message("Pre-computing weights and edge sets") + + # Calculate clustering weights + if (is.function(weights)) { # Usual case, `weights` is a function which calculates the weight matrix + weight_result <- weights(X) + + if (is.matrix(weight_result)) { + weight_matrix <- weight_result + weight_type <- UserFunction() + } else { + weight_matrix <- weight_result$weight_mat + weight_type <- weight_result$type + } + } else if (is.matrix(weights)) { + + if (!is_square(weights)) { + crv_error(sQuote("weights"), " must be a square matrix.") + } + + if (NROW(weights) != NROW(X)) { + crv_error(sQuote("NROW(weights)"), " must be equal to ", sQuote("NROW(X).")) + } + + weight_matrix <- weights + weight_type <- UserMatrix() + } else { + crv_error(sQuote("ts_convex_clustering"), " does not know how to handle ", sQuote("weights"), + " of class ", class(weights)[1], ".") + } + + if (any(weight_matrix < 0) || anyNA(weight_matrix)) { + crv_error("All fusion weights must be positive or zero.") + } + + weight_matrix_ut <- weight_matrix * upper.tri(weight_matrix); + + edge_list <- which(weight_matrix_ut != 0, arr.ind = TRUE) + edge_list <- edge_list[order(edge_list[, 1], edge_list[, 2]), ] + cardE <- NROW(edge_list) + D <- matrix(0, ncol = n, nrow = cardE) + D[cbind(seq_len(cardE), edge_list[,1])] <- 1 + D[cbind(seq_len(cardE), edge_list[,2])] <- -1 + + weight_vec <- weight_mat_to_vec(weight_matrix) + + crv_message("Computing Time-Series Convex Clustering Solutions") + tic_inner <- Sys.time() + + clustering_sol <- TroutClusteringCPP(X = X, + M = M, + D = D, + lambda_grid = lambda_grid, + weights = weight_vec[weight_vec != 0], + rho = .clustRvizOptionsEnv[["rho"]], + thresh = .clustRvizOptionsEnv[["stopping_threshold"]], + max_iter = .clustRvizOptionsEnv[["max_iter"]], + max_inner_iter = .clustRvizOptionsEnv[["max_inner_iter"]], + l1 = l1, + show_progress = status) + + toc_inner <- Sys.time() + + crv_message("Post-processing") + + lambda_grid <- clustering_sol$gamma_path + U_raw <- array(clustering_sol$u_path, + dim = c(n, p, length(lambda_grid)), + dimnames = list(rownames(X.orig), + colnames(X.orig), + paste0("Lambda_", seq_along(lambda_grid) - 1))) + + center_tensor <- aperm(array(center_vector, dim = c(p, n, length(lambda_grid))), c(2, 1, 3)) + scale_tensor <- aperm(array(scale_vector, dim = c(p, n, length(lambda_grid))), c(2, 1, 3)) + + U <- U_raw * scale_tensor + center_tensor + + trout_clustering_fit <- list( + X = X.orig, + M = M, + D = D, + U = U, + n = n, + p = p, + norm = norm, + lambda_grid = lambda_grid, + weights = weight_matrix, + weight_type = weight_type, + X.center = X.center, + center_vector = center_vector, + X.scale = X.scale, + scale_vector = scale_vector, + time = Sys.time() - tic, + fit_time = toc_inner - tic_inner + ) + + class(trout_clustering_fit) <- "TimeSeriesConvexClustering" + + return(trout_clustering_fit) +} + +#' Print \code{\link{ts_convex_clustering}} Results +#' +#' Prints a brief descripton of a fitted \code{ts_convex_clustering} object. +#' +#' Reports number of observations and variables of dataset, any preprocessing +#' done by the \code{\link{ts_convex_clustering}} function, and regularization +#' details. +#' +#' @param x an object of class \code{ts_convex_clustering} as returned by +#' \code{\link{ts_convex_clustering}} +#' @param ... Additional unused arguments +#' @export +print.TimeSeriesConvexClustering <- function(x, ...) { + alg_string <- paste("ADMM", if(x$norm == 1) "[L1]" else "[L2]") + + cat("Time Series [Phase-Aligned Convex] Convex Clustering Fit Summary\n") + cat("================================================================\n\n") + cat("Algorithm:", alg_string, "\n") + cat("Grid:", length(x$lambda_grid), "values of lambda. \n") + cat("Fit Time:", sprintf("%2.3f %s", x$fit_time, attr(x$fit_time, "units")), "\n") + cat("Total Time:", sprintf("%2.3f %s", x$time, attr(x$time, "units")), "\n\n") + cat("Number of Observations:", x$n, "\n") + cat("Number of Variables: ", x$p, "\n\n") + + cat("Pre-processing options:\n") + cat(" - Columnwise centering:", x$X.center, "\n") + cat(" - Columnwise scaling: ", x$X.scale, "\n\n") + + cat("Weights:\n") + print(x$weight_type) + + invisible(x) +} diff --git a/R/trout.R b/R/trout.R new file mode 100644 index 0000000..c06adf9 --- /dev/null +++ b/R/trout.R @@ -0,0 +1,382 @@ +#' @rdname TROUT +#' @export +TROUT <- function(X, ...){ + UseMethod("TROUT") +} + +#' @rdname TROUT +TROUT.mts <- function(X, ...){ + .NotYetImplemented() +} + +#' Compute \code{TROUT} (Time Series Convex Clustering) Solution Path +#' +#' \code{TROUT} returns a fast approximation to the Time Series Convex Clustering +#' solution path along with visualizations such as dendrograms and +#' cluster paths. \code{TROUT} solves the Time Series Convex Clustering problem via an efficient +#' Algorithmic Regularization scheme. +#' +#' @param X The data matrix (\eqn{X \in C^{n \times p}}{X}): rows correspond to +#' the observations (to be clustered) and columns to the variables (which +#' will not be clustered). Currently missing data is not supported. +#' @param labels A character vector of length \eqn{n}: observations (row) labels +#' @param X.center A logical: Should \code{X} be centered columnwise? +#' @param X.scale A logical: Should \code{X} be scaled columnwise? +#' @param back_track A logical: Should back-tracking be used to exactly identify fusions? +#' By default, back-tracking is not used. +#' @param exact A logical: Should the exact solution be computed using an iterative algorithm? +#' By default, algorithmic regularization is applied and the exact solution +#' is not computed. Setting \code{exact = TRUE} often significantly increases +#' computation time. +#' @param norm Which norm to use in the fusion penalty? Currently only \code{1} +#' and \code{2} (default) are supported. +#' @param t A number greater than 1: the size of the multiplicative update to +#' the cluster fusion regularization parameter (not used by +#' back-tracking variants). Typically on the scale of \code{1.005} to \code{1.1}. +#' @param npcs An integer >= 2. The number of principal components to compute +#' for path visualization. +#' @param dendrogram.scale A character string denoting how the scale of dendrogram +#' regularization proportions should be visualized. +#' Choices are \code{'original'} or \code{'log'}; if not +#' provided, a data-driven heuristic choice is used. +#' @param ... Unused arguements. An error will be thrown if any unrecognized +#' arguments as given. All arguments other than \code{X} must be given +#' by name. +#' @param weights One of the following: \itemize{ +#' \item A function which, when called with argument \code{X}, +#' returns an b-by-n matrix of fusion weights. +#' \item A matrix of size n-by-n containing fusion weights +#' } +#' @param status Should a status message be printed to the console? +#' @return An object of class \code{TROUT} containing the following elements (among others): +#' \itemize{ +#' \item \code{X}: the original data matrix +#' \item \code{n}: the number of observations (rows of \code{X}) +#' \item \code{p}: the number of variables (columns of \code{X}) +#' \item \code{alg.type}: the \code{TROUT} variant used +#' \item \code{X.center}: a logical indicating whether \code{X} was centered +#' column-wise before clustering +#' \item \code{X.scale}: a logical indicating whether \code{X} was scaled +#' column-wise before centering +#' \item \code{weight_type}: a record of the scheme used to create +#' fusion weights +#' } +#' @importFrom utils data +#' @importFrom dplyr %>% mutate group_by ungroup as_tibble n_distinct +#' @importFrom rlang %||% +#' @importFrom stats var +#' @rdname TROUT +#' @export +TROUT.default <- function(X, + ..., + weights = sparse_rbf_kernel_weights(k = "auto", + phi = "auto", + dist.method = "trout", + p = 2), + labels = rownames(X), + X.center = TRUE, + X.scale = FALSE, + back_track = FALSE, + exact = FALSE, + norm = 2, + t = 1.05, + npcs = min(4L, NCOL(X), NROW(X)), + dendrogram.scale = NULL, + status = (interactive() && (clustRviz_logger_level() %in% c("MESSAGE", "WARNING", "ERROR")))) { + + tic <- Sys.time() + + #################### + ## + ## Input validation + ## + #################### + + dots <- list(...) + + if (length(dots) != 0L) { + if (!is.null(names(dots))) { + crv_error("Unknown argument ", sQuote(names(dots)[1L]), " passed to ", sQuote("TROUT.")) + } else { + crv_error("Unknown ", sQuote("..."), " arguments passed to ", sQuote("TROUT.")) + } + } + + if (!is.matrix(X)) { + crv_warning(sQuote("X"), " should be a matrix, not a " , class(X)[1], + ". Converting with as.matrix().") + X <- as.matrix(X) + } + + if (!is.complex(X)) { + crv_error(sQuote("X"), " must be complex.") + } + + # Missing data mask: M_{ij} = 1 means we see X_{ij}; + # Currently we don't support missing values in X so this better be all ones + M <- 1 - is.na(X) + + ## Check that imputation was successful. + if (anyNA(X)) { + crv_error("TROUT() does not support missing values in ", sQuote("X.")) + } + + if (!all(is.finite(X))) { + crv_error("All elements of ", sQuote("X"), " must be finite.") + } + + if (!is_logical_scalar(X.center)) { + crv_error(sQuote("X.center"), "must be either ", sQuote("TRUE"), " or ", sQuote("FALSE.")) + } + + if (!is_logical_scalar(X.scale)) { + crv_error(sQuote("X.scale"), "must be either ", sQuote("TRUE"), " or ", sQuote("FALSE.")) + } + + if (!is_logical_scalar(back_track)) { + crv_error(sQuote("back_track"), "must be either ", sQuote("TRUE"), " or ", sQuote("FALSE.")) + } + + if (!is_logical_scalar(exact)) { + crv_error(sQuote("exact"), "must be either ", sQuote("TRUE"), " or ", sQuote("FALSE.")) + } + + if (norm %not.in% c(1, 2)){ + crv_error(sQuote("norm"), " must be either 1 or 2.") + } + + l1 <- (norm == 1) + + if ( (!is_numeric_scalar(t)) || (t <= 1) ) { + crv_error(sQuote("t"), " must be a scalar greater than 1.") + } + + if (!is.null(dendrogram.scale)) { + if (dendrogram.scale %not.in% c("original", "log")) { + crv_error("If not NULL, ", sQuote("dendrogram.scale"), " must be either ", sQuote("original"), " or ", sQuote("log.")) + } + } + + if ( (!is_integer_scalar(npcs)) || (npcs < 2) || (npcs > NCOL(X)) || (npcs > NROW(X)) ){ + crv_error(sQuote("npcs"), " must be an integer scalar between 2 and ", sQuote("min(dim(X)).")) + } + + ## Get row (observation) labels + if (is.null(labels)) { + labels <- paste0("Obs", seq_len(NROW(X))) + } + + if ( length(labels) != NROW(X) ){ + crv_error(sQuote("labels"), " must be of length ", sQuote("NROW(X).")) + } + + X.orig <- X + rownames(X.orig) <- rownames(X) <- labels <- make.unique(as.character(labels), sep="_") + + n <- NROW(X) + p <- NCOL(X) + + # Center and scale X + if (X.center | X.scale) { + X <- scale(X, center = X.center, scale = X.scale) + } + + scale_vector <- attr(X, "scaled:scale", exact=TRUE) %||% rep(1, p) + center_vector <- attr(X, "scaled:center", exact=TRUE) %||% rep(0, p) + + crv_message("Pre-computing weights and edge sets") + + # Calculate clustering weights + if (is.function(weights)) { # Usual case, `weights` is a function which calculates the weight matrix + weight_result <- weights(X) + + if (is.matrix(weight_result)) { + weight_matrix <- weight_result + weight_type <- UserFunction() + } else { + weight_matrix <- weight_result$weight_mat + weight_type <- weight_result$type + } + } else if (is.matrix(weights)) { + + if (!is_square(weights)) { + crv_error(sQuote("weights"), " must be a square matrix.") + } + + if (NROW(weights) != NROW(X)) { + crv_error(sQuote("NROW(weights)"), " must be equal to ", sQuote("NROW(X).")) + } + + weight_matrix <- weights + weight_type <- UserMatrix() + } else { + crv_error(sQuote("TROUT"), " does not know how to handle ", sQuote("weights"), + " of class ", class(weights)[1], ".") + } + + if (any(weight_matrix < 0) || anyNA(weight_matrix)) { + crv_error("All fusion weights must be positive or zero.") + } + + if (!is_connected_adj_mat(weight_matrix != 0)) { + crv_error("Weights do not imply a connected graph. Clustering will not succeed.") + } + + weight_matrix_ut <- weight_matrix * upper.tri(weight_matrix); + + edge_list <- which(weight_matrix_ut != 0, arr.ind = TRUE) + edge_list <- edge_list[order(edge_list[, 1], edge_list[, 2]), ] + cardE <- NROW(edge_list) + D <- matrix(0, ncol = n, nrow = cardE) + D[cbind(seq_len(cardE), edge_list[,1])] <- 1 + D[cbind(seq_len(cardE), edge_list[,2])] <- -1 + + weight_vec <- weight_mat_to_vec(weight_matrix) + + crv_message("Computing Time Series Convex Clustering [TROUT] Path") + tic_inner <- Sys.time() + + trout.sol.path <- TROUTcpp(X = X, + M = M, + D = D, + t = t, + epsilon = .clustRvizOptionsEnv[["epsilon"]], + weights = weight_vec[weight_vec != 0], + rho = .clustRvizOptionsEnv[["rho"]], + thresh = .clustRvizOptionsEnv[["stopping_threshold"]], + max_iter = .clustRvizOptionsEnv[["max_iter"]], + max_inner_iter = .clustRvizOptionsEnv[["max_inner_iter"]], + burn_in = .clustRvizOptionsEnv[["burn_in"]], + viz_max_inner_iter = .clustRvizOptionsEnv[["viz_max_inner_iter"]], + viz_initial_step = .clustRvizOptionsEnv[["viz_initial_step"]], + viz_small_step = .clustRvizOptionsEnv[["viz_small_step"]], + keep = .clustRvizOptionsEnv[["keep"]], + l1 = l1, + show_progress = status, + back_track = back_track, + exact = exact) + + toc_inner <- Sys.time() + + ## FIXME - Convert gamma.path to a single column matrix instead of a vector + ## RcppArmadillo returns a arma::vec as a n-by-1 matrix + ## RcppEigen returns an Eigen::VectorXd as a n-length vector + ## Something downstream cares about the difference, so just change + ## the type here for now + trout.sol.path$gamma_path <- matrix(trout.sol.path$gamma_path, ncol=1) + + crv_message("Post-processing") + + post_processing_results <- ConvexClusteringPostProcess(X = X, + edge_matrix = edge_list, + gamma_path = trout.sol.path$gamma_path, + u_path = trout.sol.path$u_path, + v_path = trout.sol.path$v_path, + v_zero_indices = trout.sol.path$v_zero_inds, + labels = labels, + dendrogram_scale = dendrogram.scale, + npcs = npcs, + smooth_U = TRUE) + + trout.fit <- list( + X = X.orig, + M = M, + D = D, + U = post_processing_results$U, + dendrogram = post_processing_results$dendrogram, + rotation_matrix = post_processing_results$rotation_matrix, + cluster_membership = post_processing_results$membership_info, + n = n, + p = p, + weights = weight_matrix, + weight_type = weight_type, + back_track = back_track, + exact = exact, + norm = norm, + t = t, + X.center = X.center, + center_vector = center_vector, + X.scale = X.scale, + scale_vector = scale_vector, + time = Sys.time() - tic, + fit_time = toc_inner - tic_inner + ) + + if (.clustRvizOptionsEnv[["keep_debug_info"]]) { + trout.fit[["debug"]] <- list(path = trout.sol.path, + row = post_processing_results$debug) + } + + class(trout.fit) <- "TROUT" + + return(trout.fit) +} + +#' Print \code{TROUT} Results +#' +#' Prints a brief descripton of a fitted \code{TROUT} object. +#' +#' Reports number of observations and variables of dataset, any preprocessing +#' done by the \code{\link{TROUT}} function, regularization weight information, +#' and the variant of \code{TROUT} used. +#' +#' @details The \code{as.dendrogram} and \code{as.hclust} methods convert the +#' \code{CBASS} output to an object of class \code{dendrogram} or \code{hclust} +#' respectively. +#' +#' @param x an object of class \code{TROUT} as returned by \code{\link{TROUT}} +#' @param object an object of class \code{TROUT} as returned by \code{\link{TROUT}} +#' @param ... Additional unused arguments +#' @export +#' @rdname print_trout +print.TROUT <- function(x, ...) { + if(x$exact){ + if(x$back_track){ + alg_string = "ADMM-VIZ [Exact Solver + Back-Tracking Fusion Search]" + } else { + alg_string = paste0("ADMM (t = ", round(x$t, 3), ") [Exact Solver]") + } + } else { + if(x$back_track){ + alg_string = "TROUT-VIZ [Back-Tracking Fusion Search]" + } else { + alg_string = paste0("TROUT (t = ", round(x$t, 3), ")") + } + } + + if(x$norm == 1){ + alg_string <- paste(alg_string, "[L1]") + } + + cat("Time Series Convex Clustering Fit Summary\n") + cat("=========================================\n\n") + cat("Algorithm:", alg_string, "\n") + cat("Fit Time:", sprintf("%2.3f %s", x$fit_time, attr(x$fit_time, "units")), "\n") + cat("Total Time:", sprintf("%2.3f %s", x$time, attr(x$time, "units")), "\n\n") + + cat("Number of Observations:", x$n, "\n") + cat("Number of Variables: ", x$p, "\n\n") + + cat("Pre-processing options:\n") + cat(" - Columnwise centering:", x$X.center, "\n") + cat(" - Columnwise scaling: ", x$X.scale, "\n\n") + + cat("Weights:\n") + print(x$weight_type) + + invisible(x) +} + +#' @export +#' @importFrom stats as.dendrogram +#' @rdname print_trout +as.dendrogram.TROUT <- function(object, ...){ + as.dendrogram(object$dendrogram) +} + +#' @export +#' @importFrom stats as.hclust +#' @rdname print_trout +as.hclust.TROUT <- function(x, ...){ + x$dendrogram +} diff --git a/R/utils.R b/R/utils.R index 7778c57..ccbabd2 100644 --- a/R/utils.R +++ b/R/utils.R @@ -430,3 +430,32 @@ my_palette <- function(n){ brewer.pal(9, "Set1")[seq_len(n)] } + + +#' Distances for Complex Data +#' +#' \code{cdist} implements the standard complex distance. (\code{stats::dist} +#' simply discards imaginary components when applied to complex \code{x}). +#' \code{trout_dist} implements the phase-adjusted (TROUT) distance +#' @export +#' @importFrom stats as.dist +trout_dist <- function(x, diag = FALSE, upper = FALSE){ + m <- trout_dist_impl(x) + rownames(m) <- colnames(m) <- rownames(x) + d <- as.dist(m, diag = diag, upper = upper) + attr(d, "call") <- match.call() + d +} + +#' @rdname trout_dist +#' @export +cdist <- function(X, method = "euclidean", diag = FALSE, upper = FALSE){ + if(method != "euclidean"){ + stop("cdist only supports Euclidean distance presently.") + } + m <- cdist_impl(X) + rownames(m) <- colnames(m) <- rownames(X) + d <- as.dist(m, diag = diag, upper = upper) + attr(d, "call") <- match.call() + d +} diff --git a/R/weights.R b/R/weights.R index 64817e4..a661d29 100644 --- a/R/weights.R +++ b/R/weights.R @@ -176,12 +176,13 @@ dense_rbf_kernel_weights <- function(phi = "auto", "manhattan", "canberra", "binary", - "minkowski"), + "minkowski", + "trout"), p = 2){ tryCatch(dist.method <- match.arg(dist.method), error = function(e){ - crv_error("Unsupported choice of ", sQuote("weight.dist;"), + crv_error("Unsupported choice of ", sQuote("dist.method;"), " see the ", sQuote("method"), " argument of ", sQuote("stats::dist"), " for supported distances.") }) @@ -192,6 +193,12 @@ dense_rbf_kernel_weights <- function(phi = "auto", " argument of ", sQuote("stats::dist"), " for details.") } + if(dist.method == "trout"){ + dist_f <- trout_dist + } else { + dist_f <- function(X) dist(X, method = dist.method, p = p) + } + function(X){ user_phi <- (phi != "auto") @@ -200,7 +207,7 @@ dense_rbf_kernel_weights <- function(phi = "auto", ## necessary... phi_range <- 10^(seq(-10, 10, length.out = 21)) weight_vars <- vapply(phi_range, - function(phi) var(exp((-1) * phi * (dist(X, method = dist.method, p = p)[TRUE])^2)), + function(phi) var(exp((-1) * phi * (dist_f(X)[TRUE])^2)), numeric(1)) phi <- phi_range[which.max(weight_vars)] @@ -214,7 +221,7 @@ dense_rbf_kernel_weights <- function(phi = "auto", crv_error(sQuote("phi"), " must be positive.") } - dist_mat <- as.matrix(dist(X, method = dist.method, p = p)) + dist_mat <- as.matrix(dist_f(X)) dist_mat <- exp(-1 * phi * dist_mat^2) check_weight_matrix(dist_mat) diff --git a/src/clustRviz.cpp b/src/clustRviz.cpp index 6c7fa5b..1097dbc 100644 --- a/src/clustRviz.cpp +++ b/src/clustRviz.cpp @@ -22,6 +22,10 @@ Rcpp::List CARPcpp(const Eigen::MatrixXd& X, bool back_track = false, bool exact = false){ + if((M != 1).any()){ + ClustRVizLogger::error("Time Series (TROUT) Clustering does not yet support missing data."); + } + ConvexClustering problem(X, M, D, weights, rho, l1, show_progress); if(exact){ @@ -126,6 +130,68 @@ Rcpp::List CBASScpp(const Eigen::MatrixXd& X, } } +// [[Rcpp::export(rng = false)]] +Rcpp::List TROUTcpp(const Eigen::MatrixXcd& X, + const Eigen::ArrayXXd& M, + const Eigen::MatrixXd& D, + const Eigen::VectorXd& weights, + double epsilon, + double t, + double rho = 1, + double thresh = CLUSTRVIZ_DEFAULT_STOP_PRECISION, + int max_iter = 100000, + int max_inner_iter = 2500, + int burn_in = 50, + double back = 0.5, + int keep = 10, + int viz_max_inner_iter = 15, + double viz_initial_step = 1.1, + double viz_small_step = 1.01, + bool l1 = false, + bool show_progress = true, + bool back_track = false, + bool exact = false){ + + TroutClustering problem(X, M, D, weights, rho, l1, show_progress); + + if(exact){ + if(back_track){ + TroutClusteringADMM_VIZ admm_viz(problem, + epsilon, + thresh, + max_iter, + max_inner_iter, + burn_in, + back, + viz_max_inner_iter, + viz_initial_step, + viz_small_step); + + return admm_viz.build_return_object(); + } else { + TroutClusteringADMM admm(problem, epsilon, t, thresh, max_iter, max_inner_iter); + return admm.build_return_object(); + } + } else { + if(back_track){ + TROUT_VIZ trout_viz(problem, + epsilon, + max_iter, + burn_in, + back, + keep, + viz_max_inner_iter, + viz_initial_step, + viz_small_step); + + return trout_viz.build_return_object(); + } + + TROUT trout(problem, epsilon, t, max_iter, burn_in, keep); + return trout.build_return_object(); + } +} + // [[Rcpp::export(rng = false)]] Rcpp::List ConvexClusteringCPP(const Eigen::MatrixXd& X, const Eigen::ArrayXXd& M, @@ -165,3 +231,87 @@ Rcpp::List ConvexBiClusteringCPP(const Eigen::MatrixXd& X, return solver.build_return_object(); } + +// [[Rcpp::export(rng = false)]] +Rcpp::List TroutClusteringCPP(const Eigen::MatrixXcd& X, + const Eigen::ArrayXXd& M, + const Eigen::MatrixXd& D, + const Eigen::VectorXd& weights, + const std::vector lambda_grid, + double rho = 1, + double thresh = CLUSTRVIZ_DEFAULT_STOP_PRECISION, + int max_iter = 100000, + int max_inner_iter = 2500, + bool l1 = false, + bool show_progress = true){ + + if((M != 1).any()){ + ClustRVizLogger::error("Time Series (TROUT) Clustering does not yet support missing data."); + } + + UnivariateTroutClusteringSkeleton > problem(X, M, D, weights, rho, l1, show_progress); + UserGridTroutClusteringADMM solver(problem, lambda_grid, thresh, max_iter, max_inner_iter); + + return solver.build_return_object(); +} + +// [[Rcpp::export(rng = false)]] +SEXP matrix_row_prox(SEXP Xsexp, + double lambda, + const Eigen::VectorXd& weights, + bool l1 = true){ + + switch(TYPEOF(Xsexp)){ + case REALSXP: return Rcpp::wrap(MatrixRowProx(Rcpp::as(Xsexp), lambda, weights, l1)); + case CPLXSXP: return Rcpp::wrap(MatrixRowProx >(Rcpp::as(Xsexp), lambda, weights, l1)); + default: Rcpp::stop("Unsupported type of X."); + } + + // Should not trigger but appease compiler... + return R_NilValue; +}; + +// [[Rcpp::export(rng = false)]] +SEXP matrix_col_prox(SEXP Xsexp, + double lambda, + const Eigen::VectorXd& weights, + bool l1 = true){ + + switch(TYPEOF(Xsexp)){ + case REALSXP: return Rcpp::wrap(MatrixColProx(Rcpp::as(Xsexp), lambda, weights, l1)); + case CPLXSXP: return Rcpp::wrap(MatrixColProx >(Rcpp::as(Xsexp), lambda, weights, l1)); + default: Rcpp::stop("Unsupported type of X."); + } + + // Should not trigger but appease compiler... + return R_NilValue; +}; + +// [[Rcpp::export(rng = false)]] +SEXP smooth_u_clustering(SEXP U_oldSEXP, Rcpp::List cluster_info_list){ + switch(TYPEOF(U_oldSEXP)){ + case REALSXP: return Rcpp::wrap(smooth_u_clustering_impl(Rcpp::as(U_oldSEXP), cluster_info_list)); + case CPLXSXP: return Rcpp::wrap(smooth_u_clustering_impl >(Rcpp::as(U_oldSEXP), cluster_info_list)); + default: Rcpp::stop("Unsupported type of X."); + } + + // Should not trigger but appease compiler... + return R_NilValue; +} + +// [[Rcpp::export(rng = false)]] +SEXP tensor_projection(SEXP Xsexp, SEXP Ysexp){ + if(TYPEOF(Xsexp) != TYPEOF(Ysexp)){ + Rcpp::stop("Type of X and Y must match."); + } + + switch(TYPEOF(Xsexp)){ + case REALSXP: return Rcpp::wrap(tensor_projection_impl(Rcpp::as(Xsexp), Rcpp::as(Ysexp))); + case CPLXSXP: return Rcpp::wrap(tensor_projection_impl >(Rcpp::as(Xsexp), Rcpp::as(Ysexp))); + default: Rcpp::stop("Unsupported type of X."); + } + + // Should not trigger but appease compiler... + return R_NilValue; +} + diff --git a/src/clustRviz.h b/src/clustRviz.h index dba5b1e..b011ef1 100644 --- a/src/clustRviz.h +++ b/src/clustRviz.h @@ -1,17 +1,28 @@ #include "clustRviz_base.h" #include "clustRviz_logging.h" +#include "clustRviz_utils.h" #include "clustering_impl.h" #include "biclustering_impl.h" +#include "trout_impl.h" #include "alg_reg_policies.h" #include "optim_policies.h" +typedef ConvexClusteringSkeleton ConvexClustering; +typedef UnivariateTroutClusteringSkeleton > TroutClustering; + typedef AlgorithmicRegularizationFixedStepSizePolicy CARP; typedef AlgorithmicRegularizationBacktrackingPolicy CARP_VIZ; typedef AlgorithmicRegularizationFixedStepSizePolicy CBASS; typedef AlgorithmicRegularizationBacktrackingPolicy CBASS_VIZ; +typedef AlgorithmicRegularizationFixedStepSizePolicy TROUT; +typedef AlgorithmicRegularizationBacktrackingPolicy TROUT_VIZ; + typedef ADMMPolicy ConvexClusteringADMM; typedef ADMMPolicy ConvexBiClusteringADMM; +typedef ADMMPolicy TroutClusteringADMM; typedef BackTrackingADMMPolicy ConvexClusteringADMM_VIZ; typedef BackTrackingADMMPolicy ConvexBiClusteringADMM_VIZ; +typedef BackTrackingADMMPolicy TroutClusteringADMM_VIZ; typedef UserGridADMMPolicy UserGridConvexClusteringADMM; typedef UserGridADMMPolicy UserGridConvexBiClusteringADMM; +typedef UserGridADMMPolicy UserGridTroutClusteringADMM; diff --git a/src/clustRviz_base.h b/src/clustRviz_base.h index f738ae4..4bc7091 100644 --- a/src/clustRviz_base.h +++ b/src/clustRviz_base.h @@ -9,6 +9,10 @@ #define CLUSTRVIZ_STATUS_WIDTH_CHECK 20 // Every 20 status updates * 0.1s => every 2s #define CLUSTRVIZ_DEFAULT_STOP_PRECISION 1e-10 //Stop when cellwise diff between iters < val +// Prototypes - utils.cpp +double soft_thresh(double, double); +std::complex soft_thresh(const std::complex, double); + // Helper to determine if STL set contains an element // // In general, this is not efficient because one wants to do something @@ -17,19 +21,83 @@ template bool contains(const std::set& container, T element){ typename std::set::const_iterator it = container.find(element); return it != container.end(); +}; + +template +double scaled_squared_norm(const Eigen::MatrixBase& X){ + return X.squaredNorm() / X.size(); } -// Prototypes - utils.cpp -Eigen::MatrixXd MatrixRowProx(const Eigen::MatrixXd&, - double, - const Eigen::VectorXd&, - bool); +template +Eigen::Matrix MatrixRowProx(const Eigen::Matrix& X, + double lambda, + const Eigen::VectorXd& weights, + bool l1 = true){ + + using MatrixXt = Eigen::Matrix; + using VectorXt = Eigen::Matrix; -Eigen::MatrixXd MatrixColProx(const Eigen::MatrixXd&, - double, - const Eigen::VectorXd&, - bool); + Eigen::Index n = X.rows(); + Eigen::Index p = X.cols(); -double scaled_squared_norm(const Eigen::MatrixXd&); + MatrixXt V(n, p); + + if(l1){ + for(Eigen::Index i = 0; i < n; i++){ + for(Eigen::Index j = 0; j < p; j++){ + V(i, j) = soft_thresh(X(i, j), lambda * weights(i)); + } + } + } else { + for(Eigen::Index i = 0; i < n; i++){ + VectorXt X_i = X.row(i); + double scale_factor = 1 - lambda * weights(i) / X_i.norm(); + + if(scale_factor > 0){ + V.row(i) = X_i * scale_factor; + } else { + V.row(i).setZero(); + } + } + } + + return V; +} + +template +Eigen::Matrix MatrixColProx(const Eigen::Matrix& X, + double lambda, + const Eigen::VectorXd& weights, + bool l1 = true){ + + using MatrixXt = Eigen::Matrix; + using VectorXt = Eigen::Matrix; + + Eigen::Index n = X.rows(); + Eigen::Index p = X.cols(); + + MatrixXt V(n, p); + + if(l1){ + for(Eigen::Index i = 0; i < n; i++){ + for(Eigen::Index j = 0; j < p; j++){ + V(i, j) = soft_thresh(X(i, j), lambda * weights(j)); + } + } + } else { + for(Eigen::Index j = 0; j < p; j++){ + VectorXt X_j = X.col(j); + double scale_factor = 1 - lambda * weights(j) / X_j.norm(); + + if(scale_factor > 0){ + V.col(j) = X_j * scale_factor; + } else { + V.col(j).setZero(); + } + } + } + + return V; +} #endif diff --git a/src/clustRviz_utils.h b/src/clustRviz_utils.h new file mode 100644 index 0000000..b8bbce4 --- /dev/null +++ b/src/clustRviz_utils.h @@ -0,0 +1,141 @@ +#ifndef CLUSTRVIZ_UTILS_H +#define CLUSTRVIZ_UTILS_H 1 +// This header defines template versions of complex utilities +// These are templated on the data type to allow for both real and complex data +// These are not in clustRviz_base.h since some of them depend on ClustRVizLogger... + +#include +#include "clustRviz_logging.h" + +// U-smoothing for convex clustering +// +// Given cluster memberships, replace rows of U which belong to the same cluster +// with their mutual mean.... +template +RcppVector smooth_u_clustering_impl(RcppVector U_old, Rcpp::List cluster_info_list){ + + using MatrixXt = Eigen::Matrix; + using VectorXt = Eigen::Matrix; + + // The first argument is really an array but we pass as a NumericVector + // The second argument is a list produced by get_cluster_assignments() + Rcpp::IntegerVector U_dims = U_old.attr("dim"); + if(U_dims.size() != 3){ + ClustRVizLogger::error("U must be a three rank tensor."); + } + int N = U_dims(0); + int P = U_dims(1); + int Q = U_dims(2); + + // Check length of cluster_info + if(cluster_info_list.size() != Q){ + ClustRVizLogger::error("Dimensions of U and cluster_info do not match"); + } + + RcppVector U(N * P * Q); + U.attr("dim") = U_dims; + Rcpp::rownames(U) = Rcpp::rownames(U_old); + Rcpp::colnames(U) = Rcpp::colnames(U_old); + + for(int q = 0; q < Q; q++){ + Rcpp::List cluster_info = cluster_info_list[q]; + int n_clusters = Rcpp::as(cluster_info[2]); + + Rcpp::IntegerVector cluster_ids = cluster_info[0]; + Rcpp::IntegerVector cluster_sizes = cluster_info[1]; + + // There's a lot going on on the RHS here, so let's un-pack (inside outwards) + // First, we get a pointer to the relevant slice of U_old + // This works because the RcppVector is a C++ wrapper around a SEXP which + // is ultimately just a pointer to the relevant memory + // We when cast it to an appropriate C++ pointer type + // For real data, this is a no-op since both R and Eigen use doubles for real data + // For complex data, this matters because we convert from R's homegrown Rcomplex* + // to a std::complex* pointer as eigen expects + // We then use Eigen::Map> to get an Eigen::Matrix backed + // by R's memory in a read only fashion. + // The same construct is used below to load the smoothed data into U + MatrixXt U_old_slice = Eigen::Map(reinterpret_cast(&U_old[N * P * q]), N, P); + MatrixXt U_new(N, P); + + for(int j = 1; j <= n_clusters; j++){ // Cluster IDs are 1-based (per R conventions) + VectorXt vec(P); vec.setZero(); + + // Manually work out new mean + for(int n = 0; n < N; n++){ + if(cluster_ids[n] == j){ + vec += U_old_slice.row(n); + } + } + + vec /= cluster_sizes[j - 1]; // Subtract 1 to adjust to C++ indexing + + // Assign new mean where needed... + for(int n = 0; n < N; n++){ + if(cluster_ids[n] == j){ + U_new.row(n) = vec; + } + } + } + + Eigen::Map(reinterpret_cast(&U[N * P * q]), N, P) = U_new; + } + + return U; +} + +// Tensor projection along the second mode +// +// Given a 3D tensor X in F^{n-by-p-by-q} (observations by features by iterations) +// and a rotation matrix Y in F^{p-by-k} (features by principal components), we +// want to get a projected array in F^{n-by-k-by-q} giving the path of the principal +// components +// +// This is straightforward, but "loopy" so we implement it in Rcpp / RcppEigen for speed +// We use some template magic to support F = R (real) and F = C (complex) data +template +RcppVector tensor_projection_impl(RcppVector X, const Eigen::Matrix& Y){ + + using MatrixXt = Eigen::Matrix; + + // Validate X + Rcpp::IntegerVector X_dims = X.attr("dim"); + if(X_dims.size() != 3){ + ClustRVizLogger::error("X must be a three rank tensor."); + } + int n = X_dims(0); + int p = X_dims(1); + int q = X_dims(2); + + // Validate Y + if(Y.rows() != p){ + ClustRVizLogger::error("The dimensions of X and Y do not match -- ") << p << " != " << Y.rows(); + } + + int k = Y.cols(); + + RcppVector result(n * k * q); + Rcpp::IntegerVector result_dims{n, k, q}; + result.attr("dim") = result_dims; + + for(int i = 0; i < q; i++){ + // There's a lot going on on the RHS here, so let's un-pack (inside outwards) + // First, we get a pointer to the relevant slice of X + // This works because the RcppVector is a C++ wrapper around a SEXP which + // is ultimately just a pointer to the relevant memory + // We when cast it to an appropriate C++ pointer type + // For real data, this is a no-op since both R and Eigen use doubles for real data + // For complex data, this matters because we convert from R's homegrown Rcomplex* + // to a std::complex* pointer as eigen expects + // We then use Eigen::Map> to get an Eigen::Matrix backed + // by R's memory in a read only fashion. + // The same construct is used below to load the smoothed data into result + MatrixXt X_slice = Eigen::Map(reinterpret_cast(&X[n * p * i]), n, p); + MatrixXt X_slice_projected = X_slice * Y; + Eigen::Map(reinterpret_cast(&result[n * k * i]), n, k) = X_slice_projected; + } + + return result; +} + +#endif diff --git a/src/clustering_impl.h b/src/clustering_impl.h index 7fc85bb..1e0fe9c 100644 --- a/src/clustering_impl.h +++ b/src/clustering_impl.h @@ -5,17 +5,21 @@ #include "clustRviz_logging.h" #include "status.h" -class ConvexClustering { +template +class ConvexClusteringSkeleton { + using MatrixXt = Eigen::Matrix; + using VectorXt = Eigen::Matrix; + public: double gamma; // Current regularization level - need to be able to manipulate this externally - ConvexClustering(const Eigen::MatrixXd& X_, - const Eigen::ArrayXXd& M_, - const Eigen::MatrixXd& D_, - const Eigen::VectorXd& weights_, - const double rho_, - const bool l1_, - const bool show_progress_): + ConvexClusteringSkeleton(const Eigen::Matrix& X_, + const Eigen::ArrayXXd& M_, + const Eigen::MatrixXd& D_, + const Eigen::VectorXd& weights_, + const double rho_, + const bool l1_, + const bool show_progress_): X(X_), M(M_), D(D_), @@ -49,7 +53,7 @@ class ConvexClustering { store_values(); // PreCompute chol(I + rho D^TD) for easy inversions in the U update step - Eigen::MatrixXd IDTD = rho * D.transpose() * D + Eigen::MatrixXd::Identity(n, n); + MatrixXt IDTD = rho * D.transpose() * D + MatrixXt::Identity(n, n); u_step_solver.compute(IDTD); }; @@ -68,20 +72,23 @@ class ConvexClustering { return nzeros == num_edges; } - void admm_step(){ - // U-update - Eigen::MatrixXd X_imputed = M * X.array() + (1.0 - M) * U.array(); + inline void u_step(){ + // U-update -- Make this a method so time series clustering can replace + MatrixXt X_imputed = M * X.array() + (1.0 - M) * U.array(); U = u_step_solver.solve(X_imputed + rho * D.transpose() * (V - Z)); - Eigen::MatrixXd DU = D * U; + ClustRVizLogger::debug("U = ") << U; + } + void admm_step(){ + u_step(); // V-update - Eigen::MatrixXd DUZ = DU + Z; + const MatrixXt DUZ = D * U + Z; V = MatrixRowProx(DUZ, gamma / rho, weights, l1); ClustRVizLogger::debug("V = ") << V; // Z-update - Z += DU - V; + Z = DUZ - V; ClustRVizLogger::debug("Z = ") << Z; // Identify cluster fusions (rows of V which have gone to zero) @@ -138,8 +145,8 @@ class ConvexClustering { } // Store values - UPath.col(storage_index) = Eigen::Map(U.data(), n * p); - VPath.col(storage_index) = Eigen::Map(V.data(), p * num_edges); + UPath.col(storage_index) = Eigen::Map(U.data(), n * p); + VPath.col(storage_index) = Eigen::Map(V.data(), p * num_edges); gamma_path(storage_index) = gamma; v_zeros_path.col(storage_index) = v_zeros; @@ -168,9 +175,9 @@ class ConvexClustering { sp.update(nzeros, V.squaredNorm(), iter, gamma); } -private: +protected: // Fixed (non-data-dependent) problem details - const Eigen::MatrixXd& X; // Data matrix (to be clustered) + const MatrixXt& X; // Data matrix (to be clustered) const Eigen::ArrayXXd& M; // Missing data mask const Eigen::MatrixXd& D; // Edge (differencing) matrix const Eigen::VectorXd& weights; // Clustering weights @@ -181,30 +188,30 @@ class ConvexClustering { const int n; // Problem dimensions const int p; const int num_edges; - Eigen::LLT u_step_solver; // Cached factorization for u-update + Eigen::LLT u_step_solver; // Cached factorization for u-update // Progress printer StatusPrinter sp; // Current copies of ADMM variables - Eigen::MatrixXd U; // Primal variable - Eigen::MatrixXd V; // Split variable - Eigen::MatrixXd Z; // Dual variable + MatrixXt U; // Primal variable + MatrixXt V; // Split variable + MatrixXt Z; // Dual variable Eigen::ArrayXi v_zeros; // Fusion indicators Eigen::Index nzeros; // Number of fusions // Old versions (used for back-tracking and fusion counting) Eigen::Index nzeros_old; - Eigen::MatrixXd U_old; - Eigen::MatrixXd V_old; - Eigen::MatrixXd Z_old; + MatrixXt U_old; + MatrixXt V_old; + MatrixXt Z_old; Eigen::ArrayXi v_zeros_old; // Internal storage buffers Eigen::Index buffer_size; Eigen::Index storage_index; - Eigen::MatrixXd UPath; - Eigen::MatrixXd VPath; + MatrixXt UPath; + MatrixXt VPath; Eigen::VectorXd gamma_path; Eigen::MatrixXi v_zeros_path; }; diff --git a/src/trout_impl.h b/src/trout_impl.h new file mode 100644 index 0000000..671ca74 --- /dev/null +++ b/src/trout_impl.h @@ -0,0 +1,42 @@ +#ifndef CLUSTRVIZ_TROUT_H +#define CLUSTRVIZ_TROUT_H 1 + +#include "clustRviz_base.h" +#include "clustRviz_logging.h" +#include "status.h" + +template +class UnivariateTroutClusteringSkeleton : public ConvexClusteringSkeleton { + using MatrixXt = Eigen::Matrix; + using VectorXt = Eigen::Matrix; + + // We need to inherit the constructor explicitly for reasons I don't entirely understand + // or else this fails with a confusing error message which essentially says the + // constructor call (with 7 arguments) doesn't match the default constructor + using ConvexClusteringSkeleton::ConvexClusteringSkeleton; + + // TODO - Allow flexible specification of phase and amplitude alignment + // TODO - Allow U-step convergence parameters to be tweaked + inline void u_step(){ + // For now, we do a whole loop, but at some point might relax this to a single step if theory supports... + + // Note that "this->" is required for member access since we're inheriting from a + // templated base class -- https://stackoverflow.com/a/1121016 + Eigen::Index k_inner = 0; + MatrixXt U_old_inner = this->U; + const MatrixXt A = this->rho * this->D.transpose() * (this->V - this->Z); + + do{ + k_inner += 1; + U_old_inner = this->U; + MatrixXt X_aligned = align_phase(this->X, this->U); + + this->U = this->u_step_solver.solve(X_aligned + A); + + } while ((scaled_squared_norm(this->U - U_old_inner) > 1e-6) & (k_inner < 1000)); + + ClustRVizLogger::debug("U = ") << this->U; + } +}; + +#endif diff --git a/src/utils.cpp b/src/utils.cpp index bdc36a9..cc32861 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -24,75 +24,13 @@ double soft_thresh(double x, double lambda){ } } -// Apply a row-wise prox operator (with weights) to a matrix -// [[Rcpp::export(rng = false)]] -Eigen::MatrixXd MatrixRowProx(const Eigen::MatrixXd& X, - double lambda, - const Eigen::VectorXd& weights, - bool l1 = true){ - Eigen::Index n = X.rows(); - Eigen::Index p = X.cols(); - - Eigen::MatrixXd V(n, p); - - if(l1){ - for(Eigen::Index i = 0; i < n; i++){ - for(Eigen::Index j = 0; j < p; j++){ - V(i, j) = soft_thresh(X(i, j), lambda * weights(i)); - } - } - } else { - for(Eigen::Index i = 0; i < n; i++){ - Eigen::VectorXd X_i = X.row(i); - double scale_factor = 1 - lambda * weights(i) / X_i.norm(); - - if(scale_factor > 0){ - V.row(i) = X_i * scale_factor; - } else { - V.row(i).setZero(); - } - } - } - - return V; -} - -// Apply a col-wise prox operator (with weights) to a matrix -// [[Rcpp::export(rng = false)]] -Eigen::MatrixXd MatrixColProx(const Eigen::MatrixXd& X, - double lambda, - const Eigen::VectorXd& weights, - bool l1 = true){ - Eigen::Index n = X.rows(); - Eigen::Index p = X.cols(); - - Eigen::MatrixXd V(n, p); - - if(l1){ - for(Eigen::Index i = 0; i < n; i++){ - for(Eigen::Index j = 0; j < p; j++){ - V(i, j) = soft_thresh(X(i, j), lambda * weights(j)); - } - } +std::complex soft_thresh(const std::complex x, double lambda){ + const double r = std::abs(x); + if(r > lambda){ + return (r - lambda) * x / r; } else { - for(Eigen::Index j = 0; j < p; j++){ - Eigen::VectorXd X_j = X.col(j); - double scale_factor = 1 - lambda * weights(j) / X_j.norm(); - - if(scale_factor > 0){ - V.col(j) = X_j * scale_factor; - } else { - V.col(j).setZero(); - } - } + return std::complex(0.0, 0.0); } - - return V; -} - -double scaled_squared_norm(const Eigen::MatrixXd& mat) { - //Squared norm / number of cells gives average squared difference - return mat.squaredNorm() / (mat.rows() * mat.cols()); } // Some basic cheap checks that a weight @@ -129,105 +67,70 @@ void check_weight_matrix(const Eigen::MatrixXd& weight_matrix){ } } -// U-smoothing for convex clustering -// -// Given cluster memberships, replace rows of U which belong to the same cluster -// with their mutual mean.... -// -// [[Rcpp::export(rng = false)]] -Rcpp::NumericVector smooth_u_clustering(Rcpp::NumericVector U_old, Rcpp::List cluster_info_list){ - // The first argument is really an array but we pass as a NumericVector - // The second argument is a list produced by get_cluster_assignments() - Rcpp::IntegerVector U_dims = U_old.attr("dim"); - if(U_dims.size() != 3){ - ClustRVizLogger::error("U must be a three rank tensor."); - } - int N = U_dims(0); - int P = U_dims(1); - int Q = U_dims(2); +// TROUT Alignment +// FIXME - Why doesn't this play nice with overloading? Prototype missing somewhwere? +Eigen::VectorXcd align_phase_v(const Eigen::VectorXcd& u, + const Eigen::VectorXcd& x){ - // Check length of cluster_info - if(cluster_info_list.size() != Q){ - ClustRVizLogger::error("Dimensions of U and cluster_info do not match"); + // TODO - Triple check this analytical result! + std::complex z_hat = u.conjugate().cwiseProduct(x).sum(); + if(std::abs(z_hat) > 0){ + z_hat /= std::abs(z_hat); } - Rcpp::NumericVector U(N * P * Q); - U.attr("dim") = U_dims; - Rcpp::rownames(U) = Rcpp::rownames(U_old); - Rcpp::colnames(U) = Rcpp::colnames(U_old); - - for(int q = 0; q < Q; q++){ - Rcpp::List cluster_info = cluster_info_list[q]; - int n_clusters = Rcpp::as(cluster_info[2]); - - Rcpp::IntegerVector cluster_ids = cluster_info[0]; - Rcpp::IntegerVector cluster_sizes = cluster_info[1]; - - Eigen::MatrixXd U_old_slice = Eigen::Map(&U_old[N * P * q], N, P); - Eigen::MatrixXd U_new(N, P); + return u * z_hat; +} - for(int j = 1; j <= n_clusters; j++){ // Cluster IDs are 1-based (per R conventions) - Eigen::VectorXd vec = Eigen::VectorXd::Zero(P); +// [[Rcpp::export(rng = false)]] +Eigen::MatrixXcd align_phase(const Eigen::MatrixXcd& U, + const Eigen::MatrixXcd& X){ - // Manually work out new mean - for(int n = 0; n < N; n++){ - if(cluster_ids[n] == j){ - vec += U_old_slice.row(n); - } - } + Eigen::Index n = X.rows(); + Eigen::Index p = X.cols(); - vec /= cluster_sizes[j - 1]; // Subtract 1 to adjust to C++ indexing + Eigen::MatrixXcd V(n, p); - // Assign new mean where needed... - for(int n = 0; n < N; n++){ - if(cluster_ids[n] == j){ - U_new.row(n) = vec; - } - } - } - - Eigen::Map(&U[N * P * q], N, P) = U_new; + for(Eigen::Index i = 0; i < n; i++){ + V.row(i) = align_phase_v(U.row(i), X.row(i)); } - return U; + return V; } -// Tensor projection along the second mode -// -// Given a 3D tensor X in R^{n-by-p-by-q} (observations by features by iterations) -// and a rotation matrix Y in R^{p-by-k} (features by principal components), we -// want to get a projected array in R^{n-by-k-by-q} giving the path of the principal -// components -// -// This is straightforward, but "loopy" so we implement it in Rcpp / RcppEigen for speed // [[Rcpp::export(rng = false)]] -Rcpp::NumericVector tensor_projection(Rcpp::NumericVector X, const Eigen::MatrixXd& Y){ +Rcpp::NumericMatrix trout_dist_impl(const Eigen::MatrixXcd& X){ + Eigen::Index n = X.rows(); + Rcpp::NumericMatrix distances(n, n); - // Validate X - Rcpp::IntegerVector X_dims = X.attr("dim"); - if(X_dims.size() != 3){ - ClustRVizLogger::error("X must be a three rank tensor."); - } - int n = X_dims(0); - int p = X_dims(1); - int q = X_dims(2); + for(Eigen::Index i = 0; i < n; i++){ + for(Eigen::Index j = 0; j < n; j++){ + // FIXME - this calculates all distances twice - can simplify... - // Validate Y - if(Y.rows() != p){ - ClustRVizLogger::error("The dimensions of X and Y do not match -- ") << p << " != " << Y.rows(); + // IMPORTANT: We have to transpose the result of align_phase_v to a _row vector_ + // or else it doesn't align with X.row() and silently discards all but the first + // element + distances(i, j) = std::sqrt((X.row(i) - align_phase_v(X.row(j), X.row(i)).transpose()).squaredNorm()); + } } - int k = Y.cols(); + return distances; +} + +// TODO - Non-euclidean distances! +// [[Rcpp::export(rng = false)]] +Rcpp::NumericMatrix cdist_impl(const Eigen::MatrixXcd& X){ + Eigen::Index n = X.rows(); + Rcpp::NumericMatrix distances(n, n); - Rcpp::NumericVector result(n * k * q); - Rcpp::IntegerVector result_dims{n, k, q}; - result.attr("dim") = result_dims; + for(Eigen::Index i = 0; i < n; i++){ + for(Eigen::Index j = 0; j < n; j++){ + // FIXME - this calculates all distances twice - can simplify... - for(int i = 0; i < q; i++){ - Eigen::MatrixXd X_slice = Eigen::Map(&X[n * p * i], n, p); - Eigen::MatrixXd X_slice_projected = X_slice * Y; - Eigen::Map(&result[n * k * i], n, k) = X_slice_projected; + // IMPORTANT: We have to transpose the result of align_phase_v to a _row vector_ + // or else it doesnt align with X.row() and silently discards all but the first + // element + distances(i, j) = (X.row(i) - X.row(j)).norm(); + } } - - return result; + return distances; } diff --git a/tests/testthat/test_matrix_prox.R b/tests/testthat/test_matrix_prox.R index 0adbd0e..2e796ab 100644 --- a/tests/testthat/test_matrix_prox.R +++ b/tests/testthat/test_matrix_prox.R @@ -7,33 +7,33 @@ test_that("L1 matrix prox works", { X <- matrix(rnorm(n * p), nrow = n, ncol = p) - MatrixRowProx <- clustRviz:::MatrixRowProx + matrix_row_prox <- clustRviz:::matrix_row_prox weights <- rep(1, n) - expect_equal(X, MatrixRowProx(X, lambda = 0, weights = weights, l1 = TRUE)) - expect_equal(abs(X) + 4, MatrixRowProx(abs(X) + 5, lambda = 1, weights = weights, l1 = TRUE)) - expect_equal(-abs(X) - 4, MatrixRowProx(-abs(X) - 5, lambda = 1, weights = weights, l1 = TRUE)) + expect_equal(X, matrix_row_prox(X, lambda = 0, weights = weights, l1 = TRUE)) + expect_equal(abs(X) + 4, matrix_row_prox(abs(X) + 5, lambda = 1, weights = weights, l1 = TRUE)) + expect_equal(-abs(X) - 4, matrix_row_prox(-abs(X) - 5, lambda = 1, weights = weights, l1 = TRUE)) ## Now we check that weights work X <- matrix(1:25, nrow = 25, ncol = 1) weights <- 1:25 expect_equal(matrix(0, nrow = 25, ncol = 1), - MatrixRowProx(X, lambda = 1, weights = weights, l1 = TRUE)) + matrix_row_prox(X, lambda = 1, weights = weights, l1 = TRUE)) X <- matrix(5, nrow = 6, ncol = 1) weights <- seq(0, 5) expect_equal(matrix(5 - weights, nrow = 6, ncol = 1), - MatrixRowProx(X, lambda = 1, weights = weights, l1 = TRUE)) - + matrix_row_prox(X, lambda = 1, weights = weights, l1 = TRUE)) + #Now check matrix_col_prox against row prox - MatrixColProx <- clustRviz:::MatrixColProx - expect_equal(t(MatrixColProx(t(X), lambda = 1, weights = weights, l1 = TRUE)), - MatrixRowProx(X, lambda = 1, weights = weights, l1 = TRUE)) + matrix_col_prox <- clustRviz:::matrix_col_prox + expect_equal(t(matrix_col_prox(t(X), lambda = 1, weights = weights, l1 = TRUE)), + matrix_row_prox(X, lambda = 1, weights = weights, l1 = TRUE)) }) test_that("L2 prox works", { set.seed(125) - MatrixRowProx <- clustRviz:::MatrixRowProx + matrix_row_prox <- clustRviz:::matrix_row_prox num_unique_cols <- clustRviz:::num_unique_cols n <- 25 @@ -41,32 +41,32 @@ test_that("L2 prox works", { X <- matrix(rnorm(n, sd = 3), ncol = 1) weights <- rexp(n) - expect_equal(MatrixRowProx(X, lambda = 1, weights = weights, l1 = TRUE), - MatrixRowProx(X, lambda = 1, weights = weights, l1 = FALSE)) + expect_equal(matrix_row_prox(X, lambda = 1, weights = weights, l1 = TRUE), + matrix_row_prox(X, lambda = 1, weights = weights, l1 = FALSE)) p <- 5 X <- matrix(1, nrow = n, ncol = p) weights <- seq(0, 5, length.out = 25) - expect_equal(1, num_unique_cols(MatrixRowProx(X, lambda = 1, weights = weights, l1 = FALSE))) + expect_equal(1, num_unique_cols(matrix_row_prox(X, lambda = 1, weights = weights, l1 = FALSE))) y <- matrix(c(3, 4), nrow = 1) - expect_equal(MatrixRowProx(y, 1, 1, l1 = FALSE), y * (1 - 1/5)) - expect_equal(MatrixRowProx(y, 1, 3, l1 = FALSE), y * (1 - 3/5)) - expect_equal(MatrixRowProx(y, 2, 1, l1 = FALSE), y * (1 - 2/5)) - expect_equal(MatrixRowProx(y, 2, 3, l1 = FALSE), y * 0) + expect_equal(matrix_row_prox(y, 1, 1, l1 = FALSE), y * (1 - 1/5)) + expect_equal(matrix_row_prox(y, 1, 3, l1 = FALSE), y * (1 - 3/5)) + expect_equal(matrix_row_prox(y, 2, 1, l1 = FALSE), y * (1 - 2/5)) + expect_equal(matrix_row_prox(y, 2, 3, l1 = FALSE), y * 0) y <- -1 * y - expect_equal(MatrixRowProx(y, 1, 1, l1 = FALSE), y * (1 - 1/5)) - expect_equal(MatrixRowProx(y, 1, 3, l1 = FALSE), y * (1 - 3/5)) - expect_equal(MatrixRowProx(y, 2, 1, l1 = FALSE), y * (1 - 2/5)) - expect_equal(MatrixRowProx(y, 2, 3, l1 = FALSE), y * 0) + expect_equal(matrix_row_prox(y, 1, 1, l1 = FALSE), y * (1 - 1/5)) + expect_equal(matrix_row_prox(y, 1, 3, l1 = FALSE), y * (1 - 3/5)) + expect_equal(matrix_row_prox(y, 2, 1, l1 = FALSE), y * (1 - 2/5)) + expect_equal(matrix_row_prox(y, 2, 3, l1 = FALSE), y * 0) #Now check matrix_col_prox against row prox - MatrixColProx <- clustRviz:::MatrixColProx - expect_equal(t(MatrixColProx(t(X), lambda = 1, weights = weights, l1 = FALSE)), - MatrixRowProx(X, lambda = 1, weights = weights, l1 = FALSE)) - expect_equal(t(MatrixColProx(t(y), lambda = 1, weights = weights, l1 = TRUE)), - MatrixRowProx(y, lambda = 1, weights = weights, l1 = TRUE)) + matrix_col_prox <- clustRviz:::matrix_col_prox + expect_equal(t(matrix_col_prox(t(X), lambda = 1, weights = weights, l1 = FALSE)), + matrix_row_prox(X, lambda = 1, weights = weights, l1 = FALSE)) + expect_equal(t(matrix_col_prox(t(y), lambda = 1, weights = weights, l1 = TRUE)), + matrix_row_prox(y, lambda = 1, weights = weights, l1 = TRUE)) }) diff --git a/tests/testthat/test_trout_distance.R b/tests/testthat/test_trout_distance.R new file mode 100644 index 0000000..2c5a54d --- /dev/null +++ b/tests/testthat/test_trout_distance.R @@ -0,0 +1,30 @@ +context("Test trout_distance") + +test_that("Trout distance doesn't depend on signs", { + trout_dist <- clustRviz:::trout_dist + + X <- matrix(c(1, 1, -1, -1), ncol = 2) + expect_equal(as.vector(trout_dist(X)), 0) + + X <- matrix(c(1, -1, 1, -1), ncol = 2) + expect_equal(as.vector(trout_dist(X)), 0) + + X <- matrix(c(1 + 1i, -1 -1i, 1 + 1i, -1 -1i), ncol = 2) + expect_equal(as.vector(trout_dist(X)), 0) +}) + +test_that("Trout distance minimizes distance", { + trout_dist <- clustRviz:::trout_dist + set.seed(125) + + X1 <- rnorm(25) + (0 + 1i) * rnorm(25) + X2 <- rnorm(25) + (0 + 1i) * rnorm(25) + X <- rbind(X1, X2) + + theta_grid <- seq(0, 2 * pi, length.out = 501) + d <- Vectorize(function(theta) sum(Mod(X1 - exp((0 + 1i) * theta) * X2)^2)) + min_d <- sqrt(min(d(theta_grid))) + + td <- as.vector(trout_dist(X)) + expect_lte(td, min_d) +})