From feb071d3c94da5b5f903c2ea7f2d1b8f1e01f385 Mon Sep 17 00:00:00 2001 From: peekxc Date: Sun, 4 Feb 2018 14:46:05 -0500 Subject: [PATCH 1/2] Initial changes --- DESCRIPTION | 3 + NAMESPACE | 5 +- R/RcppExports.R | 15 +++ R/cover.R | 151 +++++++++++++++++++++ R/mapper.R | 9 +- R/mapper2D.R | 4 +- R/mapperRef.R | 161 +++++++++++++++++++++++ R/mapper_new.R | 86 ++++++++++++ man/Cover-class.Rd | 21 +++ man/MapperRef-class.Rd | 15 +++ man/cluster_cutoff_at_first_empty_bin.Rd | 9 +- man/mapper.Rd | 12 +- man/mapper1D.Rd | 11 +- man/mapper2D.Rd | 9 +- man/mapperEdges.Rd | 17 ++- man/mapperVertices.Rd | 23 ++-- man/mapper_new.Rd | 46 +++++++ src/.gitignore | 3 + src/RcppExports.cpp | 59 +++++++++ src/adjacency.cpp | 59 +++++++++ src/cover.cpp | 47 +++++++ src/dist_utilities.cpp | 24 ++++ tests/testthat.R | 4 + tests/testthat/TDAmapper_comparison.R | 10 ++ 24 files changed, 754 insertions(+), 49 deletions(-) create mode 100644 R/RcppExports.R create mode 100644 R/cover.R create mode 100644 R/mapperRef.R create mode 100644 R/mapper_new.R create mode 100644 man/Cover-class.Rd create mode 100644 man/MapperRef-class.Rd create mode 100644 man/mapper_new.Rd create mode 100644 src/.gitignore create mode 100644 src/RcppExports.cpp create mode 100644 src/adjacency.cpp create mode 100644 src/cover.cpp create mode 100644 src/dist_utilities.cpp create mode 100644 tests/testthat.R create mode 100644 tests/testthat/TDAmapper_comparison.R diff --git a/DESCRIPTION b/DESCRIPTION index 2a6d735..03905c6 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -24,3 +24,6 @@ License: GPL-3 LazyData: true URL: https://github.com/paultpearson/TDAmapper/ BugReports: https://github.com/paultpearson/TDAmapper/issues +LinkingTo: Rcpp +Imports: Rcpp +RoxygenNote: 6.0.1 diff --git a/NAMESPACE b/NAMESPACE index 80413cc..025c854 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,7 +1,10 @@ -# Generated by roxygen2 (4.1.1): do not edit by hand +# Generated by roxygen2: do not edit by hand export(mapper) export(mapper1D) export(mapper2D) export(mapperEdges) export(mapperVertices) +export(mapper_new) +importFrom(Rcpp,sourceCpp) +useDynLib(TDAmapper) diff --git a/R/RcppExports.R b/R/RcppExports.R new file mode 100644 index 0000000..a27489f --- /dev/null +++ b/R/RcppExports.R @@ -0,0 +1,15 @@ +# Generated by using Rcpp::compileAttributes() -> do not edit by hand +# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 + +adjacencyCpp <- function(ls_pairs, nodes, ls_node_map) { + .Call('_TDAmapper_adjacencyCpp', PACKAGE = 'TDAmapper', ls_pairs, nodes, ls_node_map) +} + +constructLevelSets <- function(filter_values, index_set, interval_length, step_size, filter_min) { + .Call('_TDAmapper_constructLevelSets', PACKAGE = 'TDAmapper', filter_values, index_set, interval_length, step_size, filter_min) +} + +dist_subset <- function(dist, idx) { + .Call('_TDAmapper_dist_subset', PACKAGE = 'TDAmapper', dist, idx) +} + diff --git a/R/cover.R b/R/cover.R new file mode 100644 index 0000000..da92aee --- /dev/null +++ b/R/cover.R @@ -0,0 +1,151 @@ +#' Reference Class (R5) implementation of Mapper for the TDAmapper package in R +#' Cover reference class +#' filter_values := Filter values +#' num_intervals := vector of number of bins to cover the Z with (per dimension) +#' percent_overlap := vector of overlap percentages +#' level_sets := list of level_sets (including min/max bounds) +cover_ref <- setRefClass("Cover", fields = c("filter_values", "num_intervals", "percent_overlap", "index_set", "level_sets", "metric", "type")) + +## Cover initialization +cover_ref$methods(initialize = function(filter_values, k, g, measure = "euclidean", type = "rectangular"){ + if (is.null(dim(filter_values))) { filter_values <<- array(filter_values, dim = c(length(filter_values), 1)) } + else { filter_values <<- filter_values } + metric <<- measure + type <<- type + + ## TODO: move k and g to other instantiations + num_intervals <<- k; percent_overlap <<- g + constructCover() +}) + +cover_ref$methods(summary = function(){ + type_pretty <- paste0(toupper(substr(type, start = 1, stop = 1)), tolower(substr(type, start = 2, stop = nchar(type)))) + sprintf("%s cover: number intervals = %d, overlap = %.2f", type_pretty, num_intervals, percent_overlap) +}) + +cover_ref$methods(show = function(){ + message <- c(sprintf("Open cover for %d objects (d = %d)", nrow(filter_values), ncol(filter_values))) + message <- append(message, sprintf("Parameters: number intervals = %d, overlap = %.2f", num_intervals, percent_overlap)) + writeLines(message) +}) + +## Set overlap/gain threshold +cover_ref$methods(setOverlap = function(g){ + dim_Z <- ncol(filter_values) + if (length(g) != dim_Z && length(g) != 1 || any(g > 0.50)){ stop("The gain 'g' must be a single scalar or a vector of scalars with length equal to the dimensionality of the filter space.") } + if (length(g) == 1 && dim_Z > 1){ g <- rep(g, dim_Z) } ## create a vector + percent_overlap <<- g +}) + +## Set resolution +cover_ref$methods(setResolution = function(k){ + dim_Z <- ncol(filter_values) + if (length(k) != dim_Z && length(k) != 1){ stop("The resolution 'k' must be a single scalar or a vector of scalars with length equal to the dimensionality of the filter space.") } + if (length(k) == 1 && dim_Z > 1){ k <- rep(k, dim_Z) } ## create a vector + num_intervals <<- k +}) + +## Constructs a base cover where the Lebesgue covering dimension = 0 (i.e. a valid cover, but where each set is disjoint) +## TODO: Add support for other types of covering methods +cover_ref$methods(constructBaseCover = function(cover_type = c("rectangular")){ + if ("uninitializedField" %in% class(percent_overlap)){ stop("'percent_overlap' must be set prior to constructing a valid cover.") } + if ("uninitializedField" %in% class(num_intervals)){ stop("'num_intervals' must be set prior to constructing a valid cover.") } + + ## Setup + filter_dim <- ncol(filter_values) ## filter dimensionality + indices <- lapply(k, function(k_l) 1:k_l) ## per-dimension possible indexes + index_set <<- structure(eval(as.call(append(quote(expand.grid), indices))), names = paste0("d", 1:filter_dim)) ## full indexing set + + ## Find which level set each point lies within under the case the given filter space is + ## zero-dimensional w.r.t. its Lebesgue covering dimension + points_in_level_set <- sapply(1:filter_dim, function(i) { + x <- filter_values[, i] + findInterval(x = x, seq(min(x), max(x), length.out = num_intervals[[i]]+1), all.inside = TRUE) + }) + + ## Create the level sets + level_sets <<- structure(points_in_level_set, names = index_set) + + ## Store all the information as a list in the cover field + # cover <- list(type = cover_type, index_set = index_set, points_in_level_set = points_in_level_set) + # print.mapper_cover <- function(){ print("something")} +}) + +cover_ref$methods(constructCover = function(){ + if ("uninitializedField" %in% class(percent_overlap)){ stop("'percent_overlap' must be set prior to constructing a valid cover.") } + if ("uninitializedField" %in% class(num_intervals)){ stop("'num_intervals' must be set prior to constructing a valid cover.") } + + ## Setup unexpanded and full indexing set (the full indexing set is the cartesian product of each unexpanded index set) + filter_dim <- ncol(filter_values) ## filter dimensionality + indices <- lapply(num_intervals, function(k_l) 1:k_l) ## per-dimension possible indexes + index_set <<- structure(eval(as.call(append(quote(expand.grid), indices))), names = paste0("d", 1:filter_dim)) ## full indexing set + + ## Get filter min and max ranges + filter_rng <- apply(filter_values, 2, range) + { filter_min <- filter_rng[1,]; filter_max <- filter_rng[2,] } + + ## Using the current gain, setup the cover. This computes the min and max bounds of each level set. + interval_length <- (filter_max - filter_min)/(num_intervals - percent_overlap * (num_intervals - 1)) ## interval length + step_size <- interval_length * (1 - percent_overlap) ## step size + index_set_str <- apply(index_set, 1, function(alpha) paste0(as.character(alpha), collapse = ",")) ## level set index tuple names + + ## Construct the level sets + # level_sets <<- structure(vector(mode = "list", length = nrow(index_set)), names = index_set_str) ## level sets + # for (i in 1:nrow(index_set)){ + # ls_idx <- index_set[i,] + # ls_bnds <- rbind(filter_min + (ls_idx-1) * step_size, (filter_min + (ls_idx-1) * step_size) + interval_length) + # rownames(ls_bnds) <- c("min", "max") ## For clarity + # level_sets[[i]]$bounds <<- ls_bnds + # + # ## For the given level set, query which filter values lie within it, per dimension + # ls_queries <- sapply(1:filter_dim, function(d_i) filter_values[, d_i] >= ls_bnds[1, d_i] & filter_values[, d_i] <= ls_bnds[2, d_i]) + # level_sets[[i]]$points_in_level_set <<- which(apply(ls_queries, 1, all)) + # } + level_sets <<- constructLevelSets(filter_values, as.matrix(index_set), as.numeric(interval_length), as.numeric(step_size), as.numeric(filter_min)) + + # .GlobalEnv[["test"]] <- list(filter_values = filter_values, index_set = index_set, interval_length = interval_length, step_size = step_size, filter_min = filter_min) + # test <- constructLevelSets(filter_values = filter_values, index_set = index_set, interval_length = interval_length, step_size = step_size, filter_min = filter_min) +}) + +## Which level set (flat indices) are valid to compare against? When the overlap is <= 50%, it's required that the +## level sets are checked only against immediate neighbors of that set (when the index is at maximum one +## index away). If the overlap is > 50%, then check every combination. +cover_ref$methods(valid_pairs = function(){ + all_ls_pairs <- t(combn(1:length(level_sets), 2)) + idx_set_pairs <- as.matrix(index_set[all_ls_pairs[, 1],] - index_set[all_ls_pairs[, 2],]) + if (all(percent_overlap <= 0.50)){ + valid_ls_pairs <- as.vector(abs(apply(idx_set_pairs, 1, max))) <= 1 + return(all_ls_pairs[valid_ls_pairs,]) + } else { + ## TODO: Extend mapper to compute more than the 1-skeleton efficiently + return(all_ls_pairs) + } +}) + +## Function to plot the filter space, including the level set bounds +cover_ref$methods(plotFilterSpace = function(show_ls_bounds = TRUE, ...){ + filter_dim <- ncol(filter_values) + if (filter_dim > 2){ stop("Cannot plot a filter space with more than 3 dimensions") } + if (filter_dim == 1){ + filter_sz <- nrow(m$filter_values) + alpha_scale <- ifelse(filter_sz > 1000, 0.01, 1 - (log(filter_sz)-1)/(log(1000)-1)) + plot(cbind(m$filter_values, rep(0L, length(m$filter_values))), ylim = c(-1, 1), pch = "|", cex = 5, + col = adjustcolor("black", alpha.f = alpha_scale), + ylab = "", xlab = "Filter Values") + for (ls in cover$LS){ + lines(x = c(ls[1, 1], ls[2, 1]), y = c(-0.5, -0.5), ...) + lines(x = c(ls[2, 1], ls[2, 1]), y = c(-0.5, 0.5), ...) + lines(x = c(ls[2, 1], ls[1, 1]), y = c(0.5, 0.5), ...) + lines(x = c(ls[1, 1], ls[1, 1]), y = c(0.5, -0.5), ...) + } + } else if (filter_dim == 2){ + plot(m$filter_values) + for (ls in cover$LS){ + lines(x = c(ls[1, 1], ls[2, 1]), y = c(ls[1, 2], ls[1, 2]), ...) + lines(x = c(ls[2, 1], ls[2, 1]), y = c(ls[1, 2], ls[2, 2]), ...) + lines(x = c(ls[2, 1], ls[1, 1]), y = c(ls[2, 2], ls[2, 2]), ...) + lines(x = c(ls[1, 1], ls[1, 1]), y = c(ls[2, 2], ls[1, 2]), ...) + } + } +}) + diff --git a/R/mapper.R b/R/mapper.R index 6967d4c..19c6cdb 100644 --- a/R/mapper.R +++ b/R/mapper.R @@ -30,8 +30,12 @@ #' g1 <- graph.adjacency(m1$adjacency, mode="undirected") #' plot(g1, layout = layout.auto(g1) ) #' } +#' +#' @useDynLib TDAmapper +#' @importFrom Rcpp sourceCpp +#' #' @export -#' +#' mapper <- function(dist_object, filter_values, num_intervals, percent_overlap, num_bins_when_clustering) { @@ -80,7 +84,6 @@ mapper <- function(dist_object, filter_values, num_intervals, percent_overlap, n filter_max <- as.vector(sapply(filter_values,max)) interval_width <- (filter_max - filter_min) / num_intervals - # initialize variables vertex_index <- 0 level_of_vertex <- c() @@ -172,7 +175,7 @@ mapper <- function(dist_object, filter_values, num_intervals, percent_overlap, n # cut the cluster tree # internal indices refers to 1:num_points_in_this_level # external indices refers to the row number of the original data point - level_cutoff <- cluster_cutoff_at_first_empty_bin(level_heights, level_max_dist, num_bins_when_clustering) + level_cutoff <- cluster_cutoff_at_first_empty_bin(level_heights, level_max_dist, num_bins_when_clustering) level_external_indices <- points_in_this_level[level_hclust$order] level_internal_indices <- as.vector(cutree(list( merge = level_hclust$merge, diff --git a/R/mapper2D.R b/R/mapper2D.R index b86c938..7a6d3f9 100644 --- a/R/mapper2D.R +++ b/R/mapper2D.R @@ -101,13 +101,13 @@ mapper2D <- function( points_in_level[[level]] <- which(points_in_level_logical==TRUE) if (num_points_in_level == 0) { - print('Level set is empty') + # print('Level set is empty') vertices_in_level[[level]] <- -1 # Bertrand Michel's suggestion for empty levels: use flag -1 next } if (num_points_in_level == 1) { - print('Level set has only one point') + # print('Level set has only one point') num_vertices_in_level <- 1 cluster_indices_within_level <- c(1) vertices_in_level[[level]] <- vertex_index + (1:num_vertices_in_level) diff --git a/R/mapperRef.R b/R/mapperRef.R new file mode 100644 index 0000000..6dfb4a1 --- /dev/null +++ b/R/mapperRef.R @@ -0,0 +1,161 @@ +#' Reference Class (R5) implementation of Mapper for the TDAmapper package in R +#' Composes a set of classes to perform Mapper efficiently. +#' Author: Matt Piekenbrock + +## Create the reference class, along with required field types +mapper_ref <- setRefClass("MapperRef", fields = list(X="ANY", cover="ANY", clustering_algorithm="ANY", G="ANY", config="list")) + +## The cover stores the filter values +mapper_ref$methods(setCover = function(fv, k, g, measure = "euclidean"){ + if (is.null(dim(fv)) && is.numeric(fv)){ fv <- matrix(fv, nrow = length(fv), ncol = 1) } + if (is.null(dim(fv)) || (!"matrix" %in% class(fv))) { stop("Filter values must be numeric and in matrix form") } + cover <<- cover_ref$new(filter_values = fv, k, g, measure = measure) +}) + +## Changes the clustering algorithm used by mapper. +## Must accept a 'dist' object and return a static or 'flat' clustering result +mapper_ref$methods(setClusteringAlgorithm = function(cl = c("single", "ward.D", "ward.D2", "complete", "average", "mcquitty", "median", "centroid")){ + + ## Default to single linkage clustering with heuristic histogram-like cut rule + if (missing(cl) || cl == "single"){ + clustering_algorithm <<- function(dist_x, num_bins_when_clustering = 10){ + hcl <- hclust(dist_x, method = "single") + cut_height <- cluster_cutoff_at_first_empty_bin(heights = hcl$height, diam = max(dist_x), num_bins_when_clustering = num_bins_when_clustering) + as.vector(cutree(hcl, h = cut_height)) + } + } + ## If other linkage criterion is provided, use that + else if (class(cl) == "character"){ + hclust_opts <- c("single", "ward.D", "ward.D2", "complete", "average", "mcquitty", "median", "centroid") + if (!cl %in% hclust_opts){ stop(sprint("Unknown linkage method passed. Please use one of (%s). See ?hclust for details.", paste0(hclust_opts, collapse = ", "))) } + clustering_algorithm <<- function(dist_x, num_bins_when_clustering = 10){ + hcl <- hclust(dist_x, method = cl) + cut_height <- cluster_cutoff_at_first_empty_bin(heights = hcl$height, diam = max(dist_x), num_bins_when_clustering = num_bins_when_clustering) + as.vector(cutree(hcl, h = cut_height)) + } + } + ## Allow the user to use their own clustering algorithm if they desire. Must accept a dist object and return a flat clustering result. + else if (class(f) == "function"){ + clustering_algorithm <<- f + } else { + stop("'cl' must be either a linkage criterion (character) or a clustering algorithm (function)") + } +}) + +## The cover stores the filter values +mapper_ref$methods(computeNodes = function(...){ + if ("uninitializedField" %in% class(cover)){ stop("Cover must be constructed prior to computing nodes.") } + + ## Compute the interpoint distances for each subset + # LS_dist <- lapply(m$cover$level_sets, function(ls){ dist(X[ls$points_in_level_set,], method = "euclidean") }) + LS_dist <- lapply(cover$level_sets, function(ls){ + if ("dist" %in% class(X)){ + dist_subset(dist = X, idx = ls$points_in_level_set) + } else { dist(X[ls$points_in_level_set,], method = "euclidean") } + }) + + ## Initialize the graph as an empty list + G <<- list() + + ## Iterate through the 'dist' objects stored for each level set, perform the clustering + cl_res <- lapply(LS_dist, function(ls_dist) { + if (length(ls_dist) > 0) clustering_algorithm(ls_dist, ...) + }) + + ## Precompute useful variables to know + n_nodes <- sum(sapply(cl_res, function(cl) length(unique(cl)))) + node_idx <- unlist(mapply(function(cl, ls_i) if (length(cl) > 0) paste0(ls_i, ".", unique(cl)), cl_res, 1:length(cl_res))) + n_lvlsets <- length(cover$level_sets) + + ## Agglomerate the nodes into a list. This matches up the original indexes of the filter values with the + ## the clustering results, such that each node stores the original filter index values as well as the + ## creating a correspondence between the node and it's corresponding level set flat index (lsfi) + ## TODO: Cleanup and either vectorize or send down to C++ + G$nodes <<- vector(mode = "list", length = n_nodes) + n_i <- 1L + for (lsfi in 1:n_lvlsets){ + cl_i <- cl_res[[lsfi]] + ## Extract the node point indices for each cluster + node_pt_idx <- lapply(unique(cl_i), function(cl_idx) cover$level_sets[[lsfi]]$points_in_level_set[which(cl_i == cl_idx)]) + for (node in node_pt_idx){ + attr(node, "level_set") <- lsfi + G$nodes[[n_i]] <<- node + n_i <- n_i + 1L + } + } +}) + +mapper_ref$methods(computeEdges = function(){ + if ("uninitializedField" %in% class(G)){ stop("Graph with nodes must be constructed before computing edges.") } + + ## Retrieve the level set flat indices (LSFI) for each corresponding node + node_lsfi <- sapply(G$nodes, function(node) attr(node, "level_set")) # which level set (by value) each node (by index) is in + + ## Create map from the level set flat index (by index) to the node indices the level set stores + ## Note in this map empty level sets are NULL + ls_node_map <- lapply(seq(length(cover$level_sets)), function(lvl_set_idx) { + node_indices <- which(node_lsfi == lvl_set_idx) + if (length(node_indices) == 0){ return(NULL) } else { return(node_indices) } + }) + + ## Retrieve the valid level set index pairs to compare. In the worst case, with no cover-specific optimization + ## or 1-skeleton assumption, this may just be all pairwise combinations of LSFI's for the full simplicial complex + ls_to_compare <- cover$valid_pairs() + + ## Let Rcpp handle the O(n^2) non-empty intersection checks + G$adjacency <<- adjacencyCpp(ls_pairs = ls_to_compare, nodes = G$nodes, ls_node_map = ls_node_map); +}) + + +mapper_ref$methods(plotNetwork = function(){ + agg_pt_fv <- sapply(G$nodes, function(n_idx){ apply(matrix(cover$filter_values[n_idx,], nrow = length(n_idx)), 1, mean)}) + agg_node_fv <- sapply(agg_pt_fv, mean) + exp_factor <- log(sapply(G$nodes, function(n_idx){ length(n_idx) })) + rbw_pal <- rev(rainbow(100, start = 0, end = 4/6)) + binned_idx <- cut(agg_node_fv, breaks = 100, labels = F) + base_net <- network::network.initialize(nrow(G$adjacency), directed = F) + plot(network::network.adjacency(x = G$adjacency, g = base_net), + vertex.col = rbw_pal[binned_idx], vertex.cex = exp_factor, + label = 1:length(G$nodes), displaylabels = TRUE) +}) + +mapper_ref$methods(computeGain = function(){ + if ("uninitializedField" %in% class(filter_values)){ stop("Filter values must be set prior to constructing a valid cover.") } + +}) + +## S3-like print override +mapper_ref$methods(show = function(){ + if ("dist" %in% class(X)){ n <- attr(X, "Size") } else { n <- nrow(X) } + if (!is.null(config$call)) + cat("\nCall:\n", deparse(config$call), "\n\n", sep = "") + message <- sprintf("Mapper construction for %d objects", n) + if (!"uninitializedField" %in% class(cover)){ message <- append(message, cover$summary()) } + if (!"uninitializedField" %in% class(clustering_algorithm)){ message <- append(message, cover$summary()) } + writeLines(message) +}) + + + +## Exports the internal mapper core structures to a TDAmapper output +mapper_ref$methods(exportTDAmapper = function(){ + level_of_vertex <- sapply(G$nodes, function(ni) attr(ni, "level_set")) + structure(list( adjacency = G$adjacency, + num_vertices = length(G$nodes), + level_of_vertex = level_of_vertex, + points_in_vertex = lapply(G$nodes, as.vector), + points_in_level = unname(lapply(cover$level_sets, function(ls) ls$points_in_level_set)), + vertices_in_level = lapply(1:length(cover$level_sets), function(ls_idx) { + tmp <- which(level_of_vertex == ls_idx) + if (length(tmp) == 0){ return(-1) } + else return(tmp) + })), + class = "TDAmapper") +}) + +mapper_ref$methods(getLevel = function(){ + +}) +mapper_ref$methods(computeGraph = function(){ + +}) \ No newline at end of file diff --git a/R/mapper_new.R b/R/mapper_new.R new file mode 100644 index 0000000..9c9147f --- /dev/null +++ b/R/mapper_new.R @@ -0,0 +1,86 @@ +#' mapper_new function +#' +#' This function uses a filter function f: X -> R on a data set X that has n rows (observations) and k columns (variables). +#' +#' @param X Either a data matrix or an object of type 'dist'. +#' @param filter_values An n x m matrix, n-length vector real numbers, or an m-length list each contain n real numbers. +#' @param num_intervals Either a positive integer or a vector of m positive integers specifying the number of intervals to cover the filter values with. +#' @param percent_overlap Either a number or m-length vector of values between 0 and 100 specifying how much adjacent intervals should overlap. +#' @param num_bins_when_clustering A positive integer that controls whether points in the same level set end up in the same cluster. +#' +#' @return An object of class \code{TDAmapper} which is a list of items named \code{adjacency} (adjacency matrix for the edges), \code{num_vertices} (integer number of vertices), \code{level_of_vertex} (vector with \code{level_of_vertex[i]} = index of the level set for vertex i), \code{points_in_vertex} (list with \code{points_in_vertex[[i]]} = vector of indices of points in vertex i), \code{points_in_level} (list with \code{points_in_level[[i]]} = vector of indices of points in level set i, and \code{vertices_in_level} (list with \code{vertices_in_level[[i]]} = vector of indices of vertices in level set i. +#' +#' @author Paul Pearson, \email{pearsonp@@hope.edu}, Matt Piekenbrock, \email{piekenbrock.5@@wright.edu} +#' @references \url{https://github.com/paultpearson/TDAmapper} +#' +#' @examples +#' m1 <- mapper_new( +#' dist_object = dist(data.frame( x=2*cos(0.5*(1:100)), y=sin(1:100) )), +#' filter_values = 2*cos(0.5*(1:100)), +#' num_intervals = 10, +#' percent_overlap = 50, +#' num_bins_when_clustering = 10) +#' \dontrun{ +#' #install.packages("igraph") +#' library(igraph) +#' g1 <- graph.adjacency(m1$adjacency, mode="undirected") +#' plot(g1, layout = layout.auto(g1) ) +#' } +#' @export + +mapper_new <- function(X, filter_values, num_intervals, percent_overlap, num_bins_when_clustering, + return_reference = FALSE, ...) { + + ### Check to see if 'dist_object' was passed in using old API call. + extra <- list(...) + args <- c("dist_object", "distance_matrix") + m <- pmatch(names(extra), args) + if(any(is.na(m))) { stop("Unknown parameter: ", paste(names(extra)[is.na(m)], collapse = ", ")) } + names(extra) <- args[m] + + if(!is.null(extra$dist_object)) { + warning("mapper accepts dist objects through the 'X' parameter. Setting X equal to the value of the argument passed via 'dist_object.'") + X <- extra$dist_object + } + if(!is.null(extra$distance_matrix)) { + warning("mapper accepts dist objects through the 'X' parameter. Setting X equal to the value of the argument passed via 'distance_matrix'") + X <- extra$distance_matrix + } + + ## SETUP + # Constructs a MapperRef instance, a mutable R5 class for generating the Mapper construction + m <- mapper_ref$new(X = X) + + ## BEGIN COVER + # The level set flat index (lsfi) is implicitly represented by the position of the level set in the + # level sets list (cover$level_sets). The name of each level set corresponds with its level set multi index (lsmi), + # and thus can be used alternatively as a key into the level sets map. + # (The looping construct and point queries have been optimized in 'setCover') + if (is.list(filter_values)){ filter_values <- do.call(cbind, filter_values) } + m$setCover(fv = filter_values, k = num_intervals, g = percent_overlap/100) + ## END COVER + + ## BEGIN CLUSTER + # Through the reference class, other clustering algorithms can be used. At this high-level API call, make suitable assumptions that + # single-linkage w/ bin-based cutting heuristic is preferred. + m$setClusteringAlgorithm(cl = "single") + ## END CLUSTER + + ## BEGIN VERTEX CONSTRUCTION + # Actually performs the clustering and vertex creation. Note that if a 'dist' object was passed in via X, then it + # will be subset based on the point indices within each level set. If X is a point cloud, then the (euclidean) dist + # object is computed locally within each level set. If the number of intervals is high enough and the data set is + # sufficiently large, this should result in large improvements in efficiency. + m$computeNodes(num_bins_when_clustering = num_bins_when_clustering) + ## END VERTEX CONSTRUCTION + + ## BEGIN SIMPLICIAL COMPLEX + # At this high-level interface call, only the 1-skeleton (graph) is computed within 'G.' + m$computeEdges() + ## END SIMPLICIAL COMPLEX + + ## Convert to 'TDAmapper' object or, if the reference class is wanted, return that + mapperoutput <- if (return_reference) m else m$exportTDAmapper() + return(mapperoutput) +} # end mapper function + diff --git a/man/Cover-class.Rd b/man/Cover-class.Rd new file mode 100644 index 0000000..3f76a33 --- /dev/null +++ b/man/Cover-class.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cover.R +\docType{class} +\name{Cover-class} +\alias{Cover-class} +\alias{cover_ref} +\title{Reference Class (R5) implementation of Mapper for the TDAmapper package in R +Cover reference class +filter_values := Filter values +num_intervals := vector of number of bins to cover the Z with (per dimension) +percent_overlap := vector of overlap percentages +level_sets := list of level_sets (including min/max bounds)} +\description{ +Reference Class (R5) implementation of Mapper for the TDAmapper package in R +Cover reference class +filter_values := Filter values +num_intervals := vector of number of bins to cover the Z with (per dimension) +percent_overlap := vector of overlap percentages +level_sets := list of level_sets (including min/max bounds) +} + diff --git a/man/MapperRef-class.Rd b/man/MapperRef-class.Rd new file mode 100644 index 0000000..4576528 --- /dev/null +++ b/man/MapperRef-class.Rd @@ -0,0 +1,15 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/mapperRef.R +\docType{class} +\name{MapperRef-class} +\alias{MapperRef-class} +\alias{mapper_ref} +\title{Reference Class (R5) implementation of Mapper for the TDAmapper package in R +Composes a set of classes to perform Mapper efficiently. +Author: Matt Piekenbrock} +\description{ +Reference Class (R5) implementation of Mapper for the TDAmapper package in R +Composes a set of classes to perform Mapper efficiently. +Author: Matt Piekenbrock +} + diff --git a/man/cluster_cutoff_at_first_empty_bin.Rd b/man/cluster_cutoff_at_first_empty_bin.Rd index 202070f..c9bdc1d 100644 --- a/man/cluster_cutoff_at_first_empty_bin.Rd +++ b/man/cluster_cutoff_at_first_empty_bin.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/cluster_cutoff_at_first_empty_bin.R \name{cluster_cutoff_at_first_empty_bin} \alias{cluster_cutoff_at_first_empty_bin} @@ -19,13 +19,12 @@ Numerical value for cutoff point of hierarchical cluster diagram. \description{ This function decides where to cut the hierarchical clustering tree to define clusters within a level set. } -\author{ -Paul Pearson, \email{pearsonp@hope.edu} -} \references{ \url{https://github.com/paultpearson/TDAmapper} } \seealso{ \code{\link{mapper1D}}, \code{\link{mapper2D}} } - +\author{ +Paul Pearson, \email{pearsonp@hope.edu} +} diff --git a/man/mapper.Rd b/man/mapper.Rd index d30bc68..f213f7a 100644 --- a/man/mapper.Rd +++ b/man/mapper.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/mapper.R \name{mapper} \alias{mapper} @@ -34,14 +34,12 @@ m1 <- mapper( percent_overlap = c(50,50), num_bins_when_clustering = 10) \dontrun{ -#install.packages("igraph") +#install.packages("igraph") library(igraph) g1 <- graph.adjacency(m1$adjacency, mode="undirected") plot(g1, layout = layout.auto(g1) ) } -} -\author{ -Paul Pearson, \email{pearsonp@hope.edu} + } \references{ \url{https://github.com/paultpearson/TDAmapper} @@ -49,5 +47,7 @@ Paul Pearson, \email{pearsonp@hope.edu} \seealso{ \code{\link{mapper1D}}, \code{\link{mapper2D}} } +\author{ +Paul Pearson, \email{pearsonp@hope.edu} +} \keyword{mapper} - diff --git a/man/mapper1D.Rd b/man/mapper1D.Rd index f3ad666..05989b8 100644 --- a/man/mapper1D.Rd +++ b/man/mapper1D.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/mapper1D.R \name{mapper1D} \alias{mapper1D} @@ -33,20 +33,19 @@ m1 <- mapper1D( percent_overlap = 50, num_bins_when_clustering = 10) \dontrun{ -#install.packages("igraph") +#install.packages("igraph") library(igraph) g1 <- graph.adjacency(m1$adjacency, mode="undirected") plot(g1, layout = layout.auto(g1) ) } } -\author{ -Paul Pearson, \email{pearsonp@hope.edu} -} \references{ \url{https://github.com/paultpearson/TDAmapper} } \seealso{ \code{\link{mapper2D}} } +\author{ +Paul Pearson, \email{pearsonp@hope.edu} +} \keyword{mapper1D} - diff --git a/man/mapper2D.Rd b/man/mapper2D.Rd index c2a16e4..c65f224 100644 --- a/man/mapper2D.Rd +++ b/man/mapper2D.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/mapper2D.R \name{mapper2D} \alias{mapper2D} @@ -39,14 +39,13 @@ g2 <- graph.adjacency(m2$adjacency, mode="undirected") plot(g2, layout = layout.auto(g2) ) } } -\author{ -Paul Pearson, \email{pearsonp@hope.edu} -} \references{ \url{https://github.com/paultpearson/TDAmapper} } \seealso{ \code{\link{mapper1D}} } +\author{ +Paul Pearson, \email{pearsonp@hope.edu} +} \keyword{mapper2D} - diff --git a/man/mapperEdges.Rd b/man/mapperEdges.Rd index 997a991..faf35a8 100644 --- a/man/mapperEdges.Rd +++ b/man/mapperEdges.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/mapperEdges.R \name{mapperEdges} \alias{mapperEdges} @@ -14,7 +14,7 @@ A data frame describing the edges in the graph of the mapper output. } \description{ The input to this function is a TDAmapper class object and the output -is a data frame of edges that can be used as input to the networkD3 +is a data frame of edges that can be used as input to the networkD3 plot utility. } \examples{ @@ -27,27 +27,26 @@ m1 <- mapper( num_intervals = 10, percent_overlap = 50, num_bins_when_clustering = 10) - + pt_labels <- 1:length(f) vertices <- mapperVertices(m1, pt_labels) edges <- mapperEdges(m1) # interactive plot -forceNetwork(Nodes = nodes, Links = links, +forceNetwork(Nodes = nodes, Links = links, Source = "Linksource", Target = "Linktarget", Value = "Linkvalue", NodeID = "Nodename", - Group = "Nodegroup", opacity = 0.8, + Group = "Nodegroup", opacity = 0.8, linkDistance = 10, charge = -400) } } -\author{ -Paul Pearson, \email{pearsonp@hope.edu} -} \references{ \url{https://github.com/paultpearson/TDAmapper} } \seealso{ \code{\link{mapperVertices}} } +\author{ +Paul Pearson, \email{pearsonp@hope.edu} +} \keyword{mapperEdges} - diff --git a/man/mapperVertices.Rd b/man/mapperVertices.Rd index 23e63ee..043c538 100644 --- a/man/mapperVertices.Rd +++ b/man/mapperVertices.Rd @@ -1,4 +1,4 @@ -% Generated by roxygen2 (4.1.1): do not edit by hand +% Generated by roxygen2: do not edit by hand % Please edit documentation in R/mapperVertices.R \name{mapperVertices} \alias{mapperVertices} @@ -7,17 +7,17 @@ mapperVertices(m, pt_labels) } \arguments{ -\item{m}{An object of class TDAmapper that is the output of the mapper +\item{m}{An object of class TDAmapper that is the output of the mapper function.} } \value{ -A data frame describing the vertices in the graph of the mapper -output and the point labels that will be displayed when the mouse +A data frame describing the vertices in the graph of the mapper +output and the point labels that will be displayed when the mouse hovers over a vertex in the graph. } \description{ The input to this function is a TDAmapper class object and the output -is a data frame of vertices that can be used as input to the networkD3 +is a data frame of vertices that can be used as input to the networkD3 plot utility. } \examples{ @@ -30,27 +30,26 @@ m1 <- mapper( num_intervals = 10, percent_overlap = 50, num_bins_when_clustering = 10) - + pt_labels <- 1:length(f) vertices <- mapperVertices(m1, pt_labels) edges <- mapperEdges(m1) # interactive plot -forceNetwork(Nodes = nodes, Links = links, +forceNetwork(Nodes = nodes, Links = links, Source = "Linksource", Target = "Linktarget", Value = "Linkvalue", NodeID = "Nodename", - Group = "Nodegroup", opacity = 0.8, + Group = "Nodegroup", opacity = 0.8, linkDistance = 10, charge = -400) } } -\author{ -Paul Pearson, \email{pearsonp@hope.edu} -} \references{ \url{https://github.com/paultpearson/TDAmapper} } \seealso{ \code{\link{mapperEdges}} } +\author{ +Paul Pearson, \email{pearsonp@hope.edu} +} \keyword{mapperVertices} - diff --git a/man/mapper_new.Rd b/man/mapper_new.Rd new file mode 100644 index 0000000..8acdf39 --- /dev/null +++ b/man/mapper_new.Rd @@ -0,0 +1,46 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/mapper_new.R +\name{mapper_new} +\alias{mapper_new} +\title{mapper_new function} +\usage{ +mapper_new(X, filter_values, num_intervals, percent_overlap, + num_bins_when_clustering, ...) +} +\arguments{ +\item{X}{Either a data matrix or an object of type 'dist'.} + +\item{filter_values}{An n x m matrix or n-length vector real numbers.} + +\item{num_intervals}{Either a positive integer or a vector of m positive integers specifying the number of intervals to cover the filter values with.} + +\item{percent_overlap}{Either a number or m-length vector of values between 0 and 100 specifying how much adjacent intervals should overlap.} + +\item{num_bins_when_clustering}{A positive integer that controls whether points in the same level set end up in the same cluster.} +} +\value{ +An object of class \code{TDAmapper} which is a list of items named \code{adjacency} (adjacency matrix for the edges), \code{num_vertices} (integer number of vertices), \code{level_of_vertex} (vector with \code{level_of_vertex[i]} = index of the level set for vertex i), \code{points_in_vertex} (list with \code{points_in_vertex[[i]]} = vector of indices of points in vertex i), \code{points_in_level} (list with \code{points_in_level[[i]]} = vector of indices of points in level set i, and \code{vertices_in_level} (list with \code{vertices_in_level[[i]]} = vector of indices of vertices in level set i. +} +\description{ +This function uses a filter function f: X -> R on a data set X that has n rows (observations) and k columns (variables). +} +\examples{ +m1 <- mapper_new( + dist_object = dist(data.frame( x=2*cos(0.5*(1:100)), y=sin(1:100) )), + filter_values = 2*cos(0.5*(1:100)), + num_intervals = 10, + percent_overlap = 50, + num_bins_when_clustering = 10) +\dontrun{ +#install.packages("igraph") +library(igraph) +g1 <- graph.adjacency(m1$adjacency, mode="undirected") +plot(g1, layout = layout.auto(g1) ) +} +} +\references{ +\url{https://github.com/paultpearson/TDAmapper} +} +\author{ +Paul Pearson, \email{pearsonp@hope.edu}, Matt Piekenbrock, \email{piekenbrock.5@wright.edu} +} diff --git a/src/.gitignore b/src/.gitignore new file mode 100644 index 0000000..22034c4 --- /dev/null +++ b/src/.gitignore @@ -0,0 +1,3 @@ +*.o +*.so +*.dll diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp new file mode 100644 index 0000000..be453f8 --- /dev/null +++ b/src/RcppExports.cpp @@ -0,0 +1,59 @@ +// Generated by using Rcpp::compileAttributes() -> do not edit by hand +// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 + +#include + +using namespace Rcpp; + +// adjacencyCpp +IntegerMatrix adjacencyCpp(const IntegerMatrix& ls_pairs, const List& nodes, const List& ls_node_map); +RcppExport SEXP _TDAmapper_adjacencyCpp(SEXP ls_pairsSEXP, SEXP nodesSEXP, SEXP ls_node_mapSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const IntegerMatrix& >::type ls_pairs(ls_pairsSEXP); + Rcpp::traits::input_parameter< const List& >::type nodes(nodesSEXP); + Rcpp::traits::input_parameter< const List& >::type ls_node_map(ls_node_mapSEXP); + rcpp_result_gen = Rcpp::wrap(adjacencyCpp(ls_pairs, nodes, ls_node_map)); + return rcpp_result_gen; +END_RCPP +} +// constructLevelSets +List constructLevelSets(const NumericMatrix& filter_values, const IntegerMatrix& index_set, const NumericVector& interval_length, const NumericVector& step_size, const NumericVector& filter_min); +RcppExport SEXP _TDAmapper_constructLevelSets(SEXP filter_valuesSEXP, SEXP index_setSEXP, SEXP interval_lengthSEXP, SEXP step_sizeSEXP, SEXP filter_minSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const NumericMatrix& >::type filter_values(filter_valuesSEXP); + Rcpp::traits::input_parameter< const IntegerMatrix& >::type index_set(index_setSEXP); + Rcpp::traits::input_parameter< const NumericVector& >::type interval_length(interval_lengthSEXP); + Rcpp::traits::input_parameter< const NumericVector& >::type step_size(step_sizeSEXP); + Rcpp::traits::input_parameter< const NumericVector& >::type filter_min(filter_minSEXP); + rcpp_result_gen = Rcpp::wrap(constructLevelSets(filter_values, index_set, interval_length, step_size, filter_min)); + return rcpp_result_gen; +END_RCPP +} +// dist_subset +NumericVector dist_subset(const NumericVector& dist, IntegerVector idx); +RcppExport SEXP _TDAmapper_dist_subset(SEXP distSEXP, SEXP idxSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const NumericVector& >::type dist(distSEXP); + Rcpp::traits::input_parameter< IntegerVector >::type idx(idxSEXP); + rcpp_result_gen = Rcpp::wrap(dist_subset(dist, idx)); + return rcpp_result_gen; +END_RCPP +} + +static const R_CallMethodDef CallEntries[] = { + {"_TDAmapper_adjacencyCpp", (DL_FUNC) &_TDAmapper_adjacencyCpp, 3}, + {"_TDAmapper_constructLevelSets", (DL_FUNC) &_TDAmapper_constructLevelSets, 5}, + {"_TDAmapper_dist_subset", (DL_FUNC) &_TDAmapper_dist_subset, 2}, + {NULL, NULL, 0} +}; + +RcppExport void R_init_TDAmapper(DllInfo *dll) { + R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); + R_useDynamicSymbols(dll, FALSE); +} diff --git a/src/adjacency.cpp b/src/adjacency.cpp new file mode 100644 index 0000000..a6d31db --- /dev/null +++ b/src/adjacency.cpp @@ -0,0 +1,59 @@ +#include +using namespace Rcpp; + + +// Fast partial-sort/binary-search check to see if the intersection between two given vectors has non-zero length +// Loosely based off of bugged version found here: https://stackoverflow.com/questions/21359432/a-c-version-of-the-in-operator-in-r +bool any_is_in(const IntegerVector& x, const IntegerVector& y){ + std::vector y_sort(y.size()); + std::partial_sort_copy (y.begin(), y.end(), y_sort.begin(), y_sort.end()); // partial-sorted elements of y copied to y_sort + const int nx = x.size(); + for (int i = 0; i < nx; ++i) { + if (std::binary_search(y_sort.begin(), y_sort.end(), x[i])) { + return(true); // end the search + } + } + return(false); +} + +// adjacencyCpp: Creates an adjacency matrix forming edges between nodes w/ non-empty intersections +// Parameters: +// ls_pairs := Integer Matrix of level index pairs to consider +// nodes := List of nodes (each element of which is a vector containing the point indices contained in the node) +// node_map := List where each index corresponds to the ordered level set flat indices, and each element the indices of the nodes in that level set +// [[Rcpp::export]] +IntegerMatrix adjacencyCpp(const IntegerMatrix& ls_pairs, const List& nodes, const List& ls_node_map) { + int n = nodes.length(); + IntegerMatrix adj_mat(n, n); + for (int i = 0; i < ls_pairs.nrow(); ++i){ + + // Get the current pair of level sets to compare; skip if either are empty + const int ls_1 = ls_pairs(i, 0), ls_2 = ls_pairs(i, 1); + if ( Rf_isNull(ls_node_map.at(ls_1 - 1)) || Rf_isNull(ls_node_map.at(ls_2 - 1))){ + continue; + } + const IntegerVector& nodes1 = ls_node_map.at(ls_1 - 1); + const IntegerVector& nodes2 = ls_node_map.at(ls_2 - 1); + + // Compare the nodes within each level set + for (IntegerVector::const_iterator n1 = nodes1.begin(); n1 != nodes1.end(); ++n1){ + for (IntegerVector::const_iterator n2 = nodes2.begin(); n2 != nodes2.end(); ++n2){ + // Retrieve node indices + const IntegerVector& n1_idx = nodes[*n1 - 1]; + const IntegerVector& n2_idx = nodes[*n2 - 1]; + // Add edge between the two if they share a data point + bool intersect_check = any_is_in(n1_idx, n2_idx); + if (intersect_check){ + adj_mat(*n1 - 1, *n2 - 1) = 1; + adj_mat(*n2 - 1, *n1 - 1) = 1; + } + } + } + } + return adj_mat; +} + +/*** R +# load("test.rdata") +# adjacencyCpp(ls_pairs = test$ls_pairs, nodes = test$nodes, ls_node_map = test$ls_node_map) +*/ diff --git a/src/cover.cpp b/src/cover.cpp new file mode 100644 index 0000000..23dfca6 --- /dev/null +++ b/src/cover.cpp @@ -0,0 +1,47 @@ +#include +using namespace Rcpp; + +// Given a logical vector, returns an integer vector (1-based) of the positions in the vector which are true +IntegerVector which_true( LogicalVector x) { + int nx = x.size(); + std::vector y; + y.reserve(nx); + for(int i = 0; i < nx; i++) { if (x[i]) y.push_back(i+1); } + return wrap(y); +} + +// [[Rcpp::export]] +List constructLevelSets(const NumericMatrix& filter_values, const IntegerMatrix& index_set, const NumericVector& interval_length, const NumericVector& step_size, const NumericVector& filter_min) { + const int n = index_set.nrow(); + const int d = index_set.ncol(); + + NumericMatrix ls_bnds = NumericMatrix(2, d); + List level_sets = List(n); + LogicalVector level_set_test = LogicalVector(filter_values.nrow(), true); + for (int i = 0; i < n; ++i){ + IntegerVector ls_idx = index_set(i, _) - 1; + NumericVector level_set_min = filter_min + (as(ls_idx) * step_size); + ls_bnds(0, _) = level_set_min; + ls_bnds(1, _) = level_set_min + interval_length; + + // Reset to all true prior to doing logical range checks + std::fill(level_set_test.begin(), level_set_test.end(), true); + for (int d_i = 0; d_i < d; ++d_i){ + level_set_test = level_set_test & ((filter_values.column(d_i) >= ls_bnds(0, d_i)) & (filter_values.column(d_i) <= ls_bnds(1, d_i))); + } + // Don't explicitly need to save the bounds, but they may useful later + level_sets[i] = List::create(_["points_in_level_set"] = which_true(level_set_test)); //, _["bounds"] = ls_bnds); + } + return(level_sets); +} + + +/*** R +# load("test.rdata") +# fv <- as.matrix(test$filter_values) +# is <- as.matrix(test$index_set) +# il <- as.numeric(test$interval_length) +# ss <- as.numeric(test$step_size) +# fmin <- as.numeric(test$filter_min) +# constructLevelSets(filter_values = fv, index_set = is, interval_length = il, step_size = ss, filter_min = fmin) +*/ diff --git a/src/dist_utilities.cpp b/src/dist_utilities.cpp new file mode 100644 index 0000000..6a15b1f --- /dev/null +++ b/src/dist_utilities.cpp @@ -0,0 +1,24 @@ +#include +using namespace Rcpp; + +#define INDEX_TF(N,to,from) (N)*(to) - (to)*(to+1)/2 + (from) - (to) - (1) + +// Given a 'dist' object (vector) and an integer vector of indices (1-based), create a new dist object +// represented by the subset of 'dist' indexed by 'idx' +// [[Rcpp::export]] +NumericVector dist_subset(const NumericVector& dist, IntegerVector idx){ + const int n = dist.attr("Size"); + const int cl_n = idx.length(); + NumericVector new_dist = NumericVector((cl_n * (cl_n - 1))/2); + int ii = 0; + for (IntegerVector::iterator i = idx.begin(); i != idx.end(); ++i){ + for (IntegerVector::iterator j = i; j != idx.end(); ++j){ + if (*i == *j) { continue; } + const int ij_idx = INDEX_TF(n, (*i < *j ? *i : *j) - 1, (*i < *j ? *j : *i) - 1); + new_dist[ii++] = dist[ij_idx]; + } + } + new_dist.attr("Size") = cl_n; + new_dist.attr("class") = "dist"; + return(new_dist); +} diff --git a/tests/testthat.R b/tests/testthat.R new file mode 100644 index 0000000..02d06e6 --- /dev/null +++ b/tests/testthat.R @@ -0,0 +1,4 @@ +library("testthat") + +library("TDAmapper") +test_check("TDAmapper") diff --git a/tests/testthat/TDAmapper_comparison.R b/tests/testthat/TDAmapper_comparison.R new file mode 100644 index 0000000..8ff56f4 --- /dev/null +++ b/tests/testthat/TDAmapper_comparison.R @@ -0,0 +1,10 @@ +m <- TDAmapper:::mapper_ref$new(X = dist(X)) +m$setCover(fv = f[,1], k = 5L, g = 0.50) +m$setClusteringAlgorithm(cl = "single") +m$computeNodes(num_bins_when_clustering = 5L) +m$computeEdges() +m$plotNetwork() + +m_orig <- mapper1D(dist(X), f[,1], 5, 50, 5) +m_new <- m$exportTDAmapper() +all.equal(m_orig, m_new) \ No newline at end of file From 472aaea529ba34da40da119813d79e2627af9ece Mon Sep 17 00:00:00 2001 From: peekxc Date: Sun, 4 Feb 2018 14:48:35 -0500 Subject: [PATCH 2/2] Pull request notes --- mapper_ex.Rmd | 238 ++++++++++++++++++++++++++++++ mapper_ex.html | 383 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 621 insertions(+) create mode 100644 mapper_ex.Rmd create mode 100644 mapper_ex.html diff --git a/mapper_ex.Rmd b/mapper_ex.Rmd new file mode 100644 index 0000000..8bf9dfa --- /dev/null +++ b/mapper_ex.Rmd @@ -0,0 +1,238 @@ +--- +title: 'TDAmapper: Additions' +author: "Matt Piekenbrock" +output: + html_document: + df_print: paged +--- + +## Overview of Pull Request + +The [Python Mapper library by Dan Mullner](http://danifold.net/mapper/) is great, and takes a traditional Pythonic approach to implementing Mapper, i.e. the OOP approach. OOP is divided in R, as most of R is focused on functional-style programming, and in many ways is quite different from Python. Nonetheless, OOP provides some very real benefits in terms of extensibility and abstraction: in the context of the Python Mapper library, inheritance and polymorphism allows the library the flexibility of configuring how the Mapper construction is generated, including the types of covers to use, the clustering algorithm to use, etc. Additionally, having a Mapper object that can be updated by reference (via heavily coupled OOP programming strategies) enables increases in performance and efficiency of computation. + +There are [several ways of doing OOP in R](https://stackoverflow.com/questions/9521651/r-and-object-oriented-programming); the community is quite divided, adn their popularity, benefits, and use-cases vary quite a bit. I rewrote the Mapper pipeline with [R Reference Classes (R5)](https://stat.ethz.ch/R-manual/R-devel/library/methods/html/refClass.html) since they're quite easy and elegant to use. + +That being said, R users (myself included) are used to functional-style programming, and I don't think it's a good idea to go away from that, especially considering the simplicity of functional-style programming. So instead, I added the `mapper_new` (temporary) function as an illustration of how the TDAmapper package can benefit from both styles. Basically, the new version of this package offers (still) functional wrapper functions to doing high-level calls to generating the Mapper constructions (which maintain backwards compatibility), but internally they make use of reference classes. This allows more advanced users which may want to do more with Mapper the option of working with, changing, and contributing augmentations to the Mapper pipeline (like myself) without having to do things like copy and pasting exorbitant lines of code. + +## Examples using the reference class + +Below is an example of using Mapper to describe a noisy circle, as from Example 3.2 in the original paper by Singh, Carlsson, et. al. +```{r} +library("TDAmapper") +## X in this case is 2-d noisy points around the perimeter of a circle +n <- 1000 +t <- 2*pi*runif(n); u <- runif(n, min = 2, max = 2.1) + runif(n, min = 2, max = 2.1) +r <- ifelse(u > 1, 2 - u, u) +X <- cbind(r*cos(t), r*sin(t)) + +## The filter function is simply the distance from each point to the left-most point in the x-dimension +min_idx <- which.min(X[, 1]) +f_x <- abs(X[, 1] - X[min_idx, 1]) +``` + +```{r} +## Plot the circle +rbw_pal <- rev(rainbow(100, start = 0, end = 4/6)) +f_x <- abs(X[, 1] - X[min_idx, 1]) +binned_idx <- cut(f_x, breaks = 100, labels = F) +plot(X, pch = 20, asp = 1, col = rbw_pal[binned_idx]) +``` + + +Using the new internal Mapper reference class, the Mapper algorithm can be configured at each step. This abstraction allows more customization i.e. a diffferent clustering algorithm may be used, a different type of cover may be constructed, the full simplicial complex may be computed, etc. + +```{r} +Z <- array(f_x, dim = c(length(f_x), 1L)) ## filter values +m <- TDAmapper:::mapper_ref$new(X = X) ## construct the instance +m$setCover(fv = Z, k = 5L, g = 0.2) ## k == number of intervals, g == overlap; only rectangular cover supported for now +m$setClusteringAlgorithm(cl = "single") # use single-linkage; can also use other linkages, or supply your own clustering function +m$computeNodes(num_bins_when_clustering = 10) # parameters to clustering algorithm are passed to computeNodes +m$computeEdges() # in future will be able to choose between 1-skeleton and full simplicial complex +``` + +Internally, various libraries may be used for nice plotting defaults. Here's an example which mimicks Figure 1 of the original paper using the `network` package. Node sizes are weighted logarithmically in relation to the number points in each node, and the color is chosen based on a linear scale between red and blue based on the average set of filter values for all the points in each node. +```{r} +m$plotNetwork() +``` + +The above code used the point cloud data (X) directly (assumed euclidean distance) in the clustering process. An arbitrary 'dist' object may also be used. + +```{r} +m2 <- TDAmapper:::mapper_ref$new(X = dist(X)) ## construct the instance +m2$setCover(fv = Z, k = 5L, g = 0.2) ## k == number of intervals, g == overlap; only rectangular cover supported for now +m2$setClusteringAlgorithm(cl = "single") # use single-linkage; can also use other linkages, or supply your own clustering function +m2$computeNodes(num_bins_when_clustering = 10) # parameters to clustering algorithm are passed to computeNodes +m2$computeEdges() # in future will be able to choose between 1-skeleton and full simplicial complex +m2$plotNetwork() +``` + +The same structure produced with the mapper1D function: +```{r} +library("igraph") +m_orig <- TDAmapper::mapper1D(distance_matrix = dist(X), + filter_values = Z, + num_intervals = 5L, + percent_overlap = 20, + num_bins_when_clustering = 10) +g1 <- graph.adjacency(m_orig$adjacency, mode="undirected") +plot(g1, layout = layout.auto(g1) ) +``` + +## TDAmapper equivalency +Rather than use the reference class directly, instead I wrapped the above procedure into a functinal wrapper `mapper_new` function to replace the current `mapper` call. It's just named `mapper_new` right now to illustrate the differences. Compared to the `mapper1D` and `mapper2D` functions, the new API produces the same construction. + +Consider the previous 1-D example with the new function, using the exact same named parameter settings. The new function as documented uses 'X' instead of 'distance_matrix' for input, although it is backwards compatible (w/ a warning. +```{r} +m_new <- TDAmapper::mapper_new(distance_matrix = dist(X), + filter_values = Z, + num_intervals = 5L, + percent_overlap = 20, + num_bins_when_clustering = 10) +``` + +Of course, they are equivalent in every aspect. +```{r} +all.equal(m_new, m_orig) +``` + +The same applies to the Mapper 2D function. Here's the 2-D example from the github using an oval. +```{r} +X <- dist(data.frame( x=2*cos(1:100), y=sin(1:100) )) +Z <- as.matrix(data.frame(x=2*cos(1:100), y=sin(1:100))) + +m2_orig <- mapper2D(distance_matrix = X, + filter_values = list( Z[,1], Z[,2] ), + num_intervals = c(5,5), + percent_overlap = 50, + num_bins_when_clustering = 10) +m2_new <- mapper_new(X = X, + filter_values = list( Z[,1], Z[,2] ), + num_intervals = c(5,5), + percent_overlap = 50, + num_bins_when_clustering = 10) +all.equal(m2_orig, m2_new) +``` + +## Benefits: Extensibility and Abstraction + +### Performance Comparison +Consider example 2 from the github, identifying an oval. +```{r} +n <- 1000L + +## Example 2: Identifying oval +X <- data.frame(x=2*cos(1:n), y=sin(1:n)) +Z <- as.matrix(data.frame(x=2*cos(1:n), y=sin(1:n))) +dis_X <- dist(X) + +## Original 2D version +original <- function(){ + mapper2D(distance_matrix = dis_X, + filter_values = list( Z[,1], Z[,2] ), + num_intervals = c(5,5), + percent_overlap = 50, + num_bins_when_clustering = 10) +} + +## New version (w/ same parameters) +new <- function(...){ + mapper_new(X = dis_X, filter_values = Z, num_intervals = c(5,5), percent_overlap = 50, num_bins_when_clustering = 10, ...) +} + +## There are benefits to not computing all pairwise distances +x_mat <- as.matrix(X) +new_wo_dis <- function(...){ + mapper_new(X = x_mat, filter_values = Z, num_intervals = c(5,5), percent_overlap = 50, num_bins_when_clustering = 10, ...) +} + +``` + +The construction of the level sets in the cover and the construction of the edges (including the non-empty intersection check) have been rewritten and optimized in C++. As a result, using the same parameters betwene the two functions, the performance is quite different. +```{r} +print(microbenchmark::microbenchmark(original(), times = 15L)) +print(microbenchmark::microbenchmark(new(), times = 15L)) +print(microbenchmark::microbenchmark(new_wo_dis(), times = 15L)) +``` +The Rcpp/C++ improvements to the computationally difficult portions Mapper (mostly related to the cover) result in roughly an order of magnitude faster performance in this case. The C++ backend also make TDAmapper is also _scalable_. Consider the exact same problem, but with a higher resolution / more level sets. + +```{r} +## Original 2D version +original <- function(){ + mapper2D(distance_matrix = dis_X, + filter_values = list( Z[,1], Z[,2] ), + num_intervals = c(10,10), + percent_overlap = 50, + num_bins_when_clustering = 10) +} + +## New version +new <- function(...){ + mapper_new(X = dis_X, filter_values = Z, num_intervals = c(10,10), percent_overlap = 50, num_bins_when_clustering = 10, ...) +} +``` + +They seem to scale similarly. In this case, the new backend returns the result at roughly 43x time the speed. + +```{r} +print(microbenchmark::microbenchmark(original(), times = 15L)) +print(microbenchmark::microbenchmark(new(), times = 15L)) +``` + +## Differences +### Between Mapper2D and mapper_new +For large enough data size, there's a small discrepancy in the edge formation of the Mapper 2D code. I think it's in the way the looping structure is performed. Consider the following: +```{r} +## Slight discrepancy +t1 <- original(); t2 <- new() +all.equal(original(), new()) ## Everything is same except for adjacency: ""Component adjacency" : Mean absolute difference: 1" +which(t1$adjacency[1, ] == 1) ## 2 11 +which(t2$adjacency[1, ] == 1) ## 2 11 12 +any(t1$points_in_vertex[[2]] %in% t1$points_in_vertex[[11]]) ## TRUE - Theres should be edge between nodes 2 and 11 +any(t1$points_in_vertex[[2]] %in% t1$points_in_vertex[[12]]) ## TRUE - Theres should also be edge between nodes 2 and 12 +``` + +There should be a an edge between nodes 2 and 11, and 2 and 12, as they have non-empty intersections. + +### Between mapper and mapper_new +The $n$-dimensional experimental version of `mapper` produces different results that the new version. It seems that it handles the interval length computation is handled a bit differently in `mapper` than in the Python Mapper library. + +The `mapper_new` function handles 3-space, or $n$-space, in the covering in the same way it handles the 1-d or 2-d versions. The new code was written carefully such that it is actually the same for every dimension; that is, `mapper_new` may be called for any dimension filter space, without the need for multiple loops, dimension-specific conditions, etc. + +Consider the trefoil knot used in Example 5 from the repository. +```{r} +# parametrize a trefoil knot +n <- 100 +t <- 2*pi*(1:n)/n +X <- data.frame(x = sin(t)+2*sin(2*t), + y = cos(t)-2*cos(2*t), + z = -sin(3*t)) +f <- X +m4 <- mapper(dist(X), f[,1], 5, 50, 5) +g4 <- graph.adjacency(m4$adjacency, mode="undirected") +plot(g4, layout = layout.auto(g4) ) +``` + +The `return_reference` parameter allows the user to get access to the internal reference class used to construct the Mapper instance. +```{r} +m4_new <- mapper_new(dist(X), f[,1], 5, 50, 5, return_reference = TRUE) +``` + +You can always get back to the original `TDAmapper` class via exporting +```{r} +m4_new_tdamapper <- m4_new$exportTDAmapper() +``` + +Of course, the plan is to enable multiple different modes of exporting the internal structure information in the future, whether it be via the igraph package, network package, networkD3 package, html widgets, etc. + +Again, the cover is constructed differently in the `mapper` function compared to the `mapper1D`, `mapper2D`, and `mapper_new` functions, so the results are a bit different, although the structure visually is identical: +```{r} +all.equal(m4, m4_new_tdamapper) +``` + +```{r} +layout(matrix(c(0, 1), nrow = 1)) +g4 <- graph.adjacency(m4$adjacency, mode="undirected") +plot(g4, layout = layout.auto(g4) ) +m4_new$plotNetwork() +``` + diff --git a/mapper_ex.html b/mapper_ex.html new file mode 100644 index 0000000..e05d656 --- /dev/null +++ b/mapper_ex.html @@ -0,0 +1,383 @@ + + + + + + + + + + + + + + +TDAmapper: Additions + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + + + + + + + + + + + + + +
+

Overview of Pull Request

+

The Python Mapper library by Dan Mullner is great, and takes a traditional Pythonic approach to implementing Mapper, i.e. the OOP approach. OOP is divided in R, as most of R is focused on functional-style programming, and in many ways is quite different from Python. Nonetheless, OOP provides some very real benefits in terms of extensibility and abstraction: in the context of the Python Mapper library, inheritance and polymorphism allows the library the flexibility of configuring how the Mapper construction is generated, including the types of covers to use, the clustering algorithm to use, etc. Additionally, having a Mapper object that can be updated by reference (via heavily coupled OOP programming strategies) enables increases in performance and efficiency of computation.

+

There are several ways of doing OOP in R; the community is quite divided, adn their popularity, benefits, and use-cases vary quite a bit. I rewrote the Mapper pipeline with R Reference Classes (R5) since they’re quite easy and elegant to use.

+

That being said, R users (myself included) are used to functional-style programming, and I don’t think it’s a good idea to go away from that, especially considering the simplicity of functional-style programming. So instead, I added the mapper_new (temporary) function as an illustration of how the TDAmapper package can benefit from both styles. Basically, the new version of this package offers (still) functional wrapper functions to doing high-level calls to generating the Mapper constructions (which maintain backwards compatibility), but internally they make use of reference classes. This allows more advanced users which may want to do more with Mapper the option of working with, changing, and contributing augmentations to the Mapper pipeline (like myself) without having to do things like copy and pasting exorbitant lines of code.

+
+
+

Examples using the reference class

+

Below is an example of using Mapper to describe a noisy circle, as from Example 3.2 in the original paper by Singh, Carlsson, et. al.

+
library("TDAmapper")
+## X in this case is 2-d noisy points around the perimeter of a circle 
+n <- 1000
+t <- 2*pi*runif(n); u <- runif(n, min = 2, max = 2.1) + runif(n,  min = 2, max = 2.1)
+r <- ifelse(u > 1, 2 - u, u) 
+X <- cbind(r*cos(t), r*sin(t))
+
+## The filter function is simply the distance from each point to the left-most point in the x-dimension
+min_idx <- which.min(X[, 1])
+f_x <- abs(X[, 1] - X[min_idx, 1])
+
## Plot the circle 
+rbw_pal <- rev(rainbow(100, start = 0, end = 4/6))
+f_x <- abs(X[, 1] - X[min_idx, 1])
+binned_idx <- cut(f_x, breaks = 100, labels = F)
+plot(X, pch = 20, asp = 1, col = rbw_pal[binned_idx])
+

+

Using the new internal Mapper reference class, the Mapper algorithm can be configured at each step. This abstraction allows more customization i.e. a diffferent clustering algorithm may be used, a different type of cover may be constructed, the full simplicial complex may be computed, etc.

+
Z <- array(f_x, dim = c(length(f_x), 1L)) ## filter values
+m <- TDAmapper:::mapper_ref$new(X = X) ## construct the instance
+m$setCover(fv = Z, k = 5L, g = 0.2) ## k == number of intervals, g == overlap; only rectangular cover supported for now
+m$setClusteringAlgorithm(cl = "single") # use single-linkage; can also use other linkages, or supply your own clustering function
+m$computeNodes(num_bins_when_clustering = 10) # parameters to clustering algorithm are passed to computeNodes 
+m$computeEdges() # in future will be able to choose between 1-skeleton and full simplicial complex
+

Internally, various libraries may be used for nice plotting defaults. Here’s an example which mimicks Figure 1 of the original paper using the network package. Node sizes are weighted logarithmically in relation to the number points in each node, and the color is chosen based on a linear scale between red and blue based on the average set of filter values for all the points in each node.

+
m$plotNetwork()
+

+

The above code used the point cloud data (X) directly (assumed euclidean distance) in the clustering process. An arbitrary ‘dist’ object may also be used.

+
m2 <- TDAmapper:::mapper_ref$new(X = dist(X)) ## construct the instance
+m2$setCover(fv = Z, k = 5L, g = 0.2) ## k == number of intervals, g == overlap; only rectangular cover supported for now
+m2$setClusteringAlgorithm(cl = "single") # use single-linkage; can also use other linkages, or supply your own clustering function
+m2$computeNodes(num_bins_when_clustering = 10) # parameters to clustering algorithm are passed to computeNodes 
+m2$computeEdges() # in future will be able to choose between 1-skeleton and full simplicial complex
+m2$plotNetwork()
+

+

The same structure produced with the mapper1D function:

+
library("igraph")
+
## 
+## Attaching package: 'igraph'
+
## The following objects are masked from 'package:stats':
+## 
+##     decompose, spectrum
+
## The following object is masked from 'package:base':
+## 
+##     union
+
m_orig <- TDAmapper::mapper1D(distance_matrix = dist(X), 
+                              filter_values = Z, 
+                              num_intervals = 5L,
+                              percent_overlap = 20, 
+                              num_bins_when_clustering = 10)
+g1 <- graph.adjacency(m_orig$adjacency, mode="undirected")
+plot(g1, layout = layout.auto(g1) )
+

+
+
+

TDAmapper equivalency

+

Rather than use the reference class directly, instead I wrapped the above procedure into a functinal wrapper mapper_new function to replace the current mapper call. It’s just named mapper_new right now to illustrate the differences. Compared to the mapper1D and mapper2D functions, the new API produces the same construction.

+

Consider the previous 1-D example with the new function, using the exact same named parameter settings. The new function as documented uses ‘X’ instead of ‘distance_matrix’ for input, although it is backwards compatible (w/ a warning.

+
m_new <- TDAmapper::mapper_new(distance_matrix = dist(X), 
+                                filter_values = Z, 
+                                num_intervals = 5L,
+                                percent_overlap = 20, 
+                                num_bins_when_clustering = 10)
+
## Warning in TDAmapper::mapper_new(distance_matrix = dist(X), filter_values =
+## Z, : mapper accepts dist objects through the 'X' parameter. Setting X equal
+## to the value of the argument passed via 'distance_matrix'
+

Of course, they are equivalent in every aspect.

+
all.equal(m_new, m_orig)
+
## [1] TRUE
+

The same applies to the Mapper 2D function. Here’s the 2-D example from the github using an oval.

+
X <- dist(data.frame( x=2*cos(1:100), y=sin(1:100) ))
+Z <- as.matrix(data.frame(x=2*cos(1:100), y=sin(1:100)))
+
+m2_orig <- mapper2D(distance_matrix = X,
+              filter_values = list( Z[,1], Z[,2] ),
+              num_intervals = c(5,5),
+              percent_overlap = 50,
+              num_bins_when_clustering = 10)
+m2_new <- mapper_new(X = X,
+              filter_values = list( Z[,1], Z[,2] ),
+              num_intervals = c(5,5),
+              percent_overlap = 50,
+              num_bins_when_clustering = 10)
+all.equal(m2_orig, m2_new)
+
## [1] TRUE
+
+
+

Benefits: Extensibility and Abstraction

+
+

Performance Comparison

+

Consider example 2 from the github, identifying an oval.

+
n <- 1000L
+
+## Example 2: Identifying oval 
+X <- data.frame(x=2*cos(1:n), y=sin(1:n))
+Z <- as.matrix(data.frame(x=2*cos(1:n), y=sin(1:n)))
+dis_X <- dist(X)
+
+## Original 2D version 
+original <- function(){
+  mapper2D(distance_matrix = dis_X,
+                filter_values = list( Z[,1], Z[,2] ),
+                num_intervals = c(5,5),
+                percent_overlap = 50,
+                num_bins_when_clustering = 10)
+}
+
+## New version (w/ same parameters)
+new <- function(...){
+  mapper_new(X = dis_X, filter_values = Z, num_intervals = c(5,5), percent_overlap = 50, num_bins_when_clustering = 10, ...)
+}
+
+## There are benefits to not computing all pairwise distances
+x_mat <- as.matrix(X)
+new_wo_dis <- function(...){
+  mapper_new(X = x_mat, filter_values = Z, num_intervals = c(5,5), percent_overlap = 50, num_bins_when_clustering = 10, ...)
+}
+

The construction of the level sets in the cover and the construction of the edges (including the non-empty intersection check) have been rewritten and optimized in C++. As a result, using the same parameters betwene the two functions, the performance is quite different.

+
print(microbenchmark::microbenchmark(original(), times = 15L))
+
## Unit: milliseconds
+##        expr      min       lq     mean   median       uq      max neval
+##  original() 615.4038 653.3471 699.8842 697.4976 731.9561 792.3856    15
+
print(microbenchmark::microbenchmark(new(), times = 15L))
+
## Unit: milliseconds
+##   expr     min       lq    mean   median       uq      max neval
+##  new() 17.9736 18.88725 20.1663 19.62465 21.16282 24.14937    15
+
print(microbenchmark::microbenchmark(new_wo_dis(), times = 15L))
+
## Unit: milliseconds
+##          expr      min       lq     mean   median       uq      max neval
+##  new_wo_dis() 17.06801 17.73719 19.33634 18.97734 20.93539 21.87764    15
+

The Rcpp/C++ improvements to the computationally difficult portions Mapper (mostly related to the cover) result in roughly an order of magnitude faster performance in this case. The C++ backend also make TDAmapper is also scalable. Consider the exact same problem, but with a higher resolution / more level sets.

+
## Original 2D version 
+original <- function(){
+  mapper2D(distance_matrix = dis_X,
+                filter_values = list( Z[,1], Z[,2] ),
+                num_intervals = c(10,10),
+                percent_overlap = 50,
+                num_bins_when_clustering = 10)
+}
+
+## New version 
+new <- function(...){
+  mapper_new(X = dis_X, filter_values = Z, num_intervals = c(10,10), percent_overlap = 50, num_bins_when_clustering = 10, ...)
+}
+

They seem to scale similarly. In this case, the new backend returns the result at roughly 43x time the speed.

+
print(microbenchmark::microbenchmark(original(), times = 15L))
+
## Unit: seconds
+##        expr      min       lq     mean   median       uq      max neval
+##  original() 1.982011 2.044641 2.187592 2.134616 2.314305 2.636705    15
+
print(microbenchmark::microbenchmark(new(), times = 15L))
+
## Unit: milliseconds
+##   expr      min       lq    mean   median       uq      max neval
+##  new() 54.57873 58.22318 59.9627 60.34342 61.26445 68.84196    15
+
+
+
+

Differences

+
+

Between Mapper2D and mapper_new

+

For large enough data size, there’s a small discrepancy in the edge formation of the Mapper 2D code. I think it’s in the way the looping structure is performed. Consider the following:

+
## Slight discrepancy 
+t1 <- original(); t2 <- new()
+all.equal(original(), new()) ## Everything is same except for adjacency: ""Component adjacency" : Mean absolute difference: 1"
+
## [1] "Component \"adjacency\": Mean absolute difference: 1"
+
which(t1$adjacency[1, ] == 1) ## 2 11 
+
## [1]  2 11
+
which(t2$adjacency[1, ] == 1) ## 2 11 12
+
## [1]  2 11 12
+
any(t1$points_in_vertex[[2]] %in% t1$points_in_vertex[[11]]) ## TRUE - Theres should be edge between nodes 2 and 11
+
## [1] TRUE
+
any(t1$points_in_vertex[[2]] %in% t1$points_in_vertex[[12]]) ## TRUE - Theres should also be edge between nodes 2 and 12
+
## [1] TRUE
+

There should be a an edge between nodes 2 and 11, and 2 and 12, as they have non-empty intersections.

+
+
+

Between mapper and mapper_new

+

The \(n\)-dimensional experimental version of mapper produces different results that the new version. It seems that it handles the interval length computation is handled a bit differently in mapper than in the Python Mapper library.

+

The mapper_new function handles 3-space, or \(n\)-space, in the covering in the same way it handles the 1-d or 2-d versions. The new code was written carefully such that it is actually the same for every dimension; that is, mapper_new may be called for any dimension filter space, without the need for multiple loops, dimension-specific conditions, etc.

+

Consider the trefoil knot used in Example 5 from the repository.

+
# parametrize a trefoil knot
+n <- 100
+t <- 2*pi*(1:n)/n
+X <- data.frame(x = sin(t)+2*sin(2*t),
+                y = cos(t)-2*cos(2*t),
+                z = -sin(3*t))
+f <- X
+m4 <- mapper(dist(X), f[,1], 5, 50, 5)
+g4 <- graph.adjacency(m4$adjacency, mode="undirected")
+plot(g4, layout = layout.auto(g4) )
+

+

The return_reference parameter allows the user to get access to the internal reference class used to construct the Mapper instance.

+
m4_new <- mapper_new(dist(X), f[,1], 5, 50, 5, return_reference = TRUE)
+

You can always get back to the original TDAmapper class via exporting

+
m4_new_tdamapper <- m4_new$exportTDAmapper()
+

Of course, the plan is to enable multiple different modes of exporting the internal structure information in the future, whether it be via the igraph package, network package, networkD3 package, html widgets, etc.

+

Again, the cover is constructed differently in the mapper function compared to the mapper1D, mapper2D, and mapper_new functions, so the results are a bit different, although the structure visually is identical:

+
all.equal(m4, m4_new_tdamapper)
+
##  [1] "Names: 2 string mismatches"                                                   
+##  [2] "Component \"points_in_vertex\": Component 1: Numeric: lengths (19, 34) differ"
+##  [3] "Component \"points_in_vertex\": Component 2: Numeric: lengths (30, 36) differ"
+##  [4] "Component \"points_in_vertex\": Component 3: Numeric: lengths (30, 32) differ"
+##  [5] "Component \"points_in_vertex\": Component 4: Numeric: lengths (30, 34) differ"
+##  [6] "Component \"points_in_vertex\": Component 5: Numeric: lengths (19, 34) differ"
+##  [7] "Component 5: Component 1: Numeric: lengths (19, 34) differ"                   
+##  [8] "Component 5: Component 2: Numeric: lengths (30, 36) differ"                   
+##  [9] "Component 5: Component 3: Numeric: lengths (30, 32) differ"                   
+## [10] "Component 5: Component 4: Numeric: lengths (30, 34) differ"                   
+## [11] "Component 5: Component 5: Numeric: lengths (19, 34) differ"
+
layout(matrix(c(0, 1), nrow = 1))
+g4 <- graph.adjacency(m4$adjacency, mode="undirected")
+plot(g4, layout = layout.auto(g4) )
+

+
m4_new$plotNetwork()
+

+
+
+ + + + +
+ + + + + + + +