diff --git a/.gitignore b/.gitignore index 0b5325d..74816fd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,4 @@ -.Rproj.user +*.Rproj .Rhistory .RData *.Ruserdata @@ -10,3 +10,4 @@ vignettes/*.pdf *.aux *.log *.synctex.gz +.Rproj.user diff --git a/R/mechanism-bootstrap.R b/R/mechanism-bootstrap.R index 70f02e0..30bf86d 100644 --- a/R/mechanism-bootstrap.R +++ b/R/mechanism-bootstrap.R @@ -5,25 +5,68 @@ #' @param sensitivity Sensitivity of the function #' @param epsilon Numeric differential privacy parameter #' @param fun Function to evaluate +#' @param inputObject the Bootstrap mechanism object on which the input function will be evaluated #' @return Value of the function applied to one bootstrap sample #' @import stats #' @export +# There are 2 options for handling empty partitions: -bootstrapReplication <- function(x, n, sensitivity, epsilon, fun) { +# 1: skip it entirely, and say the total number of partitions is just the number of partitions that are not empty + +bootstrapReplication <- function(x, n, sensitivity, epsilon, fun, inputObject, ...) { partition <- rmultinom(n=1, size=n, prob=rep(1 / n, n)) - maxAppearances <- max(partition) - probs <- sapply(1:maxAppearances, dbinom, size=n, prob=(1 / n)) - statPartitions <- vector('list', maxAppearances) - for (i in 1:maxAppearances) { - iVariance <- (i * probs[i] * (sensitivity^2)) / (2 * epsilon) - iStat <- fun(x[partition == i]) - iNoise <- dpNoise(n=length(iStat), scale=sqrt(iVariance), dist='gaussian') - statPartitions[[i]] <- i * iStat + iNoise + # make a sorted vector of the partitions of the data + # because it is not guaranteed that every partition from 1:max.appearances will have a value in it + # so we need to loop through only the partitions that have data + validPartitions <- sort(unique(partition[,1])) + # we do not want the 0 partition, so we remove it from the list + validPartitions <- validPartitions[2:length(validPartitions)] + # print the unique values of the partition, to track which entries may result in NaN + #print(validPartitions) + probs <- sapply(1:length(validPartitions), dbinom, size=n, prob=(1 / n)) + stat.partitions <- vector('list', length(validPartitions)) + for (i in 1:length(validPartitions)) { + currentPartition <- validPartitions[i] + variance.currentPartition <- (currentPartition * probs[i] * (sensitivity^2)) / (2 * epsilon) + stat.currentPartition <- inputObject$bootStatEval(x[partition == currentPartition], fun, ...) + noise.currentPartition <- dpNoise(n=length(stat.currentPartition), scale=sqrt(variance.currentPartition), dist='gaussian') + stat.partitions[[i]] <- currentPartition * stat.currentPartition + noise.currentPartition } - statOut <- do.call(rbind, statPartitions) - return(apply(statOut, 2, sum)) + stat.out <- do.call(rbind, stat.partitions) + # return(apply(stat.out, 2, sum)) + returnedBootstrappedResult <- apply(stat.out, 2, sum) + return(returnedBootstrappedResult) } +# 2: treat it as a partition with a statistic of value 0 and keep it in the calculation, adding noise and adding it to the final calculation + +# bootstrapReplication <- function(x, n, sensitivity, epsilon, fun, inputObject, ...) { +# partition <- rmultinom(n=1, size=n, prob=rep(1 / n, n)) +# # make a sorted vector of the partitions of the data +# # because it is not guaranteed that every partition from 1:max.appearances will have a value in it +# validPartitions <- validPartitions <- sort(unique(partition[,1])) +# # print the unique values of the partition, to track which entries may result in NaN +# print(validPartitions) +# max.appearances <- max(partition) +# probs <- sapply(1:max.appearances, dbinom, size=n, prob=(1 / n)) +# stat.partitions <- vector('list', max.appearances) +# for (i in 1:max.appearances) { +# variance.i <- (i * probs[i] * (sensitivity^2)) / (2 * epsilon) +# if (i %in% validPartitions) { +# currentPartition <- validPartitions[i] +# stat.i <- inputObject$bootStatEval(x[partition == currentPartition], fun, ...) +# noise.i <- dpNoise(n=length(stat.i), scale=sqrt(variance.i), dist='gaussian') +# stat.partitions[[i]] <- i * stat.i + noise.i +# } else { +# stat.i <- 0 +# noise.i <- dpNoise(n=length(stat.i), scale=sqrt(variance.i), dist='gaussian') +# stat.partitions[[i]] <- i * stat.i + noise.i +# } +# } +# stat.out <- do.call(rbind, stat.partitions) +# return(apply(stat.out, 2, mean)) +# } + #' Bootstrap mechanism #' @@ -39,9 +82,9 @@ mechanismBootstrap <- setRefClass( ) mechanismBootstrap$methods( - bootStatEval = function(xi,...) { + bootStatEval = function(xi, fun,...) { funArgs <- getFuncArgs(fun, inputList=list(...), inputObject=.self) - inputVals = c(list(x=x), funArgs) + inputVals = c(list(x=xi), funArgs) stat <- do.call(bootFun, inputVals) return(stat) }) @@ -58,11 +101,11 @@ mechanismBootstrap$methods( }) mechanismBootstrap$methods( - evaluate = function(fun, x, sens, postFun) { - x <- censorData(x, .self$varType, .self$rng) + evaluate = function(fun, x, sens, postFun, ...) { + x <- censorData(x, .self$varType, .self$rng, rngFormat=.self$rngFormat) x <- fillMissing(x, .self$varType, .self$imputeRng[0], .self$imputeRng[1]) epsilonPart <- epsilon / .self$nBoot - release <- replicate(.self$nBoot, bootstrapReplication(x, n, sens, epsilonPart, fun=.self$bootStatEval)) + release <- replicate(.self$nBoot, bootstrapReplication(x, .self$n, sens, epsilonPart, fun, .self)) stdError <- .self$bootSE(release, .self$nBoot, sens) out <- list('release' = release, 'stdError' = stdError) out <- postFun(out) diff --git a/R/mechanism-laplace.R b/R/mechanism-laplace.R index 39c91af..9e4a7a9 100644 --- a/R/mechanism-laplace.R +++ b/R/mechanism-laplace.R @@ -17,7 +17,6 @@ mechanismLaplace$methods( #' Differentially private evaluation of input function "fun" with sensitivity "sens" on input data #' "x" using the Laplace mechanism. #' - #' @name Laplace Mechanism #' @references C. Dwork, A. Roth The Algorithmic Foundations of Differential Privacy, Chapter 3.3 The Laplace Mechanism p.30-37. August 2014. #' #' @param fun function of input x to add Laplace noise to. @@ -59,8 +58,8 @@ mechanismLaplace$methods( evaluate = function(fun, x, sens, postFun, ...) { x <- censorData(x, .self$varType, .self$rng, .self$bins, .self$rngFormat) x <- fillMissing(x, .self$varType, imputeRng=.self$rng, categories=.self$imputeBins) - fun.args <- getFuncArgs(fun, inputList=list(...), inputObject=.self) - inputVals = c(list(x=x), fun.args) + funArgs <- getFuncArgs(fun, inputList=list(...), inputObject=.self) + inputVals = c(list(x=x), funArgs) trueVal <- do.call(fun, inputVals) # Concern: are we confident that the environment this is happening in is getting erased. scale <- sens / .self$epsilon release <- trueVal + dpNoise(n=length(trueVal), scale=scale, dist='laplace') diff --git a/R/statistic-mean.R b/R/statistic-mean.R index e2d86d2..96ef895 100644 --- a/R/statistic-mean.R +++ b/R/statistic-mean.R @@ -81,7 +81,13 @@ dpMean$methods( #' the \code{dpMean$release} function. release = function(data, ...) { x <- data[, variable] - .self$result <- export(mechanism)$evaluate(mean, x, .self$sens, .self$postProcess, ...) + if (mechanism=='mechanismLaplace'){ + .self$result <- export(mechanism)$evaluate(mean, x, .self$sens, .self$postProcess, ...) + } + else if (mechanism=='mechanismBootstrap'){ + .self$result <- export(mechanism)$evaluate(bootMean, x, .self$sens, .self$postProcess, .self$n) + } + }) dpMean$methods( diff --git a/man/Laplace-Mechanism.Rd b/man/Laplace-Mechanism.Rd deleted file mode 100644 index 15a2222..0000000 --- a/man/Laplace-Mechanism.Rd +++ /dev/null @@ -1,55 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/mechanism-laplace.R -\name{Laplace Mechanism} -\alias{Laplace Mechanism} -\title{Laplace Mechanism} -\arguments{ -\item{fun}{function of input x to add Laplace noise to.} - -\item{x}{input that function fun will be evaluated on.} - -\item{sens}{sensitivity of fun. Sensitivity is defined in above citation.} - -\item{postFun}{post-processing function. Takes differentially private release as input -and returns some form of output in principal based on the differentially private release.} - -\item{...}{any additional (optional) parameters} -} -\value{ -result of post-processing on input function "fun" evaluated on database "x", assuming sensitivity of fun is "sens". -} -\description{ -Differentially private evaluation of input function "fun" with sensitivity "sens" on input data -"x" using the Laplace mechanism. -} -\examples{ -# histogram example - -# the function in `statistic-histogram.R` that creates a histogram from input data -histogram_function <- fun.hist -# the data frame that holds the data we want to analyze, in this case the data is called "PUMS5extract10000" -data <- data(PUMS5extract10000) -# the variable for which we want a histogram -variable <- "age" -# the sensitivity for the histogram, the default sensitivity for histograms is 2 -sens <- 2 -# the post-processing function to use to format the histogram release correctly -post_processing_function <- dpHistogram$postProcess - -laplace_histogram <- mechanismLaplace$evaluate(histogram_function, data[, variable], sens, post_processing_function) - -# mean example - -mean_function <- mean -# the sensitivity for a differntially private mean in calculated as the difference in the data range divided by the number of data points -sens <- diff(rng) / n -# the post-processing function to use to format the mean release correctly -post_processing_function <- dpMean$postProcess -# `data` and `variable` same as above - -laplace_mean <- mechanismLaplace$evaluate(mean_function, data[, variable], sens, post_processing_function) - -} -\references{ -C. Dwork, A. Roth The Algorithmic Foundations of Differential Privacy, Chapter 3.3 The Laplace Mechanism p.30-37. August 2014. -} diff --git a/man/bootstrapReplication.Rd b/man/bootstrapReplication.Rd index 3c83d8b..8ccc312 100644 --- a/man/bootstrapReplication.Rd +++ b/man/bootstrapReplication.Rd @@ -4,7 +4,7 @@ \alias{bootstrapReplication} \title{Bootstrap replication for a function} \usage{ -bootstrapReplication(x, n, sensitivity, epsilon, fun) +bootstrapReplication(x, n, sensitivity, epsilon, fun, inputObject, ...) } \arguments{ \item{x}{Vector} @@ -16,6 +16,8 @@ bootstrapReplication(x, n, sensitivity, epsilon, fun) \item{epsilon}{Numeric differential privacy parameter} \item{fun}{Function to evaluate} + +\item{inputObject}{the Bootstrap mechanism object on which the input function will be evaluated} } \value{ Value of the function applied to one bootstrap sample diff --git a/man/dLap.Rd b/man/dLap.Rd new file mode 100644 index 0000000..fbb8502 --- /dev/null +++ b/man/dLap.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities-noise-generation.R +\name{dLap} +\alias{dLap} +\title{Probability density for Laplace distribution} +\usage{ +dLap(x, mu = 0, b = 1) +} +\arguments{ +\item{x}{numeric, value} + +\item{mu}{numeric, center of the distribution} + +\item{b}{numeric, spread} +} +\value{ +Density for elements of x +} +\description{ +Probability density for Laplace distribution +} +\examples{ + +x <- seq(-3, 3, length.out=61) +dLap(x) +} diff --git a/man/pLap.Rd b/man/pLap.Rd new file mode 100644 index 0000000..098c4af --- /dev/null +++ b/man/pLap.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities-noise-generation.R +\name{pLap} +\alias{pLap} +\title{Laplace Cumulative Distribution Function} +\usage{ +pLap(x, mu = 0, b = 1) +} +\arguments{ +\item{x}{Numeric, the value(s) at which the user wants to know the CDF height.} + +\item{mu}{Numeric, the center of the LaPlace distribution, defaults to 0.} + +\item{b}{Numeric, the spread of the LaPlace distribution, defaults to 1.} +} +\value{ +Probability the LaPlace draw is less than or equal to \code{x}. +} +\description{ +Determines the probability a draw from a LaPlace distribution is less than + or equal to the specified value. +} +\examples{ + +x <- 0 +pLap(x) +} diff --git a/man/qLap.Rd b/man/qLap.Rd new file mode 100644 index 0000000..2dc6efc --- /dev/null +++ b/man/qLap.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities-noise-generation.R +\name{qLap} +\alias{qLap} +\title{Quantile function for Laplace distribution} +\usage{ +qLap(p, mu = 0, b = 1) +} +\arguments{ +\item{p}{Numeric, vector of probabilities} + +\item{mu}{numeric, center of the distribution} + +\item{b}{numeric, spread} +} +\value{ +Quantile function +} +\description{ +Quantile function for Laplace distribution +} +\examples{ +probs <- c(0.05, 0.50, 0.95) +qLap(probs) +} diff --git a/man/rLap.Rd b/man/rLap.Rd new file mode 100644 index 0000000..2370563 --- /dev/null +++ b/man/rLap.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utilities-noise-generation.R +\name{rLap} +\alias{rLap} +\title{Random draw from Laplace distribution} +\usage{ +rLap(mu = 0, b = 1, size = 1) +} +\arguments{ +\item{mu}{numeric, center of the distribution} + +\item{b}{numeric, spread} + +\item{size}{integer, number of draws} +} +\value{ +Random draws from Laplace distribution +} +\description{ +Random draw from Laplace distribution +} +\examples{ + +rLap(size=1000) +} diff --git a/tests/testthat/test-bootstrap.R b/tests/testthat/test-bootstrap.R new file mode 100644 index 0000000..9135097 --- /dev/null +++ b/tests/testthat/test-bootstrap.R @@ -0,0 +1,13 @@ +context("bootstrap") + +test_that('bootstrap did not run, then three NaNs, now produces result that is way too big', { + data(PUMS5extract10000, package = "PSIlence") + + n.boot <- 25 + boot_mean <- dpMean$new(mechanism='mechanismBootstrap', varType='numeric', + variable='income', n=10000, epsilon=0.1, rng=c(0, 750000), + n.boot=n.boot) + boot_mean$release(PUMS5extract10000) + print(boot_mean$result) + print(mean(boot_mean$result$release)) +}) diff --git a/vignettes/dp-mean.Rmd b/vignettes/dp-mean.Rmd index 3a01739..1ec1075 100644 --- a/vignettes/dp-mean.Rmd +++ b/vignettes/dp-mean.Rmd @@ -37,7 +37,7 @@ Syntax ```{r, eval = FALSE} x1 <- c(3, 12, 20, 42, 33, 65, 70, 54, 33, 45) -x2 <- c(TRUE, FALSE, FALSE, TRUE, FALSE, TRUE, TRUE, TRUE) +x2 <- c(TRUE, FALSE, FALSE, TRUE, FALSE, TRUE, TRUE, TRUE, FALSE, TRUE) data <- data.frame(x1, x2) dpMeanExample <- dpMean$new(mechanism='mechanismLaplace', varType='numeric',