diff --git a/src/05-training-testing-data-prep-functions.R b/src/05-training-testing-data-prep-functions.R
index d1ff3ce..c887c2e 100644
--- a/src/05-training-testing-data-prep-functions.R
+++ b/src/05-training-testing-data-prep-functions.R
@@ -1,22 +1,22 @@
## -----------------------------------
-## wisdm - data preparation functions
-## ApexRMS, March 2022
+## wisdm - data preparation functions
+## ApexRMS, March 2022
## -----------------------------------
# DATA PREP FUNCTIONS ----------------------------------------------------------
## Split data for Training/Testing ---------------------------------------------
-testTrainSplit <- function(inputData, # dataframe with field data and Covariate Values
- trainProp, # proportion of data to use in model training
- factorVars = NULL, # names of categorical variables (if any)
- ratioPresAbs = NULL){ # ratio of presence to absence points that should be used
-
-
+testTrainSplit <- function(
+ inputData, # dataframe with field data and Covariate Values
+ trainProp, # proportion of data to use in model training
+ factorVars = NULL, # names of categorical variables (if any)
+ ratioPresAbs = NULL # ratio of presence to absence points that should be used
+) {
# Description:
# Given a training proportion, each observation is assigned to the
- # test or training split.The split will be balanced with respect to the response
- # in that the training proportion will be matched as close as possible for each
+ # test or training split.The split will be balanced with respect to the response
+ # in that the training proportion will be matched as close as possible for each
# level of the response.
# An optional parameter, ratioPresAbs, can be used to specify if there is a certain
# ratio of presence to absence points that should be used this ensures the given
@@ -24,158 +24,243 @@ testTrainSplit <- function(inputData, # dataframe with field data an
# all responses greater than or equal to 1 will be used to meet this ratio.
# This option reduces the sample size as some data must be thrown away to meet
# the constraint of having the desired proportion.
- # Background points are ignored by this module (they are read in and written out,
- # but not assigned to either the test or training split).
-
- if(trainProp<=0 | trainProp>1){ stop("Training Proportion must be a number between 0 and 1 excluding 0") }
- if(!is.null(ratioPresAbs)) {
- if(ratioPresAbs<=0) stop("The ratio of presence to absence (ratioPresAbs) must be a \ngreater than 0")}
-
+ # Background points are ignored by this module (they are read in and written out,
+ # but not assigned to either the test or training split).
+
+ if (trainProp <= 0 | trainProp > 1) {
+ stop("Training Proportion must be a number between 0 and 1 excluding 0")
+ }
+ if (!is.null(ratioPresAbs)) {
+ if (ratioPresAbs <= 0) {
+ stop(
+ "The ratio of presence to absence (ratioPresAbs) must be a \ngreater than 0"
+ )
+ }
+ }
+
#Read input data and remove any columns to be excluded
dat <- inputData
response <- dat$Response
-
- if(sum(as.numeric(response)==0)==0 && !is.null(ratioPresAbs)) stop("The ratio of presence to absence cannot be set with only presence data")
-
+
+ if (sum(as.numeric(response) == 0) == 0 && !is.null(ratioPresAbs)) {
+ stop(
+ "The ratio of presence to absence cannot be set with only presence data"
+ )
+ }
+
# Ignore background data if present
- bg.dat <- dat[response == nodataValue,]
-
- if(dim(bg.dat)[1] != 0){
- dat <- dat[-c(which(response == nodataValue, arr.ind=TRUE)),]
- response <- response[-c(which(response==nodataValue, arr.ind=TRUE))]
+ bg.dat <- dat[response == nodataValue, ]
+
+ if (dim(bg.dat)[1] != 0) {
+ dat <- dat[-c(which(response == nodataValue, arr.ind = TRUE)), ]
+ response <- response[-c(which(response == nodataValue, arr.ind = TRUE))]
}
## Warnings section ##
-
+
# tagging factors and looking at their levels warning users if their factors have few levels
- if(!is.null(factorVars)){
- for (i in 1:length(factorVars)){
- factor.table <- table(dat[,factorVars[i]])
- if(any(factor.table<10)){ updateRunLog(paste0("\nSome levels for the categorical predictor ",factorVars[i]," do not have at least 10 observations. ",
- "Consider removing or reclassifying this predictor before continuing.\n",
- "Factors with few observations can cause failure in model fitting when the data is split and cannot be reliably used in training a model.\n"))
+ if (!is.null(factorVars)) {
+ for (i in 1:length(factorVars)) {
+ factor.table <- table(dat[, factorVars[i]])
+ if (any(factor.table < 10)) {
+ updateRunLog(paste0(
+ "\nSome levels for the categorical predictor ",
+ factorVars[i],
+ " do not have at least 10 observations. ",
+ "Consider removing or reclassifying this predictor before continuing.\n",
+ "Factors with few observations can cause failure in model fitting when the data is split and cannot be reliably used in training a model.\n"
+ ))
factor.table <- as.data.frame(factor.table)
- colnames(factor.table)<-c("Factor Name","Factor Count")
- updateRunLog(paste("\n",factorVars[i],"\n"))
- updateRunLog(pander::pandoc.table.return(factor.table, style = "rmarkdown"))
+ colnames(factor.table) <- c("Factor Name", "Factor Count")
+ updateRunLog(paste("\n", factorVars[i], "\n"))
+ updateRunLog(pander::pandoc.table.return(
+ factor.table,
+ style = "rmarkdown"
+ ))
}
}
}
- if(length(response)<100){ stop("A test training split is not advisable for less than 100 observations. Consider-cross validation as an alternative.")}
- if(length(response)<200){
- updateRunLog(paste0("\nThere are less than 200 observations. Cross-validation might be preferable to a test/training split./n",
- "Weigh the decision while keeping in mind the number of predictors being considered: ", ncol(dat)-7,"/n"))
+ if (length(response) < 100) {
+ stop(
+ "A test training split is not advisable for less than 100 observations. Consider-cross validation as an alternative."
+ )
+ }
+ if (length(response) < 200) {
+ updateRunLog(paste0(
+ "\nThere are less than 200 observations. Cross-validation might be preferable to a test/training split.\n",
+ "Weigh the decision while keeping in mind the number of predictors being considered: ",
+ ncol(dat) - 7,
+ "\n"
+ ))
}
- if(all(na.omit(response) %in% 0:1) & any(table(response)<10)){
- stop("Use of a test training split is not recommended when the dataset contains less than 10 presence or absence points")
+ if (all(na.omit(response) %in% 0:1) & any(table(response) < 10)) {
+ stop(
+ "Use of a test training split is not recommended when the dataset contains less than 10 presence or absence points"
+ )
}
## End warnings section ##
-
- temp <- if(!is.null(ratioPresAbs)){(sum(response>=1)/sum(response==0) == ratioPresAbs)}
- if(is.null(temp)){ temp <- FALSE }
-
- if(is.null(ratioPresAbs) | temp){
+
+ temp <- if (!is.null(ratioPresAbs)) {
+ (sum(response >= 1) / sum(response == 0) == ratioPresAbs)
+ }
+ if (is.null(temp)) {
+ temp <- FALSE
+ }
+
+ if (is.null(ratioPresAbs) | temp) {
# Randomly sample presence absence or counts as close to the size of the training proportion as possible
-
+
# iterate through each unique response and randomly assign the trainProp to be in the training split
TrainSplit <- numeric()
- for(i in sort(as.numeric(unique(response)))){
- TrainSplit <- c(TrainSplit, sample(which(response == i, arr.ind=TRUE), size = round(sum(response==i)*trainProp)))
+ for (i in sort(as.numeric(unique(response)))) {
+ TrainSplit <- c(
+ TrainSplit,
+ sample(
+ which(response == i, arr.ind = TRUE),
+ size = round(sum(response == i) * trainProp)
+ )
+ )
}
-
+
# take everything not in the training set for the test set
- TestSplit <- which(!(seq(1:length(response))) %in% TrainSplit, arr.ind=TRUE)
-
+ TestSplit <- which(
+ !(seq(1:length(response))) %in% TrainSplit,
+ arr.ind = TRUE
+ )
+
dat$UseInModelEvaluation[TrainSplit] <- F
dat$UseInModelEvaluation[TestSplit] <- T
+ } else {
+ # now considering if there is a desired ratio of presence to absence points
- } else { # now considering if there is a desired ratio of presence to absence points
-
- if(sum(response>=1)/sum(response==0) >= ratioPresAbs){
-
+ if (sum(response >= 1) / sum(response == 0) >= ratioPresAbs) {
# first determine how many presence points to remove
- TotToRmv <- (sum(response>=1)-ratioPresAbs*sum(response==0))
- if(TotToRmv/sum(response>=1) > 0.5){
- updateRunLog("\nWarning: Over 50% of the presence points were removed to meet the desired ratio of presence to absence\n")}
-
+ TotToRmv <- (sum(response >= 1) - ratioPresAbs * sum(response == 0))
+ if (TotToRmv / sum(response >= 1) > 0.5) {
+ updateRunLog(
+ "\nWarning: Over 50% of the presence points were removed to meet the desired ratio of presence to absence\n"
+ )
+ }
+
# determine the number of each count to remove weighted by the response and then remove these
- EachToRmv <- round(TotToRmv * table(response[response!= 0])/sum(response!= 0))
- ByCount <- split(cbind(which(response != 0, arr.ind=TRUE)),f = response[response!=0])
-
+ EachToRmv <- round(
+ TotToRmv * table(response[response != 0]) / sum(response != 0)
+ )
+ ByCount <- split(
+ cbind(which(response != 0, arr.ind = TRUE)),
+ f = response[response != 0]
+ )
+
# sampling function ----
- sam <- function(x,size){
- if(length(x)==1 & size==1){ return(x)
- } else { sample(x=x,size=size) }
+ sam <- function(x, size) {
+ if (length(x) == 1 & size == 1) {
+ return(x)
+ } else {
+ sample(x = x, size = size)
+ }
}
-
- RmvIndx <- as.vector(unlist(mapply(sam,x=ByCount,size=EachToRmv)))
+
+ RmvIndx <- as.vector(unlist(mapply(sam, x = ByCount, size = EachToRmv)))
KeepIndx <- seq(1:length(response))[-c(RmvIndx)]
respKeep <- response[KeepIndx]
names(respKeep) <- KeepIndx
-
+
# now break these into a train an test split while
TrainSplit <- numeric()
-
- for(i in sort(as.numeric(unique(respKeep)))){
- TrainSplit <- c(TrainSplit, sample(names(respKeep[respKeep==i]), size=round(sum(respKeep==i)*trainProp)))
+
+ for (i in sort(as.numeric(unique(respKeep)))) {
+ TrainSplit <- c(
+ TrainSplit,
+ sample(
+ names(respKeep[respKeep == i]),
+ size = round(sum(respKeep == i) * trainProp)
+ )
+ )
}
TrainSplit <- as.numeric(TrainSplit)
# Take everything not in the training set or in the remove list for the test set
- TestSplit <- seq(from=1,to=length(response))[-c(c(TrainSplit,RmvIndx))]
-
+ TestSplit <- seq(from = 1, to = length(response))[
+ -c(c(TrainSplit, RmvIndx))
+ ]
+
dat$UseInModelEvaluation[TrainSplit] <- F
dat$UseInModelEvaluation[TestSplit] <- T
-
}
-
- if(sum(response>=1)/sum(response==0) < ratioPresAbs){
-
+
+ if (sum(response >= 1) / sum(response == 0) < ratioPresAbs) {
# first balance all responses greater than 1
TrainSplit <- numeric()
- for(i in sort(as.numeric(unique(response[response!=0])))){
- TrainSplit <- c(TrainSplit, sample(which(response==i, arr.ind=TRUE), size = round(sum(response==i)*trainProp)))
+ for (i in sort(as.numeric(unique(response[response != 0])))) {
+ TrainSplit <- c(
+ TrainSplit,
+ sample(
+ which(response == i, arr.ind = TRUE),
+ size = round(sum(response == i) * trainProp)
+ )
+ )
+ }
+ if (
+ (sum(response == 0) - sum(response >= 1) / ratioPresAbs) /
+ sum(response == 0) >
+ 0.5
+ ) {
+ updateRunLog(
+ "\nWarning: Over 50% of the absence points were removed to meet the desired ratio of presence to absence\n"
+ )
}
- if((sum(response==0)-sum(response>=1)/ratioPresAbs)/sum(response==0) > 0.5){
- updateRunLog("\nWarning: Over 50% of the absence points were removed to meet the desired ratio of presence to absence\n")}
-
+
# sample the right number of absence points for the train split
- TrainSplit <- c(TrainSplit, sample(which(response==0, arr.ind=TRUE), size=round(sum(response>=1)*(1/ratioPresAbs)*trainProp)))
-
+ TrainSplit <- c(
+ TrainSplit,
+ sample(
+ which(response == 0, arr.ind = TRUE),
+ size = round(sum(response >= 1) * (1 / ratioPresAbs) * trainProp)
+ )
+ )
+
# take everything not in the training set for the test set
- TestSplit <- which(!(seq(1:length(response))) %in% TrainSplit, arr.ind=TRUE)
-
+ TestSplit <- which(
+ !(seq(1:length(response))) %in% TrainSplit,
+ arr.ind = TRUE
+ )
+
# now sample some points to remove so we have the correct proportion
- temp <- sample(which(TestSplit %in% which(response==0, arr.ind=TRUE), arr.ind=TRUE),
- size = round(sum(TestSplit %in% which(response==0, arr.ind=TRUE)) -
- (1-trainProp)*sum(response>=1)*(1/ratioPresAbs)))
+ temp <- sample(
+ which(
+ TestSplit %in% which(response == 0, arr.ind = TRUE),
+ arr.ind = TRUE
+ ),
+ size = round(
+ sum(TestSplit %in% which(response == 0, arr.ind = TRUE)) -
+ (1 - trainProp) * sum(response >= 1) * (1 / ratioPresAbs)
+ )
+ )
TestSplit <- TestSplit[-c(temp)]
-
+
dat$UseInModelEvaluation[TrainSplit] <- F
dat$UseInModelEvaluation[TestSplit] <- T
-
}
-
+
# dat <- dat[c(TrainSplit,TestSplit),]
-
}
-
- if(dim(bg.dat)[1] != 0) {
+
+ if (dim(bg.dat)[1] != 0) {
names(bg.dat) <- names(dat)
dat <- rbind(dat, bg.dat)
- }
-
- return(dat)
+ }
+ return(dat)
}
## Split data for Cross Validation ---------------------------------------------
-crossValidationSplit <- function(inputData, # dataframe with field data and Covariate Values
- factorVars, # names of categorical variables (if any)
- nFolds = 10, # ValidationDataSheet$NumberOfFolds
- stratify = FALSE, # ValidationDataSheet$StratifyFolds,
- spatSplit = FALSE){
-
+crossValidationSplit <- function(
+ inputData, # dataframe with field data and Covariate Values
+ factorVars, # names of categorical variables (if any)
+ nFolds = 10, # ValidationDataSheet$NumberOfFolds
+ stratify = FALSE, # ValidationDataSheet$StratifyFolds,
+ spatSplit = FALSE
+) {
# Description:
# this code takes as input an mds file with the first line being the predictor or
# response name, the second line an indicator of predictors to include and the
@@ -186,58 +271,83 @@ crossValidationSplit <- function(inputData, # dataframe with field data
# will be assigned a fold. Optional stratification by response is also available
# Background points are ignored
# by this module (they are read in, written out, but not assigned to cv folds.
-
+
# cross validation can only be used for model selection
-
- options(warn=1)
- if(nFolds<=1 | nFolds%%1!=0) stop("Number of folds must be an integer greater than 1")
-
+
+ options(warn = 1)
+ if (nFolds <= 1 | nFolds %% 1 != 0) {
+ stop("Number of folds must be an integer greater than 1")
+ }
+
# read input data and remove any columns to be excluded
dat <- inputData
response <- inputData$Response
-
- if(sum(as.numeric(response)==0)==0 && !is.null(stratify)){
- stop("Cross Validation requires absence data.")
+
+ if (sum(as.numeric(response) == 0) == 0 && stratify) {
+ stop("Stratified cross-validation requires absence data.")
}
-
+
# ignore background data (if present)
- bg.dat <- dat[response == nodataValue,]
+ bg.dat <- dat[response == nodataValue, ]
+
+ if (dim(bg.dat)[1] != 0) {
+ dat <- dat[-c(which(response == nodataValue, arr.ind = TRUE)), ]
+ response <- response[-c(which(response == nodataValue, arr.ind = TRUE))]
+ }
- if(dim(bg.dat)[1]!=0){
- dat <- dat[-c(which(response==nodataValue, arr.ind=TRUE)),]
- response <- response[-c(which(response==nodataValue, arr.ind=TRUE))]
+ if (nFolds > nrow(dat)) {
+ stop(paste0(
+ "Number of CV folds (",
+ nFolds,
+ ") exceeds the number of available observations (",
+ nrow(dat),
+ "). Reduce the number of folds before continuing."
+ ))
}
-
-
+
# tagging factors and looking at their levels warning users if their factors have few levels
- if(!is.null(factorVars)){
- for (i in 1:length(factorVars)){
- factor.table <- table(dat[,factorVars[i]])
- if(any(factor.table<10)){
- updateRunLog(paste0("\nSome levels for the categorical predictor ", factorVars[i], " do not have at least 10 observations.",
- " Consider removing or reclassifying this predictor before continuing.\n",
- "Factors with few observations can cause failure in model fitting when the data is split and cannot be reliably used in training a model.\n"))
+ if (!is.null(factorVars)) {
+ for (i in 1:length(factorVars)) {
+ factor.table <- table(dat[, factorVars[i]])
+ if (any(factor.table < nFolds)) {
+ updateRunLog(paste0(
+ "\nWarning: Some levels for the categorical predictor ",
+ factorVars[i],
+ " have fewer observations than the number of CV folds (",
+ nFolds,
+ "). \n",
+ "This increases the likelihood that a CV training fold will not contain all factor levels,",
+ " which may cause prediction failures during cross-validation.\n",
+ "Consider removing or reclassifying rare levels, or reducing the number of CV folds before continuing.\n"
+ ))
factor.table <- as.data.frame(factor.table)
- colnames(factor.table)<-c("Factor Name","Factor Count")
- updateRunLog(paste("\n",factorVars[i],"\n"))
- updateRunLog(pander::pandoc.table.return(factor.table, style = "rmarkdown"))
+ colnames(factor.table) <- c("Factor Level", "Observation Count")
+ updateRunLog(paste("\n", factorVars[i], "\n"))
+ updateRunLog(pander::pandoc.table.return(
+ factor.table,
+ style = "rmarkdown"
+ ))
}
}
}
# this splits the training set
- if(any(dat$UseInModelEvaluation == TRUE)){
- split.mask <- dat$UseInModelEvaluation == FALSE # FALSE = train, TRUE = test
+ if (any(dat$UseInModelEvaluation == TRUE)) {
+ split.mask <- dat$UseInModelEvaluation == FALSE # FALSE = train, TRUE = test
index <- seq(1:nrow(dat))[split.mask]
- } else { split.mask <- index <- seq(1:nrow(dat)) }
-
-
- if(stratify & !spatSplit){ # if stratify = T and spatial split = F
-
- for(i in 1:length(names(table(response)))){
+ } else {
+ split.mask <- index <- seq(1:nrow(dat))
+ }
+
+ if (stratify & !spatSplit) {
+ # if stratify = T and spatial split = F
+
+ for (i in 1:length(names(table(response)))) {
index.i <- index[response[split.mask] == names(table(response))[i]]
index.i <- index.i[order(runif(length(index.i)))]
- dat$ModelSelectionSplit[index.i] <- c(rep(seq(1:nFolds), each = floor(length(index.i)/nFolds)),
- sample(seq(1:nFolds), size = length(index.i) %% nFolds, replace=FALSE))
+ dat$ModelSelectionSplit[index.i] <- c(
+ rep(seq(1:nFolds), each = floor(length(index.i) / nFolds)),
+ sample(seq(1:nFolds), size = length(index.i) %% nFolds, replace = FALSE)
+ )
}
# } else if(spatSplit){ # if spatial split = T
# X<-as.numeric(dat$X[split.mask])
@@ -249,23 +359,29 @@ crossValidationSplit <- function(inputData, # dataframe with field data
# CVsplitY<-apply(outer(Y,Ybreak,"<"),1,sum)
# if(any(CVsplitY==0)) CVsplitY[CVsplitY==0]<-1
# dat$ModelSelectionSplit[index]<-as.numeric(as.factor(paste(CVsplit,CVsplitY)))
-
- } else { # if stratify = F and spatial split = F
-
- index <- index[order(runif(length(index)))]
- dat$ModelSelectionSplit[index] <- c(rep(seq(1:nFolds),each = floor(length(index)/nFolds)),
- sample(seq(1:nFolds), size = length(index) %% nFolds, replace=FALSE))
+ } else {
+ # if stratify = F and spatial split = F
+
+ index <- index[order(runif(length(index)))]
+ dat$ModelSelectionSplit[index] <- c(
+ rep(seq(1:nFolds), each = floor(length(index) / nFolds)),
+ sample(seq(1:nFolds), size = length(index) %% nFolds, replace = FALSE)
+ )
}
-
- if(all(na.omit(response) %in% 0:1) & any(table(dat$Response,dat$ModelSelectionSplit)<2)){
- stop("Some combinations of the response and data split are nearly empty, please use a stratified nonspatial split")
+
+ if (
+ all(na.omit(response) %in% 0:1) &
+ any(table(dat$Response, dat$ModelSelectionSplit) < 2)
+ ) {
+ stop(
+ "Some combinations of the response and data split are nearly empty, please use a stratified nonspatial split"
+ )
}
-
- if(dim(bg.dat)[1] != 0){
+
+ if (dim(bg.dat)[1] != 0) {
names(bg.dat) <- names(dat)
dat <- rbind(dat, bg.dat)
}
-
- return(dat)
-
+
+ return(dat)
}
diff --git a/src/08-fit-model-functions.R b/src/08-fit-model-functions.R
index 5902170..d929547 100644
--- a/src/08-fit-model-functions.R
+++ b/src/08-fit-model-functions.R
@@ -754,12 +754,20 @@ fitModel <- function(
))
}
- modelGAM <- mgcv::gam(
- formula = startModel,
- data = dat,
- family = binomial(link = "logit"),
- weights = wt,
- method = PenaltySelectionMethod
+ modelGAM <- tryCatch(
+ {
+ mgcv::gam(
+ formula = startModel,
+ data = dat,
+ family = binomial(link = "logit"),
+ weights = wt,
+ method = PenaltySelectionMethod
+ )
+ },
+ error = function(err) {
+ message("mgcv::gam() failed: ", err$message)
+ NULL
+ }
)
return(modelGAM)
@@ -1114,6 +1122,14 @@ cv.fct <- function(
fullFit = F
)
+ if (is.null(cv.final.mod)) {
+ stop(paste0(
+ "CV fold ", i, " model fitting failed. ",
+ "Consider removing or reclassifying rare factor levels, reducing the number of CV folds, ",
+ "or reviewing the data for outliers or class imbalance."
+ ))
+ }
+
fitted.values[pred.mask] <- pred.fct(
mod = cv.final.mod,
x = xdat[pred.mask, ],
@@ -1146,7 +1162,34 @@ cv.fct <- function(
u_i <- fitted.values[pred.mask]
weights.subset <- site.weights[pred.mask]
+ # filter NA predictions (e.g. from factor levels absent in this fold's training data)
+ valid_i <- !is.na(u_i)
+ if (any(!valid_i)) {
+ updateRunLog(paste0(
+ "\nWarning: ", sum(!valid_i), " site(s) in CV fold ", i,
+ " could not be predicted and will be excluded from fold evaluation.",
+ " This is likely caused by a factor level absent from this fold's training data.\n"
+ ))
+ y_i <- y_i[valid_i]
+ u_i <- u_i[valid_i]
+ weights.subset <- weights.subset[valid_i]
+ }
+
+ if (all(is.na(u_i))) {
+ stop(paste0(
+ "CV fold ", i, " produced no valid predictions. This is likely caused by a categorical ",
+ "variable with a factor level that is absent from this fold's training data. ",
+ "Consider removing or reclassifying rare factor levels, or reducing the number of CV folds."
+ ))
+ }
+
if (family == "binomial" | family == "bernoulli") {
+ if (length(unique(y_i)) < 2) {
+ stop(paste0(
+ "CV fold ", i, " contains only one response class after excluding unpredictable sites. ",
+ "Consider removing or reclassifying rare factor levels, or reducing the number of CV folds."
+ ))
+ }
subset.resid.deviance[i] <- calc.deviance(
y_i,
u_i,
@@ -1159,17 +1202,20 @@ cv.fct <- function(
} # end of Cross Validation Fold Loop
data$predicted <- fitted.values
+ data <- data[!is.na(data$predicted), ]
u_i <- fitted.values
if (family == "binomial" | family == "bernoulli") {
+ # filter NA predictions from pooled CV metrics
+ valid_pooled <- !is.na(u_i)
cv.resid.deviance <- calc.deviance(
- obs,
- u_i,
- weights = site.weights,
+ obs[valid_pooled],
+ u_i[valid_pooled],
+ weights = site.weights[valid_pooled],
family = "binomial"
)
- cv.test <- roc(obs, u_i)
- cv.calib <- calibration(obs, u_i)
+ cv.test <- roc(obs[valid_pooled], u_i[valid_pooled])
+ cv.calib <- calibration(obs[valid_pooled], u_i[valid_pooled])
}
subset.test.mean <- mean(subset.test)
diff --git a/src/10-ensemble-model.py b/src/10-ensemble-model.py
index 864e759..f3878fe 100644
--- a/src/10-ensemble-model.py
+++ b/src/10-ensemble-model.py
@@ -157,7 +157,7 @@ def norm(dx, min, max):
ps.environment.update_run_log(f' Normalizing raster {i+1}...')
if minVal == maxVal:
ps.environment.update_run_log(
- f' WARNING: Raster {i+1} is constant (all values = {minVal:.4f}); '
+ f'\nWarning: Raster {i+1} is constant (all values = {minVal:.4f}); '
f'this output has no spatial discriminatory power — this may indicate a model or data preparation issue. '
f'This raster should be removed from the ensemble.')
ps.environment.update_run_log(
diff --git a/src/5-prepare-training-testing-data.R b/src/5-prepare-training-testing-data.R
index 1189e37..d99d83c 100644
--- a/src/5-prepare-training-testing-data.R
+++ b/src/5-prepare-training-testing-data.R
@@ -8,7 +8,7 @@
# source dependencies ----------------------------------------------------------
-library(rsyncrosim) # install.packages("C:/GitHub/rsyncrosim", type="source", repos=NULL)
+library(rsyncrosim) # install.packages("C:/GitHub/rsyncrosim", type="source", repos=NULL)
library(terra)
library(tidyr)
library(dplyr)
@@ -20,7 +20,7 @@ source(file.path(packageDir, "05-training-testing-data-prep-functions.R"))
# Set progress bar -------------------------------------------------------------
-steps <- 5
+steps <- 5
updateRunLog('5 - Prepare Training/Testing Data => Begin')
progressBar(type = "begin", totalSteps = steps)
@@ -31,24 +31,36 @@ myProject <- rsyncrosim::project()
myScenario <- scenario()
# temp directory
-ssimTempDir <- ssimEnvironment()$TransferDirectory
+ssimTempDir <- ssimEnvironment()$TransferDirectory
# Read in datasheets
covariatesSheet <- datasheet(myProject, "wisdm_Covariates", optional = T)
fieldDataSheet <- datasheet(myScenario, "wisdm_FieldData", optional = T)
validationDataSheet <- datasheet(myScenario, "wisdm_ValidationOptions")
-siteDataSheet <- datasheet(myScenario, "wisdm_SiteData", optional = T, lookupsAsFactors = F)
+siteDataSheet <- datasheet(
+ myScenario,
+ "wisdm_SiteData",
+ optional = T,
+ lookupsAsFactors = F
+)
# Prep inputs ------------------------------------------------------------------
# identify categorical covariates
-if(sum(covariatesSheet$IsCategorical, na.rm = T)>0){
- factorVars <- covariatesSheet$CovariateName[which(covariatesSheet$IsCategorical == T & covariatesSheet$CovariateName %in% siteDataSheet$CovariatesID)]
- if(length(factorVars)<1){ factorVars <- NULL }
-} else { factorVars <- NULL }
+if (sum(covariatesSheet$IsCategorical, na.rm = T) > 0) {
+ factorVars <- covariatesSheet$CovariateName[which(
+ covariatesSheet$IsCategorical == T &
+ covariatesSheet$CovariateName %in% siteDataSheet$CovariatesID
+ )]
+ if (length(factorVars) < 1) {
+ factorVars <- NULL
+ }
+} else {
+ factorVars <- NULL
+}
-# drop no data (-9999) sites that resulted from spatial aggregation
-fieldDataSheet <- fieldDataSheet[fieldDataSheet$Response != nodataValue,]
+# drop no data (-9999) sites that resulted from spatial aggregation
+fieldDataSheet <- fieldDataSheet[fieldDataSheet$Response != nodataValue, ]
# set response for background sites to zero
bgSiteIds <- fieldDataSheet$SiteID[fieldDataSheet$Response == backgroundValue]
@@ -57,27 +69,37 @@ fieldDataSheet$Response[fieldDataSheet$Response == backgroundValue] <- 0
# preserve field data column names
fieldDataColNames <- names(fieldDataSheet)
-# Set defaults ----------------------------------------------------------------
+# Set defaults ----------------------------------------------------------------
## Validation Sheet
-if(nrow(validationDataSheet)<1){
- validationDataSheet <- safe_rbind(validationDataSheet, data.frame(SplitData = FALSE,
- CrossValidate = FALSE))
+if (nrow(validationDataSheet) < 1) {
+ validationDataSheet <- safe_rbind(
+ validationDataSheet,
+ data.frame(SplitData = FALSE, CrossValidate = FALSE)
+ )
+}
+if (is.na(validationDataSheet$SplitData)) {
+ validationDataSheet$SplitData <- FALSE
}
-if(is.na(validationDataSheet$SplitData)){validationDataSheet$SplitData <- FALSE}
-if(validationDataSheet$SplitData){
- if(is.na(validationDataSheet$ProportionTrainingData)){
+if (validationDataSheet$SplitData) {
+ if (is.na(validationDataSheet$ProportionTrainingData)) {
validationDataSheet$ProportionTrainingData <- 0.5
- updateRunLog("\nTraining proportion not specified. Default value used: 0.5\n")
+ updateRunLog(
+ "\nTraining proportion not specified. Default value used: 0.5\n"
+ )
}
}
-if(is.na(validationDataSheet$CrossValidate)){validationDataSheet$CrossValidate <- FALSE}
-if(validationDataSheet$CrossValidate){
- if(is.na(validationDataSheet$NumberOfFolds)){
+if (is.na(validationDataSheet$CrossValidate)) {
+ validationDataSheet$CrossValidate <- FALSE
+}
+if (validationDataSheet$CrossValidate) {
+ if (is.na(validationDataSheet$NumberOfFolds)) {
updateRunLog("\nNumber of Folds not specified. Default value used: 10\n")
validationDataSheet$NumberOfFolds <- 10
}
- if(is.na(validationDataSheet$StratifyFolds)){validationDataSheet$StratifyFolds <- FALSE}
+ if (is.na(validationDataSheet$StratifyFolds)) {
+ validationDataSheet$StratifyFolds <- FALSE
+ }
}
saveDatasheet(myScenario, validationDataSheet, "wisdm_ValidationOptions")
@@ -87,14 +109,17 @@ progressBar()
siteDataWide <- spread(data = siteDataSheet, key = CovariatesID, value = Value)
inputData <- left_join(fieldDataSheet, siteDataWide) # select(siteDataWide,-PixelID))
-rm(siteDataSheet, fieldDataSheet, siteDataWide); gc()
-
-# Define Train/Test Split (if specified)
-if(validationDataSheet$SplitData){
- inputData <- testTrainSplit(inputData = inputData,
- trainProp = validationDataSheet$ProportionTrainingData,
- # ratioPresAbs = validationDataSheet$RatioPresenceAbsence,
- factorVars = factorVars)
+rm(siteDataSheet, fieldDataSheet, siteDataWide)
+gc()
+
+# Define Train/Test Split (if specified)
+if (validationDataSheet$SplitData) {
+ inputData <- testTrainSplit(
+ inputData = inputData,
+ trainProp = validationDataSheet$ProportionTrainingData,
+ # ratioPresAbs = validationDataSheet$RatioPresenceAbsence, # disabled; not currently exposed as a UI option in wisdm
+ factorVars = factorVars
+ )
} else {
# set all data to be used in model training
inputData$UseInModelEvaluation <- FALSE
@@ -102,48 +127,60 @@ if(validationDataSheet$SplitData){
progressBar()
-# Define Cross Validation folds (if specified)
-if(validationDataSheet$CrossValidate){
-
- inputData <- crossValidationSplit(inputData = inputData,
- factorVars = factorVars,
- nFolds = validationDataSheet$NumberOfFolds,
- stratify = validationDataSheet$StratifyFolds)
+# Define Cross Validation folds (if specified)
+if (validationDataSheet$CrossValidate) {
+ inputData <- crossValidationSplit(
+ inputData = inputData,
+ factorVars = factorVars,
+ nFolds = validationDataSheet$NumberOfFolds,
+ stratify = validationDataSheet$StratifyFolds
+ )
}
progressBar()
# Check categorical variables and update field data sheet (if no validation specified)
-if(validationDataSheet$SplitData == F & validationDataSheet$CrossValidate == F){
- if(!is.null(factorVars)){
- for (i in 1:length(factorVars)){
- factor.table <- table(inputData[,factorVars[i]])
- if(any(factor.table<10)){
- updateRunLog(paste0("\nSome levels for the categorical predictor ",factorVars[i]," do not have at least 10 observations. ",
- "Consider removing or reclassifying this predictor before continuing.\n",
- "Factors with few observations can cause failure in model fitting when the data is split and cannot be reliably used in training a model.\n"))
+if (
+ validationDataSheet$SplitData == F & validationDataSheet$CrossValidate == F
+) {
+ if (!is.null(factorVars)) {
+ for (i in 1:length(factorVars)) {
+ factor.table <- table(inputData[, factorVars[i]])
+ if (any(factor.table < 10)) {
+ updateRunLog(paste0(
+ "\nWarning: Some levels for the categorical predictor ",
+ factorVars[i],
+ " do not have at least 10 observations. ",
+ "Consider removing or reclassifying this predictor before continuing.\n",
+ "Factors with few observations can cause failure in model fitting when the data is split and cannot be reliably used in training a model.\n"
+ ))
factor.table <- as.data.frame(factor.table)
- colnames(factor.table)<-c("Factor Name","Factor Count")
- updateRunLog(paste("\n",factorVars[i],"\n"))
- updateRunLog(pander::pandoc.table.return(factor.table, style = "rmarkdown"))
+ colnames(factor.table) <- c("Factor Name", "Factor Count")
+ updateRunLog(paste("\n", factorVars[i], "\n"))
+ updateRunLog(pander::pandoc.table.return(
+ factor.table,
+ style = "rmarkdown"
+ ))
}
}
}
# set all data to be used in model training
inputData$UseInModelEvaluation <- FALSE
-}
+}
# revert response for background sites
updateFieldData <- dplyr::select(inputData, all_of(fieldDataColNames))
-rm(inputData); gc()
+rm(inputData)
+gc()
-if(length(bgSiteIds) > 0){
- updateFieldData$Response[which(updateFieldData$SiteID %in% bgSiteIds)] <- backgroundValue
+if (length(bgSiteIds) > 0) {
+ updateFieldData$Response[which(
+ updateFieldData$SiteID %in% bgSiteIds
+ )] <- backgroundValue
}
updateFieldData$SiteID <- format(updateFieldData$SiteID, scientific = F)
# save updated field data to scenario
saveDatasheet(myScenario, updateFieldData, "wisdm_FieldData", append = F)
progressBar(type = "end")
-
diff --git a/src/8-fit-gam.R b/src/8-fit-gam.R
index ec20bc6..5e89f04 100644
--- a/src/8-fit-gam.R
+++ b/src/8-fit-gam.R
@@ -256,6 +256,14 @@ progressBar()
finalMod <- fitModel(dat = trainingData, out = out)
+if (is.null(finalMod)) {
+ stop(
+ "Unable to fit GAM with provided data and settings. Review the run log for details. ",
+ "Common issues include covariates with rare factor levels, too many predictors relative ",
+ "to the number of observations, or data quality issues. "
+ )
+}
+
finalMod$trainingData <- trainingData
# add relevant model details to out
diff --git a/src/9-apply-model.R b/src/9-apply-model.R
index f895e21..4fcc4d1 100644
--- a/src/9-apply-model.R
+++ b/src/9-apply-model.R
@@ -720,17 +720,30 @@ for (i in seq_len(nrow(modelOutputsSheet))) {
# ------------------------------------------------------------------
if (outputOptionsSheet$MakeMessMap | outputOptionsSheet$MakeModMap) {
- if (length(factorVars)) {
+ messFactorVars <- intersect(
+ covariatesSheet$CovariateName[covariatesSheet$IsCategorical == TRUE],
+ modVars
+ )
+ if (modType == "maxent") {
+ messFactorVars <- union(messFactorVars, intersect(mod$catVars, modVars))
+ }
+ contVars <- setdiff(modVars, messFactorVars)
+ if (length(contVars) == 0L) {
updateRunLog(
- "\nWarning: MESS and MoD maps cannot be generated for models with categorical variables.\n"
+ "\nWarning: MESS and MoD maps cannot be generated because all model variables are categorical.\n"
)
} else {
- # Prep training data order & precompute structures
- train.dat <- dplyr::select(trainingData, dplyr::all_of(modVars))
- for (k in modVars) {
+ if (length(factorVars) > 0L) {
+ updateRunLog(
+ "\nWarning: MESS and MoD maps will be computed on continuous variables only; categorical variables are excluded.\n"
+ )
+ }
+ # Prep training data order & precompute structures (continuous vars only)
+ train.dat <- dplyr::select(trainingData, dplyr::all_of(contVars))
+ for (k in contVars) {
train.dat[, k] <- sort(train.dat[, k])
}
- prep <- prepare_mess(train.dat, var_names = modVars)
+ prep <- prepare_mess(train.dat, var_names = contVars)
# ----- MESS -----
if (outputOptionsSheet$MakeMessMap) {
@@ -754,7 +767,7 @@ for (i in seq_len(nrow(modelOutputsSheet))) {
tmp_fin <- file.path(ssimTempDir, "mess_finalized.tif")
r_main <- terra::predict(
- object = covs,
+ object = covs[[contVars]],
model = prep,
fun = mess_fun,
filename = tmp_mess,
@@ -763,7 +776,12 @@ for (i in seq_len(nrow(modelOutputsSheet))) {
# Apply restriction mask and update tmp_mess so src_for_final is correct
if (!is.null(restrictRaster)) {
- r_main <- terra::mask(r_main, restrictRaster, maskvalues = 0)
+ restrict_aligned <- terra::resample(
+ restrictRaster,
+ r_main,
+ method = "near"
+ )
+ r_main <- terra::mask(r_main, restrict_aligned, maskvalues = 0)
terra::writeRaster(r_main, tmp_mess, overwrite = TRUE)
}
@@ -842,7 +860,7 @@ for (i in seq_len(nrow(modelOutputsSheet))) {
)
r_main <- terra::predict(
- object = covs,
+ object = covs[[contVars]],
model = prep,
fun = mod_fun,
name_to_id = name_to_id,
@@ -852,7 +870,12 @@ for (i in seq_len(nrow(modelOutputsSheet))) {
# Apply restriction mask and update tmp_mod so src_for_final is correct
if (!is.null(restrictRaster)) {
- r_main <- terra::mask(r_main, restrictRaster, maskvalues = 0)
+ restrict_aligned <- terra::resample(
+ restrictRaster,
+ r_main,
+ method = "near"
+ )
+ r_main <- terra::mask(r_main, restrict_aligned, maskvalues = 0)
terra::writeRaster(r_main, tmp_mod, overwrite = TRUE)
}
@@ -943,13 +966,13 @@ for (i in seq_len(nrow(modelOutputsSheet))) {
paste0(modType, "_bin_map.tif")
)
}
- if (outputOptionsSheet$MakeMessMap && length(factorVars) == 0L) {
+ if (outputOptionsSheet$MakeMessMap && length(contVars) > 0L) {
spatialOutputsSheet$MessRaster[outputRow] <- file.path(
ssimTempDir,
paste0(modType, "_mess_map.tif")
)
}
- if (outputOptionsSheet$MakeModMap && length(factorVars) == 0L) {
+ if (outputOptionsSheet$MakeModMap && length(contVars) > 0L) {
spatialOutputsSheet$ModRaster[outputRow] <- file.path(
ssimTempDir,
paste0(modType, "_mod_map.tif")
diff --git a/src/package.xml b/src/package.xml
index 0bda412..c15a41b 100644
--- a/src/package.xml
+++ b/src/package.xml
@@ -154,7 +154,7 @@
-
+
@@ -163,7 +163,7 @@
-
+
@@ -194,7 +194,7 @@
-
+
@@ -216,7 +216,7 @@
-
+
@@ -226,7 +226,7 @@
-
+
@@ -234,7 +234,7 @@
-
+