diff --git a/src/05-training-testing-data-prep-functions.R b/src/05-training-testing-data-prep-functions.R index d1ff3ce..c887c2e 100644 --- a/src/05-training-testing-data-prep-functions.R +++ b/src/05-training-testing-data-prep-functions.R @@ -1,22 +1,22 @@ ## ----------------------------------- -## wisdm - data preparation functions -## ApexRMS, March 2022 +## wisdm - data preparation functions +## ApexRMS, March 2022 ## ----------------------------------- # DATA PREP FUNCTIONS ---------------------------------------------------------- ## Split data for Training/Testing --------------------------------------------- -testTrainSplit <- function(inputData, # dataframe with field data and Covariate Values - trainProp, # proportion of data to use in model training - factorVars = NULL, # names of categorical variables (if any) - ratioPresAbs = NULL){ # ratio of presence to absence points that should be used - - +testTrainSplit <- function( + inputData, # dataframe with field data and Covariate Values + trainProp, # proportion of data to use in model training + factorVars = NULL, # names of categorical variables (if any) + ratioPresAbs = NULL # ratio of presence to absence points that should be used +) { # Description: # Given a training proportion, each observation is assigned to the - # test or training split.The split will be balanced with respect to the response - # in that the training proportion will be matched as close as possible for each + # test or training split.The split will be balanced with respect to the response + # in that the training proportion will be matched as close as possible for each # level of the response. # An optional parameter, ratioPresAbs, can be used to specify if there is a certain # ratio of presence to absence points that should be used this ensures the given @@ -24,158 +24,243 @@ testTrainSplit <- function(inputData, # dataframe with field data an # all responses greater than or equal to 1 will be used to meet this ratio. # This option reduces the sample size as some data must be thrown away to meet # the constraint of having the desired proportion. - # Background points are ignored by this module (they are read in and written out, - # but not assigned to either the test or training split). - - if(trainProp<=0 | trainProp>1){ stop("Training Proportion must be a number between 0 and 1 excluding 0") } - if(!is.null(ratioPresAbs)) { - if(ratioPresAbs<=0) stop("The ratio of presence to absence (ratioPresAbs) must be a \ngreater than 0")} - + # Background points are ignored by this module (they are read in and written out, + # but not assigned to either the test or training split). + + if (trainProp <= 0 | trainProp > 1) { + stop("Training Proportion must be a number between 0 and 1 excluding 0") + } + if (!is.null(ratioPresAbs)) { + if (ratioPresAbs <= 0) { + stop( + "The ratio of presence to absence (ratioPresAbs) must be a \ngreater than 0" + ) + } + } + #Read input data and remove any columns to be excluded dat <- inputData response <- dat$Response - - if(sum(as.numeric(response)==0)==0 && !is.null(ratioPresAbs)) stop("The ratio of presence to absence cannot be set with only presence data") - + + if (sum(as.numeric(response) == 0) == 0 && !is.null(ratioPresAbs)) { + stop( + "The ratio of presence to absence cannot be set with only presence data" + ) + } + # Ignore background data if present - bg.dat <- dat[response == nodataValue,] - - if(dim(bg.dat)[1] != 0){ - dat <- dat[-c(which(response == nodataValue, arr.ind=TRUE)),] - response <- response[-c(which(response==nodataValue, arr.ind=TRUE))] + bg.dat <- dat[response == nodataValue, ] + + if (dim(bg.dat)[1] != 0) { + dat <- dat[-c(which(response == nodataValue, arr.ind = TRUE)), ] + response <- response[-c(which(response == nodataValue, arr.ind = TRUE))] } ## Warnings section ## - + # tagging factors and looking at their levels warning users if their factors have few levels - if(!is.null(factorVars)){ - for (i in 1:length(factorVars)){ - factor.table <- table(dat[,factorVars[i]]) - if(any(factor.table<10)){ updateRunLog(paste0("\nSome levels for the categorical predictor ",factorVars[i]," do not have at least 10 observations. ", - "Consider removing or reclassifying this predictor before continuing.\n", - "Factors with few observations can cause failure in model fitting when the data is split and cannot be reliably used in training a model.\n")) + if (!is.null(factorVars)) { + for (i in 1:length(factorVars)) { + factor.table <- table(dat[, factorVars[i]]) + if (any(factor.table < 10)) { + updateRunLog(paste0( + "\nSome levels for the categorical predictor ", + factorVars[i], + " do not have at least 10 observations. ", + "Consider removing or reclassifying this predictor before continuing.\n", + "Factors with few observations can cause failure in model fitting when the data is split and cannot be reliably used in training a model.\n" + )) factor.table <- as.data.frame(factor.table) - colnames(factor.table)<-c("Factor Name","Factor Count") - updateRunLog(paste("\n",factorVars[i],"\n")) - updateRunLog(pander::pandoc.table.return(factor.table, style = "rmarkdown")) + colnames(factor.table) <- c("Factor Name", "Factor Count") + updateRunLog(paste("\n", factorVars[i], "\n")) + updateRunLog(pander::pandoc.table.return( + factor.table, + style = "rmarkdown" + )) } } } - if(length(response)<100){ stop("A test training split is not advisable for less than 100 observations. Consider-cross validation as an alternative.")} - if(length(response)<200){ - updateRunLog(paste0("\nThere are less than 200 observations. Cross-validation might be preferable to a test/training split./n", - "Weigh the decision while keeping in mind the number of predictors being considered: ", ncol(dat)-7,"/n")) + if (length(response) < 100) { + stop( + "A test training split is not advisable for less than 100 observations. Consider-cross validation as an alternative." + ) + } + if (length(response) < 200) { + updateRunLog(paste0( + "\nThere are less than 200 observations. Cross-validation might be preferable to a test/training split.\n", + "Weigh the decision while keeping in mind the number of predictors being considered: ", + ncol(dat) - 7, + "\n" + )) } - if(all(na.omit(response) %in% 0:1) & any(table(response)<10)){ - stop("Use of a test training split is not recommended when the dataset contains less than 10 presence or absence points") + if (all(na.omit(response) %in% 0:1) & any(table(response) < 10)) { + stop( + "Use of a test training split is not recommended when the dataset contains less than 10 presence or absence points" + ) } ## End warnings section ## - - temp <- if(!is.null(ratioPresAbs)){(sum(response>=1)/sum(response==0) == ratioPresAbs)} - if(is.null(temp)){ temp <- FALSE } - - if(is.null(ratioPresAbs) | temp){ + + temp <- if (!is.null(ratioPresAbs)) { + (sum(response >= 1) / sum(response == 0) == ratioPresAbs) + } + if (is.null(temp)) { + temp <- FALSE + } + + if (is.null(ratioPresAbs) | temp) { # Randomly sample presence absence or counts as close to the size of the training proportion as possible - + # iterate through each unique response and randomly assign the trainProp to be in the training split TrainSplit <- numeric() - for(i in sort(as.numeric(unique(response)))){ - TrainSplit <- c(TrainSplit, sample(which(response == i, arr.ind=TRUE), size = round(sum(response==i)*trainProp))) + for (i in sort(as.numeric(unique(response)))) { + TrainSplit <- c( + TrainSplit, + sample( + which(response == i, arr.ind = TRUE), + size = round(sum(response == i) * trainProp) + ) + ) } - + # take everything not in the training set for the test set - TestSplit <- which(!(seq(1:length(response))) %in% TrainSplit, arr.ind=TRUE) - + TestSplit <- which( + !(seq(1:length(response))) %in% TrainSplit, + arr.ind = TRUE + ) + dat$UseInModelEvaluation[TrainSplit] <- F dat$UseInModelEvaluation[TestSplit] <- T + } else { + # now considering if there is a desired ratio of presence to absence points - } else { # now considering if there is a desired ratio of presence to absence points - - if(sum(response>=1)/sum(response==0) >= ratioPresAbs){ - + if (sum(response >= 1) / sum(response == 0) >= ratioPresAbs) { # first determine how many presence points to remove - TotToRmv <- (sum(response>=1)-ratioPresAbs*sum(response==0)) - if(TotToRmv/sum(response>=1) > 0.5){ - updateRunLog("\nWarning: Over 50% of the presence points were removed to meet the desired ratio of presence to absence\n")} - + TotToRmv <- (sum(response >= 1) - ratioPresAbs * sum(response == 0)) + if (TotToRmv / sum(response >= 1) > 0.5) { + updateRunLog( + "\nWarning: Over 50% of the presence points were removed to meet the desired ratio of presence to absence\n" + ) + } + # determine the number of each count to remove weighted by the response and then remove these - EachToRmv <- round(TotToRmv * table(response[response!= 0])/sum(response!= 0)) - ByCount <- split(cbind(which(response != 0, arr.ind=TRUE)),f = response[response!=0]) - + EachToRmv <- round( + TotToRmv * table(response[response != 0]) / sum(response != 0) + ) + ByCount <- split( + cbind(which(response != 0, arr.ind = TRUE)), + f = response[response != 0] + ) + # sampling function ---- - sam <- function(x,size){ - if(length(x)==1 & size==1){ return(x) - } else { sample(x=x,size=size) } + sam <- function(x, size) { + if (length(x) == 1 & size == 1) { + return(x) + } else { + sample(x = x, size = size) + } } - - RmvIndx <- as.vector(unlist(mapply(sam,x=ByCount,size=EachToRmv))) + + RmvIndx <- as.vector(unlist(mapply(sam, x = ByCount, size = EachToRmv))) KeepIndx <- seq(1:length(response))[-c(RmvIndx)] respKeep <- response[KeepIndx] names(respKeep) <- KeepIndx - + # now break these into a train an test split while TrainSplit <- numeric() - - for(i in sort(as.numeric(unique(respKeep)))){ - TrainSplit <- c(TrainSplit, sample(names(respKeep[respKeep==i]), size=round(sum(respKeep==i)*trainProp))) + + for (i in sort(as.numeric(unique(respKeep)))) { + TrainSplit <- c( + TrainSplit, + sample( + names(respKeep[respKeep == i]), + size = round(sum(respKeep == i) * trainProp) + ) + ) } TrainSplit <- as.numeric(TrainSplit) # Take everything not in the training set or in the remove list for the test set - TestSplit <- seq(from=1,to=length(response))[-c(c(TrainSplit,RmvIndx))] - + TestSplit <- seq(from = 1, to = length(response))[ + -c(c(TrainSplit, RmvIndx)) + ] + dat$UseInModelEvaluation[TrainSplit] <- F dat$UseInModelEvaluation[TestSplit] <- T - } - - if(sum(response>=1)/sum(response==0) < ratioPresAbs){ - + + if (sum(response >= 1) / sum(response == 0) < ratioPresAbs) { # first balance all responses greater than 1 TrainSplit <- numeric() - for(i in sort(as.numeric(unique(response[response!=0])))){ - TrainSplit <- c(TrainSplit, sample(which(response==i, arr.ind=TRUE), size = round(sum(response==i)*trainProp))) + for (i in sort(as.numeric(unique(response[response != 0])))) { + TrainSplit <- c( + TrainSplit, + sample( + which(response == i, arr.ind = TRUE), + size = round(sum(response == i) * trainProp) + ) + ) + } + if ( + (sum(response == 0) - sum(response >= 1) / ratioPresAbs) / + sum(response == 0) > + 0.5 + ) { + updateRunLog( + "\nWarning: Over 50% of the absence points were removed to meet the desired ratio of presence to absence\n" + ) } - if((sum(response==0)-sum(response>=1)/ratioPresAbs)/sum(response==0) > 0.5){ - updateRunLog("\nWarning: Over 50% of the absence points were removed to meet the desired ratio of presence to absence\n")} - + # sample the right number of absence points for the train split - TrainSplit <- c(TrainSplit, sample(which(response==0, arr.ind=TRUE), size=round(sum(response>=1)*(1/ratioPresAbs)*trainProp))) - + TrainSplit <- c( + TrainSplit, + sample( + which(response == 0, arr.ind = TRUE), + size = round(sum(response >= 1) * (1 / ratioPresAbs) * trainProp) + ) + ) + # take everything not in the training set for the test set - TestSplit <- which(!(seq(1:length(response))) %in% TrainSplit, arr.ind=TRUE) - + TestSplit <- which( + !(seq(1:length(response))) %in% TrainSplit, + arr.ind = TRUE + ) + # now sample some points to remove so we have the correct proportion - temp <- sample(which(TestSplit %in% which(response==0, arr.ind=TRUE), arr.ind=TRUE), - size = round(sum(TestSplit %in% which(response==0, arr.ind=TRUE)) - - (1-trainProp)*sum(response>=1)*(1/ratioPresAbs))) + temp <- sample( + which( + TestSplit %in% which(response == 0, arr.ind = TRUE), + arr.ind = TRUE + ), + size = round( + sum(TestSplit %in% which(response == 0, arr.ind = TRUE)) - + (1 - trainProp) * sum(response >= 1) * (1 / ratioPresAbs) + ) + ) TestSplit <- TestSplit[-c(temp)] - + dat$UseInModelEvaluation[TrainSplit] <- F dat$UseInModelEvaluation[TestSplit] <- T - } - + # dat <- dat[c(TrainSplit,TestSplit),] - } - - if(dim(bg.dat)[1] != 0) { + + if (dim(bg.dat)[1] != 0) { names(bg.dat) <- names(dat) dat <- rbind(dat, bg.dat) - } - - return(dat) + } + return(dat) } ## Split data for Cross Validation --------------------------------------------- -crossValidationSplit <- function(inputData, # dataframe with field data and Covariate Values - factorVars, # names of categorical variables (if any) - nFolds = 10, # ValidationDataSheet$NumberOfFolds - stratify = FALSE, # ValidationDataSheet$StratifyFolds, - spatSplit = FALSE){ - +crossValidationSplit <- function( + inputData, # dataframe with field data and Covariate Values + factorVars, # names of categorical variables (if any) + nFolds = 10, # ValidationDataSheet$NumberOfFolds + stratify = FALSE, # ValidationDataSheet$StratifyFolds, + spatSplit = FALSE +) { # Description: # this code takes as input an mds file with the first line being the predictor or # response name, the second line an indicator of predictors to include and the @@ -186,58 +271,83 @@ crossValidationSplit <- function(inputData, # dataframe with field data # will be assigned a fold. Optional stratification by response is also available # Background points are ignored # by this module (they are read in, written out, but not assigned to cv folds. - + # cross validation can only be used for model selection - - options(warn=1) - if(nFolds<=1 | nFolds%%1!=0) stop("Number of folds must be an integer greater than 1") - + + options(warn = 1) + if (nFolds <= 1 | nFolds %% 1 != 0) { + stop("Number of folds must be an integer greater than 1") + } + # read input data and remove any columns to be excluded dat <- inputData response <- inputData$Response - - if(sum(as.numeric(response)==0)==0 && !is.null(stratify)){ - stop("Cross Validation requires absence data.") + + if (sum(as.numeric(response) == 0) == 0 && stratify) { + stop("Stratified cross-validation requires absence data.") } - + # ignore background data (if present) - bg.dat <- dat[response == nodataValue,] + bg.dat <- dat[response == nodataValue, ] + + if (dim(bg.dat)[1] != 0) { + dat <- dat[-c(which(response == nodataValue, arr.ind = TRUE)), ] + response <- response[-c(which(response == nodataValue, arr.ind = TRUE))] + } - if(dim(bg.dat)[1]!=0){ - dat <- dat[-c(which(response==nodataValue, arr.ind=TRUE)),] - response <- response[-c(which(response==nodataValue, arr.ind=TRUE))] + if (nFolds > nrow(dat)) { + stop(paste0( + "Number of CV folds (", + nFolds, + ") exceeds the number of available observations (", + nrow(dat), + "). Reduce the number of folds before continuing." + )) } - - + # tagging factors and looking at their levels warning users if their factors have few levels - if(!is.null(factorVars)){ - for (i in 1:length(factorVars)){ - factor.table <- table(dat[,factorVars[i]]) - if(any(factor.table<10)){ - updateRunLog(paste0("\nSome levels for the categorical predictor ", factorVars[i], " do not have at least 10 observations.", - " Consider removing or reclassifying this predictor before continuing.\n", - "Factors with few observations can cause failure in model fitting when the data is split and cannot be reliably used in training a model.\n")) + if (!is.null(factorVars)) { + for (i in 1:length(factorVars)) { + factor.table <- table(dat[, factorVars[i]]) + if (any(factor.table < nFolds)) { + updateRunLog(paste0( + "\nWarning: Some levels for the categorical predictor ", + factorVars[i], + " have fewer observations than the number of CV folds (", + nFolds, + "). \n", + "This increases the likelihood that a CV training fold will not contain all factor levels,", + " which may cause prediction failures during cross-validation.\n", + "Consider removing or reclassifying rare levels, or reducing the number of CV folds before continuing.\n" + )) factor.table <- as.data.frame(factor.table) - colnames(factor.table)<-c("Factor Name","Factor Count") - updateRunLog(paste("\n",factorVars[i],"\n")) - updateRunLog(pander::pandoc.table.return(factor.table, style = "rmarkdown")) + colnames(factor.table) <- c("Factor Level", "Observation Count") + updateRunLog(paste("\n", factorVars[i], "\n")) + updateRunLog(pander::pandoc.table.return( + factor.table, + style = "rmarkdown" + )) } } } # this splits the training set - if(any(dat$UseInModelEvaluation == TRUE)){ - split.mask <- dat$UseInModelEvaluation == FALSE # FALSE = train, TRUE = test + if (any(dat$UseInModelEvaluation == TRUE)) { + split.mask <- dat$UseInModelEvaluation == FALSE # FALSE = train, TRUE = test index <- seq(1:nrow(dat))[split.mask] - } else { split.mask <- index <- seq(1:nrow(dat)) } - - - if(stratify & !spatSplit){ # if stratify = T and spatial split = F - - for(i in 1:length(names(table(response)))){ + } else { + split.mask <- index <- seq(1:nrow(dat)) + } + + if (stratify & !spatSplit) { + # if stratify = T and spatial split = F + + for (i in 1:length(names(table(response)))) { index.i <- index[response[split.mask] == names(table(response))[i]] index.i <- index.i[order(runif(length(index.i)))] - dat$ModelSelectionSplit[index.i] <- c(rep(seq(1:nFolds), each = floor(length(index.i)/nFolds)), - sample(seq(1:nFolds), size = length(index.i) %% nFolds, replace=FALSE)) + dat$ModelSelectionSplit[index.i] <- c( + rep(seq(1:nFolds), each = floor(length(index.i) / nFolds)), + sample(seq(1:nFolds), size = length(index.i) %% nFolds, replace = FALSE) + ) } # } else if(spatSplit){ # if spatial split = T # X<-as.numeric(dat$X[split.mask]) @@ -249,23 +359,29 @@ crossValidationSplit <- function(inputData, # dataframe with field data # CVsplitY<-apply(outer(Y,Ybreak,"<"),1,sum) # if(any(CVsplitY==0)) CVsplitY[CVsplitY==0]<-1 # dat$ModelSelectionSplit[index]<-as.numeric(as.factor(paste(CVsplit,CVsplitY))) - - } else { # if stratify = F and spatial split = F - - index <- index[order(runif(length(index)))] - dat$ModelSelectionSplit[index] <- c(rep(seq(1:nFolds),each = floor(length(index)/nFolds)), - sample(seq(1:nFolds), size = length(index) %% nFolds, replace=FALSE)) + } else { + # if stratify = F and spatial split = F + + index <- index[order(runif(length(index)))] + dat$ModelSelectionSplit[index] <- c( + rep(seq(1:nFolds), each = floor(length(index) / nFolds)), + sample(seq(1:nFolds), size = length(index) %% nFolds, replace = FALSE) + ) } - - if(all(na.omit(response) %in% 0:1) & any(table(dat$Response,dat$ModelSelectionSplit)<2)){ - stop("Some combinations of the response and data split are nearly empty, please use a stratified nonspatial split") + + if ( + all(na.omit(response) %in% 0:1) & + any(table(dat$Response, dat$ModelSelectionSplit) < 2) + ) { + stop( + "Some combinations of the response and data split are nearly empty, please use a stratified nonspatial split" + ) } - - if(dim(bg.dat)[1] != 0){ + + if (dim(bg.dat)[1] != 0) { names(bg.dat) <- names(dat) dat <- rbind(dat, bg.dat) } - - return(dat) - + + return(dat) } diff --git a/src/08-fit-model-functions.R b/src/08-fit-model-functions.R index 5902170..d929547 100644 --- a/src/08-fit-model-functions.R +++ b/src/08-fit-model-functions.R @@ -754,12 +754,20 @@ fitModel <- function( )) } - modelGAM <- mgcv::gam( - formula = startModel, - data = dat, - family = binomial(link = "logit"), - weights = wt, - method = PenaltySelectionMethod + modelGAM <- tryCatch( + { + mgcv::gam( + formula = startModel, + data = dat, + family = binomial(link = "logit"), + weights = wt, + method = PenaltySelectionMethod + ) + }, + error = function(err) { + message("mgcv::gam() failed: ", err$message) + NULL + } ) return(modelGAM) @@ -1114,6 +1122,14 @@ cv.fct <- function( fullFit = F ) + if (is.null(cv.final.mod)) { + stop(paste0( + "CV fold ", i, " model fitting failed. ", + "Consider removing or reclassifying rare factor levels, reducing the number of CV folds, ", + "or reviewing the data for outliers or class imbalance." + )) + } + fitted.values[pred.mask] <- pred.fct( mod = cv.final.mod, x = xdat[pred.mask, ], @@ -1146,7 +1162,34 @@ cv.fct <- function( u_i <- fitted.values[pred.mask] weights.subset <- site.weights[pred.mask] + # filter NA predictions (e.g. from factor levels absent in this fold's training data) + valid_i <- !is.na(u_i) + if (any(!valid_i)) { + updateRunLog(paste0( + "\nWarning: ", sum(!valid_i), " site(s) in CV fold ", i, + " could not be predicted and will be excluded from fold evaluation.", + " This is likely caused by a factor level absent from this fold's training data.\n" + )) + y_i <- y_i[valid_i] + u_i <- u_i[valid_i] + weights.subset <- weights.subset[valid_i] + } + + if (all(is.na(u_i))) { + stop(paste0( + "CV fold ", i, " produced no valid predictions. This is likely caused by a categorical ", + "variable with a factor level that is absent from this fold's training data. ", + "Consider removing or reclassifying rare factor levels, or reducing the number of CV folds." + )) + } + if (family == "binomial" | family == "bernoulli") { + if (length(unique(y_i)) < 2) { + stop(paste0( + "CV fold ", i, " contains only one response class after excluding unpredictable sites. ", + "Consider removing or reclassifying rare factor levels, or reducing the number of CV folds." + )) + } subset.resid.deviance[i] <- calc.deviance( y_i, u_i, @@ -1159,17 +1202,20 @@ cv.fct <- function( } # end of Cross Validation Fold Loop data$predicted <- fitted.values + data <- data[!is.na(data$predicted), ] u_i <- fitted.values if (family == "binomial" | family == "bernoulli") { + # filter NA predictions from pooled CV metrics + valid_pooled <- !is.na(u_i) cv.resid.deviance <- calc.deviance( - obs, - u_i, - weights = site.weights, + obs[valid_pooled], + u_i[valid_pooled], + weights = site.weights[valid_pooled], family = "binomial" ) - cv.test <- roc(obs, u_i) - cv.calib <- calibration(obs, u_i) + cv.test <- roc(obs[valid_pooled], u_i[valid_pooled]) + cv.calib <- calibration(obs[valid_pooled], u_i[valid_pooled]) } subset.test.mean <- mean(subset.test) diff --git a/src/10-ensemble-model.py b/src/10-ensemble-model.py index 864e759..f3878fe 100644 --- a/src/10-ensemble-model.py +++ b/src/10-ensemble-model.py @@ -157,7 +157,7 @@ def norm(dx, min, max): ps.environment.update_run_log(f' Normalizing raster {i+1}...') if minVal == maxVal: ps.environment.update_run_log( - f' WARNING: Raster {i+1} is constant (all values = {minVal:.4f}); ' + f'\nWarning: Raster {i+1} is constant (all values = {minVal:.4f}); ' f'this output has no spatial discriminatory power — this may indicate a model or data preparation issue. ' f'This raster should be removed from the ensemble.') ps.environment.update_run_log( diff --git a/src/5-prepare-training-testing-data.R b/src/5-prepare-training-testing-data.R index 1189e37..d99d83c 100644 --- a/src/5-prepare-training-testing-data.R +++ b/src/5-prepare-training-testing-data.R @@ -8,7 +8,7 @@ # source dependencies ---------------------------------------------------------- -library(rsyncrosim) # install.packages("C:/GitHub/rsyncrosim", type="source", repos=NULL) +library(rsyncrosim) # install.packages("C:/GitHub/rsyncrosim", type="source", repos=NULL) library(terra) library(tidyr) library(dplyr) @@ -20,7 +20,7 @@ source(file.path(packageDir, "05-training-testing-data-prep-functions.R")) # Set progress bar ------------------------------------------------------------- -steps <- 5 +steps <- 5 updateRunLog('5 - Prepare Training/Testing Data => Begin') progressBar(type = "begin", totalSteps = steps) @@ -31,24 +31,36 @@ myProject <- rsyncrosim::project() myScenario <- scenario() # temp directory -ssimTempDir <- ssimEnvironment()$TransferDirectory +ssimTempDir <- ssimEnvironment()$TransferDirectory # Read in datasheets covariatesSheet <- datasheet(myProject, "wisdm_Covariates", optional = T) fieldDataSheet <- datasheet(myScenario, "wisdm_FieldData", optional = T) validationDataSheet <- datasheet(myScenario, "wisdm_ValidationOptions") -siteDataSheet <- datasheet(myScenario, "wisdm_SiteData", optional = T, lookupsAsFactors = F) +siteDataSheet <- datasheet( + myScenario, + "wisdm_SiteData", + optional = T, + lookupsAsFactors = F +) # Prep inputs ------------------------------------------------------------------ # identify categorical covariates -if(sum(covariatesSheet$IsCategorical, na.rm = T)>0){ - factorVars <- covariatesSheet$CovariateName[which(covariatesSheet$IsCategorical == T & covariatesSheet$CovariateName %in% siteDataSheet$CovariatesID)] - if(length(factorVars)<1){ factorVars <- NULL } -} else { factorVars <- NULL } +if (sum(covariatesSheet$IsCategorical, na.rm = T) > 0) { + factorVars <- covariatesSheet$CovariateName[which( + covariatesSheet$IsCategorical == T & + covariatesSheet$CovariateName %in% siteDataSheet$CovariatesID + )] + if (length(factorVars) < 1) { + factorVars <- NULL + } +} else { + factorVars <- NULL +} -# drop no data (-9999) sites that resulted from spatial aggregation -fieldDataSheet <- fieldDataSheet[fieldDataSheet$Response != nodataValue,] +# drop no data (-9999) sites that resulted from spatial aggregation +fieldDataSheet <- fieldDataSheet[fieldDataSheet$Response != nodataValue, ] # set response for background sites to zero bgSiteIds <- fieldDataSheet$SiteID[fieldDataSheet$Response == backgroundValue] @@ -57,27 +69,37 @@ fieldDataSheet$Response[fieldDataSheet$Response == backgroundValue] <- 0 # preserve field data column names fieldDataColNames <- names(fieldDataSheet) -# Set defaults ---------------------------------------------------------------- +# Set defaults ---------------------------------------------------------------- ## Validation Sheet -if(nrow(validationDataSheet)<1){ - validationDataSheet <- safe_rbind(validationDataSheet, data.frame(SplitData = FALSE, - CrossValidate = FALSE)) +if (nrow(validationDataSheet) < 1) { + validationDataSheet <- safe_rbind( + validationDataSheet, + data.frame(SplitData = FALSE, CrossValidate = FALSE) + ) +} +if (is.na(validationDataSheet$SplitData)) { + validationDataSheet$SplitData <- FALSE } -if(is.na(validationDataSheet$SplitData)){validationDataSheet$SplitData <- FALSE} -if(validationDataSheet$SplitData){ - if(is.na(validationDataSheet$ProportionTrainingData)){ +if (validationDataSheet$SplitData) { + if (is.na(validationDataSheet$ProportionTrainingData)) { validationDataSheet$ProportionTrainingData <- 0.5 - updateRunLog("\nTraining proportion not specified. Default value used: 0.5\n") + updateRunLog( + "\nTraining proportion not specified. Default value used: 0.5\n" + ) } } -if(is.na(validationDataSheet$CrossValidate)){validationDataSheet$CrossValidate <- FALSE} -if(validationDataSheet$CrossValidate){ - if(is.na(validationDataSheet$NumberOfFolds)){ +if (is.na(validationDataSheet$CrossValidate)) { + validationDataSheet$CrossValidate <- FALSE +} +if (validationDataSheet$CrossValidate) { + if (is.na(validationDataSheet$NumberOfFolds)) { updateRunLog("\nNumber of Folds not specified. Default value used: 10\n") validationDataSheet$NumberOfFolds <- 10 } - if(is.na(validationDataSheet$StratifyFolds)){validationDataSheet$StratifyFolds <- FALSE} + if (is.na(validationDataSheet$StratifyFolds)) { + validationDataSheet$StratifyFolds <- FALSE + } } saveDatasheet(myScenario, validationDataSheet, "wisdm_ValidationOptions") @@ -87,14 +109,17 @@ progressBar() siteDataWide <- spread(data = siteDataSheet, key = CovariatesID, value = Value) inputData <- left_join(fieldDataSheet, siteDataWide) # select(siteDataWide,-PixelID)) -rm(siteDataSheet, fieldDataSheet, siteDataWide); gc() - -# Define Train/Test Split (if specified) -if(validationDataSheet$SplitData){ - inputData <- testTrainSplit(inputData = inputData, - trainProp = validationDataSheet$ProportionTrainingData, - # ratioPresAbs = validationDataSheet$RatioPresenceAbsence, - factorVars = factorVars) +rm(siteDataSheet, fieldDataSheet, siteDataWide) +gc() + +# Define Train/Test Split (if specified) +if (validationDataSheet$SplitData) { + inputData <- testTrainSplit( + inputData = inputData, + trainProp = validationDataSheet$ProportionTrainingData, + # ratioPresAbs = validationDataSheet$RatioPresenceAbsence, # disabled; not currently exposed as a UI option in wisdm + factorVars = factorVars + ) } else { # set all data to be used in model training inputData$UseInModelEvaluation <- FALSE @@ -102,48 +127,60 @@ if(validationDataSheet$SplitData){ progressBar() -# Define Cross Validation folds (if specified) -if(validationDataSheet$CrossValidate){ - - inputData <- crossValidationSplit(inputData = inputData, - factorVars = factorVars, - nFolds = validationDataSheet$NumberOfFolds, - stratify = validationDataSheet$StratifyFolds) +# Define Cross Validation folds (if specified) +if (validationDataSheet$CrossValidate) { + inputData <- crossValidationSplit( + inputData = inputData, + factorVars = factorVars, + nFolds = validationDataSheet$NumberOfFolds, + stratify = validationDataSheet$StratifyFolds + ) } progressBar() # Check categorical variables and update field data sheet (if no validation specified) -if(validationDataSheet$SplitData == F & validationDataSheet$CrossValidate == F){ - if(!is.null(factorVars)){ - for (i in 1:length(factorVars)){ - factor.table <- table(inputData[,factorVars[i]]) - if(any(factor.table<10)){ - updateRunLog(paste0("\nSome levels for the categorical predictor ",factorVars[i]," do not have at least 10 observations. ", - "Consider removing or reclassifying this predictor before continuing.\n", - "Factors with few observations can cause failure in model fitting when the data is split and cannot be reliably used in training a model.\n")) +if ( + validationDataSheet$SplitData == F & validationDataSheet$CrossValidate == F +) { + if (!is.null(factorVars)) { + for (i in 1:length(factorVars)) { + factor.table <- table(inputData[, factorVars[i]]) + if (any(factor.table < 10)) { + updateRunLog(paste0( + "\nWarning: Some levels for the categorical predictor ", + factorVars[i], + " do not have at least 10 observations. ", + "Consider removing or reclassifying this predictor before continuing.\n", + "Factors with few observations can cause failure in model fitting when the data is split and cannot be reliably used in training a model.\n" + )) factor.table <- as.data.frame(factor.table) - colnames(factor.table)<-c("Factor Name","Factor Count") - updateRunLog(paste("\n",factorVars[i],"\n")) - updateRunLog(pander::pandoc.table.return(factor.table, style = "rmarkdown")) + colnames(factor.table) <- c("Factor Name", "Factor Count") + updateRunLog(paste("\n", factorVars[i], "\n")) + updateRunLog(pander::pandoc.table.return( + factor.table, + style = "rmarkdown" + )) } } } # set all data to be used in model training inputData$UseInModelEvaluation <- FALSE -} +} # revert response for background sites updateFieldData <- dplyr::select(inputData, all_of(fieldDataColNames)) -rm(inputData); gc() +rm(inputData) +gc() -if(length(bgSiteIds) > 0){ - updateFieldData$Response[which(updateFieldData$SiteID %in% bgSiteIds)] <- backgroundValue +if (length(bgSiteIds) > 0) { + updateFieldData$Response[which( + updateFieldData$SiteID %in% bgSiteIds + )] <- backgroundValue } updateFieldData$SiteID <- format(updateFieldData$SiteID, scientific = F) # save updated field data to scenario saveDatasheet(myScenario, updateFieldData, "wisdm_FieldData", append = F) progressBar(type = "end") - diff --git a/src/8-fit-gam.R b/src/8-fit-gam.R index ec20bc6..5e89f04 100644 --- a/src/8-fit-gam.R +++ b/src/8-fit-gam.R @@ -256,6 +256,14 @@ progressBar() finalMod <- fitModel(dat = trainingData, out = out) +if (is.null(finalMod)) { + stop( + "Unable to fit GAM with provided data and settings. Review the run log for details. ", + "Common issues include covariates with rare factor levels, too many predictors relative ", + "to the number of observations, or data quality issues. " + ) +} + finalMod$trainingData <- trainingData # add relevant model details to out diff --git a/src/9-apply-model.R b/src/9-apply-model.R index f895e21..4fcc4d1 100644 --- a/src/9-apply-model.R +++ b/src/9-apply-model.R @@ -720,17 +720,30 @@ for (i in seq_len(nrow(modelOutputsSheet))) { # ------------------------------------------------------------------ if (outputOptionsSheet$MakeMessMap | outputOptionsSheet$MakeModMap) { - if (length(factorVars)) { + messFactorVars <- intersect( + covariatesSheet$CovariateName[covariatesSheet$IsCategorical == TRUE], + modVars + ) + if (modType == "maxent") { + messFactorVars <- union(messFactorVars, intersect(mod$catVars, modVars)) + } + contVars <- setdiff(modVars, messFactorVars) + if (length(contVars) == 0L) { updateRunLog( - "\nWarning: MESS and MoD maps cannot be generated for models with categorical variables.\n" + "\nWarning: MESS and MoD maps cannot be generated because all model variables are categorical.\n" ) } else { - # Prep training data order & precompute structures - train.dat <- dplyr::select(trainingData, dplyr::all_of(modVars)) - for (k in modVars) { + if (length(factorVars) > 0L) { + updateRunLog( + "\nWarning: MESS and MoD maps will be computed on continuous variables only; categorical variables are excluded.\n" + ) + } + # Prep training data order & precompute structures (continuous vars only) + train.dat <- dplyr::select(trainingData, dplyr::all_of(contVars)) + for (k in contVars) { train.dat[, k] <- sort(train.dat[, k]) } - prep <- prepare_mess(train.dat, var_names = modVars) + prep <- prepare_mess(train.dat, var_names = contVars) # ----- MESS ----- if (outputOptionsSheet$MakeMessMap) { @@ -754,7 +767,7 @@ for (i in seq_len(nrow(modelOutputsSheet))) { tmp_fin <- file.path(ssimTempDir, "mess_finalized.tif") r_main <- terra::predict( - object = covs, + object = covs[[contVars]], model = prep, fun = mess_fun, filename = tmp_mess, @@ -763,7 +776,12 @@ for (i in seq_len(nrow(modelOutputsSheet))) { # Apply restriction mask and update tmp_mess so src_for_final is correct if (!is.null(restrictRaster)) { - r_main <- terra::mask(r_main, restrictRaster, maskvalues = 0) + restrict_aligned <- terra::resample( + restrictRaster, + r_main, + method = "near" + ) + r_main <- terra::mask(r_main, restrict_aligned, maskvalues = 0) terra::writeRaster(r_main, tmp_mess, overwrite = TRUE) } @@ -842,7 +860,7 @@ for (i in seq_len(nrow(modelOutputsSheet))) { ) r_main <- terra::predict( - object = covs, + object = covs[[contVars]], model = prep, fun = mod_fun, name_to_id = name_to_id, @@ -852,7 +870,12 @@ for (i in seq_len(nrow(modelOutputsSheet))) { # Apply restriction mask and update tmp_mod so src_for_final is correct if (!is.null(restrictRaster)) { - r_main <- terra::mask(r_main, restrictRaster, maskvalues = 0) + restrict_aligned <- terra::resample( + restrictRaster, + r_main, + method = "near" + ) + r_main <- terra::mask(r_main, restrict_aligned, maskvalues = 0) terra::writeRaster(r_main, tmp_mod, overwrite = TRUE) } @@ -943,13 +966,13 @@ for (i in seq_len(nrow(modelOutputsSheet))) { paste0(modType, "_bin_map.tif") ) } - if (outputOptionsSheet$MakeMessMap && length(factorVars) == 0L) { + if (outputOptionsSheet$MakeMessMap && length(contVars) > 0L) { spatialOutputsSheet$MessRaster[outputRow] <- file.path( ssimTempDir, paste0(modType, "_mess_map.tif") ) } - if (outputOptionsSheet$MakeModMap && length(factorVars) == 0L) { + if (outputOptionsSheet$MakeModMap && length(contVars) > 0L) { spatialOutputsSheet$ModRaster[outputRow] <- file.path( ssimTempDir, paste0(modType, "_mod_map.tif") diff --git a/src/package.xml b/src/package.xml index 0bda412..c15a41b 100644 --- a/src/package.xml +++ b/src/package.xml @@ -154,7 +154,7 @@ - + @@ -163,7 +163,7 @@ - + @@ -194,7 +194,7 @@ - + @@ -216,7 +216,7 @@ - + @@ -226,7 +226,7 @@ - + @@ -234,7 +234,7 @@ - +