From 3029e6d755bf074848284d5d9a0d3b3ff34f0d6e Mon Sep 17 00:00:00 2001 From: evalieungh Date: Mon, 27 Jan 2025 14:19:40 +0100 Subject: [PATCH 01/72] rename R Scripts to R_scripts --- ModGP MASTER.R | 2 +- ModGP-run_exec.R | 2 +- ModGP-run_prep.R | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ModGP MASTER.R b/ModGP MASTER.R index 9fa1b85..c5872cc 100644 --- a/ModGP MASTER.R +++ b/ModGP MASTER.R @@ -28,7 +28,7 @@ message(sprintf("SPECIES = %s", SPECIES)) ## Directories ------------------------------------------------------------ ### Define directories in relation to project directory Dir.Base <- getwd() -Dir.Scripts <- file.path(Dir.Base, "R Scripts") +Dir.Scripts <- file.path(Dir.Base, "R_scripts") source(file.path(Dir.Scripts, "ModGP-commonlines.R")) diff --git a/ModGP-run_exec.R b/ModGP-run_exec.R index 35e8443..eb4ceeb 100644 --- a/ModGP-run_exec.R +++ b/ModGP-run_exec.R @@ -28,7 +28,7 @@ message(sprintf("SPECIES = %s", SPECIES)) ## Directories ------------------------------------------------------------ ### Define directories in relation to project directory Dir.Base <- getwd() -Dir.Scripts <- file.path(Dir.Base, "R Scripts") +Dir.Scripts <- file.path(Dir.Base, "R_scripts") source(file.path(Dir.Scripts, "ModGP-commonlines.R")) diff --git a/ModGP-run_prep.R b/ModGP-run_prep.R index 2c95cd6..a7e2d12 100644 --- a/ModGP-run_prep.R +++ b/ModGP-run_prep.R @@ -28,7 +28,7 @@ message(sprintf("SPECIES = %s", SPECIES)) ## Directories ------------------------------------------------------------ ### Define directories in relation to project directory Dir.Base <- getwd() -Dir.Scripts <- file.path(Dir.Base, "R Scripts") +Dir.Scripts <- file.path(Dir.Base, "R_scripts") source(file.path(Dir.Scripts, "ModGP-commonlines.R")) From 0fa465d965cbb5de7df863820989bcff6b63411b Mon Sep 17 00:00:00 2001 From: evalieungh Date: Mon, 27 Jan 2025 14:44:58 +0100 Subject: [PATCH 02/72] move sourcing from commonlines to other scripts --- ModGP MASTER.R | 4 ++ ModGP-run_exec.R | 4 ++ ModGP-run_prep.R | 4 ++ R_scripts/ModGP-commonlines.R | 71 +++++++++++++++++++++++------------ 4 files changed, 59 insertions(+), 24 deletions(-) diff --git a/ModGP MASTER.R b/ModGP MASTER.R index c5872cc..53d53f6 100644 --- a/ModGP MASTER.R +++ b/ModGP MASTER.R @@ -30,7 +30,11 @@ message(sprintf("SPECIES = %s", SPECIES)) Dir.Base <- getwd() Dir.Scripts <- file.path(Dir.Base, "R_scripts") +## Sourcing --------------------------------------------------------------- source(file.path(Dir.Scripts, "ModGP-commonlines.R")) +source(file.path(Dir.Scripts,"SHARED-Data.R")) +source(file.path(Dir.Scripts,"ModGP-SDM.R")) +source(file.path(Dir.Scripts,"ModGP-Outputs.R")) ## API Credentials -------------------------------------------------------- try(source(file.path(Dir.Scripts, "SHARED-APICredentials.R"))) diff --git a/ModGP-run_exec.R b/ModGP-run_exec.R index eb4ceeb..f0873f2 100644 --- a/ModGP-run_exec.R +++ b/ModGP-run_exec.R @@ -30,7 +30,11 @@ message(sprintf("SPECIES = %s", SPECIES)) Dir.Base <- getwd() Dir.Scripts <- file.path(Dir.Base, "R_scripts") +## Sourcing --------------------------------------------------------------- source(file.path(Dir.Scripts, "ModGP-commonlines.R")) +source(file.path(Dir.Scripts,"SHARED-Data.R")) +source(file.path(Dir.Scripts,"ModGP-SDM.R")) +source(file.path(Dir.Scripts,"ModGP-Outputs.R")) # Choose the number of parallel processes RUNNING_ON_LUMI <- TRUE diff --git a/ModGP-run_prep.R b/ModGP-run_prep.R index a7e2d12..561f2fd 100644 --- a/ModGP-run_prep.R +++ b/ModGP-run_prep.R @@ -30,7 +30,11 @@ message(sprintf("SPECIES = %s", SPECIES)) Dir.Base <- getwd() Dir.Scripts <- file.path(Dir.Base, "R_scripts") +## Sourcing --------------------------------------------------------------- source(file.path(Dir.Scripts, "ModGP-commonlines.R")) +source(file.path(Dir.Scripts,"SHARED-Data.R")) +source(file.path(Dir.Scripts,"ModGP-SDM.R")) +source(file.path(Dir.Scripts,"ModGP-Outputs.R")) ## API Credentials -------------------------------------------------------- try(source(file.path(Dir.Scripts, "SHARED-APICredentials.R"))) diff --git a/R_scripts/ModGP-commonlines.R b/R_scripts/ModGP-commonlines.R index 5f20517..f5f2c38 100644 --- a/R_scripts/ModGP-commonlines.R +++ b/R_scripts/ModGP-commonlines.R @@ -8,27 +8,55 @@ install.load.package <- function(x) { install.packages(x, repos='http://cran.us.r-project.org') require(x, character.only = TRUE) } + +# HJ: to do: remove unneeded packages +# EL: trying to comment out some now to test if we run into issues + ### CRAN PACKAGES ---- package_vec <- c( - 'cowplot', # grid plotting - 'ggplot2', # ggplot machinery - 'ggpmisc', # table plotting in ggplot environment - 'ggpubr', # t-test comparison in ggplot - 'gridExtra', # ggplot saving in PDF - 'parallel', # parallel runs - 'pbapply', # parallel runs with estimator bar - 'raster', # spatial data - 'remotes', # remote installation - 'rgbif', # GBIF access - 'rnaturalearth', # shapefiles - 'sdm', # SDM machinery - 'sf', # spatial data - 'sp', # spatial data - 'terra', # spatial data - 'tidyr', # gather() - 'usdm', # vifcor() - 'viridis', # colour palette - 'iterators' + 'automap', # automatic interpolation (for KrigR) + 'cowplot', # grid plotting + 'exactextractr', # HJ: added to solve extraction problems + #'geodata', # HJ: added to get soil data for testing + 'ggplot2', # ggplot machinery + 'ggpmisc', # table plotting in ggplot environment + 'ggpubr', # t-test comparison in ggplot + 'gridExtra', # ggplot saving in PDF + 'ncdf4', # handling NetCDF files + 'parallel', # parallel runs + 'pbapply', # parallel runs with estimator bar + 'raster', # spatial data ----------------------- should be replaced by terra + 'remotes', # remote installation + 'rgbif', # GBIF access + 'rnaturalearth', # shapefiles + 'sdm', # SDM machinery + 'sf', # spatial data + 'sp', # spatial data + 'terra', # spatial data + 'tidyr', # gather() + 'usdm', # vifcor() + 'viridis', # colour palette + 'bit64', + 'iterators', + + # Capfitogen SelectVar packages + # HJ: added here from Capfitogen SelectVar script. To do: remove unnecessary ones + 'dismo', + 'cluster', + 'ade4', + 'labdsv', + 'mclust', + 'clustvarsel', + #'randomForest', # ---------------- replace with ranger? + 'ranger', + + # Capfitogen ECLmapas packages + # HJ: added here from Capfitogen ECLmapas script. To do: remove unnecessary ones + 'modeltools', + 'flexmix', + 'fpc', + 'vegan', + 'adegenet' #find.clusters ELCmap.R ) sapply(package_vec, install.load.package) @@ -92,8 +120,3 @@ CreateDir <- sapply(Dirs, function(x){ x <- eval(parse(text=x)) if(!dir.exists(x)) dir.create(x)}) rm(Dirs) - -## Sourcing --------------------------------------------------------------- -source(file.path(Dir.Scripts,"SHARED-Data.R")) -source(file.path(Dir.Scripts,"ModGP-SDM.R")) -source(file.path(Dir.Scripts,"ModGP-Outputs.R")) From 9e38ddd2ef23fe55f9f70a993ddadf921244df1f Mon Sep 17 00:00:00 2001 From: evalieungh Date: Tue, 28 Jan 2025 19:25:10 +0100 Subject: [PATCH 03/72] update draft changes, GBIF download test --- .gitignore | 2 +- Data/GBIF/Lathyrus angulatus.json | 86 ++++++++ R_scripts/ModGP-commonlines.R | 23 +- R_scripts/SHARED-Data.R | 199 +++++++++++++++--- R_scripts/SHARED-Data_CAPFITOGEN.R | 4 +- .../testing_capfitogen_deletethislater.R | 14 +- capfitogen_master.R | 161 ++++++-------- 7 files changed, 345 insertions(+), 144 deletions(-) create mode 100644 Data/GBIF/Lathyrus angulatus.json diff --git a/.gitignore b/.gitignore index 5cbef63..f7c8f07 100644 --- a/.gitignore +++ b/.gitignore @@ -56,7 +56,7 @@ rsconnect/ *.rds !Data/*/GlobalAreaCRS.RData **/HWSD2.zip -/Data/Environment/soil +/Data/Environment/ Logo/* diff --git a/Data/GBIF/Lathyrus angulatus.json b/Data/GBIF/Lathyrus angulatus.json new file mode 100644 index 0000000..0595327 --- /dev/null +++ b/Data/GBIF/Lathyrus angulatus.json @@ -0,0 +1,86 @@ +{ + "@context": [ + ["https://w3id.org/ro/crate/1.1/context"] + ], + "@graph": [ + { + "@type": ["CreativeWork"], + "@id": ["ro-crate-metadata.json"], + "conformsTo": { + "@id": ["https://w3id.org/ro/crate/1.1"] + }, + "about": { + "@id": ["./"] + } + }, + { + "@id": ["./"], + "hasPart": [ + { + "@id": ["Lathyrus angulatus.RData"] + } + ], + "about": [ + { + "@id": ["https://www.gbif.org/species/5356429"] + } + ], + "@type": [ + ["Dataset"] + ], + "creator": { + "@id": ["https://orcid.org/0000-0002-4984-7646", "biodt-robot@gbif.no"] + }, + "author": { + "@id": ["https://orcid.org/0000-0002-4984-7646", "biodt-robot@gbif.no"] + }, + "license": { + "@id": ["https://creativecommons.org/licenses/by/4.0/"] + }, + "studySubject": [ + ["http://eurovoc.europa.eu/632"] + ], + "datePublished": ["2025-01-27 19:07:06"], + "name": ["Cleaned GBIF occurrence records for species Lathyrus angulatus"], + "encodingFormat": ["application/ld+json"], + "mainEntity": ["Dataset"], + "keywords": [ + ["GBIF"], + ["Occurrence"], + ["Biodiversity"], + ["Observation"], + ["Capfitogen"] + ], + "description": ["Capfitogen input data for Lathyrus angulatus"] + }, + { + "@id": ["Lathyrus angulatus.RData"], + "@type": [ + ["File"] + ], + "name": ["Lathyrus angulatus.RData"], + "contentSize": [171823], + "encodingFormat": ["application/RData"] + }, + { + "@id": ["https://orcid.org/0000-0002-4984-7646", "biodt-robot@gbif.no"], + "@type": ["Person", "Organisation"], + "name": ["biodt-cwr", "Erik Kusch"] + }, + { + "@id": ["#action1"], + "@type": ["CreateAction"], + "agent": { + "@id": ["https://orcid.org/0000-0002-4984-7646", "biodt-robot@gbif.no"] + }, + "instrument": { + "@id": ["https://github.com/BioDT/uc-CWR"] + }, + "result": [ + { + "@id": ["./"] + } + ] + } + ] +} diff --git a/R_scripts/ModGP-commonlines.R b/R_scripts/ModGP-commonlines.R index f5f2c38..a1f0fe6 100644 --- a/R_scripts/ModGP-commonlines.R +++ b/R_scripts/ModGP-commonlines.R @@ -29,6 +29,7 @@ package_vec <- c( 'remotes', # remote installation 'rgbif', # GBIF access 'rnaturalearth', # shapefiles + 'rnaturalearthdata', # needed for FUN.Down.BV() 'sdm', # SDM machinery 'sf', # spatial data 'sp', # spatial data @@ -58,26 +59,35 @@ package_vec <- c( 'vegan', 'adegenet' #find.clusters ELCmap.R ) + sapply(package_vec, install.load.package) ### NON-CRAN PACKAGES ---- -if(packageVersion("KrigR") < "0.9.1"){ # KrigR check - devtools::install_github("https://github.com/ErikKusch/KrigR", ref = "Development") +# check if KrigR is missing or outdated +if(packageVersion("KrigR") < "0.9.1" || + "KrigR" %in% rownames(installed.packages()) == FALSE) { + message("installing KrigR from github.com/ErikKusch/KrigR") + devtools::install_github("https://github.com/ErikKusch/KrigR", + ref = "Development") } library(KrigR) -if("mraster" %in% rownames(installed.packages()) == FALSE){ # KrigR check - remotes::install_github("babaknaimi/mraster") +if ("mraster" %in% rownames(installed.packages()) == FALSE) { + # KrigR check + remotes::install_github("babaknaimi/mraster") } library(mraster) -if(!("maxent" %in% unlist(getmethodNames()))){sdm::installAll()} # install methods for sdm package +if (!("maxent" %in% unlist(getmethodNames()))) { + sdm::installAll() +} # install methods for sdm package ## updating package_vec for handling of parallel environments package_vec <- c(package_vec, "KrigR", "mraster") ## Functionality ---------------------------------------------------------- -`%nin%` <- Negate(`%in%`) # a function for negation of %in% function +#' a function for negation of %in% function +`%nin%` <- Negate(`%in%`) #' Progress bar for data loading saveObj <- function(object, file.name){ @@ -114,6 +124,7 @@ Dir.R_scripts <- file.path(Dir.Base, "R_scripts") Dir.Results <- file.path(Dir.Base, "results") Dir.Results.SelectVar <- file.path(Dir.Results, "SelectVar") Dir.Results.ECLMap <- file.path(Dir.Results, "ECLMap") + ### Create directories which aren't present yet Dirs <- grep(ls(), pattern = "Dir.", value = TRUE) CreateDir <- sapply(Dirs, function(x){ diff --git a/R_scripts/SHARED-Data.R b/R_scripts/SHARED-Data.R index 16e4a35..740f13d 100644 --- a/R_scripts/SHARED-Data.R +++ b/R_scripts/SHARED-Data.R @@ -12,7 +12,7 @@ # queries download from GBIF, handles and cleans data, returns SF MULTIPOINT object and GBIF download metadata FUN.DownGBIF <- function(species = NULL, # species name as character for whose genus data is to be downloaded Dir = getwd(), # where to store the data - Force = FALSE, # whether the download should be forced despite local data already existing + Force = FALSE, # overwrite existing data? Mode = "ModGP", # which specification to run, either for whole GENUS of supplied species (ModGP), or for species directly (Capfitogen) parallel = 1 # an integer, 1 = sequential; always defaults to sequential when Mode == "Capfitogen" ){ @@ -45,7 +45,8 @@ FUN.DownGBIF <- function(species = NULL, # species name as character for whose g RankGBIF <- "species" } GBIF_match <- name_backbone(name = species, - rank = RankGBIF, kingdom = "plante") + rank = RankGBIF, + kingdom = "plante") ## Extracting taxonkey tax_ID <- ifelse(GBIF_match$rank != toupper(RankGBIF), NA, @@ -81,11 +82,13 @@ FUN.DownGBIF <- function(species = NULL, # species name as character for whose g ### Resolving Common Issues ---- message("Resolving Common Data Issues") ## removing bases of record that may not be linked to coordinates properly - occ_occ <- occ_occ[occ_occ$basisOfRecord %nin% c("PRESERVED_SPECIMEN", "MATERIAL_CITATION"), ] + occ_occ <- occ_occ[occ_occ$basisOfRecord %nin% c("PRESERVED_SPECIMEN", + "MATERIAL_CITATION"), ] ## removing highly uncertain locations, i.e., anything more than 1km in uncertainty occ_occ <- occ_occ[occ_occ$coordinateUncertaintyInMeters <= 1000, ] ## removing rounded coordinates - occ_occ <- occ_occ[-grep(occ_occ$issue, pattern = "COORDINATE_ROUNDED"), ] + occ_occ <- occ_occ[-grep(occ_occ$issue, + pattern = "COORDINATE_ROUNDED"), ] ## removing empty species rows occ_occ <- occ_occ[occ_occ$species != "" & !is.na(occ_occ$species), ] @@ -98,12 +101,16 @@ FUN.DownGBIF <- function(species = NULL, # species name as character for whose g parallel <- parallel::makeCluster(parallel) on.exit(stopCluster(parallel)) print("R Objects loading to cluster") - parallel::clusterExport(parallel, varlist = c( - "package_vec", "install.load.package", - "occ_occ" - ), envir = environment()) + parallel::clusterExport(parallel, + varlist = c( + "package_vec", + "install.load.package", + "occ_occ"), + envir = environment()) print("R Packages loading on cluster") - clusterpacks <- clusterCall(parallel, function() sapply(package_vec, install.load.package)) + clusterpacks <- clusterCall(parallel, + function() sapply(package_vec, + install.load.package)) } ### Making SF for species ---- @@ -115,14 +122,28 @@ FUN.DownGBIF <- function(species = NULL, # species name as character for whose g cl = parallel, FUN = function(x){ spec_df <- occ_occ[occ_occ$species == x, ] - spec_uniloca <- occ_occ[occ_occ$species == x, c("species", "decimalLatitude", "decimalLongitude")] + spec_uniloca <- occ_occ[occ_occ$species == x, + c("species", + "decimalLatitude", + "decimalLongitude")] spec_df <- spec_df[!duplicated(spec_uniloca), - c("gbifID", "datasetKey", "occurrenceID", "species", "scientificName", "speciesKey", - "decimalLatitude", "decimalLongitude", "coordinateUncertaintyInMeters", - "eventDate", "basisOfRecord", "recordNumber", "issue") - ] + c("gbifID", + "datasetKey", + "occurrenceID", + "species", + "scientificName", + "speciesKey", + "decimalLatitude", + "decimalLongitude", + "coordinateUncertaintyInMeters", + "eventDate", + "basisOfRecord", + "recordNumber", + "issue")] spec_df$presence <- 1 - st_as_sf(spec_df, coords = c("decimalLongitude", "decimalLatitude")) + st_as_sf(spec_df, + coords = c("decimalLongitude", + "decimalLatitude")) }) names(specs_ls) <- GBIF_specs @@ -130,8 +151,49 @@ FUN.DownGBIF <- function(species = NULL, # species name as character for whose g if(Mode == "Capfitogen"){ specs_ls <- specs_ls[[1]] ## create capfitogen data frame - CapfitogenColumns <- c("INSTCODE", "ACCENUMB", "COLLNUMB", "COLLCODE", "COLLNAME", "COLLINSTADDRESS", "COLLMISSID", "GENUS", "SPECIES", "SPAUTHOR", "SUBTAXA", "SUBTAUTHOR", "CROPNAME", "ACCENAME", "ACQDATE", "ORIGCTY", "NAMECTY", "ADM1", "ADM2", "ADM3", "ADM4", "COLLSITE", "DECLATITUDE", "LATITUDE", "DECLONGITUDE", "LONGITUDE", "COORDUNCERT", "COORDDATUM", "GEOREFMETH", "ELEVATION", "COLLDATE", "BREDCODE", "BREDNAME", "SAMPSTAT", "ANCEST", "COLLSRC", "DONORCODE", "DONORNAME", "DONORNUMB", "OTHERNUMB", "DUPLSITE", "DUPLINSTNAME", "STORAGE", "MLSSTAT", "REMARKS") - CapfitogenData <- data.frame(matrix(data = NA, nrow = nrow(specs_ls), ncol = length(CapfitogenColumns))) + CapfitogenColumns <- c("INSTCODE", + "ACCENUMB", + "COLLNUMB", + "COLLCODE", + "COLLNAME", + "COLLINSTADDRESS", + "COLLMISSID", + "GENUS", + "SPECIES", + "SPAUTHOR", + "SUBTAXA", + "SUBTAUTHOR", + "CROPNAME", + "ACCENAME", + "ACQDATE", + "ORIGCTY", + "NAMECTY", + "ADM1", "ADM2", "ADM3", "ADM4", + "COLLSITE", + "DECLATITUDE", "LATITUDE", + "DECLONGITUDE", "LONGITUDE", + "COORDUNCERT", + "COORDDATUM", + "GEOREFMETH", + "ELEVATION", + "COLLDATE", + "BREDCODE", + "BREDNAME", + "SAMPSTAT", + "ANCEST", + "COLLSRC", + "DONORCODE", + "DONORNAME", + "DONORNUMB", + "OTHERNUMB", + "DUPLSITE", + "DUPLINSTNAME", + "STORAGE", + "MLSSTAT", + "REMARKS") + CapfitogenData <- data.frame(matrix(data = NA, + nrow = nrow(specs_ls), + ncol = length(CapfitogenColumns))) colnames(CapfitogenData) <- CapfitogenColumns ## Create unique rownames for the ACCENUMB CapfitogenData$ACCENUMB <- seq(from = 1, to = nrow(CapfitogenData), by = 1) @@ -184,12 +246,15 @@ FUN.DownGBIF <- function(species = NULL, # species name as character for whose g } # BIOCLIMATIC VARIABLE DOWNLOAD -------------------------------------------- -#' queries and downloads and computes bioclimatic variables at global extent from ERA5-Land, Water availability is based on soil moisture level 1 (0-7cm) and 2 (7-28cm) -FUN.DownBV <- function(T_Start = 1970, # what year to begin climatology calculation in - T_End = 2000, # what year to end climatology calculation in - Dir = getwd(), # where to store the data output on disk - Force = FALSE # do not overwrite already present data - ){ +#' queries, downloads, and computes bioclimatic variables +#' at global extent from ERA5-Land. Water availability is based on +#' soil moisture level 1 (0-7cm) and 2 (7-28cm) +FUN.DownBV <- function( + T_Start = 1970, # what year to begin climatology calculation in + T_End = 2000, # what year to end climatology calculation in + Dir = getwd(), # where to store the data output on disk + Force = FALSE # do not overwrite already present data + ){ FNAME <- file.path(Dir, paste0("BV_", T_Start, "-", T_End, ".nc")) if(!Force & file.exists(FNAME)){ @@ -200,12 +265,14 @@ FUN.DownBV <- function(T_Start = 1970, # what year to begin climatology calculat } ### Raw soil moisture level data ---- - #' We download raw soil moisture data for layers 1 (0-7cm) and 2 (7-28cm) separately. These are then summed up and used in the bioclimatic variable computation of KrigR + #' Download raw soil moisture data for layers 1 (0-7cm) and + #' 2 (7-28cm) separately. + #' These are summed up and used in KrigR's bioclimatic variable computation if(!file.exists( file.path(Dir, paste(tools::file_path_sans_ext(basename(FNAME)), - "volumetric_soil_water_layer_1", "RAW.nc", - sep = "_")) + "volumetric_soil_water_layer_1", + "RAW.nc", sep = "_")) )){ #### Downloading ---- Qsoil1_ras <- CDownloadS( @@ -279,21 +346,89 @@ FUN.DownBV <- function(T_Start = 1970, # what year to begin climatology calculat JSON_ls <- jsonlite::read_json("ro-crate-metadata.json") JSON_ls$`@graph`[[2]]$hasPart[[1]]$`@id` <- basename(FNAME) - JSON_ls$`@graph`[[2]]$about[[1]]$`@id` <- "https://cds.climate.copernicus.eu/cdsapp#!/dataset/reanalysis-era5-land" + JSON_ls$`@graph`[[2]]$about[[1]]$`@id` <- + "https://cds.climate.copernicus.eu/cdsapp#!/dataset/reanalysis-era5-land" JSON_ls$`@graph`[[2]]$datePublished <- Sys.time() # tail(file.info(FNAME)$ctime) - JSON_ls$`@graph`[[2]]$name <- "Bioclimatic data obtained from ERA5-Land. Water avialbility is denoted via the sum of soil moisture layer 1 and 2." - JSON_ls$`@graph`[[2]]$keywords <- list("ERA5-Land", "ECMWF", "Bioclimatic Variables", "Soil Moisture") + JSON_ls$`@graph`[[2]]$name <- + "Bioclimatic data obtained from ERA5-Land. Water avialbility is denoted via the sum of soil moisture layer 1 and 2." + JSON_ls$`@graph`[[2]]$keywords <- list("ERA5-Land", + "ECMWF", + "Bioclimatic Variables", + "Soil Moisture") JSON_ls$`@graph`[[2]]$description <- "Bioclimatic data obtained from ERA5-Land. Water avialbility is denoted via the sum of soil moisture layer 1 and 2." JSON_ls$`@graph`[[3]]$name <- basename(FNAME) JSON_ls$`@graph`[[3]]$contentSize <- file.size(FNAME) JSON_ls$`@graph`[[3]]$`@id` <- basename(FNAME) - JSON_ls$`@graph`[[5]]$instrument$`@id` <- "https://doi.org/10.1088/1748-9326/ac48b3" + JSON_ls$`@graph`[[5]]$instrument$`@id` <- + "https://doi.org/10.1088/1748-9326/ac48b3" - con <- file(file.path(Dir, paste0(tools::file_path_sans_ext(basename(FNAME)), ".json"))) - writeLines(jsonlite::toJSON(JSON_ls, pretty = TRUE), con) + con <- file(file.path(Dir, + paste0(tools::file_path_sans_ext(basename(FNAME)), + ".json"))) + writeLines(jsonlite::toJSON(JSON_ls, + pretty = TRUE), + con) close(con) BV_ras } + +## EDAPHIC DATA DOWNLOAD ------------------------------------------------------ +# HJ: new part: very rough draft for downloading edaphic data +# missing all the important parts: proper data sources, functions +# where to set the variables for each category (bioclimatic, edaphic, geophysical)? + +# Edaphic data download + +# HJ: ONLY FOR TESTING PURPOSES: soildata package +# needed: data source and downloading commands for .nc files; where to set the selected variables + +FUN.DownEV <- function(Dir = getwd(), # where to store the data output on disk + Force = FALSE, # do not overwrite already present data, + ){ + FNAME <- file.path(Dir, "edaphic.nc") + + # check if file exists and whether to overwrite + if(!Force & file.exists(FNAME)){ + EV_ras <- stack(FNAME) + #names(EV_ras) <- paste0("BIO", 1:19) # replace with edaphic names vector + message("Data has already been downloaded with these specifications. It has been loaded from the disk. If you wish to override the present data, please specify Force = TRUE") + return(EV_ras) + } + + if(!file.exists( + + )){ + ## downloading: do this with KrigR (CDownloadS() for ECMWF data only) or other way? + + } + evarg <- c(arg1, arg2, arg3) + # evargs <- commandArgs(trailingOnly = TRUE) + #evarg <- c("soc", "silt") + edaph_ras <- soil_world(evarg, depth = 5, path = file.path(Dir.Data.Envir, "Edaphic")) + names(edaph_ras) <- evarg + + ### Masking ---- + # whole world + #Land_sp <- ne_countries(type = "countries", scale = "medium") + + # HJ: for testing/ to match previous Capfitogen tests: only Spain + # HJ: this is an attempt to do the same thing with terra that was done with KrigR in ModGP (see below) + # please switch back to KrigR is wanted/needed + Land_sp <- ne_states("Spain") + edaph_ras <- crop(edaph_ras, terra::ext(Land_sp)) + edaph_ras <- terra::mask(edaph_ras, vect(Land_sp)) + + # BV_ras <- crop(BV_ras, extent(Land_sp)) + # BV_mask <- KrigR:::mask_Shape(base.map = BV_ras[[1]], Shape = Land_sp[,"name"]) + # BV_ras <- mask(BV_ras, BV_mask) + + ### Saving ---- + terra::writeCDF(edaph_ras, filename = FNAME, overwrite = TRUE) + unlink(file.path(Dir.Data.Envir, "Edaphic", "soil_world", "*.tif")) + + edaph_ras + +} diff --git a/R_scripts/SHARED-Data_CAPFITOGEN.R b/R_scripts/SHARED-Data_CAPFITOGEN.R index 6352db4..5055c86 100644 --- a/R_scripts/SHARED-Data_CAPFITOGEN.R +++ b/R_scripts/SHARED-Data_CAPFITOGEN.R @@ -14,7 +14,7 @@ # HJ: first sections of script are taken from ModGP, only small modifications # EL: how can we avoid code duplication from the MoDGP SHARED-Data script? -# need to work on this to find a good solution. +# need to work on this to find a good solution.Because the code here mostly just defines functions but does not run them, I think its OK to merge with the main SHARED-Data.R instead of duplicating. # GBIF DOWNLOAD FUNCTION -------------------------------------------------- # queries download from GBIF, handles and cleans data, returns SF MULTIPOINT object and GBIF download metadata @@ -343,7 +343,7 @@ FUN.DownBV <- function(T_Start = 1970, # what year to begin climatology calculat FUN.DownEV <- function(arg1, arg2, arg3){ - FNAME <- file.path(Dir.Data.Envir, "edaph.nc") + FNAME <- file.path(Dir.Data.Envir, "edaphic_data.nc") evarg <- c(arg1, arg2, arg3) # evargs <- commandArgs(trailingOnly = TRUE) diff --git a/R_scripts/testing_capfitogen_deletethislater.R b/R_scripts/testing_capfitogen_deletethislater.R index caa1736..74530ae 100644 --- a/R_scripts/testing_capfitogen_deletethislater.R +++ b/R_scripts/testing_capfitogen_deletethislater.R @@ -12,16 +12,24 @@ GET(url, write_disk(hwsd_zipfile)) unzip(hwsd_zipfile, exdir = paste(hwsd_path, "/soil", sep = "")) # test reading BIL file -testraster <- terra::rast("Data/Environment/soil/HWSD2.bil") -testraster +hwsd_raster <- terra::rast("Data/Environment/soil/HWSD2.bil") +hwsd_raster +print(hwsd_raster) +plot(hwsd_raster) +summary(hwsd_raster) +proj4string(hwsd_raster) +res(hswd_raster) # aggregate to coarser resolution by a factor of 9 # (bin 9x9 neighbouring pixels into one, and assign the bigger pixel the mean) -soil30 <- aggregate(testraster, fact = 9, fun = mean) +soil30 <- aggregate(hwsd_raster, fact = 9, fun = mean) soil30 plot(soil30) # a plot is made, but of what? There is only one layer of values, and it's not obvoius to me what those values are... +# test downloading CAPFITOGEN scripts into R_Scripts + + #------------------------------------------------- # Main input file (pasaporte): # LathyrusData-ForCapfitogen_27oct2023.txt (by Carrie) diff --git a/capfitogen_master.R b/capfitogen_master.R index b81c3a4..0c5ea37 100644 --- a/capfitogen_master.R +++ b/capfitogen_master.R @@ -33,85 +33,9 @@ if (length(args)==0) { } message(sprintf("SPECIES = %s", SPECIES)) -## Packages --------------------------------------------------------------- -# Define function to load and/or install packages -install.load.package <- function(x) { - if (!require(x, character.only = TRUE)) - install.packages(x, repos='http://cran.us.r-project.org') - require(x, character.only = TRUE) -} - -# HJ: to do: remove unneeded packages -# EL: trying to comment out some now to test if we run into issues - # move to commonlines -### CRAN PACKAGES ---- -package_vec <- c( - #'automap', # automatic interpolation (for KrigR) - 'cowplot', # grid plotting - 'exactextractr', # HJ: added to solve extraction problems - #'geodata', # HJ: added to get soil data for testing - 'ggplot2', # ggplot machinery - 'ggpmisc', # table plotting in ggplot environment - 'ggpubr', # t-test comparison in ggplot - 'gridExtra', # ggplot saving in PDF - 'ncdf4', # handling NetCDF files - 'parallel', # parallel runs - 'pbapply', # parallel runs with estimator bar - #'raster', # spatial data ----------------------- should be replaced by terra - 'remotes', # remote installation - 'rgbif', # GBIF access - #'rnaturalearth', # shapefiles - 'sdm', # SDM machinery - 'sf', # spatial data - 'sp', # spatial data - 'terra', # spatial data - 'tidyr', # gather() - 'usdm', # vifcor() - 'viridis', # colour palette - 'bit64', - - # Capfitogen SelectVar packages - # HJ: added here from Capfitogen SelectVar script. To do: remove unnecessary ones - 'dismo', - 'cluster', - 'ade4', - 'labdsv', - 'mclust', - 'clustvarsel', - #'randomForest', # ---------------- replace with ranger? - 'ranger', - - # Capfitogen ECLmapas packages - # HJ: added here from Capfitogen ECLmapas script. To do: remove unnecessary ones - 'modeltools', - 'flexmix', - 'fpc', - 'vegan', - 'adegenet' #find.clusters ELCmap.R -) -sapply(package_vec, install.load.package) - -# ### NON-CRAN PACKAGES ---- are these necessary for capfitogen? -# if("KrigR" %in% rownames(installed.packages()) == FALSE){ # KrigR check -# Sys.setenv(R_REMOTES_NO_ERRORS_FROM_WARNINGS="true") -# remotes::install_github("https://github.com/cran/rgdal") # requires gdal-config! -# remotes::install_github("ErikKusch/KrigR") -# } -# library(KrigR) # is KrigR necessary for capfitogen? -# -# if("mraster" %in% rownames(installed.packages()) == FALSE){ # KrigR check -# remotes::install_github("babaknaimi/mraster") -# } -# library(mraster) -# -# if(!("maxent" %in% unlist(getmethodNames()))){sdm::installAll()} # install methods for sdm package -# -# # updating package_vec for handling of parallel environments -# package_vec <- c(package_vec, "KrigR", "mraster") - ### Define directories in relation to project directory Dir.Base <- getwd() -Dir.Scripts <- file.path(Dir.Base, "R_Scripts") +Dir.Scripts <- file.path(Dir.Base, "R_scripts") ## source packages, directories, simple functions (...) source(file.path(Dir.Scripts, "ModGP-commonlines.R")) @@ -136,23 +60,38 @@ if(!exists("numberOfCores")){ # Core check: if number of cores for parallel proc } # end of Core check message(sprintf("numberOfCores = %d", numberOfCores)) -## Sourcing --------------------------------------------------------------- -# source(file.path(Dir.R_scripts, "SHARED-Data.R")) # done in commonlines -# source(file.path(Dir.R_scripts, "ELCMap.R")) #HJ: clustering ready, map part not - # DATA ==================================================================== +## Run SHARED-Data script ------------------------------------------------- +## defines FUN.DownGBIF(), FUN.DownBV() +source(file.path(Dir.R_scripts, "SHARED-Data.R")) + ## GBIF Data -------------------------------------------------------------- message("Retrieving GBIF data") ## species of interest Species_ls <- FUN.DownGBIF( species = SPECIES, # which species to pull data for Dir = Dir.Data.GBIF, # where to store the data output on disk - Force = TRUE, # overwrite (TRUE) already present data or not (FALSE) + Force = FALSE, # overwrite (TRUE) already present data or not (FALSE) Mode = "Capfitogen", # query download for one species parallel = 1 # no speed gain here for parallelising on personal machine ) ## Environmental Data ----------------------------------------------------- +## Bioclomatic data: 19 BioClim variables +message("Retrieving bioclimatic variables") # NB! slow +bioclim_data <- FUN.DownBV( + T_Start = 1995, # what year to begin climatology calculation in + T_End = 2015, # what year to end climatology calculation in + Dir = Dir.Data.Envir, # where to store the data output on disk + Force = FALSE # do not overwrite already present data + ) + +## Edaphic data: +edaphic_data <- FUN.DownEV( + Dir = Dir.Data.Envir, + Force = FALSE +) + #' existing data in "Data/Environment/BV-1985-2015.nc" #' and soil data in .bil under /soil downloaded from Harmonized World #' Soil Database version 2.0 @@ -183,12 +122,47 @@ Species_ls <- FUN.DownGBIF( #geophy_ras <- FUN.DownGV( ) # TO DO +## read variables -------------------------------------------------------- +bioclim_ras <- terra::rast(file.path(Dir.Data.Envir, + "BV_1985-2015.nc")) +bioclim_ras <- terra::project(bioclim_ras, + "EPSG:4326") # WGS84; World Geodetic System 1984 +BioClim_names <- c( ## BioClim variable names, see https://www.worldclim.org/data/bioclim.html + "BIO1_Annual_Mean_Temperature", + "BIO2_Mean_Diurnal_Range", + "BIO3_Isothermality", + "BIO4_Temperature_Seasonality", + "BIO5_Max_Temperature_of_Warmest_Month", + "BIO6_Min_Temperature_of_Coldest_Month", + "BIO7_Temperature_Annual_Range", + "BIO8_Mean_Temperature_of_Wettest_Quarter", + "BIO9_Mean_Temperature_of_Driest_Quarter", + "BIO10_Mean_Temperature_of_Warmest_Quarter", + "BIO11_Mean_Temperature_of_Coldest_Quarter", + "BIO12_Annual_Precipitation", + "BIO13_Precipitation_of_Wettest_Month", + "BIO14_Precipitation_of_Driest_Month", + "BIO15_Precipitation_Seasonality", + "BIO16_Precipitation_of_Wettest_Quarter", + "BIO17_Precipitation_of_Driest_Quarter", + "BIO18_Precipitation_of_Warmest_Quarter", + "BIO19_Precipitation_of_Coldest_Quarter") +names(bioclim_ras) <- BioClim_names + + +geophys_ras <- terra::rast(file.path(Dir.Data.Envir, "geophys.nc")) + +# geophys_ras <- terra::project(geophys_ras, "EPSG:4326") +# names(geophys_ras) <- geophysv +# edaph_ras <- terra::rast(file.path(Dir.Data.Envir, "edaph.nc")) +# names(edaph_ras) <- edaphv + + # # CAPFITOGEN pipeline ========================================================= ## Parameters ----------------------------------------------------------------- -#' copied and shortened from CAPFITOGEN's "Parameters_SelecVar.R" script. - +## copied and shortened from CAPFITOGEN's "Parameters_SelecVar.R" script. # ruta <- "C:/CAPFITOGEN3" # replace with other paths extent <- pais <- "World" pasaporte <- file.path(Dir.Data.GBIF, "filename") # species observations - enter GBIF data file, check if column names work @@ -198,15 +172,11 @@ distdup <- 1 # distance threshold in km to remove duplicates from same populatio resol1 <- "Celdas 1x1 km aprox (30 arc-seg)" # resolution, change to 9x9 buffy <- FALSE # buffer zone? # tamp <- 1000 #Only applies when buffy=TRUE -bioclimv <- c("tmean_1","vapr_annual","prec_1") # bioclimatic variables, altered by HJ with existing data +bioclimv <- BioClim_names #c("tmean_1","vapr_annual","prec_1") # bioclimatic variables, altered by HJ with existing data edaphv <- c("s_silt","s_sand","s_soilwater_cap") # edaphic variables (defaults from SOILGRIDS) geophysv <- c("alt","aspect") # geophysical variables -latitud <- FALSE #Only applies if ecogeo=TRUE -#TRUE or FALSE type parameter -##### Note:This parameter indicates whether the latitude variable (Y) that comes from the DECLATITUDE column of the passport table will be used to make the ecogeographic characterization (as a geophysical variable) -longitud <- TRUE #Only applies if ecogeo=TRUE -#TRUE or FALSE type parameter -##### Note1:This parameter indicates whether the longitude variable (X) that comes from the DECLATITUDE column of the passport table will be used to make the ecogeographic characterization (as a geophysical variable) +latitud <- FALSE #Only applies if ecogeo=TRUE; whether to use latitude variable (Y) as a geophysical variable from 'pasaporte' +longitud <- FALSE percenRF <- 0.66 # percentage of variables that will be selected by Random Forest percenCorr <- 0.33 # percentage of variables that will be selected by the analysis of bivariate correlations, which is executed after the selection by Random Forest (for example, if you wanted to select 1/3 of the total of variables by bivariate correlations, percenRF would be 0.33 CorrValue <- 0.5 # correlation threshold value, above (in its positive form) or below (in its negative form) of which it is assumed that there is a correlation between two variables. @@ -215,15 +185,6 @@ nminvar <- 3 # minimum number of variables to select per component. For example, ecogeopcaxe <- 4 # number of axes (principal components) that will be shown in the tables of eigenvectors, eigenvalues and the PCA scores. ecogeopcaxe cannot be greater than the smallest number of variables to be evaluated per component resultados <- Dir.Results # directory to place results -## temp: read variables -------------------------------------------------------- -bioclim_ras <- terra::rast(file.path(Dir.Data.Envir, "bioclim.nc")) -# bioclim_ras <- terra::project(bioclim_ras, "EPSG:4326") -# names(bioclim_ras) <- bioclimv -# geophys_ras <- terra::rast(file.path(Dir.Data.Envir, "geophys.nc")) -# geophys_ras <- terra::project(geophys_ras, "EPSG:4326") -# names(geophys_ras) <- geophysv -# edaph_ras <- terra::rast(file.path(Dir.Data.Envir, "edaph.nc")) -# names(edaph_ras) <- edaphv ## Variable selection: SelecVar ------------------------------------------------ #' run variable selection (script VarSelection.R for each category of environmental variables): From 2a8335ba139cf9182628dc52e09980756b4ebb63 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Thu, 30 Jan 2025 10:42:33 +0100 Subject: [PATCH 04/72] drafting SoilGrids download code --- R_scripts/ModGP-commonlines.R | 1 + R_scripts/SHARED-Data.R | 13 ++---- .../testing_capfitogen_deletethislater.R | 44 +++++++++++++++++-- 3 files changed, 46 insertions(+), 12 deletions(-) diff --git a/R_scripts/ModGP-commonlines.R b/R_scripts/ModGP-commonlines.R index a1f0fe6..ac2c9ab 100644 --- a/R_scripts/ModGP-commonlines.R +++ b/R_scripts/ModGP-commonlines.R @@ -39,6 +39,7 @@ package_vec <- c( 'viridis', # colour palette 'bit64', 'iterators', + 'gdalUtilities', # to download from SoilGrids (FUN.DownEV) # Capfitogen SelectVar packages # HJ: added here from Capfitogen SelectVar script. To do: remove unnecessary ones diff --git a/R_scripts/SHARED-Data.R b/R_scripts/SHARED-Data.R index 740f13d..e306976 100644 --- a/R_scripts/SHARED-Data.R +++ b/R_scripts/SHARED-Data.R @@ -376,14 +376,9 @@ FUN.DownBV <- function( } ## EDAPHIC DATA DOWNLOAD ------------------------------------------------------ -# HJ: new part: very rough draft for downloading edaphic data -# missing all the important parts: proper data sources, functions -# where to set the variables for each category (bioclimatic, edaphic, geophysical)? - -# Edaphic data download - -# HJ: ONLY FOR TESTING PURPOSES: soildata package -# needed: data source and downloading commands for .nc files; where to set the selected variables +# HJ: new part, INCOMPLETE! +# EL: data also need to be changed to get the same resolution as the BioClim +# variables downloaded with FUN.DownBV FUN.DownEV <- function(Dir = getwd(), # where to store the data output on disk Force = FALSE, # do not overwrite already present data, @@ -401,7 +396,7 @@ FUN.DownEV <- function(Dir = getwd(), # where to store the data output on disk if(!file.exists( )){ - ## downloading: do this with KrigR (CDownloadS() for ECMWF data only) or other way? + ## downloading } evarg <- c(arg1, arg2, arg3) diff --git a/R_scripts/testing_capfitogen_deletethislater.R b/R_scripts/testing_capfitogen_deletethislater.R index 74530ae..ca82f16 100644 --- a/R_scripts/testing_capfitogen_deletethislater.R +++ b/R_scripts/testing_capfitogen_deletethislater.R @@ -2,7 +2,7 @@ # as notes and tests for scripting. To be deleted when a full capfitogen pipeline # is in place. #------------------------------------------------- -#test downloading soil data from harmonized world soil database v2.0 +#test downloading soil data from harmonized world soil database v2.0 (hwsd) hwsd_path = file.path(getwd(), "Data", "Environment") hwsd_zipfile = paste(hwsd_path, "/HWSD2.zip", sep = "") url = "https://s3.eu-west-1.amazonaws.com/data.gaezdev.aws.fao.org/HWSD/HWSD2_RASTER.zip" @@ -14,11 +14,10 @@ unzip(hwsd_zipfile, # test reading BIL file hwsd_raster <- terra::rast("Data/Environment/soil/HWSD2.bil") hwsd_raster -print(hwsd_raster) plot(hwsd_raster) summary(hwsd_raster) -proj4string(hwsd_raster) res(hswd_raster) +names(hwsd_raster[[1]]) # aggregate to coarser resolution by a factor of 9 # (bin 9x9 neighbouring pixels into one, and assign the bigger pixel the mean) soil30 <- aggregate(hwsd_raster, fact = 9, fun = mean) @@ -26,7 +25,46 @@ soil30 plot(soil30) # a plot is made, but of what? There is only one layer of values, and it's not obvoius to me what those values are... +# testing with SoilGrids instead, looks there are more variables there +# see https://www.isric.org/explore/soilgrids/soilgrids-access +library(terra) +library(gdalUtilities) +#projection_string = '+proj=igh +lat_0=0 +lon_0=0 +datum=WGS84 +units=m +no_defs' # proj string for Homolosine projection, https://en.wikipedia.org/wiki/Goode_homolosine_projection +soilGrids_url="/vsicurl?max_retry=3&retry_delay=1&list_dir=no&url=https://files.isric.org/soilgrids/latest/data/" + + #' overview of datasets: https://www.isric.org/explore/soilgrids/faq-soilgrids#What_do_the_filename_codes_mean + #' NB! Each global map occupies circa 5 GB! It takes a while to download. + #' bdod_0-5cm_mean.vrt # bdod=Bulk density of the fine earth fraction, cg/cm³ + #' cec_0-5cm_mean.vrt # cec = Cation Exchange Capacity of the soil, mmol(c)/kg + #' cfvo_0-5cm_mean.vrt # cfvo = Volumetric fraction of coarse fragments (> 2 mm) cm3/dm3 (vol‰) + #' silt_0-5cm_mean.vrt # silt = Proportion of silt particles (≥ 0.002 mm and ≤ 0.05/0.063 mm) in the fine earth fraction g/kg + #' clay_0-5cm_mean.vrt # clay = Proportion of clay particles (< 0.002 mm) in the fine earth fraction g/kg + #' sand_0-5cm_mean.vrt # sand = Proportion of sand particles (> 0.05/0.063 mm) in the fine earth fraction g/kg + #' nitrogen_0-5cm_mean.vrt # nitrogen = Total nitrogen (N) cg/kg + #' phh2o_0-5cm_mean.vrt # phh2o = Soil pH pHx10 + #' ocd_0-5cm_mean.vrt # ocd = Organic carbon density hg/m³ + #' ocs_0-30cm_mean.vrt # ocs = Organic carbon stocks t/ha + #' soc_0-5cm_mean.vrt # soc = Soil organic carbon content in the fine earth fraction dg/kg + #' + #' in addition, https://files.isric.org/soilgrids/latest/data/wrb/ + #' has maps of soil types, as estimated probability of occurrence per type. + #' MostProbable.vrt has the most probable soil type per gridcell. + #' + # gdal_translate() converts raster data between different formats. +soilGrids_data <- gdal_translate(paste0(soilGrids_url, + 'ocs/ocs_0-30cm_mean.vrt'), + paste0(Dir.Data.Envir, + "/crop_roi_igh_r.tif"), + tr = c(2500,2500) # target resolution +) + +crop_roi_igh_r <- rast(paste0(Dir.Data.Envir, + "/crop_roi_igh_r.tif")) + +plot(crop_roi_igh_r) +crop_roi_igh_r +summary(crop_roi_igh_r) # test downloading CAPFITOGEN scripts into R_Scripts From 02ba75f321d387c9c1d7dd9ab4ae17e03472c1f7 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Thu, 30 Jan 2025 10:56:39 +0100 Subject: [PATCH 05/72] update gitignore to exclude data --- .gitignore | 4 ++- Data/Environment/README.md | 51 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 1 deletion(-) create mode 100644 Data/Environment/README.md diff --git a/.gitignore b/.gitignore index f7c8f07..f4412a3 100644 --- a/.gitignore +++ b/.gitignore @@ -54,9 +54,11 @@ rsconnect/ *.png *.txt *.rds +*.tif +*.xlsx !Data/*/GlobalAreaCRS.RData **/HWSD2.zip -/Data/Environment/ +/Data/Environment/soil Logo/* diff --git a/Data/Environment/README.md b/Data/Environment/README.md new file mode 100644 index 0000000..8e0db43 --- /dev/null +++ b/Data/Environment/README.md @@ -0,0 +1,51 @@ +SoilGrids Preview +----------------- + +SoilGrids is a system for global digital soil mapping that uses state-of-the-art machine learning methods to map the spatial distribution of soil properties across the globe. SoilGrids prediction models are fitted using over 230 000 soil profile observations from the WoSIS database and a series of environmental covariates. Covariates were selected from a pool of over 400 environmental layers from Earth observation derived products and other environmental information including climate, land cover and terrain morphology. The outputs of SoilGrids are global soil property maps at six standard depth intervals (according to the GlobalSoilMap IUSS working group and its specifications) at a spatial resolution of 250 meters. Prediction uncertainty is quantified by the lower and upper limits of a 90% prediction interval. The SoilGrids maps are publicly available under the [CC-BY 4.0 License](https://creativecommons.org/licenses/by/4.0/). + +Maps of the following soil properties are available: pH, soil organic carbon content, bulk density, coarse fragments content, sand content, silt content, clay content, cation exchange capacity (CEC), total nitrogen as well as soil organic carbon density and soil organic carbon stock. + +Main improvements +----------------- + +Relative to the previous, the following improvements can be highlighted: + +- Wider selection of soil observations: more profile observations, increased quality assessment and improved and consistent standardisation across the different point datasets. + +- Quantification of prediction uncertainty at pixel level with the 90% prediction interval using Quantile Random Forest. + +- Improved model calibration and cross-validation procedure to better take into account the uneven spatial distribution of data points across the world. + +- Improved covariates selection and model parameter tuning. + +- Texture fractions modelled and mapped not independently from each other, but as compositional data with the sum of the fractions constrained to 100%. + +How to download +--------------- + +In this upcoming SoilGrids release maps are in the [VRT format](https://gdal.org/drivers/raster/vrt.html). Each map is composed by three elements: + +- a master VRT file; +- an OVR file with overviews for swift visualisation; +- a folder with GeoTIFF tiles. + +All elements have the same name, a triplet separated by underscores: `property_depthInterval_quantile` For instance, to download the the 5%-quantile prediction of coarse fragments in the 5 cm to 15 cm depth interval the user must fetch the files `cfvo_5-15cm_Q05.vrt` and `cfvo_5-15cm_Q05.ovr` plus the `cfvo_5-15cm_Q05` folder. + +Each map occupies circa 5 GB. The full collection of maps for a single property, with six +standard depth intervals and four quantiles per depth requires about 120 GB. + +Web Services +------------ + +The SoilGrids maps are also available through OGC web services. These are preferrable for users interested only on mapping or on a restricted area of the globe. Visit the [OGC services catalogue](https://maps.isric.org) page to explore further. + +Questions +--------- + +If questions arise, start by visiting the [SoilGrids FAQ](https://www.isric.org/explore/soilgrids/faq-soilgrids). If it does not answer your question, you may post a question to [GIS.StackExchange](https://gis.stackexchange.com/) under the tag [soilgrids](https://gis.stackexchange.com/tags/soilgrids). ISRIC staff are subscribed to this tag and will be automatically notified of any new question. + + +Contact +------- + +Soilgrids team: soilgrids@isric.org From 95ac20e1a9019f902383e4497bd977ea43656dfd Mon Sep 17 00:00:00 2001 From: evalieungh Date: Thu, 30 Jan 2025 11:10:28 +0100 Subject: [PATCH 06/72] update data info --- Data/Environment/README.md | 51 ++++++-------------------------------- 1 file changed, 7 insertions(+), 44 deletions(-) diff --git a/Data/Environment/README.md b/Data/Environment/README.md index 8e0db43..9fbba7b 100644 --- a/Data/Environment/README.md +++ b/Data/Environment/README.md @@ -1,51 +1,14 @@ -SoilGrids Preview ------------------ +# Environmental data -SoilGrids is a system for global digital soil mapping that uses state-of-the-art machine learning methods to map the spatial distribution of soil properties across the globe. SoilGrids prediction models are fitted using over 230 000 soil profile observations from the WoSIS database and a series of environmental covariates. Covariates were selected from a pool of over 400 environmental layers from Earth observation derived products and other environmental information including climate, land cover and terrain morphology. The outputs of SoilGrids are global soil property maps at six standard depth intervals (according to the GlobalSoilMap IUSS working group and its specifications) at a spatial resolution of 250 meters. Prediction uncertainty is quantified by the lower and upper limits of a 90% prediction interval. The SoilGrids maps are publicly available under the [CC-BY 4.0 License](https://creativecommons.org/licenses/by/4.0/). +downloaded with functions run in capfitogen_master.R and defined in SHARED-Data.R. -Maps of the following soil properties are available: pH, soil organic carbon content, bulk density, coarse fragments content, sand content, silt content, clay content, cation exchange capacity (CEC), total nitrogen as well as soil organic carbon density and soil organic carbon stock. +## Bioclimatic variables (BV) -Main improvements ------------------ +A set of 19 bioclimatic variables, downloaded and processed with the KrigR package. -Relative to the previous, the following improvements can be highlighted: +## Edaphic variables (EV) -- Wider selection of soil observations: more profile observations, increased quality assessment and improved and consistent standardisation across the different point datasets. +Soil data downloaded from SoilGrids. Each map occupies ~ 5 GB. "SoilGrids is a system for global digital soil mapping that uses state-of-the-art machine learning methods to map the spatial distribution of soil properties across the globe. SoilGrids prediction models are fitted using over 230 000 soil profile observations from the WoSIS database and a series of environmental covariates. Covariates were selected from a pool of over 400 environmental layers from Earth observation derived products and other environmental information including climate, land cover and terrain morphology. The outputs of SoilGrids are global soil property maps at six standard depth intervals (according to the GlobalSoilMap IUSS working group and its specifications) at a spatial resolution of 250 meters. Prediction uncertainty is quantified by the lower and upper limits of a 90% prediction interval. The SoilGrids maps are publicly available under the [CC-BY 4.0 License](https://creativecommons.org/licenses/by/4.0/). Maps of the following soil properties are available: pH, soil organic carbon content, bulk density, coarse fragments content, sand content, silt content, clay content, cation exchange capacity (CEC), total nitrogen as well as soil organic carbon density and soil organic carbon stock." See [SoilGrids FAQ](https://www.isric.org/explore/soilgrids/faq-soilgrids). -- Quantification of prediction uncertainty at pixel level with the 90% prediction interval using Quantile Random Forest. +## Geophysical variables (GV) -- Improved model calibration and cross-validation procedure to better take into account the uneven spatial distribution of data points across the world. - -- Improved covariates selection and model parameter tuning. - -- Texture fractions modelled and mapped not independently from each other, but as compositional data with the sum of the fractions constrained to 100%. - -How to download ---------------- - -In this upcoming SoilGrids release maps are in the [VRT format](https://gdal.org/drivers/raster/vrt.html). Each map is composed by three elements: - -- a master VRT file; -- an OVR file with overviews for swift visualisation; -- a folder with GeoTIFF tiles. - -All elements have the same name, a triplet separated by underscores: `property_depthInterval_quantile` For instance, to download the the 5%-quantile prediction of coarse fragments in the 5 cm to 15 cm depth interval the user must fetch the files `cfvo_5-15cm_Q05.vrt` and `cfvo_5-15cm_Q05.ovr` plus the `cfvo_5-15cm_Q05` folder. - -Each map occupies circa 5 GB. The full collection of maps for a single property, with six -standard depth intervals and four quantiles per depth requires about 120 GB. - -Web Services ------------- - -The SoilGrids maps are also available through OGC web services. These are preferrable for users interested only on mapping or on a restricted area of the globe. Visit the [OGC services catalogue](https://maps.isric.org) page to explore further. - -Questions ---------- - -If questions arise, start by visiting the [SoilGrids FAQ](https://www.isric.org/explore/soilgrids/faq-soilgrids). If it does not answer your question, you may post a question to [GIS.StackExchange](https://gis.stackexchange.com/) under the tag [soilgrids](https://gis.stackexchange.com/tags/soilgrids). ISRIC staff are subscribed to this tag and will be automatically notified of any new question. - - -Contact -------- - -Soilgrids team: soilgrids@isric.org From 011f7d338d90f0ddd3d715d9a28a939148f60e0d Mon Sep 17 00:00:00 2001 From: evalieungh Date: Thu, 30 Jan 2025 18:48:47 +0100 Subject: [PATCH 07/72] add back HJ notes into notes script --- R_scripts/capfitogen.R | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/R_scripts/capfitogen.R b/R_scripts/capfitogen.R index ca82f16..7000280 100644 --- a/R_scripts/capfitogen.R +++ b/R_scripts/capfitogen.R @@ -1,6 +1,47 @@ # this script starts with a copy of @trossi's 'capfitogen' script and will serve # as notes and tests for scripting. To be deleted when a full capfitogen pipeline # is in place. +## Heli's handover notes -------------------------------------- +#' Capfitogen R script modifications for CWR BioDT project +#' Heli Juottonen, CSC (heli.juottonen@csc.fi) (please email if any questions!) +#' +#' Main script: capfitogen_master_061124.R +#' +#' sources the following scripts: +#' +#' 1. SHARED-Data.R +#' 1. downloading species data from GBIF: FUN.DownGBIF (from ModGP) +#' 2. downloading environmental data (.nc files): +#' FUN.DownBV, bioclimatic data (as in ModGP, modifications needed?) +#' FUN.DownEV, edaphic data (not ready, only a very rough draft) +#' FUN.DownGV, geophysical data (not ready, only a very rough draft) +#' +#' unclear: best way to obtain the data as .nc files? +#' unclear: where to define which specific variables downloaded for each category? +#' unclear: how and where and if to set the geographic area? +#' +#' outputs (=inputs for the next step): +#' sf file of species occurrence: Species_ls$occs +#' raster stacks: bioclim_ras, edaph_ras, geophys_ras +#' +#' 2. VarSelection.R +#' 1. selection of variables separately for each category (bioclimatic, edaphic, geophysical): +#' FUN.VarSelection +#' uses the vifcor approach as ModGP +#' +#' outputs (= inputs for the next step): +#' values of selected variables extracted from raster stacks: bioclim_ext, edaph_ext, geophys_ext +#' +#' 3. ELCMap.R +#' 1. clustering (kmeans/BIC): FUN.KmeansClust +#' run separately for each variable category (bioclimatic, edaphic, geophysical) +#' +#' outputs (= inputs for the next step): bioclim_cl, geophys_cl, edaph_cl +#' (coordinates, cluster membership, extracted environmental variable values) +#' +#' 2. creating maps (not ready, requires someone with proper spatial data R skills) +#' +#' #------------------------------------------------- #test downloading soil data from harmonized world soil database v2.0 (hwsd) hwsd_path = file.path(getwd(), "Data", "Environment") From 43e68bb7c51a29d88ffb7d1955af8a9c6c93b8c0 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Thu, 30 Jan 2025 19:37:11 +0100 Subject: [PATCH 08/72] add back files lost in recent merge --- .gitignore | 3 + R_scripts/3_VarSelection_HJ.R | 37 ++++ R_scripts/4_ELCMap_HJ.R | 406 ++++++++++++++++++++++++++++++++++ R_scripts/README.md | 1 + 4 files changed, 447 insertions(+) create mode 100644 R_scripts/3_VarSelection_HJ.R create mode 100644 R_scripts/4_ELCMap_HJ.R create mode 100644 R_scripts/README.md diff --git a/.gitignore b/.gitignore index b6d08ae..e14151f 100644 --- a/.gitignore +++ b/.gitignore @@ -56,7 +56,9 @@ rsconnect/ *.rds *.tif *.xlsx +*.zip !Data/*/GlobalAreaCRS.RData +Data/Environment/soil/ Logo/* @@ -74,4 +76,5 @@ Logo/* # Downloaded data CAPFITOGEN3/ +scripts/ hq diff --git a/R_scripts/3_VarSelection_HJ.R b/R_scripts/3_VarSelection_HJ.R new file mode 100644 index 0000000..5961b51 --- /dev/null +++ b/R_scripts/3_VarSelection_HJ.R @@ -0,0 +1,37 @@ +#' Variable selection script, copied and modified from CAPFITOGEN +#' +#' to be added: vifcor approach from ModGP + +FUN.VarSelection <- function(specdata = Species_ls$occs, + varstack = NULL, + buf = NULL){ + + vifcor_res <- usdm::vifcor(varstack, th = 0.7) # variable inflation factor + varstack_sel <- usdm::exclude(varstack, vifcor_res) # excluding variables with high cor and vif + + # extracting values from the raster file + + if(missing(buf)) { + pnt <- sf::st_coordinates(specdata) # pnt <- sf::st_coordinates(occ_ls) + values <- terra::extract(varstack_sel, pnt) + #values <- exactextractr::exact_extract(varstack_sel, occ_ls) # HJ: a lot faster than terra but gives an error: 'Unsupported geometry' + } else { + pntbuf <- sf::st_buffer(specdata, dist = buf) # pntbuf <- sf::st_buffer(occ_ls, dist = buf) + st_crs(pntbuf) <- 4326 # HJ: without this, warning about 'no CRS specified'. How to do this properly? + values <- exactextractr::exact_extract(varstack_sel, pntbuf, 'mean') # HJ: is mean correct here? + # HJ: terra not used here because gets stuck with buffer, probably I'm misunderstanding something + } + + # correcting column names + colnames(values) <- names(varstack_sel) + + # adding coordinates + ext_values <- as.data.frame(row.names(values), nm = "pointid") + ext_values <- cbind(ext_values, st_coordinates(specdata), values) + #ext_values <- cbind(pnt, values) + #ext_values <- na.omit(ext_values) + + ext_values + +} + diff --git a/R_scripts/4_ELCMap_HJ.R b/R_scripts/4_ELCMap_HJ.R new file mode 100644 index 0000000..d32b73f --- /dev/null +++ b/R_scripts/4_ELCMap_HJ.R @@ -0,0 +1,406 @@ +#' ############################################################################## +#' In this script: Finding the optimal number of clusters for each +#' ecogeographic component, and performing clustering +#' +#' Script copied and modified from CAPFITOGEN ELCmapas.R +#' +#' AUTHORS: [Mauricio Parra Quijano, Heli Juottonen, Eva Lieungh] +#' Capfitogen credit: Parra-Quijano et al. 2021, +#' https://repositorio.unal.edu.co/handle/unal/85787 +#' +#' HJ: originally in Capfitogen six methods, here only kmeans/BIC) +#' HJ: everything here from Capfitogen scripts, just simplified/ +#' streamlined/turned into functions +#' ############################################################################## + +# Method of successive K-means and BIC criteria ================================ +#' HJ: ext_values = bioclim_ext/geophys_ext/edaph_ext +#' HJ: max_clusters = maximum number of clusters (was 8) +#' HJ: vartype: edaph / geopys / bioclim +{ + FUN.KmeansClust <- function(ext_values, max_clusters, vartype) { + # standardization (rescaling) the variables (mean zero, sd=1) + # clustering is run on standardized variables, actual values more useful later) + ext_values <- na.omit(ext_values) + stand_values <- terra::scale(ext_values[, 4:length(ext_values)]) + stand_values <- data.frame(ext_values[, 1], stand_values) + colnames(stand_values)[1] <- "pointid" + stand_values <- na.omit(stand_values) # place? needed? + + # HJ: missing: possibility to use latitude and longitude as variables for geophys data - is this needed? + # below how this was handled in Capfitogen scripts + + # ext_values <- raster::extract(variablestack, puntos[, c("POINT_X","POINT_Y")]) + # ext_values <- cbind(puntos[, 1], bioclim) + # colnames(var)[1] <- "POINTID" + # + # if(longitud){ + # var <- cbind(var, puntos[,2]) + # colnames(var)[ncol(var)] <- "LONGITUD" + # } + # if(latitud){ + # var <- cbind(var, puntos[,3]) + # colnames(var)[ncol(var)] < -"LATITUD" + # } + + # clustering + fitb <- find.clusters(stand_values[, -1], + stat = "BIC", + choose.n.clust = FALSE, + criterion = "goodfit", + max.n.clust = max_clusters, + center = TRUE, + scale = TRUE, + pca.select = "percVar", + perc.pca = 75 + ) + VARCLUST <- as.numeric(fitb$grp) + stand_values$VARCLUST <- VARCLUST + ext_values <- merge(ext_values, + stand_values[, c("pointid", "VARCLUST")], + by = "pointid" + ) + colnames(ext_values)[colnames(ext_values) == "VARCLUST"] <- + paste0(vartype, "_clust") + VARCLUST <- length(unique(VARCLUST)) + + # save jpeg + jpeg(file = file.path( + Dir.Results, + paste0(vartype, "_BIC_optimal_N_clusters.jpeg") + )) + plot(fitb$Kstat, + type = "o", + xlab = "number of clusters (K)", + ylab = "BIC", + col = "blue", + main = paste("Detection based on BIC. Optimal value=", + round(fitb$stat, 1), + sep = "" + ) + ) + points(VARCLUST, + fitb$Kstat[VARCLUST], + pch = "x", cex = 2 + ) + dev.off() + + ext_values + } +} + +# 9. Combining clusters from each component to obtain ecogeographical categories (bioclimatic, edaphic and geophysical unique conditions) +# 10. Creating each component and the combination (ELC) map + +{ + ######################################################################## + ################## End of clustering ################################# + + # HJ: input: edaph_cl, geophys_cl, bioclim_cl + # HJ: output: 3-4 maps, summary data? + + FUN.ELCmaps <- function(edaph = edaph_cl, + bioclim = bioclim_cl, + geophys = geophys_cl) { + # Consolidaci?n de tabla ?nica a trav?s de tabla puntos + tabla <- data.frame(edaph) # , geophys$geophys_clust, bioclim$bioclim_clust) + tabla <- merge(tabla, geophys, + by = "pointid", all.x = T + ) + tabla <- merge(tabla, bioclim, + by = "pointid", all.x = T + ) + # rm(bioclim,geophys,edaph,puntos) + mapaelc <- as.data.frame(matrix( + nrow = length(tabla[, 1]), + ncol = 2 + )) + mapaelc[, 1] <- tabla[, 1] + colnames(mapaelc)[1] <- "pointid" + colnames(mapaelc)[2] <- "combi" + for (i in 1:length(tabla[, 1])) { + mapaelc[i, 2] <- + ifelse(is.na(substr( + tabla$bioclim_clust[i], 1, 1 + )) | is.na(substr(tabla$geophys_clust[i],1, 1 + )) | is.na(substr(tabla$edaph_clust[i],1, 1 + )), NA, + paste(substr(tabla$bioclim_clust[i], 1, 1), + substr(tabla$geophys_clust[i], 1, 1), + substr(tabla$edaph_clust[i], 1, 1), + sep = "") + ) + } + elc <- subset(mapaelc, !duplicated(combi), select = -pointid) + elc <- subset(elc, !is.na(combi)) + elc <- elc[order(elc$combi), , drop = FALSE] + # elc <- elc[i,] + + # Assign number to each category + elc[, 2] <- 1:nrow(elc) + # mapaelc <- mapaelc[,1:2] + # Assignment + mapaelc <- merge(mapaelc, elc, by = "combi") + colnames(mapaelc)[3] <- "elc_cat" + tabla <- merge(tabla, mapaelc, by = "pointid", all.x = T) + tabla$elc_cat[is.na(tabla$elc_cat)] <- 0 + tabla$bioclim_clust[is.na(tabla$bioclim_clust)] <- 0 + tabla$geophys_clust[is.na(tabla$geophys_clust)] <- 0 + tabla$edaph_clust[is.na(tabla$edaph_clust)] <- 0 + + # Creating ELC raster map + mapaelc0 <- raster(matrix(nrow = (dim(bioclim_ras)[1]), + ncol = (dim(bioclim_ras)[2])), + template = bioclim_ras) + # HJ: unused argument: template ??? wrong type? + mapaelc1 <- rasterize(cbind(tabla[, 2], tabla[, 3]), + mapaelc0, + field = tabla$elc_cat) + mapaelc2 <- rasterize(cbind(tabla[, 2], tabla[, 3]), + mapaelc0, + field = tabla$bioclim_clust) + mapaelc3 <- rasterize(cbind(tabla[, 2], tabla[, 3]), + mapaelc0, + field = tabla$geophys_clust) + mapaelc4 <- rasterize(cbind(tabla[, 2], tabla[, 3]), + mapaelc0, + field = tabla$edaph_clust) + + # 11. Characterizing each final cluster by the original variables (not rescaled) + # 12. Exporting tables and maps in different formats + + # HJ: from here on things start to go wrong, more GIS experience needed + + crs(mapaelc1) <- "+proj=longlat" + crs(mapaelc2) <- "+proj=longlat" + crs(mapaelc3) <- "+proj=longlat" + crs(mapaelc4) <- "+proj=longlat" + + writeRaster(mapaelc1, + filename = paste(resultados, "/mapa_elc_", + pais, ".grd", sep = ""), + overwrite = T, datatype = "FLT4S") + + writeRaster(mapaelc1, + filename = paste(resultados, "/mapa_elc_DIVA_", + pais, ".grd", sep = ""), + overwrite = T, datatype = "FLT4S") + + writeRaster(mapaelc1, filename = paste(resultados, "/mapa_elc_", + pais, ".tif", sep = ""), + overwrite = T, datatype = "FLT4S") + + writeRaster(mapaelc2, + filename = paste(resultados, "/mapa_bioclimatico_", + pais, ".grd", sep = ""), + overwrite = T, datatype = "FLT4S") + + writeRaster(mapaelc2, + filename = paste(resultados, "/mapa_bioclimatico_", + pais, ".tif", sep = ""), + overwrite = T, datatype = "FLT4S") + + writeRaster(mapaelc3, + filename = paste(resultados, "/mapa_geofisico_", + pais, ".grd", sep = ""), + overwrite = T, datatype = "FLT4S") + + writeRaster(mapaelc3, + filename = paste(resultados, "/mapa_geofisico_", + pais, ".tif", sep = ""), + overwrite = T, datatype = "FLT4S") + + writeRaster(mapaelc4, + filename = paste(resultados, "/mapa_edafico_", + pais, ".grd", sep = ""), + overwrite = T, datatype = "FLT4S") + + writeRaster(mapaelc4, + filename = paste(resultados, "/mapa_edafico_", + pais, ".tif", sep = ""), + overwrite = T, datatype = "FLT4S") + + writeRaster(mapaelc2, + filename = "testibioclim.tif", + overwrite = T, datatype = "FLT4S") + + # HJ: above doesn't work, probably because of my limited GIS skills. Gave the error below: + + # Error in plot.window(...) : need finite 'ylim' values + # In addition: Warning messages: + # 1: In xy.coords(x, y, xlabel, ylabel, log) : NAs introduced by coercion + # 2: In min(x) : no non-missing arguments to min; returning Inf + # 3: In max(x) : no non-missing arguments to max; returning -Inf + + # HJ: script from here on creates tables with descriptive statistics of the environmental data + # HJ : creates a lot of tables, which ones of them are needed? + + # OBJETO SALIDA 3 + # tables of statistics for each component + tablabio <- data.frame(table(bioclim$BIOCLUST)) + colnames(tablabio)[1] <- "BIOCLIM_CAT" + tablabioclim <- aggregate(bioclim[, 2:(length(bioclim[1, ]) - 1)], + by = list(bioclim$BIOCLUST), mean, na.rm = TRUE) + colnames(tablabioclim)[1] <- "BIOCLIM_CAT" + tablabioclim1 <- aggregate(bioclim[, 2:(length(bioclim[1, ]) - 1)], + by = list(bioclim$BIOCLUST), min, na.rm = TRUE) + colnames(tablabioclim1)[1] <- "BIOCLIM_CAT" + tablabioclim2 <- aggregate(bioclim[, 2:(length(bioclim[1, ]) - 1)], + by = list(bioclim$BIOCLUST), max, na.rm = TRUE) + colnames(tablabioclim2)[1] <- "BIOCLIM_CAT" + tablabioclim3 <- aggregate(bioclim[, 2:(length(bioclim[1, ]) - 1)], + by = list(bioclim$BIOCLUST), sd, na.rm = TRUE) + colnames(tablabioclim3)[1] <- "BIOCLIM_CAT" + tablabioclim <- merge(tablabio, tablabioclim, by = "BIOCLIM_CAT") + tablabioclim <- merge(tablabioclim, tablabioclim1, + by = "BIOCLIM_CAT", suffixes = c(".media", ".min")) + tablabioclim2 <- merge(tablabioclim2, tablabioclim3, + by = "BIOCLIM_CAT", suffixes = c(".max", ".sd")) + tablabioclim <- merge(tablabioclim, tablabioclim2, + by = "BIOCLIM_CAT") + + write.table(tablabioclim, + file = paste(resultados, "/Estadist_BIOCLIM_", + pais, ".txt", sep = ""), sep = "\t", + row.names = FALSE, qmethod = "double") + + tablageo <- data.frame(table(geophys$GEOCLUST)) + colnames(tablageo)[1] <- "GEOPHYS_CAT" + tablageophys <- aggregate(geophys[, 2:(length(geophys[1, ]) - 1)], + by = list(geophys$GEOCLUST), mean, na.rm = TRUE) + colnames(tablageophys)[1] <- "GEOPHYS_CAT" + tablageophys1 <- aggregate(geophys[, 2:(length(geophys[1, ]) - 1)], + by = list(geophys$GEOCLUST), min, na.rm = TRUE) + colnames(tablageophys1)[1] <- "GEOPHYS_CAT" + tablageophys2 <- aggregate(geophys[, 2:(length(geophys[1, ]) - 1)], + by = list(geophys$GEOCLUST), max, na.rm = TRUE) + colnames(tablageophys2)[1] <- "GEOPHYS_CAT" + tablageophys3 <- aggregate(geophys[, 2:(length(geophys[1, ]) - 1)], + by = list(geophys$GEOCLUST), sd, na.rm = TRUE) + colnames(tablageophys3)[1] <- "GEOPHYS_CAT" + tablageophys <- merge(tablageo, tablageophys, + by = "GEOPHYS_CAT") + tablageophys <- merge(tablageophys, tablageophys1, + by = "GEOPHYS_CAT", suffixes = c(".media", ".min")) + tablageophys2 <- merge(tablageophys2, tablageophys3, + by = "GEOPHYS_CAT", suffixes = c(".max", ".sd")) + tablageophys <- merge(tablageophys, tablageophys2, + by = "GEOPHYS_CAT") + + write.table(tablageophys, + file = paste(resultados, "/Estadist_GEOPHYS_", + pais, ".txt", sep = ""), sep = "\t", + row.names = FALSE, qmethod = "double") + + tablaeda <- data.frame(table(edaph$EDACLUST)) + colnames(tablaeda)[1] <- "EDAPH_CAT" + tablaedaph <- aggregate(edaph[, 2:(length(edaph[1, ]) - 1)], + by = list(edaph$EDACLUST), mean, na.rm = TRUE) + colnames(tablaedaph)[1] <- "EDAPH_CAT" + tablaedaph1 <- aggregate(edaph[, 2:(length(edaph[1, ]) - 1)], + by = list(edaph$EDACLUST), min, na.rm = TRUE) + colnames(tablaedaph1)[1] <- "EDAPH_CAT" + tablaedaph2 <- aggregate(edaph[, 2:(length(edaph[1, ]) - 1)], + by = list(edaph$EDACLUST), max, na.rm = TRUE) + colnames(tablaedaph2)[1] <- "EDAPH_CAT" + tablaedaph3 <- aggregate(edaph[, 2:(length(edaph[1, ]) - 1)], + by = list(edaph$EDACLUST), sd, na.rm = TRUE) + colnames(tablaedaph3)[1] <- "EDAPH_CAT" + tablaedaph <- merge(tablaeda, tablaedaph, + by = "EDAPH_CAT") + tablaedaph <- merge(tablaedaph, tablaedaph1, + by = "EDAPH_CAT", suffixes = c(".media", ".min")) + tablaedaph2 <- merge(tablaedaph2, tablaedaph3, + by = "EDAPH_CAT", suffixes = c(".max", ".sd")) + tablaedaph <- merge(tablaedaph, tablaedaph2, + by = "EDAPH_CAT") + + write.table(tablaedaph, + file = paste(resultados, "/Estadist_EDAPH_", + pais, ".txt", sep = ""), + sep = "\t", row.names = FALSE, qmethod = "double") + write.table(tabla, + file = paste(resultados, "/Tabla_ELC_celdas_", + pais, ".txt", sep = ""), sep = "\t", + row.names = FALSE, qmethod = "double") + + if (any(unique(tabla$BIOCLUST) == 0)) { + N_bioclust <- length(unique(tabla$BIOCLUST)) - 1 + } + if (all(unique(tabla$BIOCLUST) > 0)) { + N_bioclust <- length(unique(tabla$BIOCLUST)) + } + if (any(unique(tabla$GEOCLUST) == 0)) { + N_geoclust <- length(unique(tabla$GEOCLUST)) - 1 + } + if (all(unique(tabla$GEOCLUST) > 0)) { + N_geoclust <- length(unique(tabla$GEOCLUST)) + } + if (any(unique(tabla$EDACLUST) == 0)) { + N_edaclust <- length(unique(tabla$EDACLUST)) - 1 + } + if (all(unique(tabla$EDACLUST) > 0)) { + N_edaclust <- length(unique(tabla$EDACLUST)) + } + if (any(unique(tabla$ELC_CAT) == 0)) { + N_ELC_CAT <- length(unique(tabla$ELC_CAT)) - 1 + } + if (all(unique(tabla$ELC_CAT) > 0)) { + N_ELC_CAT <- length(unique(tabla$ELC_CAT)) + } + NCATS <- as.data.frame(cbind(N_ELC_CAT, N_bioclust, N_geoclust, N_edaclust)) + + # OBJETO SALIDA 4 + write.table(NCATS, + file = paste(resultados, "/numero_categorias_", + pais, ".txt", sep = ""), sep = "\t", + row.names = FALSE, qmethod = "double") + + ## Obtain descriptive statistics para cada categor?a ELc + nbioclim <- length(bioclimv) + ngeophys <- vector(mode = "numeric", length = 1) + for (i in 1:1) { + ngeophys <- ifelse(all(latitud, longitud), + length(geophysv) + 2, + ifelse(any(latitud, longitud), + length(geophysv) + 1, length(geophysv))) + } + nedaph <- length(edaphv) + tabla <- tabla[, c(ncol(tabla), + ncol(tabla) - 1, + 4:(3 + nbioclim), + (5 + nbioclim):(4 + nbioclim + ngeophys), + (6 + nbioclim + ngeophys):(ncol(tabla) - 3))] + media <- aggregate(tabla[, c(-1, -2)], + by = list(tabla$ELC_CAT), FUN = "mean") + colnames(media)[1] <- "ELC_CAT" + mediana <- aggregate(tabla[, c(-1, -2)], + by = list(tabla$ELC_CAT), FUN = "median") + colnames(mediana)[1] <- "ELC_CAT" + maximo <- aggregate(tabla[, c(-1, -2)], + by = list(tabla$ELC_CAT), FUN = "max") + colnames(maximo)[1] <- "ELC_CAT" + minimo <- aggregate(tabla[, c(-1, -2)], + by = list(tabla$ELC_CAT), FUN = "min") + colnames(minimo)[1] <- "ELC_CAT" + desvest <- aggregate(tabla[, c(-1, -2)], + by = list(tabla$ELC_CAT), FUN = "sd") + colnames(desvest)[1] <- "ELC_CAT" + + # Tabla de unificaci?n de estad?sticos + estad <- merge(media, mediana, by = "ELC_CAT", suffixes = c(".media", ".mediana")) + estad1 <- merge(maximo, minimo, by = "ELC_CAT", suffixes = c(".maximo", ".minimo")) + estad <- merge(estad, estad1, by = "ELC_CAT") + aaa <- "sd" + for (i in 2:length(desvest[1, ])) { + colnames(desvest)[i] <- paste(colnames(desvest)[i], aaa, sep = ".") + } + estad <- merge(estad, desvest, by = "ELC_CAT") + # Export descriptive statistics + write.table(estad, file = paste(resultados, "/Estadist_ELC_", pais, ".txt", sep = ""), sep = "\t", row.names = FALSE, qmethod = "double") + + # Export table de equivalencias Combinaci?n (Bio-Geo-Eda) y categoriesfinal map + colnames(elc)[2] <- "ELC_CAT" + write.table(estad, file = paste(resultados, "/Combi_ELC_", pais, ".txt", sep = ""), sep = "\t", row.names = FALSE, qmethod = "double") + } +} diff --git a/R_scripts/README.md b/R_scripts/README.md new file mode 100644 index 0000000..891ce9c --- /dev/null +++ b/R_scripts/README.md @@ -0,0 +1 @@ +This directory contains R scripts to be sourced by master scripts in the repository root (i.e. one level up in the folder structure). \ No newline at end of file From 6667db830a6d47e916ab30259d344dd24fcda381 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Tue, 4 Feb 2025 17:37:43 +0100 Subject: [PATCH 09/72] update draft code for soil/edaphic data --- ModGP MASTER.R => ModGP_MASTER.R | 2 +- README.md | 2 +- R_scripts/SHARED-Data.R | 108 +++++++++++++++++++++++++------ R_scripts/capfitogen.R | 4 +- capfitogen_master.R | 32 ++++++--- 5 files changed, 114 insertions(+), 34 deletions(-) rename ModGP MASTER.R => ModGP_MASTER.R (99%) diff --git a/ModGP MASTER.R b/ModGP_MASTER.R similarity index 99% rename from ModGP MASTER.R rename to ModGP_MASTER.R index 53d53f6..4d6d381 100644 --- a/ModGP MASTER.R +++ b/ModGP_MASTER.R @@ -3,7 +3,7 @@ #' CONTENTS: #' - Execution of ModGP pipeline #' DEPENDENCIES: -#' - R Scripts directory containing: +#' - R_scripts directory containing: #' - "ModGP-Outputs.R" #' - "ModGP-SDM.R" #' - "SHARED-APICredentials.R" diff --git a/README.md b/README.md index 59a9a1d..15b9301 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ ## ModGP on Rstudio -1. Source `ModGP MASTER.R` and change `SPECIES` argument at line 19 to execute ModGP pipeline for a specific genus. +1. Source `ModGP_MASTER.R` and change `SPECIES` argument at line 19 to execute ModGP pipeline for a specific genus. NB! ModGP should be run on a supercomputer. The environmental data download has very large interim files (>40GB per year per variable, >200 GB overall), and the distribution modelling also requires a long time to run. ## ModGP on LUMI with Hyperqueue diff --git a/R_scripts/SHARED-Data.R b/R_scripts/SHARED-Data.R index a28e639..dbbb093 100644 --- a/R_scripts/SHARED-Data.R +++ b/R_scripts/SHARED-Data.R @@ -371,48 +371,116 @@ FUN.DownBV <- function( FUN.DownEV <- function(Dir = getwd(), # where to store the data output on disk Force = FALSE, # do not overwrite already present data, - ){ + resample_to_match = FALSE){ + # define a file name FNAME <- file.path(Dir, "edaphic.nc") - # check if file exists and whether to overwrite + # check if file already exists and whether to overwrite if(!Force & file.exists(FNAME)){ EV_ras <- stack(FNAME) #names(EV_ras) <- paste0("BIO", 1:19) # replace with edaphic names vector message("Data has already been downloaded with these specifications. It has been loaded from the disk. If you wish to override the present data, please specify Force = TRUE") return(EV_ras) } - - if(!file.exists( - - )){ - ## downloading + # if the file doesn't already exist: + ## downloading data from SoilGrids + if(!file.exists(FNAME)){ + message("Start downloading data from SoilGrids") + soilGrids_url="/vsicurl?max_retry=3&retry_delay=1&list_dir=no&url=https://files.isric.org/soilgrids/latest/data/" + #' overview of datasets: https://www.isric.org/explore/soilgrids/faq-soilgrids#What_do_the_filename_codes_mean + #' NB! Each global map occupies circa 5 GB! It takes a while to download. + #' in addition, https://files.isric.org/soilgrids/latest/data/wrb/ + #' has maps of soil types, as estimated probability of occurrence per type. + #' MostProbable.vrt has the most probable soil type per gridcell. + SoilGrids_variables_in <- c( + "bdod/bdod_0-5cm_mean", # Bulk density of the fine earth fraction, cg/cm³ + "cec/cec_0-5cm_mean", # Cation Exchange Capacity of the soil, mmol(c)/kg + "cfvo/cfvo_0-5cm_mean", # Volumetric fraction of coarse fragments (> 2 mm) cm3/dm3 (vol‰) + "silt/silt_0-5cm_mean", # Proportion of silt particles (≥ 0.002 mm and ≤ 0.05/0.063 mm) in the fine earth fraction g/kg + "clay/clay_0-5cm_mean", # Proportion of clay particles (< 0.002 mm) in the fine earth fraction g/kg + "sand/sand_0-5cm_mean", # Proportion of sand particles (> 0.05/0.063 mm) in the fine earth fraction g/kg + "nitrogen/nitrogen_0-5cm_mean", # Total nitrogen (N) cg/kg + "phh2o/phh2o_0-5cm_mean", # Soil pH pHx10 + "ocd/ocd_0-5cm_mean",# Organic carbon density hg/m³ + "ocs/ocs_0-30cm_mean",# Organic carbon stocks t/ha + "soc/soc_0-5cm_mean")# Soil organic carbon content in the fine earth fraction dg/kg + SoilGrids_variables <- sub(".*/", "", SoilGrids_variables_in) + soilGrids_data <- stack() + for (i in length(SoilGrids_variables_in)) { + variable_name = SoilGrids_variables[] + soilGrids_data[[i]] <- gdal_translate( + src_dataset = paste0(soilGrids_url, SoilGrids_variables_in[i], ".vrt"), # input to be downloaded + dst_dataset = paste0(Dir.Data.Envir, "/", SoilGrids_variables[i], ".tif"), # output file + tr = c(2500,2500) # target resolution + ) + } + ## downloading data from HSWD + message("Start downloading data from HSWD (harmonised world soil database)") + PH_nutrient <- raster("https://www.fao.org/fileadmin/user_upload/soils/docs/HWSD/Soil_Quality_data/sq1.asc") + PH_toxicity <- raster("https://www.fao.org/fileadmin/user_upload/soils/docs/HWSD/Soil_Quality_data/sq6.asc") + HSWD_PH_stack <- stack(PH_nutrient, PH_toxicity) + ## combine and rename rasters + EV_stack <- stack(HSWD_PH_stack, soilGrids_data) + names(EV_stack) <- c("Nutrient", + "Toxicity", + SoilGrids_variables) + + ## resample to match a provided raster object's origin and resolution + if(!resample_to_match){ + EV_stack <- raster::resample(HSWD_PH_stack, # rasters to be resampled + resample_to_match) # raster with parameters to be resampled to + } } - evarg <- c(arg1, arg2, arg3) - # evargs <- commandArgs(trailingOnly = TRUE) - #evarg <- c("soc", "silt") - edaph_ras <- soil_world(evarg, depth = 5, path = file.path(Dir.Data.Envir, "Edaphic")) - names(edaph_ras) <- evarg - ### Masking ---- + ### Saving ---- + terra::writeCDF(EV_stack, filename = FNAME, overwrite = FALSE) + #unlink(file.path(Dir.Data.Envir, "Edaphic", "soil_world", "*.tif")) + + EV_stack + +} + +### Masking ---- +# whole world +#Land_sp <- ne_countries(type = "countries", scale = "medium") + +# HJ: for testing/ to match previous Capfitogen tests: only Spain +# HJ: this is an attempt to do the same thing with terra that was done with KrigR in ModGP (see below) +# please switch back to KrigR is wanted/needed +# Land_sp <- ne_states("Spain") +# edaph_ras <- crop(edaph_ras, terra::ext(Land_sp)) +# edaph_ras <- terra::mask(edaph_ras, vect(Land_sp)) +# BV_ras <- crop(BV_ras, extent(Land_sp)) +# BV_mask <- KrigR:::mask_Shape(base.map = BV_ras[[1]], Shape = Land_sp[,"name"]) +# BV_ras <- mask(BV_ras, BV_mask) + +# HJ: missing: data source, download function for .nc files, where to set the selected variables + +FUN.DownGV <- function(arg1, arg2){ + + FNAME <- file.path(Dir.Data.Envir, "geophys.nc") + + evarg <- c(arg1, arg2) + geophys_ras <- ??? file.path(Dir.Data.Envir, "Geophysical") + # whole world #Land_sp <- ne_countries(type = "countries", scale = "medium") - # HJ: for testing/ to match previous Capfitogen tests: only Spain # HJ: this is an attempt to do the same thing with terra that was done with KrigR in ModGP (see below) # please switch back to KrigR is wanted/needed Land_sp <- ne_states("Spain") - edaph_ras <- crop(edaph_ras, terra::ext(Land_sp)) - edaph_ras <- terra::mask(edaph_ras, vect(Land_sp)) + geophys_ras <- crop(geophys_ras, terra::ext(Land_sp)) + geophys_ras <- terra::mask(geophys_ras, vect(Land_sp)) # BV_ras <- crop(BV_ras, extent(Land_sp)) # BV_mask <- KrigR:::mask_Shape(base.map = BV_ras[[1]], Shape = Land_sp[,"name"]) # BV_ras <- mask(BV_ras, BV_mask) ### Saving ---- - terra::writeCDF(edaph_ras, filename = FNAME, overwrite = TRUE) - unlink(file.path(Dir.Data.Envir, "Edaphic", "soil_world", "*.tif")) + terra::writeCDF(geophys_ras, filename = FNAME, overwrite = TRUE) + unlink(file.path(Dir.Data.Envir, "Geophysical")) - edaph_ras + geophys_ras -} +} \ No newline at end of file diff --git a/R_scripts/capfitogen.R b/R_scripts/capfitogen.R index 7000280..b48f51c 100644 --- a/R_scripts/capfitogen.R +++ b/R_scripts/capfitogen.R @@ -93,8 +93,8 @@ soilGrids_url="/vsicurl?max_retry=3&retry_delay=1&list_dir=no&url=https://files. #' MostProbable.vrt has the most probable soil type per gridcell. #' # gdal_translate() converts raster data between different formats. -soilGrids_data <- gdal_translate(paste0(soilGrids_url, - 'ocs/ocs_0-30cm_mean.vrt'), +soilGrids_data <- gdal_translate( + src_dataset = paste0(soilGrids_url,'ocs/ocs_0-30cm_mean.vrt'), paste0(Dir.Data.Envir, "/crop_roi_igh_r.tif"), tr = c(2500,2500) # target resolution diff --git a/capfitogen_master.R b/capfitogen_master.R index 0c5ea37..f566d4c 100644 --- a/capfitogen_master.R +++ b/capfitogen_master.R @@ -11,9 +11,9 @@ #' - R_Scripts directory containing: #' - "MoDGP-commonlines.R" #' - "SHARED-APICredentials.R" -- NB! internal to project members, ask for access -#' - "SHARED-Data_CAPFITOGEN.R" -#' - "SelectVar.R" -- CAPFITOGEN tool -#' - +#' - "SHARED-Data.R" +#' - "ELCmaps" - CAPFITOGEN tool +#' - "Complementa" - CAPFITOGEN tool #' AUTHORS: [Erik Kusch, Heli Juottonen, Eva Lieungh] #' Capfitogen credit: Parra-Quijano et al. 2021, #' https://repositorio.unal.edu.co/handle/unal/85787 @@ -62,11 +62,11 @@ message(sprintf("numberOfCores = %d", numberOfCores)) # DATA ==================================================================== ## Run SHARED-Data script ------------------------------------------------- -## defines FUN.DownGBIF(), FUN.DownBV() +## defines FUN.DownGBIF(), FUN.DownBV(), FUN.DownEV() source(file.path(Dir.R_scripts, "SHARED-Data.R")) ## GBIF Data -------------------------------------------------------------- -message("Retrieving GBIF data") +message("Downloading new or loading existing GBIF data") ## species of interest Species_ls <- FUN.DownGBIF( species = SPECIES, # which species to pull data for @@ -77,21 +77,33 @@ Species_ls <- FUN.DownGBIF( ) ## Environmental Data ----------------------------------------------------- -## Bioclomatic data: 19 BioClim variables -message("Retrieving bioclimatic variables") # NB! slow +##' Bioclomatic data: 19 BioClim variables +##' is each file of each variable >20GB? +message("Downloading new or loading existing 19 BioClim bioclimatic variables") bioclim_data <- FUN.DownBV( - T_Start = 1995, # what year to begin climatology calculation in - T_End = 2015, # what year to end climatology calculation in + T_Start = 2000, # what year to begin climatology calculation in + T_End = 2001, # what year to end climatology calculation in Dir = Dir.Data.Envir, # where to store the data output on disk Force = FALSE # do not overwrite already present data ) ## Edaphic data: +message("Retrieving edaphic variables") edaphic_data <- FUN.DownEV( Dir = Dir.Data.Envir, - Force = FALSE + Force = FALSE, + resample_to_match = bioclim_data[[1]] ) +# PH_nutrient <- raster("https://www.fao.org/fileadmin/user_upload/soils/docs/HWSD/Soil_Quality_data/sq1.asc") +# PH_toxicity <- raster("https://www.fao.org/fileadmin/user_upload/soils/docs/HWSD/Soil_Quality_data/sq6.asc") +# PH_stack <- stack(PH_nutrient, PH_toxicity) +# PH_stack <- raster::resample(PH_stack, # rasters to be resampled +# bioclim_data[[1]]) # raster with parameters to be resampled to +# PH_stack <- stack(PH_stack, bioclim_data$BIO1, bioclim_data$BIO12) +# names(PH_stack) <- c("Nutrient", "Toxicity", "Temperature", "Soil Moisture") + + #' existing data in "Data/Environment/BV-1985-2015.nc" #' and soil data in .bil under /soil downloaded from Harmonized World #' Soil Database version 2.0 From 6d78db51cd4bfbcaa0fbe8ae21a2ab9702f20012 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Wed, 5 Feb 2025 18:26:31 +0100 Subject: [PATCH 10/72] add ERA5 citation, debug functions --- R_scripts/ModGP-commonlines.R | 1 + R_scripts/SHARED-Data.R | 198 ++++++++++++++++++++++------------ capfitogen_master.R | 37 ++----- 3 files changed, 139 insertions(+), 97 deletions(-) diff --git a/R_scripts/ModGP-commonlines.R b/R_scripts/ModGP-commonlines.R index 18a1909..c120d7a 100644 --- a/R_scripts/ModGP-commonlines.R +++ b/R_scripts/ModGP-commonlines.R @@ -19,6 +19,7 @@ package_vec <- c( 'exactextractr', # HJ: added to solve extraction problems #'geodata', # HJ: added to get soil data for testing 'ggplot2', # ggplot machinery + 'ggpp', 'ggpmisc', # table plotting in ggplot environment 'ggpubr', # t-test comparison in ggplot 'gridExtra', # ggplot saving in PDF diff --git a/R_scripts/SHARED-Data.R b/R_scripts/SHARED-Data.R index dbbb093..89897dd 100644 --- a/R_scripts/SHARED-Data.R +++ b/R_scripts/SHARED-Data.R @@ -9,22 +9,24 @@ #' ####################################################################### # # GBIF DOWNLOAD FUNCTION -------------------------------------------------- -# queries download from GBIF, handles and cleans data, returns SF MULTIPOINT object and GBIF download metadata -FUN.DownGBIF <- function(species = NULL, # species name as character for whose genus data is to be downloaded - Dir = getwd(), # where to store the data - Force = FALSE, # overwrite existing data? - Mode = "ModGP", # which specification to run, either for whole GENUS of supplied species (ModGP), or for species directly (Capfitogen) - parallel = 1 # an integer, 1 = sequential; always defaults to sequential when Mode == "Capfitogen" +#' queries download from GBIF, handles and cleans data, +#' returns SF MULTIPOINT object and GBIF download metadata +FUN.DownGBIF <- function( + species = NULL, # species name as character for whose genus data is to be downloaded + Dir = getwd(), # where to store the data + Force = FALSE, # overwrite existing data? + Mode = "ModGP", # which specification to run, either for whole GENUS of supplied species (ModGP), or for species directly (Capfitogen) + parallel = 1 # an integer, 1 = sequential; always defaults to sequential when Mode == "Capfitogen" ){ ## Preparing species name identifiers input_species <- species - ## Focussing on Genus-part of the name if Mode is set to ModGP + ## Focusing on Genus-part of the name if Mode is set to ModGP if(Mode == "ModGP"){ species <- strsplit(input_species, " ")[[1]][1] } - ## Filename and data presence check + ## File name and data presence check FNAME <- file.path(Dir, paste0(species, ".RData")) if(!Force & file.exists(FNAME)){ save_ls <- loadObj(FNAME) @@ -34,8 +36,7 @@ FUN.DownGBIF <- function(species = NULL, # species name as character for whose g ## Function start message("Starting GBIF data retrieval") - ## GBIF ID Query ---- - ## GBIF query + ## GBIF taxa Query ---- if(Mode == "ModGP"){ message(paste("## Resolving", species, "at genus level")) RankGBIF <- "genus" @@ -69,8 +70,10 @@ FUN.DownGBIF <- function(species = NULL, # species name as character for whose g pred("occurrenceStatus", "PRESENT"), format = "SIMPLE_CSV") curlopts <- list(http_version = 2) # needed on Mac to avoid HTTP issues in next line (see here: https://github.com/ropensci/rgbif/issues/579) - occ_meta <- occ_download_wait(occ_down, status_ping = 30, - curlopts = list(), quiet = FALSE) # wait for download to finish + occ_meta <- occ_download_wait(occ_down, + status_ping = 30, + curlopts = list(), + quiet = FALSE) # wait for download to finish occ_get <- occ_download_get(occ_down, path = Dir) # download data curlopts <- list(http_version = 1) # resetting this to not affect other functions @@ -80,16 +83,20 @@ FUN.DownGBIF <- function(species = NULL, # species name as character for whose g ## Manipulating GBIF Data ---- ### Resolving Common Issues ---- - message("Resolving Common Data Issues") + message("Resolving Common Data Issues. Removing occurrences") ## removing bases of record that may not be linked to coordinates properly + message("... that may not be linked to coordinates properly") occ_occ <- occ_occ[occ_occ$basisOfRecord %nin% c("PRESERVED_SPECIMEN", "MATERIAL_CITATION"), ] ## removing highly uncertain locations, i.e., anything more than 1km in uncertainty + message("... with >1km uncertainty") occ_occ <- occ_occ[occ_occ$coordinateUncertaintyInMeters <= 1000, ] ## removing rounded coordinates + message("... with rounded coordinates") occ_occ <- occ_occ[-grep(occ_occ$issue, pattern = "COORDINATE_ROUNDED"), ] ## removing empty species rows + message("... with empty species rows") occ_occ <- occ_occ[occ_occ$species != "" & !is.na(occ_occ$species), ] ### Parallel Set-Up ---- @@ -117,38 +124,49 @@ FUN.DownGBIF <- function(species = NULL, # species name as character for whose g message("Extracting species-level data into MULTIPOINT objects") GBIF_specs <- unique(occ_occ$species) - ## Making a list of spatialfeatures MULTIPOINT objects denoting unique locations of presence per species - specs_ls <- pblapply(GBIF_specs, - cl = parallel, - FUN = function(x){ - spec_df <- occ_occ[occ_occ$species == x, ] - spec_uniloca <- occ_occ[occ_occ$species == x, - c("species", - "decimalLatitude", - "decimalLongitude")] - spec_df <- spec_df[!duplicated(spec_uniloca), - c("gbifID", - "datasetKey", - "occurrenceID", - "species", - "scientificName", - "speciesKey", - "decimalLatitude", - "decimalLongitude", - "coordinateUncertaintyInMeters", - "eventDate", - "basisOfRecord", - "recordNumber", - "issue")] - spec_df$presence <- 1 - st_as_sf(spec_df, - coords = c("decimalLongitude", - "decimalLatitude")) - }) + ##' Making a list of spatialfeatures MULTIPOINT objects + ##' denoting unique locations of presence per species + specs_ls <- pblapply( + GBIF_specs, + cl = parallel, + FUN = function(x) { + ## dataframe of occurrences + spec_df <- occ_occ[occ_occ$species == x,] + ## unique locations + spec_uniloca <- occ_occ[occ_occ$species == x, + c("species", + "decimalLatitude", + "decimalLongitude")] + ## remove duplicates (of populations) + spec_df <- spec_df[!duplicated(spec_uniloca), + c("gbifID", + "datasetKey", + "occurrenceID", + "species", + "scientificName", + "speciesKey", + "decimalLatitude", + "decimalLongitude", + "coordinateUncertaintyInMeters", + "eventDate", + "basisOfRecord", + "recordNumber", + "issue" + )] + spec_df$presence <- 1 + st_as_sf(spec_df, + coords = c("decimalLongitude", + "decimalLatitude")) + } + ) names(specs_ls) <- GBIF_specs ## Making list into single data frame when Capfitogen mode is toggled on. + # HJ: section below to create a Capfitogen data frame not used + # species data included as the sf file created above if(Mode == "Capfitogen"){ + message("Making data for Capfitogen mode") + message(FNAME) specs_ls <- specs_ls[[1]] ## create capfitogen data frame CapfitogenColumns <- c("INSTCODE", @@ -168,10 +186,15 @@ FUN.DownGBIF <- function(species = NULL, # species name as character for whose g "ACQDATE", "ORIGCTY", "NAMECTY", - "ADM1", "ADM2", "ADM3", "ADM4", + "ADM1", + "ADM2", + "ADM3", + "ADM4", "COLLSITE", - "DECLATITUDE", "LATITUDE", - "DECLONGITUDE", "LONGITUDE", + "DECLATITUDE", + "LATITUDE", + "DECLONGITUDE", + "LONGITUDE", "COORDUNCERT", "COORDDATUM", "GEOREFMETH", @@ -196,18 +219,22 @@ FUN.DownGBIF <- function(species = NULL, # species name as character for whose g ncol = length(CapfitogenColumns))) colnames(CapfitogenData) <- CapfitogenColumns ## Create unique rownames for the ACCENUMB - CapfitogenData$ACCENUMB <- seq(from = 1, to = nrow(CapfitogenData), by = 1) + CapfitogenData$ACCENUMB <- seq(from = 1, + to = nrow(CapfitogenData), + by = 1) ## Add in the species, latitude and longitude (nothing else at this point) CapfitogenData$SPECIES <- specs_ls$species CapfitogenData$DECLATITUDE <- st_coordinates(specs_ls)[,"Y"] CapfitogenData$DECLONGITUDE <- st_coordinates(specs_ls)[,"X"] - specs_ls <- CapfitogenData + specs_ls_capfitogen <- CapfitogenData } ### Returning Object to Disk and Environment ---- + ifelse(Mode == "Capfitogen", + occs = specs_ls_capfitogen, + occs = specs_ls) save_ls <- list(meta = occ_meta, - occs = specs_ls - # , + occs = occs # json = JSON_ls ) @@ -215,27 +242,42 @@ FUN.DownGBIF <- function(species = NULL, # species name as character for whose g unlink(occ_get) # removes .zip file ### JSON RO-CRATE creation ---- + message("Create .json RO-crate (research object) metadata") JSON_ls <- jsonlite::read_json("ro-crate-metadata.json") JSON_ls$`@graph`[[2]]$hasPart[[1]]$`@id` <- basename(FNAME) - JSON_ls$`@graph`[[2]]$about[[1]]$`@id` <- paste0("https://www.gbif.org/species/", tax_ID) # gbif ID - JSON_ls$`@graph`[[2]]$creator$`@id` <- c(JSON_ls$`@graph`[[2]]$creator$`@id`, as.character(options("gbif_email"))) - JSON_ls$`@graph`[[2]]$author$`@id` <- c(JSON_ls$`@graph`[[2]]$author$`@id`, as.character(options("gbif_email"))) + JSON_ls$`@graph`[[2]]$about[[1]]$`@id` <- paste0("https://www.gbif.org/species/", + tax_ID) # gbif taxon ID + JSON_ls$`@graph`[[2]]$creator$`@id` <- c(JSON_ls$`@graph`[[2]]$creator$`@id`, + as.character(options("gbif_email"))) + JSON_ls$`@graph`[[2]]$author$`@id` <- c(JSON_ls$`@graph`[[2]]$author$`@id`, + as.character(options("gbif_email"))) JSON_ls$`@graph`[[2]]$datePublished <- Sys.time() - JSON_ls$`@graph`[[2]]$name <- paste("Cleaned GBIF occurrence records for", RankGBIF, species) - JSON_ls$`@graph`[[2]]$keywords <- list("GBIF", "Occurrence", "Biodiversity", "Observation", Mode) - JSON_ls$`@graph`[[2]]$description <- paste(Mode, "input data for", species) + JSON_ls$`@graph`[[2]]$name <- paste("Cleaned GBIF occurrence records for", + RankGBIF, species) + JSON_ls$`@graph`[[2]]$keywords <- list("GBIF", + "Occurrence", + "Biodiversity", + "Observation", + Mode) + JSON_ls$`@graph`[[2]]$description <- paste(Mode, + "input data for", + species) JSON_ls$`@graph`[[3]]$name <- basename(FNAME) JSON_ls$`@graph`[[3]]$contentSize <- file.size(FNAME) JSON_ls$`@graph`[[3]]$encodingFormat <- "application/RData" JSON_ls$`@graph`[[3]]$`@id` <- basename(FNAME) - JSON_ls$`@graph`[[4]]$name <- c(as.character(options("gbif_user")), JSON_ls$`@graph`[[4]]$name) - JSON_ls$`@graph`[[4]]$`@id` <- c(JSON_ls$`@graph`[[4]]$`@id`, as.character(options("gbif_email"))) - JSON_ls$`@graph`[[4]]$`@type` <- c(JSON_ls$`@graph`[[4]]$`@type`, "Organisation") + JSON_ls$`@graph`[[4]]$name <- c(as.character(options("gbif_user")), + JSON_ls$`@graph`[[4]]$name) + JSON_ls$`@graph`[[4]]$`@id` <- c(JSON_ls$`@graph`[[4]]$`@id`, + as.character(options("gbif_email"))) + JSON_ls$`@graph`[[4]]$`@type` <- c(JSON_ls$`@graph`[[4]]$`@type`, + "Organisation") - JSON_ls$`@graph`[[5]]$agent$`@id` <- c(JSON_ls$`@graph`[[5]]$agent$`@id`, as.character(options("gbif_email"))) + JSON_ls$`@graph`[[5]]$agent$`@id` <- c(JSON_ls$`@graph`[[5]]$agent$`@id`, + as.character(options("gbif_email"))) JSON_ls$`@graph`[[5]]$instrument$`@id` <- "https://github.com/BioDT/uc-CWR" con <- file(file.path(Dir, paste0(species, ".json"))) @@ -260,13 +302,17 @@ FUN.DownBV <- function( if(!Force & file.exists(FNAME)){ BV_ras <- stack(FNAME) names(BV_ras) <- paste0("BIO", 1:19) - message("Data has already been downloaded with these specifications previously. It has been loaded from the disk. If you wish to override the present data, please specify Force = TRUE") + message("Data has already been downloaded with these specifications. + It has been loaded from the disk. If you wish to override + the present data, please specify Force = TRUE") return(BV_ras) } ### Raw soil moisture level data ---- - #' We download raw soil moisture data for layers 1 (0-7cm) and 2 (7-28cm) separately. - #' These are then summed up and used in the bioclimatic variable computation of KrigR + #' We download raw soil moisture data for + #' layers 1 (0-7cm) and 2 (7-28cm) separately. + #' These are then summed up and used in the + #' bioclimatic variable computation of KrigR FNAME_RAW <- file.path(Dir, paste(tools::file_path_sans_ext(basename(FNAME)), "volumetric_soil_water_layer_1", "RAW.nc", sep = "_")) if(!file.exists(FNAME_RAW)) { @@ -331,6 +377,17 @@ FUN.DownBV <- function( API_Key = API_Key ) + # ### Masking ---- + # Land_sp <- ne_countries(type = "countries", scale = "medium") + # BV_ras <- crop(BV_ras, extent(Land_sp)) + # BV_mask <- KrigR:::mask_Shape(base.map = BV_ras[[1]], Shape = Land_sp[,"name"]) + # BV_ras <- mask(BV_ras, BV_mask) + # + # ### Saving ---- + # writeRaster(BV_ras, filename = FNAME, format = "CDF", overwrite = TRUE) + # unlink(file.path(Dir, "Qsoil_BC.nc")) + # names(BV_ras) <- paste0("BIO", 1:19) + ### JSON RO-CRATE creation ---- JSON_ls <- jsonlite::read_json("ro-crate-metadata.json") @@ -344,7 +401,8 @@ FUN.DownBV <- function( "ECMWF", "Bioclimatic Variables", "Soil Moisture") - JSON_ls$`@graph`[[2]]$description <- "Bioclimatic data obtained from ERA5-Land. Water avialbility is denoted via the sum of soil moisture layer 1 and 2." + JSON_ls$`@graph`[[2]]$description <- + "Bioclimatic data obtained from ERA5-Land. Water avialbility is denoted via the sum of soil moisture layer 1 and 2." JSON_ls$`@graph`[[3]]$name <- basename(FNAME) JSON_ls$`@graph`[[3]]$contentSize <- file.size(FNAME) @@ -356,11 +414,13 @@ FUN.DownBV <- function( con <- file(file.path(Dir, paste0(tools::file_path_sans_ext(basename(FNAME)), ".json"))) - writeLines(jsonlite::toJSON(JSON_ls, - pretty = TRUE), + writeLines(jsonlite::toJSON(JSON_ls, pretty = TRUE), con) close(con) + message(paste0("ERA5 citation:\nCopernicus Climate Change Service, Climate Data Store, (2024): ERA5-land post-processed daily-statistics from 1950 to present. Copernicus Climate Change Service (C3S) Climate Data Store (CDS), DOI: 10.24381/cds.e9c9c792 ", + "(Accesed on ", Sys.Date(),")")) + BV_ras } @@ -389,9 +449,15 @@ FUN.DownEV <- function(Dir = getwd(), # where to store the data output on disk soilGrids_url="/vsicurl?max_retry=3&retry_delay=1&list_dir=no&url=https://files.isric.org/soilgrids/latest/data/" #' overview of datasets: https://www.isric.org/explore/soilgrids/faq-soilgrids#What_do_the_filename_codes_mean #' NB! Each global map occupies circa 5 GB! It takes a while to download. - #' in addition, https://files.isric.org/soilgrids/latest/data/wrb/ + #' In addition, https://files.isric.org/soilgrids/latest/data/wrb/ #' has maps of soil types, as estimated probability of occurrence per type. #' MostProbable.vrt has the most probable soil type per gridcell. + #' Soil salinity: https://data.isric.org/geonetwork/srv/eng/catalog.search#/metadata/c59d0162-a258-4210-af80-777d7929c512 + #' Processing HWSD v2 in R (tutorial) + #' Technical note Processing the Harmonized World Soil Database (Version 2.0) in R, by David Rossiter https://www.isric.org/sites/default/files/R_HWSD2.pdf + #' HWSDv2 SQLite data set (accompanies HWSD v2 in R tutorial) + #' This is an SQLite version of HWSD ver. 2.0 for use with the tutorial prepared by David Rossiter. https://www.isric.org/sites/default/files/HWSD2.sqlite + SoilGrids_variables_in <- c( "bdod/bdod_0-5cm_mean", # Bulk density of the fine earth fraction, cg/cm³ "cec/cec_0-5cm_mean", # Cation Exchange Capacity of the soil, mmol(c)/kg @@ -425,7 +491,7 @@ FUN.DownEV <- function(Dir = getwd(), # where to store the data output on disk names(EV_stack) <- c("Nutrient", "Toxicity", SoilGrids_variables) - + ## resample to match a provided raster object's origin and resolution if(!resample_to_match){ EV_stack <- raster::resample(HSWD_PH_stack, # rasters to be resampled diff --git a/capfitogen_master.R b/capfitogen_master.R index f566d4c..fd257d7 100644 --- a/capfitogen_master.R +++ b/capfitogen_master.R @@ -18,7 +18,7 @@ #' Capfitogen credit: Parra-Quijano et al. 2021, #' https://repositorio.unal.edu.co/handle/unal/85787 #' ####################################################################### # - +#https://github.com/ErikKusch/KrigR/blob/94cdb9e0aa3cff9f996575f4fb3d10c617eb37ab/metadata/reanalysis-era5-land.RData # PREAMBLE ================================================================ set.seed(42) # making things reproducibly random rm(list=ls()) # clean environment @@ -79,10 +79,12 @@ Species_ls <- FUN.DownGBIF( ## Environmental Data ----------------------------------------------------- ##' Bioclomatic data: 19 BioClim variables ##' is each file of each variable >20GB? +##' Will this download Global Multi-resolution Terrain Elevation Data (GMTED2010) as well? +##' Temporal coverage: January 1950 to present ? https://cds.climate.copernicus.eu/datasets/derived-era5-land-daily-statistics?tab=overview message("Downloading new or loading existing 19 BioClim bioclimatic variables") bioclim_data <- FUN.DownBV( - T_Start = 2000, # what year to begin climatology calculation in - T_End = 2001, # what year to end climatology calculation in + T_Start = 1999, # what year to begin climatology calculation in + T_End = 1999, # what year to end climatology calculation in Dir = Dir.Data.Envir, # where to store the data output on disk Force = FALSE # do not overwrite already present data ) @@ -92,36 +94,9 @@ message("Retrieving edaphic variables") edaphic_data <- FUN.DownEV( Dir = Dir.Data.Envir, Force = FALSE, - resample_to_match = bioclim_data[[1]] + #resample_to_match = bioclim_data[[1]] ) -# PH_nutrient <- raster("https://www.fao.org/fileadmin/user_upload/soils/docs/HWSD/Soil_Quality_data/sq1.asc") -# PH_toxicity <- raster("https://www.fao.org/fileadmin/user_upload/soils/docs/HWSD/Soil_Quality_data/sq6.asc") -# PH_stack <- stack(PH_nutrient, PH_toxicity) -# PH_stack <- raster::resample(PH_stack, # rasters to be resampled -# bioclim_data[[1]]) # raster with parameters to be resampled to -# PH_stack <- stack(PH_stack, bioclim_data$BIO1, bioclim_data$BIO12) -# names(PH_stack) <- c("Nutrient", "Toxicity", "Temperature", "Soil Moisture") - - -#' existing data in "Data/Environment/BV-1985-2015.nc" -#' and soil data in .bil under /soil downloaded from Harmonized World -#' Soil Database version 2.0 -#' -# HJ: this section is not ready, used ready .nc files for testing instead -# Three functions: -# 1. Bioclimatic data: FUN.DownBV -# 2. Edaphic data: FUN.DownEV -# 3. Geophysical data: FUN.DownGV - -# message("Retrieving environmental data") -# -# # Bioclimatic data -# -# message("Retrieving bioclimatic data") -# -# # HJ: bioclimatic data lines from ModGP. TO DO: adapt to Capfitogen -# # biocl_ras <- FUN.DownBV(T_Start = 1985, # what year to begin climatology calculation in # T_End = 2015, # what year to end climatology calculation in # Dir = Dir.Data.Envir, # where to store the data output on disk From e1f8d4c0ab7748b503a554e411fab1ce9664b2b4 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Thu, 6 Feb 2025 17:58:21 +0100 Subject: [PATCH 11/72] update edaphic data download function (still draft) --- R_scripts/SHARED-Data.R | 67 +++++++++++++++++++++++++++++------------ capfitogen_master.R | 49 ++++++++++++------------------ 2 files changed, 67 insertions(+), 49 deletions(-) diff --git a/R_scripts/SHARED-Data.R b/R_scripts/SHARED-Data.R index 89897dd..f54d419 100644 --- a/R_scripts/SHARED-Data.R +++ b/R_scripts/SHARED-Data.R @@ -30,7 +30,7 @@ FUN.DownGBIF <- function( FNAME <- file.path(Dir, paste0(species, ".RData")) if(!Force & file.exists(FNAME)){ save_ls <- loadObj(FNAME) - message("Data has already been downloaded with these specifications previously. It has been loaded from the disk. If you wish to override the present data, please specify Force = TRUE") + message("Data has already been downloaded with these specifications. It has been loaded from the disk. \nIf you wish to override the present data, please specify Force = TRUE") return(save_ls) } @@ -371,12 +371,15 @@ FUN.DownBV <- function( Water_Type = "monthly_averaged_reanalysis", Y_start = T_Start, Y_end = T_End, Extent = ne_countries(type = "countries", scale = "medium")[,1], - Dir = Dir, FileName = basename(FNAME), - FileExtension = ".nc", Compression = 9, # file storing + Dir = Dir, + FileName = basename(FNAME), + FileExtension = ".nc", + Compression = 9, API_User = API_User, API_Key = API_Key ) + # Not sure if this should be in the code or not. # ### Masking ---- # Land_sp <- ne_countries(type = "countries", scale = "medium") # BV_ras <- crop(BV_ras, extent(Land_sp)) @@ -419,7 +422,7 @@ FUN.DownBV <- function( close(con) message(paste0("ERA5 citation:\nCopernicus Climate Change Service, Climate Data Store, (2024): ERA5-land post-processed daily-statistics from 1950 to present. Copernicus Climate Change Service (C3S) Climate Data Store (CDS), DOI: 10.24381/cds.e9c9c792 ", - "(Accesed on ", Sys.Date(),")")) + "(Accessed on ", Sys.Date(),")")) BV_ras } @@ -445,7 +448,7 @@ FUN.DownEV <- function(Dir = getwd(), # where to store the data output on disk # if the file doesn't already exist: ## downloading data from SoilGrids if(!file.exists(FNAME)){ - message("Start downloading data from SoilGrids") + message("Start downloading data from SoilGrids: files.isric.org/soilgrids/latest/data/") soilGrids_url="/vsicurl?max_retry=3&retry_delay=1&list_dir=no&url=https://files.isric.org/soilgrids/latest/data/" #' overview of datasets: https://www.isric.org/explore/soilgrids/faq-soilgrids#What_do_the_filename_codes_mean #' NB! Each global map occupies circa 5 GB! It takes a while to download. @@ -470,18 +473,46 @@ FUN.DownEV <- function(Dir = getwd(), # where to store the data output on disk "ocd/ocd_0-5cm_mean",# Organic carbon density hg/m³ "ocs/ocs_0-30cm_mean",# Organic carbon stocks t/ha "soc/soc_0-5cm_mean")# Soil organic carbon content in the fine earth fraction dg/kg + SoilGrids_variables <- sub(".*/", "", SoilGrids_variables_in) - soilGrids_data <- stack() - for (i in length(SoilGrids_variables_in)) { - variable_name = SoilGrids_variables[] - soilGrids_data[[i]] <- gdal_translate( - src_dataset = paste0(soilGrids_url, SoilGrids_variables_in[i], ".vrt"), # input to be downloaded - dst_dataset = paste0(Dir.Data.Envir, "/", SoilGrids_variables[i], ".tif"), # output file - tr = c(2500,2500) # target resolution + + soilGrids_data <- raster::stack() + + for (i in 1:length(SoilGrids_variables_in)) { + + variable_name = SoilGrids_variables[i] + + message(SoilGrids_variables[i]) + + downloaded_variable <- gdal_translate( + # input to be downloaded + src_dataset = paste0(soilGrids_url, SoilGrids_variables_in[i], ".vrt"), + # output file + dst_dataset = paste0(Dir.Data.Envir, "/", SoilGrids_variables[i], ".tif"), + # target resolution + tr = c(2500, 2500) ) + + downloaded_raster <- raster(downloaded_variable) + + ## if provided, resample to match another raster object's origin and resolution + if (!missing(resample_to_match)) { + message(paste0("resampling raster to match ", names(resample_to_match))) + resample_to_match <- raster(resample_to_match) + + ## project SoilGrids raster to match resample_to_match file + projection_to_match <- proj4string(resample_to_match) + raster::projection(downloaded_raster) <- projection_to_match + + ## resample + downloaded_raster <- raster::resample(downloaded_raster, # raster to be resampled + resample_to_match) # raster with parameters to be resampled to + } + + soilGrids_data[i] <- downloaded_raster } ## downloading data from HSWD - message("Start downloading data from HSWD (harmonised world soil database)") + message("Downloading data from HSWD (harmonised world soil database) via fao.org") PH_nutrient <- raster("https://www.fao.org/fileadmin/user_upload/soils/docs/HWSD/Soil_Quality_data/sq1.asc") PH_toxicity <- raster("https://www.fao.org/fileadmin/user_upload/soils/docs/HWSD/Soil_Quality_data/sq6.asc") HSWD_PH_stack <- stack(PH_nutrient, PH_toxicity) @@ -491,16 +522,12 @@ FUN.DownEV <- function(Dir = getwd(), # where to store the data output on disk names(EV_stack) <- c("Nutrient", "Toxicity", SoilGrids_variables) - - ## resample to match a provided raster object's origin and resolution - if(!resample_to_match){ - EV_stack <- raster::resample(HSWD_PH_stack, # rasters to be resampled - resample_to_match) # raster with parameters to be resampled to - } } ### Saving ---- - terra::writeCDF(EV_stack, filename = FNAME, overwrite = FALSE) + terra::writeCDF(EV_stack, + filename = FNAME, + overwrite = FALSE) #unlink(file.path(Dir.Data.Envir, "Edaphic", "soil_world", "*.tif")) EV_stack diff --git a/capfitogen_master.R b/capfitogen_master.R index fd257d7..7298fa8 100644 --- a/capfitogen_master.R +++ b/capfitogen_master.R @@ -77,7 +77,9 @@ Species_ls <- FUN.DownGBIF( ) ## Environmental Data ----------------------------------------------------- -##' Bioclomatic data: 19 BioClim variables +### Bioclomatic data ------ +##' 19 BioClim variables +##' FUN.DownBV uses KrigR to download ERA5 data from Climate Data Store (CDS) ##' is each file of each variable >20GB? ##' Will this download Global Multi-resolution Terrain Elevation Data (GMTED2010) as well? ##' Temporal coverage: January 1950 to present ? https://cds.climate.copernicus.eu/datasets/derived-era5-land-daily-statistics?tab=overview @@ -89,32 +91,10 @@ bioclim_data <- FUN.DownBV( Force = FALSE # do not overwrite already present data ) -## Edaphic data: -message("Retrieving edaphic variables") -edaphic_data <- FUN.DownEV( - Dir = Dir.Data.Envir, - Force = FALSE, - #resample_to_match = bioclim_data[[1]] -) - -# biocl_ras <- FUN.DownBV(T_Start = 1985, # what year to begin climatology calculation in -# T_End = 2015, # what year to end climatology calculation in -# Dir = Dir.Data.Envir, # where to store the data output on disk -# Force = FALSE # do not overwrite already present data -# ) -# -# # Edaphic data - -# edaph_ras <- FUN.DownEV("soc", "silt", "sand") # TO DO - -#geophy_ras <- FUN.DownGV( ) # TO DO - -## read variables -------------------------------------------------------- -bioclim_ras <- terra::rast(file.path(Dir.Data.Envir, - "BV_1985-2015.nc")) -bioclim_ras <- terra::project(bioclim_ras, - "EPSG:4326") # WGS84; World Geodetic System 1984 -BioClim_names <- c( ## BioClim variable names, see https://www.worldclim.org/data/bioclim.html +bioclim_variables <- terra::rast(file.path(Dir.Data.Envir, "BV_1985-2015.nc")) +#bioclim_variables <- terra::project(bioclim_variables, "EPSG:4326") # WGS84 +BioClim_names <- c( + ## BioClim variable names, see https://www.worldclim.org/data/bioclim.html "BIO1_Annual_Mean_Temperature", "BIO2_Mean_Diurnal_Range", "BIO3_Isothermality", @@ -134,8 +114,19 @@ BioClim_names <- c( ## BioClim variable names, see https://www.worldclim.org/dat "BIO17_Precipitation_of_Driest_Quarter", "BIO18_Precipitation_of_Warmest_Quarter", "BIO19_Precipitation_of_Coldest_Quarter") -names(bioclim_ras) <- BioClim_names +names(bioclim_variables) <- BioClim_names + +### Edaphic data ------ +message("Downloading new or loading existing edaphic variables") +source(file.path(Dir.R_scripts, "SHARED-Data.R")) + +edaphic_data <- FUN.DownEV( + Dir = Dir.Data.Envir, + Force = FALSE, + resample_to_match = bioclim_variables[[1]] +) +### Geophysical data ------ geophys_ras <- terra::rast(file.path(Dir.Data.Envir, "geophys.nc")) @@ -180,7 +171,7 @@ source(file.path(Dir.R_scripts, "VarSelection.R")) # complete HJs version, or im message("Selecting variables") bioclim_ext <- FUN.VarSelection(specdata = Species_ls$occs, #occ_ls, # - varstack = bioclim_ras) + varstack = bioclim_variables) # buf = 2 # HJ: buffer doesn't work properly with terra, gets stuck? From 223aed537b890052dc3c871690824399804881a2 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Fri, 14 Feb 2025 09:50:58 +0100 Subject: [PATCH 12/72] update SoilGrids downloads --- R_scripts/ModGP-commonlines.R | 2 +- R_scripts/SHARED-Data.R | 102 ++++++++++++++++++++++------------ capfitogen_master.R | 8 ++- 3 files changed, 72 insertions(+), 40 deletions(-) diff --git a/R_scripts/ModGP-commonlines.R b/R_scripts/ModGP-commonlines.R index c120d7a..79ef92e 100644 --- a/R_scripts/ModGP-commonlines.R +++ b/R_scripts/ModGP-commonlines.R @@ -66,7 +66,7 @@ sapply(package_vec, install.load.package) ### NON-CRAN PACKAGES ---- # check if KrigR is missing or outdated -if(packageVersion("KrigR") < "0.9.1" || +if(packageVersion("KrigR") < "0.9.5" || "KrigR" %in% rownames(installed.packages()) == FALSE) { message("installing KrigR from github.com/ErikKusch/KrigR") devtools::install_github("https://github.com/ErikKusch/KrigR", diff --git a/R_scripts/SHARED-Data.R b/R_scripts/SHARED-Data.R index f54d419..f647b12 100644 --- a/R_scripts/SHARED-Data.R +++ b/R_scripts/SHARED-Data.R @@ -297,6 +297,12 @@ FUN.DownBV <- function( Dir = getwd(), # where to store the data output on disk Force = FALSE # do not overwrite already present data ){ + + # Workaround for Dir, 1/2 + # original_wd = getwd() + # setwd(Dir) + # End of workaround + FNAME <- file.path(Dir, paste0("BV_", T_Start, "-", T_End, ".nc")) if(!Force & file.exists(FNAME)){ @@ -424,6 +430,10 @@ FUN.DownBV <- function( message(paste0("ERA5 citation:\nCopernicus Climate Change Service, Climate Data Store, (2024): ERA5-land post-processed daily-statistics from 1950 to present. Copernicus Climate Change Service (C3S) Climate Data Store (CDS), DOI: 10.24381/cds.e9c9c792 ", "(Accessed on ", Sys.Date(),")")) + # Workaround for Dir, 2/2 +# setwd(original_wd) + # End of workaround + BV_ras } @@ -451,32 +461,30 @@ FUN.DownEV <- function(Dir = getwd(), # where to store the data output on disk message("Start downloading data from SoilGrids: files.isric.org/soilgrids/latest/data/") soilGrids_url="/vsicurl?max_retry=3&retry_delay=1&list_dir=no&url=https://files.isric.org/soilgrids/latest/data/" #' overview of datasets: https://www.isric.org/explore/soilgrids/faq-soilgrids#What_do_the_filename_codes_mean - #' NB! Each global map occupies circa 5 GB! It takes a while to download. + #' NB! Each global map occupies circa 20 GB for 250x20m resolution! + #' It takes a while to download. #' In addition, https://files.isric.org/soilgrids/latest/data/wrb/ #' has maps of soil types, as estimated probability of occurrence per type. #' MostProbable.vrt has the most probable soil type per gridcell. #' Soil salinity: https://data.isric.org/geonetwork/srv/eng/catalog.search#/metadata/c59d0162-a258-4210-af80-777d7929c512 - #' Processing HWSD v2 in R (tutorial) - #' Technical note Processing the Harmonized World Soil Database (Version 2.0) in R, by David Rossiter https://www.isric.org/sites/default/files/R_HWSD2.pdf - #' HWSDv2 SQLite data set (accompanies HWSD v2 in R tutorial) - #' This is an SQLite version of HWSD ver. 2.0 for use with the tutorial prepared by David Rossiter. https://www.isric.org/sites/default/files/HWSD2.sqlite - + SoilGrids_variables_in <- c( - "bdod/bdod_0-5cm_mean", # Bulk density of the fine earth fraction, cg/cm³ + "bdod/bdod_0-5cm_mean", # Bulk density of the fine earth fraction, cg/cm³ "cec/cec_0-5cm_mean", # Cation Exchange Capacity of the soil, mmol(c)/kg "cfvo/cfvo_0-5cm_mean", # Volumetric fraction of coarse fragments (> 2 mm) cm3/dm3 (vol‰) - "silt/silt_0-5cm_mean", # Proportion of silt particles (≥ 0.002 mm and ≤ 0.05/0.063 mm) in the fine earth fraction g/kg - "clay/clay_0-5cm_mean", # Proportion of clay particles (< 0.002 mm) in the fine earth fraction g/kg - "sand/sand_0-5cm_mean", # Proportion of sand particles (> 0.05/0.063 mm) in the fine earth fraction g/kg - "nitrogen/nitrogen_0-5cm_mean", # Total nitrogen (N) cg/kg - "phh2o/phh2o_0-5cm_mean", # Soil pH pHx10 - "ocd/ocd_0-5cm_mean",# Organic carbon density hg/m³ - "ocs/ocs_0-30cm_mean",# Organic carbon stocks t/ha - "soc/soc_0-5cm_mean")# Soil organic carbon content in the fine earth fraction dg/kg + "silt/silt_0-5cm_mean")#, # Proportion of silt particles (≥ 0.002 mm and ≤ 0.05/0.063 mm) in the fine earth fraction g/kg + #"clay/clay_0-5cm_mean", # Proportion of clay particles (< 0.002 mm) in the fine earth fraction g/kg + #"sand/sand_0-5cm_mean", # Proportion of sand particles (> 0.05/0.063 mm) in the fine earth fraction g/kg + #"nitrogen/nitrogen_0-5cm_mean", # Total nitrogen (N) cg/kg + #"phh2o/phh2o_0-5cm_mean", # Soil pH pHx10 + #"ocd/ocd_0-5cm_mean",# Organic carbon density hg/m³ + #"ocs/ocs_0-30cm_mean",# Organic carbon stocks t/ha + #"soc/soc_0-5cm_mean")# Soil organic carbon content in the fine earth fraction dg/kg SoilGrids_variables <- sub(".*/", "", SoilGrids_variables_in) - soilGrids_data <- raster::stack() + #soilGrids_data <- raster::stack() # try list instead, rasters have different extents + soilGrids_data <- list() for (i in 1:length(SoilGrids_variables_in)) { @@ -484,15 +492,23 @@ FUN.DownEV <- function(Dir = getwd(), # where to store the data output on disk message(SoilGrids_variables[i]) - downloaded_variable <- gdal_translate( - # input to be downloaded - src_dataset = paste0(soilGrids_url, SoilGrids_variables_in[i], ".vrt"), - # output file - dst_dataset = paste0(Dir.Data.Envir, "/", SoilGrids_variables[i], ".tif"), - # target resolution - tr = c(2500, 2500) + path_to_downloaded_file <- paste0(Dir.Data.Envir, "/", + SoilGrids_variables[i], ".tif") + + # if variable is not downloaded already, ... + ifelse(!file.exists(path_to_downloaded_file), + # download it, ... + downloaded_variable <- gdalUtilities::gdal_translate( + src_dataset = paste0(soilGrids_url, + SoilGrids_variables_in[i], ".vrt"), + dst_dataset = path_to_downloaded_file, + tr = c(250, 250) # target resolution + ), + # or else load it from file + downloaded_variable <- path_to_downloaded_file ) + ## load variable as raster downloaded_raster <- raster(downloaded_variable) ## if provided, resample to match another raster object's origin and resolution @@ -510,25 +526,39 @@ FUN.DownEV <- function(Dir = getwd(), # where to store the data output on disk } soilGrids_data[i] <- downloaded_raster + message(extent(soilGrids_data[[i]])) } - ## downloading data from HSWD - message("Downloading data from HSWD (harmonised world soil database) via fao.org") - PH_nutrient <- raster("https://www.fao.org/fileadmin/user_upload/soils/docs/HWSD/Soil_Quality_data/sq1.asc") - PH_toxicity <- raster("https://www.fao.org/fileadmin/user_upload/soils/docs/HWSD/Soil_Quality_data/sq6.asc") - HSWD_PH_stack <- stack(PH_nutrient, PH_toxicity) + # ## downloading data from HSWD + #' Technical note Processing the Harmonized World Soil Database (Version 2.0) in R, by David Rossiter https://www.isric.org/sites/default/files/R_HWSD2.pdf + #' HWSDv2 SQLite data set (accompanies HWSD v2 in R tutorial) + #' This is an SQLite version of HWSD ver. 2.0 for use with the tutorial prepared by David Rossiter. https://www.isric.org/sites/default/files/HWSD2.sqlite + + # message("Downloading data from HSWD (harmonised world soil database) via fao.org") + # PH_nutrient <- raster("https://www.fao.org/fileadmin/user_upload/soils/docs/HWSD/Soil_Quality_data/sq1.asc") + # PH_toxicity <- raster("https://www.fao.org/fileadmin/user_upload/soils/docs/HWSD/Soil_Quality_data/sq6.asc") + # #HSWD_PH_stack <- stack(PH_nutrient, PH_toxicity) + # HSWD_PH_stack <- list(PH_nutrient, PH_toxicity) + # + # ## combine and rename rasters + # EV_stack <- #stack(HSWD_PH_stack, soilGrids_data) + # c(HSWD_PH_stack, soilGrids_data) + # names(EV_stack) <- c("Nutrient", + # "Toxicity", + # SoilGrids_variables) - ## combine and rename rasters - EV_stack <- stack(HSWD_PH_stack, soilGrids_data) - names(EV_stack) <- c("Nutrient", - "Toxicity", - SoilGrids_variables) + # EV_stack <- stack(unlist(soilGrids_data)) # stacking not possible with rasters of different extent + # names(EV_stack) <- SoilGrids_variables } ### Saving ---- - terra::writeCDF(EV_stack, - filename = FNAME, + message("save as RData") + saveRDS(soilGrids_data, + paste0(Dir.Data.Envir, "/edaphic_data_temp.RData")) + + message("try saving as nc") + terra::writeCDF(EV_stack, + filename = FNAME, overwrite = FALSE) - #unlink(file.path(Dir.Data.Envir, "Edaphic", "soil_world", "*.tif")) EV_stack diff --git a/capfitogen_master.R b/capfitogen_master.R index 7298fa8..92b4621 100644 --- a/capfitogen_master.R +++ b/capfitogen_master.R @@ -91,6 +91,7 @@ bioclim_data <- FUN.DownBV( Force = FALSE # do not overwrite already present data ) +#test <- terra::rast("Data/Environment/TEMP_2m_temperature_00001") bioclim_variables <- terra::rast(file.path(Dir.Data.Envir, "BV_1985-2015.nc")) #bioclim_variables <- terra::project(bioclim_variables, "EPSG:4326") # WGS84 BioClim_names <- c( @@ -117,13 +118,14 @@ BioClim_names <- c( names(bioclim_variables) <- BioClim_names ### Edaphic data ------ -message("Downloading new or loading existing edaphic variables") +##' NB! each file at 250x250m is ~20GB... +message("Downloading new or loading existing edaphic/soil variables") source(file.path(Dir.R_scripts, "SHARED-Data.R")) edaphic_data <- FUN.DownEV( Dir = Dir.Data.Envir, - Force = FALSE, - resample_to_match = bioclim_variables[[1]] + Force = FALSE#, + #resample_to_match = bioclim_variables[[1]] ) ### Geophysical data ------ From 7ca8146e9f13aa5ce0168436f7e0629e863bdcf0 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Mon, 17 Feb 2025 10:29:35 +0100 Subject: [PATCH 13/72] first functioning edaphic download function --- R_scripts/ModGP-commonlines.R | 2 +- R_scripts/SHARED-Data.R | 291 +++++++++++++++++----------------- capfitogen_master.R | 9 +- 3 files changed, 152 insertions(+), 150 deletions(-) diff --git a/R_scripts/ModGP-commonlines.R b/R_scripts/ModGP-commonlines.R index 79ef92e..ccaf0ed 100644 --- a/R_scripts/ModGP-commonlines.R +++ b/R_scripts/ModGP-commonlines.R @@ -66,7 +66,7 @@ sapply(package_vec, install.load.package) ### NON-CRAN PACKAGES ---- # check if KrigR is missing or outdated -if(packageVersion("KrigR") < "0.9.5" || +if(packageVersion("KrigR") < "0.9.6.1" || "KrigR" %in% rownames(installed.packages()) == FALSE) { message("installing KrigR from github.com/ErikKusch/KrigR") devtools::install_github("https://github.com/ErikKusch/KrigR", diff --git a/R_scripts/SHARED-Data.R b/R_scripts/SHARED-Data.R index f647b12..6524237 100644 --- a/R_scripts/SHARED-Data.R +++ b/R_scripts/SHARED-Data.R @@ -438,172 +438,175 @@ FUN.DownBV <- function( } ## EDAPHIC DATA DOWNLOAD ------------------------------------------------------ -# HJ: new part, INCOMPLETE! -# EL: data also need to be changed to get the same resolution as the BioClim -# variables downloaded with FUN.DownBV - -FUN.DownEV <- function(Dir = getwd(), # where to store the data output on disk - Force = FALSE, # do not overwrite already present data, - resample_to_match = FALSE){ - # define a file name - FNAME <- file.path(Dir, "edaphic.nc") - - # check if file already exists and whether to overwrite - if(!Force & file.exists(FNAME)){ - EV_ras <- stack(FNAME) - #names(EV_ras) <- paste0("BIO", 1:19) # replace with edaphic names vector - message("Data has already been downloaded with these specifications. It has been loaded from the disk. If you wish to override the present data, please specify Force = TRUE") - return(EV_ras) - } - # if the file doesn't already exist: - ## downloading data from SoilGrids - if(!file.exists(FNAME)){ - message("Start downloading data from SoilGrids: files.isric.org/soilgrids/latest/data/") - soilGrids_url="/vsicurl?max_retry=3&retry_delay=1&list_dir=no&url=https://files.isric.org/soilgrids/latest/data/" - #' overview of datasets: https://www.isric.org/explore/soilgrids/faq-soilgrids#What_do_the_filename_codes_mean - #' NB! Each global map occupies circa 20 GB for 250x20m resolution! - #' It takes a while to download. - #' In addition, https://files.isric.org/soilgrids/latest/data/wrb/ - #' has maps of soil types, as estimated probability of occurrence per type. - #' MostProbable.vrt has the most probable soil type per gridcell. - #' Soil salinity: https://data.isric.org/geonetwork/srv/eng/catalog.search#/metadata/c59d0162-a258-4210-af80-777d7929c512 - - SoilGrids_variables_in <- c( - "bdod/bdod_0-5cm_mean", # Bulk density of the fine earth fraction, cg/cm³ - "cec/cec_0-5cm_mean", # Cation Exchange Capacity of the soil, mmol(c)/kg - "cfvo/cfvo_0-5cm_mean", # Volumetric fraction of coarse fragments (> 2 mm) cm3/dm3 (vol‰) - "silt/silt_0-5cm_mean")#, # Proportion of silt particles (≥ 0.002 mm and ≤ 0.05/0.063 mm) in the fine earth fraction g/kg - #"clay/clay_0-5cm_mean", # Proportion of clay particles (< 0.002 mm) in the fine earth fraction g/kg - #"sand/sand_0-5cm_mean", # Proportion of sand particles (> 0.05/0.063 mm) in the fine earth fraction g/kg - #"nitrogen/nitrogen_0-5cm_mean", # Total nitrogen (N) cg/kg - #"phh2o/phh2o_0-5cm_mean", # Soil pH pHx10 - #"ocd/ocd_0-5cm_mean",# Organic carbon density hg/m³ - #"ocs/ocs_0-30cm_mean",# Organic carbon stocks t/ha - #"soc/soc_0-5cm_mean")# Soil organic carbon content in the fine earth fraction dg/kg +# INCOMPLETE! Works for some variables, but the data set is incomplete. +FUN.DownEV <- + function(Dir = getwd(), # where to store the data output on disk + Force = FALSE, # do not overwrite already present data, + resample_to_match = FALSE) { + # define a file name + FNAME <- file.path(Dir, "edaphic.nc") - SoilGrids_variables <- sub(".*/", "", SoilGrids_variables_in) - - #soilGrids_data <- raster::stack() # try list instead, rasters have different extents - soilGrids_data <- list() - - for (i in 1:length(SoilGrids_variables_in)) { + # check if file already exists and whether to overwrite + if (!Force & file.exists(FNAME)) { + EV_ras <- stack(FNAME) + message( + "Data has already been downloaded with these specifications. It has been loaded from the disk. If you wish to override the present data, please specify Force = TRUE" + ) + return(EV_ras) + } + # if the file doesn't already exist: + if (!file.exists(FNAME)) { + ## downloading data from SoilGrids + message("Start downloading data from SoilGrids: files.isric.org/soilgrids/latest/data/") + soilGrids_url = "/vsicurl?max_retry=3&retry_delay=1&list_dir=no&url=https://files.isric.org/soilgrids/latest/data/" - variable_name = SoilGrids_variables[i] + #' overview of datasets: https://www.isric.org/explore/soilgrids/faq-soilgrids#What_do_the_filename_codes_mean + #' NB! Each global map occupies circa 20 GB for 250x20m resolution! + #' It takes a while to download. + #' In addition, https://files.isric.org/soilgrids/latest/data/wrb/ + #' has maps of soil types, as estimated probability of occurrence per type. + #' MostProbable.vrt has the most probable soil type per gridcell. + #' Soil salinity: https://data.isric.org/geonetwork/srv/eng/catalog.search#/metadata/c59d0162-a258-4210-af80-777d7929c512 - message(SoilGrids_variables[i]) + SoilGrids_variables_in <- c( + "bdod/bdod_0-5cm_mean", # Bulk density of the fine earth fraction, cg/cm³ + "cec/cec_0-5cm_mean", # Cation Exchange Capacity of the soil, mmol(c)/kg + "cfvo/cfvo_0-5cm_mean", # Volumetric fraction of coarse fragments (> 2 mm) cm3/dm3 (vol‰) + "silt/silt_0-5cm_mean")#, # Proportion of silt particles (≥ 0.002 mm and ≤ 0.05/0.063 mm) in the fine earth fraction g/kg + #"clay/clay_0-5cm_mean", # Proportion of clay particles (< 0.002 mm) in the fine earth fraction g/kg + #"sand/sand_0-5cm_mean", # Proportion of sand particles (> 0.05/0.063 mm) in the fine earth fraction g/kg + #"nitrogen/nitrogen_0-5cm_mean", # Total nitrogen (N) cg/kg + #"phh2o/phh2o_0-5cm_mean", # Soil pH pHx10 + #"ocd/ocd_0-5cm_mean",# Organic carbon density hg/m³ + #"ocs/ocs_0-30cm_mean",# Organic carbon stocks t/ha + #"soc/soc_0-5cm_mean")# Soil organic carbon content in the fine earth fraction dg/kg - path_to_downloaded_file <- paste0(Dir.Data.Envir, "/", - SoilGrids_variables[i], ".tif") + SoilGrids_variables <- sub(".*/", "", SoilGrids_variables_in) - # if variable is not downloaded already, ... - ifelse(!file.exists(path_to_downloaded_file), - # download it, ... - downloaded_variable <- gdalUtilities::gdal_translate( - src_dataset = paste0(soilGrids_url, - SoilGrids_variables_in[i], ".vrt"), - dst_dataset = path_to_downloaded_file, - tr = c(250, 250) # target resolution - ), - # or else load it from file - downloaded_variable <- path_to_downloaded_file - ) + soilGrids_data <- list() + + for (i in 1:length(SoilGrids_variables_in)) { + variable_name = SoilGrids_variables[i] + + message(SoilGrids_variables[i]) + + path_to_downloaded_file <- paste0(Dir.Data.Envir, "/", + SoilGrids_variables[i], ".tif") + + # if variable is not downloaded already, ... + ifelse( + !file.exists(path_to_downloaded_file), + # download it, ... + downloaded_variable <- gdalUtilities::gdal_translate( + src_dataset = paste0(soilGrids_url, + SoilGrids_variables_in[i], ".vrt"), + dst_dataset = path_to_downloaded_file, + tr = c(250, 250) # target resolution + ), + # or else load it from file + downloaded_variable <- path_to_downloaded_file + ) + + ## load variable as raster + downloaded_raster <- rast(downloaded_variable) + + ## if provided, resample to match another raster object's origin and resolution + if (!missing(resample_to_match)) { + message(paste0("resampling raster to match ", names(resample_to_match))) + resample_to_match <- rast(resample_to_match) + + ## project downloaded rasters to match resample_to_match file + projection_to_match <- terra::crs(resample_to_match) + terra::crs(downloaded_raster) <- projection_to_match + + ## resample + downloaded_raster <- + terra::resample(downloaded_raster, + resample_to_match) + + } + + soilGrids_data[i] <- downloaded_raster + + } + + ## download additional rasters from HSWD + message("Downloading data from HSWD (harmonised world soil database) via fao.org") + + PH_nutrient <- + rast("https://www.fao.org/fileadmin/user_upload/soils/docs/HWSD/Soil_Quality_data/sq1.asc") - ## load variable as raster - downloaded_raster <- raster(downloaded_variable) + PH_toxicity <- + rast("https://www.fao.org/fileadmin/user_upload/soils/docs/HWSD/Soil_Quality_data/sq6.asc") ## if provided, resample to match another raster object's origin and resolution if (!missing(resample_to_match)) { message(paste0("resampling raster to match ", names(resample_to_match))) - resample_to_match <- raster(resample_to_match) - - ## project SoilGrids raster to match resample_to_match file - projection_to_match <- proj4string(resample_to_match) - raster::projection(downloaded_raster) <- projection_to_match + resample_to_match <- rast(resample_to_match) + + ## project downloaded rasters to match resample_to_match file + projection_to_match <- terra::crs(resample_to_match) + terra::crs(PH_nutrient) <- projection_to_match + terra::crs(PH_toxicity) <- projection_to_match ## resample - downloaded_raster <- raster::resample(downloaded_raster, # raster to be resampled - resample_to_match) # raster with parameters to be resampled to - } + PH_nutrient <- + terra::resample(PH_nutrient, + resample_to_match) + + PH_toxicity <- + terra::resample(PH_toxicity, + resample_to_match) + + ## combine and rename rasters + EV_rasters <- rast(c(soilGrids_data, PH_nutrient, PH_toxicity)) + names(EV_rasters) <- c(SoilGrids_variables, "Nutrient", "Toxicity") - soilGrids_data[i] <- downloaded_raster - message(extent(soilGrids_data[[i]])) } - # ## downloading data from HSWD - #' Technical note Processing the Harmonized World Soil Database (Version 2.0) in R, by David Rossiter https://www.isric.org/sites/default/files/R_HWSD2.pdf - #' HWSDv2 SQLite data set (accompanies HWSD v2 in R tutorial) - #' This is an SQLite version of HWSD ver. 2.0 for use with the tutorial prepared by David Rossiter. https://www.isric.org/sites/default/files/HWSD2.sqlite - # message("Downloading data from HSWD (harmonised world soil database) via fao.org") - # PH_nutrient <- raster("https://www.fao.org/fileadmin/user_upload/soils/docs/HWSD/Soil_Quality_data/sq1.asc") - # PH_toxicity <- raster("https://www.fao.org/fileadmin/user_upload/soils/docs/HWSD/Soil_Quality_data/sq6.asc") - # #HSWD_PH_stack <- stack(PH_nutrient, PH_toxicity) - # HSWD_PH_stack <- list(PH_nutrient, PH_toxicity) - # - # ## combine and rename rasters - # EV_stack <- #stack(HSWD_PH_stack, soilGrids_data) - # c(HSWD_PH_stack, soilGrids_data) - # names(EV_stack) <- c("Nutrient", - # "Toxicity", - # SoilGrids_variables) + ### Saving ---- + message(paste0("saving as netCDF:", FNAME)) + terra::writeCDF(EV_rasters, + filename = FNAME, + overwrite = FALSE) + + EV_rasters - # EV_stack <- stack(unlist(soilGrids_data)) # stacking not possible with rasters of different extent - # names(EV_stack) <- SoilGrids_variables } - - ### Saving ---- - message("save as RData") - saveRDS(soilGrids_data, - paste0(Dir.Data.Envir, "/edaphic_data_temp.RData")) - - message("try saving as nc") - terra::writeCDF(EV_stack, - filename = FNAME, - overwrite = FALSE) - - EV_stack - } -### Masking ---- -# whole world -#Land_sp <- ne_countries(type = "countries", scale = "medium") - -# HJ: for testing/ to match previous Capfitogen tests: only Spain -# HJ: this is an attempt to do the same thing with terra that was done with KrigR in ModGP (see below) -# please switch back to KrigR is wanted/needed -# Land_sp <- ne_states("Spain") -# edaph_ras <- crop(edaph_ras, terra::ext(Land_sp)) -# edaph_ras <- terra::mask(edaph_ras, vect(Land_sp)) -# BV_ras <- crop(BV_ras, extent(Land_sp)) -# BV_mask <- KrigR:::mask_Shape(base.map = BV_ras[[1]], Shape = Land_sp[,"name"]) -# BV_ras <- mask(BV_ras, BV_mask) +## GEOPHYSICAL DATA DOWNLOAD -------------------------------------------------- +FUN.DownGV <- + function(Dir = getwd(), # where to store the data output on disk + Force = FALSE, # do not overwrite already present data, + resample_to_match = FALSE){ + + # define a file name + FNAME <- file.path(Dir, "geophysical.nc") + + # check if file already exists and whether to overwrite + if (!Force & file.exists(FNAME)) { + GV_ras <- stack(FNAME) + message( + "Data has already been downloaded with these specifications. It has been loaded from the disk. If you wish to override the present data, please specify Force = TRUE" + ) + return(GV_ras) + } + + # if the file doesn't already exist: + if (!file.exists(FNAME)) { + + + } + -# HJ: missing: data source, download function for .nc files, where to set the selected variables -FUN.DownGV <- function(arg1, arg2){ - - FNAME <- file.path(Dir.Data.Envir, "geophys.nc") - - evarg <- c(arg1, arg2) - geophys_ras <- ??? file.path(Dir.Data.Envir, "Geophysical") - - # whole world - #Land_sp <- ne_countries(type = "countries", scale = "medium") - # HJ: for testing/ to match previous Capfitogen tests: only Spain - # HJ: this is an attempt to do the same thing with terra that was done with KrigR in ModGP (see below) - # please switch back to KrigR is wanted/needed - Land_sp <- ne_states("Spain") - geophys_ras <- crop(geophys_ras, terra::ext(Land_sp)) - geophys_ras <- terra::mask(geophys_ras, vect(Land_sp)) - - # BV_ras <- crop(BV_ras, extent(Land_sp)) - # BV_mask <- KrigR:::mask_Shape(base.map = BV_ras[[1]], Shape = Land_sp[,"name"]) - # BV_ras <- mask(BV_ras, BV_mask) ### Saving ---- - terra::writeCDF(geophys_ras, filename = FNAME, overwrite = TRUE) - unlink(file.path(Dir.Data.Envir, "Geophysical")) - - geophys_ras + terra::writeCDF(geophysical_raster, + filename = FNAME, + overwrite = TRUE) + + geophysical_raster } \ No newline at end of file diff --git a/capfitogen_master.R b/capfitogen_master.R index 92b4621..0061f87 100644 --- a/capfitogen_master.R +++ b/capfitogen_master.R @@ -118,14 +118,13 @@ BioClim_names <- c( names(bioclim_variables) <- BioClim_names ### Edaphic data ------ -##' NB! each file at 250x250m is ~20GB... +## NB! each file at 250x250m is ~20GB... message("Downloading new or loading existing edaphic/soil variables") -source(file.path(Dir.R_scripts, "SHARED-Data.R")) -edaphic_data <- FUN.DownEV( +edaphic_variables <- FUN.DownEV( Dir = Dir.Data.Envir, - Force = FALSE#, - #resample_to_match = bioclim_variables[[1]] + Force = TRUE, + resample_to_match = bioclim_variables[[1]] ) ### Geophysical data ------ From 5915d4faeee4c9aba867c042ad4c8aab48b48c1c Mon Sep 17 00:00:00 2001 From: evalieungh Date: Mon, 17 Feb 2025 20:12:20 +0100 Subject: [PATCH 14/72] draft download from google EE - not working --- R_scripts/SHARED-Data.R | 61 +++++++++++++++++++++++++++++++++++++++-- capfitogen_master.R | 45 +++++++++++++----------------- 2 files changed, 78 insertions(+), 28 deletions(-) diff --git a/R_scripts/SHARED-Data.R b/R_scripts/SHARED-Data.R index 6524237..8b2d2b0 100644 --- a/R_scripts/SHARED-Data.R +++ b/R_scripts/SHARED-Data.R @@ -595,11 +595,22 @@ FUN.DownGV <- # if the file doesn't already exist: if (!file.exists(FNAME)) { + ## Download digital elevation model (DEM) from + ##' Jarvis A., H.I. Reuter, A. Nelson, E. Guevara, 2008, Hole-filled + ##' seamless SRTM data V4, International Centre for Tropical Agriculture + ##' (CIAT), available from http://srtm.csi.cgiar.org. + #dem <- rast("https://srtm.csi.cgiar.org/wp-content/uploads/files/250m/tiles250m.jpg") + + ## Download CHILI: Continuous Heat-Insolation Load Index + ##' Theobald, D. M., Harrison-Atlas, D., Monahan, W. B., & Albano, C. M. + ##' (2015). Ecologically-relevant maps of landforms and physiographic + ##' diversity for climate adaptation planning. PloS one, 10(12), e0143619 + insolation_index <- rast("") } - + ### Saving ---- @@ -609,4 +620,50 @@ FUN.DownGV <- geophysical_raster -} \ No newline at end of file + } + +# WGS84 = EPSG:4326 + +install.packages("reticulate") # python environment - https://rstudio.github.io/reticulate/ +install.packages("rgeedim") # search and download Google Earth Engine imagery with Python + +library(reticulate) + +virtualenv_create(envname = "uc_CWR", # saved under /Documents/.virtualenvs/uc_CWR + packages = c("numpy","geedim"), + python = "C:/Program Files/Python3.10/python.exe" +) + +virtualenv_list() + +use_virtualenv("uc_CWR") + +library(rgeedim) + +names(geedim()$enums) + +chili_img_id <- gd_image_from_id('CSP/ERGo/1_0/Global/ALOS_CHILI') + +chili <- + gd_download(chili_img_id, + filename = 'chili.tif', + resampling = "bilinear", + scale = 2500, # scale=10: request ~10m resolution + overwrite = TRUE, + silent = FALSE + ) + + +x <- 'CSP/ERGo/1_0/US/CHILI' |> + gd_image_from_id() |> + gd_download( + filename = 'image.tif', + region = r, + crs = "EPSG:5070", + resampling = "bilinear", + scale = 1000, # scale=10: request ~10m resolution + overwrite = TRUE, + silent = FALSE + ) + + diff --git a/capfitogen_master.R b/capfitogen_master.R index 0061f87..584a1f9 100644 --- a/capfitogen_master.R +++ b/capfitogen_master.R @@ -129,20 +129,15 @@ edaphic_variables <- FUN.DownEV( ### Geophysical data ------ -geophys_ras <- terra::rast(file.path(Dir.Data.Envir, "geophys.nc")) +geophysical_variables <- terra::rast(file.path(Dir.Data.Envir, "geophys.nc")) -# geophys_ras <- terra::project(geophys_ras, "EPSG:4326") -# names(geophys_ras) <- geophysv -# edaph_ras <- terra::rast(file.path(Dir.Data.Envir, "edaph.nc")) -# names(edaph_ras) <- edaphv - - -# + # CAPFITOGEN pipeline ========================================================= ## Parameters ----------------------------------------------------------------- -## copied and shortened from CAPFITOGEN's "Parameters_SelecVar.R" script. -# ruta <- "C:/CAPFITOGEN3" # replace with other paths +## copied and shortened from CAPFITOGEN scripts. +## TO to: DELETE UNNECESSARY PARAMS + extent <- pais <- "World" pasaporte <- file.path(Dir.Data.GBIF, "filename") # species observations - enter GBIF data file, check if column names work geoqual <- FALSE # ? @@ -165,27 +160,25 @@ ecogeopcaxe <- 4 # number of axes (principal components) that will be shown in t resultados <- Dir.Results # directory to place results -## Variable selection: SelecVar ------------------------------------------------ -#' run variable selection (script VarSelection.R for each category of environmental variables): -#' -source(file.path(Dir.R_scripts, "VarSelection.R")) # complete HJs version, or implement original CAPFITOGEN solution with some additional code before/after? -message("Selecting variables") -bioclim_ext <- FUN.VarSelection(specdata = Species_ls$occs, #occ_ls, # - varstack = bioclim_variables) - # buf = 2 # HJ: buffer doesn't work properly with terra, gets stuck? + +## Variable selection --------------------------------------------------------- +# run variable selection based on variable inflation factor usdm::vif + +all_predictors <- c(bioclim_variables, edaphic_variables)#, geophysical_variables + +predictors <- + vifcor( + bioclim_variables,# replace with either BV, EV, GV to run separately per type + th = 0.9, # threshold of correlation + keep = NULL, # if wanted, list variables to keep no matter what + size = 1000, # subset size in case of big data (default 5000) + method = "pearson" # 'pearson','kendall','spearman' + ) -geophys_ext <- FUN.VarSelection(specdata = Species_ls$occs, #occ_ls - varstack = geophys_ras) - # buf = 2 # HJ: buffer doesn't work properly with terra, gets stuck? -edaph_ext <- FUN.VarSelection(specdata = Species_ls$occs, #occ_ls - varstack = edaph_ras - # buf = 2 # HJ: buffer doesn't work properly with terra, gets stuck? - ) -#results <- "results/SelectVar" ## Clustering and map creation: ELCmapas --------------------------------------- message("Clustering and creating maps") From 3ce70d287f3258cfc9c17dcafeed28d523d74be0 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Tue, 18 Feb 2025 17:52:49 +0100 Subject: [PATCH 15/72] start setting up ELCmaps --- .gitignore | 1 + README.md | 16 ++--- R_scripts/SHARED-Data.R | 155 ++++++++++++++++++++++++---------------- capfitogen_master.R | 80 ++++++++++++++------- 4 files changed, 156 insertions(+), 96 deletions(-) diff --git a/.gitignore b/.gitignore index e14151f..5e8e541 100644 --- a/.gitignore +++ b/.gitignore @@ -75,6 +75,7 @@ Logo/* *.out # Downloaded data +R_scripts/ELCmapas.R CAPFITOGEN3/ scripts/ hq diff --git a/README.md b/README.md index 15b9301..a7a0f55 100644 --- a/README.md +++ b/README.md @@ -23,20 +23,14 @@ sbatch submit_modgp_exec_lumi_HQ.sh Lathyrus -## CAPFITOGEN demo +## CAPFITOGEN -See [documentation](https://www.capfitogen.net/en). +As an addition to ModGP, you can run two of Capfitogen's most useful tools: ELC maps and Complementa maps to visualise overlap with protected areas. -1. Download `CAPFITOGEN3.zip` from - [here](https://drive.google.com/file/d/1EJw-XcC1NRVFS7mwzlg1VpQBpRCdfWRd/view?usp=sharing) - and extract it to the project root. +- To run our version of CAPFITOGEN in [RStudio](https://posit.co/downloads/), open `capfitogen_master.R` and execute the code, changing inputs like species name and other parameters. The script guides you through the whole process. -2. Download `rdatamaps/world/20x20` directory from - [here](https://drive.google.com/drive/folders/19bqG_Z3aFhzrCWQp1yWvMbsLivsCicHh) - and extract it to `CAPFITOGEN3/rdatamaps/world/20x20`. - -3. Run on LUMI: obtain interactive session: +- To run on LUMI: obtain interactive session: `srun -p small --nodes=1 --ntasks-per-node=1 --mem=8G -t 4:00:00 --pty bash` and execute the workflow: - `singularity run --bind $PWD cwr_0.2.0.sif capfitogen.R` + `singularity run --bind $PWD cwr_0.2.0.sif capfitogen_master.R` diff --git a/R_scripts/SHARED-Data.R b/R_scripts/SHARED-Data.R index 8b2d2b0..a7d7949 100644 --- a/R_scripts/SHARED-Data.R +++ b/R_scripts/SHARED-Data.R @@ -576,11 +576,10 @@ FUN.DownEV <- } ## GEOPHYSICAL DATA DOWNLOAD -------------------------------------------------- -FUN.DownGV <- - function(Dir = getwd(), # where to store the data output on disk - Force = FALSE, # do not overwrite already present data, - resample_to_match = FALSE){ - +FUN.DownGV <- + function(Dir = getwd(),# where to store the data output on disk + Force = FALSE,# do not overwrite already present data, + resample_to_match = FALSE) { # define a file name FNAME <- file.path(Dir, "geophysical.nc") @@ -595,75 +594,109 @@ FUN.DownGV <- # if the file doesn't already exist: if (!file.exists(FNAME)) { - ## Download digital elevation model (DEM) from - ##' Jarvis A., H.I. Reuter, A. Nelson, E. Guevara, 2008, Hole-filled - ##' seamless SRTM data V4, International Centre for Tropical Agriculture - ##' (CIAT), available from http://srtm.csi.cgiar.org. - #dem <- rast("https://srtm.csi.cgiar.org/wp-content/uploads/files/250m/tiles250m.jpg") - ## Download CHILI: Continuous Heat-Insolation Load Index - ##' Theobald, D. M., Harrison-Atlas, D., Monahan, W. B., & Albano, C. M. - ##' (2015). Ecologically-relevant maps of landforms and physiographic - ##' diversity for climate adaptation planning. PloS one, 10(12), e0143619 - insolation_index <- rast("") + message("downloading geophysical data") + + geophysical_data <- list() + + ## Download digital elevation model (DEM) ------ + ##' Fick, S.E. and R.J. Hijmans, 2017. WorldClim 2 + ##' https://doi.org/10.1002/joc.5086 + worldclim_dem_url = "https://geodata.ucdavis.edu/climate/worldclim/2_1/base/wc2.1_2.5m_elev.zip" + temp <- tempfile() + download.file(worldclim_dem_url, temp) + unzip(zipfile = temp, + exdir = Dir) + unlink(temp) + dem <- rast(paste0(Dir, "/wc2.1_2.5m_elev.tif")) + + + ## resample ------ + ## if provided, resample to match another raster object's origin and resolution + if (!missing(resample_to_match)) { + message(paste0("resampling raster to match ", names(resample_to_match))) + resample_to_match <- rast(resample_to_match) + + ## project downloaded rasters to match resample_to_match file + projection_to_match <- terra::crs(resample_to_match) + terra::crs(dem) <- projection_to_match + + ## resample + dem <- terra::resample(dem, + resample_to_match) + + + } + geophysical_data[1] <- dem } - - - ### Saving ---- - terra::writeCDF(geophysical_raster, - filename = FNAME, - overwrite = TRUE) - - geophysical_raster - + ## combine and rename rasters + geophysical_rasters <- rast(geophysical_data) + + ### Saving ---- + terra::writeCDF(geophysical_rasters, + filename = FNAME, + overwrite = TRUE) + + geophysical_rasters + } -# WGS84 = EPSG:4326 - -install.packages("reticulate") # python environment - https://rstudio.github.io/reticulate/ -install.packages("rgeedim") # search and download Google Earth Engine imagery with Python - -library(reticulate) -virtualenv_create(envname = "uc_CWR", # saved under /Documents/.virtualenvs/uc_CWR - packages = c("numpy","geedim"), - python = "C:/Program Files/Python3.10/python.exe" -) -virtualenv_list() -use_virtualenv("uc_CWR") -library(rgeedim) - -names(geedim()$enums) - -chili_img_id <- gd_image_from_id('CSP/ERGo/1_0/Global/ALOS_CHILI') - -chili <- - gd_download(chili_img_id, - filename = 'chili.tif', - resampling = "bilinear", - scale = 2500, # scale=10: request ~10m resolution - overwrite = TRUE, - silent = FALSE - ) +# WGS84 = EPSG:4326 +## Download digital elevation model (DEM) from +##' Jarvis A., H.I. Reuter, A. Nelson, E. Guevara, 2008, Hole-filled +##' seamless SRTM data V4, International Centre for Tropical Agriculture +##' (CIAT), available from http://srtm.csi.cgiar.org. +#dem <- rast("https://srtm.csi.cgiar.org/wp-content/uploads/files/250m/tiles250m.jpg") +### DRAFT: Google Earth Engine downloads. ------------------------------------- +#' Almost working, but missing user/project credentials and login. +#' See https://developers.google.com/earth-engine/guides/auth +#' ee.Authenticate() +#' ee.Initialize(project='my-project') +# +# install.packages("reticulate") # python environment - https://rstudio.github.io/reticulate/ +# install.packages("rgeedim") # search and download Google Earth Engine imagery with Python +# +# library(reticulate) +# +# virtualenv_create(envname = "uc_CWR", # saved under /Documents/.virtualenvs/uc_CWR +# packages = c("numpy","geedim"), +# python = "C:/Program Files/Python3.10/python.exe" +# ) +# +# virtualenv_list() +# +# use_virtualenv("uc_CWR") +# +# library(rgeedim) +# +# names(geedim()$enums) +# +# +# ## Download CHILI: Continuous Heat-Insolation Load Index +# ##' Theobald, D. M., Harrison-Atlas, D., Monahan, W. B., & Albano, C. M. +# ##' (2015). Ecologically-relevant maps of landforms and physiographic +# ##' diversity for climate adaptation planning. PloS one, 10(12), e0143619 +# +# chili_img_id <- gd_image_from_id('CSP/ERGo/1_0/Global/ALOS_CHILI') +# +# chili <- +# gd_download(chili_img_id, +# filename = 'chili.tif', +# resampling = "bilinear", +# scale = 2500, # scale=10: request ~10m resolution +# overwrite = TRUE, +# silent = FALSE +# ) +# -x <- 'CSP/ERGo/1_0/US/CHILI' |> - gd_image_from_id() |> - gd_download( - filename = 'image.tif', - region = r, - crs = "EPSG:5070", - resampling = "bilinear", - scale = 1000, # scale=10: request ~10m resolution - overwrite = TRUE, - silent = FALSE - ) diff --git a/capfitogen_master.R b/capfitogen_master.R index 584a1f9..79e50eb 100644 --- a/capfitogen_master.R +++ b/capfitogen_master.R @@ -129,43 +129,58 @@ edaphic_variables <- FUN.DownEV( ### Geophysical data ------ -geophysical_variables <- terra::rast(file.path(Dir.Data.Envir, "geophys.nc")) - - +geophysical_variables <- FUN.DownGV( + Dir = Dir.Data.Envir, + Force = FALSE, + resample_to_match = bioclim_variables[[1]] +) + +terra::res(geophysical_variables) # CAPFITOGEN pipeline ========================================================= +## Download CAPFITOGEN scripts and files -------------------------------------- +files_to_download <- c( + "https://raw.githubusercontent.com/HMauricioParra/Capfitogen/main/scripts/Tools%20Herramientas/ELCmapas.R" +) + +## download files in a for loop +for (i in 1:length(files_to_download)) { + file_url = files_to_download[i] + temp <- tempfile() + download.file(file_url, temp) + file.copy(temp, + paste0(Dir.R_scripts,"/", + substr(files_to_download[i], + 95, nchar(files_to_download[i])))) + unlink(temp) +} + ## Parameters ----------------------------------------------------------------- ## copied and shortened from CAPFITOGEN scripts. ## TO to: DELETE UNNECESSARY PARAMS -extent <- pais <- "World" -pasaporte <- file.path(Dir.Data.GBIF, "filename") # species observations - enter GBIF data file, check if column names work -geoqual <- FALSE # ? +pais <- "World" +#pasaporte <- file.path(Dir.Data.GBIF, "filename") # species observations - enter GBIF data file, check if column names work +#geoqual <- FALSE # ? # totalqual<-60 #Only applies if geoqual=TRUE distdup <- 1 # distance threshold in km to remove duplicates from same population -resol1 <- "Celdas 1x1 km aprox (30 arc-seg)" # resolution, change to 9x9 -buffy <- FALSE # buffer zone? -# tamp <- 1000 #Only applies when buffy=TRUE -bioclimv <- BioClim_names #c("tmean_1","vapr_annual","prec_1") # bioclimatic variables, altered by HJ with existing data -edaphv <- c("s_silt","s_sand","s_soilwater_cap") # edaphic variables (defaults from SOILGRIDS) -geophysv <- c("alt","aspect") # geophysical variables +resol1 <- "9x9km" # resolution, change to 9x9 +#buffy <- FALSE # buffer zone? latitud <- FALSE #Only applies if ecogeo=TRUE; whether to use latitude variable (Y) as a geophysical variable from 'pasaporte' longitud <- FALSE -percenRF <- 0.66 # percentage of variables that will be selected by Random Forest -percenCorr <- 0.33 # percentage of variables that will be selected by the analysis of bivariate correlations, which is executed after the selection by Random Forest (for example, if you wanted to select 1/3 of the total of variables by bivariate correlations, percenRF would be 0.33 -CorrValue <- 0.5 # correlation threshold value, above (in its positive form) or below (in its negative form) of which it is assumed that there is a correlation between two variables. -pValue <- 0.05 # significance threshold value for bivariate correlations. +#percenRF <- 0.66 # percentage of variables that will be selected by Random Forest +#percenCorr <- 0.33 # percentage of variables that will be selected by the analysis of bivariate correlations, which is executed after the selection by Random Forest (for example, if you wanted to select 1/3 of the total of variables by bivariate correlations, percenRF would be 0.33 +#CorrValue <- 0.5 # correlation threshold value, above (in its positive form) or below (in its negative form) of which it is assumed that there is a correlation between two variables. +#pValue <- 0.05 # significance threshold value for bivariate correlations. nminvar <- 3 # minimum number of variables to select per component. For example, although the processes of variable selection by RF and bivariate correlation indicate that two variables will be selected, if the nminvar number is 3, the selection process by correlations will select the three least correlated variables. -ecogeopcaxe <- 4 # number of axes (principal components) that will be shown in the tables of eigenvectors, eigenvalues and the PCA scores. ecogeopcaxe cannot be greater than the smallest number of variables to be evaluated per component -resultados <- Dir.Results # directory to place results - - - +#ecogeopcaxe <- 4 # number of axes (principal components) that will be shown in the tables of eigenvectors, eigenvalues and the PCA scores. ecogeopcaxe cannot be greater than the smallest number of variables to be evaluated per component +resultados <- Dir.Results.ECLMap # directory to place results +ruta <- Dir.Results.ECLMap ## Variable selection --------------------------------------------------------- # run variable selection based on variable inflation factor usdm::vif -all_predictors <- c(bioclim_variables, edaphic_variables)#, geophysical_variables +all_predictors <- c(bioclim_variables, edaphic_variables, geophysical_variables) predictors <- vifcor( @@ -176,12 +191,29 @@ predictors <- method = "pearson" # 'pearson','kendall','spearman' ) +## Clustering and map creation: ELCmapas --------------------------------------- +message("Clustering and creating maps") + +# download ELC map creation script from https://github.com/HMauricioParra/Capfitogen +elc_script_url = "https://raw.githubusercontent.com/HMauricioParra/Capfitogen/main/scripts/Tools%20Herramientas/ELCmapas.R" +temp <- tempfile() +download.file(elc_script_url, temp) +file.copy(temp, paste0(Dir.R_scripts,"/ELCmapas.R")) +unlink(temp) +# Set additional parameters +bioclimv <- BioClim_names # +edaphv <- names(edaphic_variables) # edaphic variables (defaults from SOILGRIDS) +geophysv <- names(geophysical_variables) # geophysical variables +maxg <- 10 # maximum number of clusters per component -## Clustering and map creation: ELCmapas --------------------------------------- -message("Clustering and creating maps") +metodo <- "kmeansbic" # clustering algorithm type. Options: medoides, elbow, calinski, ssi, bic +iterat <- 10 # if metodo="Calinski" or "ssi", the number of iterations to calculate the optimal number of clusters. + +# run the script +source(file.path(Dir.R_scripts, "ELCmapas.R")) # inputs to clustering: extracted values after variable selection From 99d76b9cc4aa1766f77c2d569741d5931f5468f9 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Wed, 19 Feb 2025 12:39:09 +0100 Subject: [PATCH 16/72] start scripting capfitogen pipeline, downloads --- .gitignore | 4 +- R_scripts/ModGP-commonlines.R | 1 + capfitogen_master.R | 125 ++++++++++++++++++++++------------ 3 files changed, 84 insertions(+), 46 deletions(-) diff --git a/.gitignore b/.gitignore index 5e8e541..5a8fa69 100644 --- a/.gitignore +++ b/.gitignore @@ -74,8 +74,10 @@ Logo/* # Job output files *.out -# Downloaded data +# Downloads R_scripts/ELCmapas.R CAPFITOGEN3/ +capfitogen-main.zip +Capfitogen-main/ scripts/ hq diff --git a/R_scripts/ModGP-commonlines.R b/R_scripts/ModGP-commonlines.R index ccaf0ed..f3df46a 100644 --- a/R_scripts/ModGP-commonlines.R +++ b/R_scripts/ModGP-commonlines.R @@ -56,6 +56,7 @@ package_vec <- c( # Capfitogen ECLmapas packages # HJ: added here from Capfitogen ECLmapas script. To do: remove unnecessary ones 'modeltools', + 'maptools', 'flexmix', 'fpc', 'vegan', diff --git a/capfitogen_master.R b/capfitogen_master.R index 79e50eb..a074be2 100644 --- a/capfitogen_master.R +++ b/capfitogen_master.R @@ -41,19 +41,27 @@ Dir.Scripts <- file.path(Dir.Base, "R_scripts") source(file.path(Dir.Scripts, "ModGP-commonlines.R")) ## API Credentials -------------------------------------------------------- -try(source(file.path(Dir.R_scripts, "SHARED-APICredentials.R"))) -if(as.character(options("gbif_user")) == "NULL" ){ - options(gbif_user=rstudioapi::askForPassword("my gbif username"))} -if(as.character(options("gbif_email")) == "NULL" ){ - options(gbif_email=rstudioapi::askForPassword("my registred gbif e-mail"))} -if(as.character(options("gbif_pwd")) == "NULL" ){ - options(gbif_pwd=rstudioapi::askForPassword("my gbif password"))} - -if(!exists("API_Key") | !exists("API_User")){ # CS API check: if CDS API credentials have not been specified elsewhere - API_User <- readline(prompt = "Please enter your Climate Data Store API user number and hit ENTER.") - API_Key <- readline(prompt = "Please enter your Climate Data Store API key number and hit ENTER.") -} # end of CDS API check - +{# set API credentials for access to climate data store (CDS) + try(source(file.path(Dir.R_scripts, "SHARED-APICredentials.R"))) + if (as.character(options("gbif_user")) == "NULL") { + options(gbif_user = rstudioapi::askForPassword("my gbif username")) + } + if (as.character(options("gbif_email")) == "NULL") { + options(gbif_email = rstudioapi::askForPassword("my registred gbif e-mail")) + } + if (as.character(options("gbif_pwd")) == "NULL") { + options(gbif_pwd = rstudioapi::askForPassword("my gbif password")) + } + + if (!exists("API_Key") | + !exists("API_User")) { + # CS API check: if CDS API credentials have not been specified elsewhere + API_User <- + readline(prompt = "Please enter your Climate Data Store API user number and hit ENTER.") + API_Key <- + readline(prompt = "Please enter your Climate Data Store API key number and hit ENTER.") + } # end of CDS API check +} ## NUMBER OF CORES if(!exists("numberOfCores")){ # Core check: if number of cores for parallel processing has not been set yet numberOfCores <- as.numeric(readline(prompt = paste("How many cores do you want to allocate to these processes? Your machine has", parallel::detectCores()))) @@ -91,9 +99,7 @@ bioclim_data <- FUN.DownBV( Force = FALSE # do not overwrite already present data ) -#test <- terra::rast("Data/Environment/TEMP_2m_temperature_00001") bioclim_variables <- terra::rast(file.path(Dir.Data.Envir, "BV_1985-2015.nc")) -#bioclim_variables <- terra::project(bioclim_variables, "EPSG:4326") # WGS84 BioClim_names <- c( ## BioClim variable names, see https://www.worldclim.org/data/bioclim.html "BIO1_Annual_Mean_Temperature", @@ -136,11 +142,13 @@ geophysical_variables <- FUN.DownGV( ) terra::res(geophysical_variables) +geophysical_variables <- rast(geophysical_variables) # CAPFITOGEN pipeline ========================================================= ## Download CAPFITOGEN scripts and files -------------------------------------- files_to_download <- c( - "https://raw.githubusercontent.com/HMauricioParra/Capfitogen/main/scripts/Tools%20Herramientas/ELCmapas.R" + "https://raw.githubusercontent.com/HMauricioParra/Capfitogen/main/scripts/Tools%20Herramientas/ELCmapas.R", + "https://raw.githubusercontent.com/HMauricioParra/Capfitogen/main/scripts/Parameters%20scripts%20(English)/Parameters_ELCmapas_2024_BioDT.R" ) ## download files in a for loop @@ -148,13 +156,60 @@ for (i in 1:length(files_to_download)) { file_url = files_to_download[i] temp <- tempfile() download.file(file_url, temp) - file.copy(temp, - paste0(Dir.R_scripts,"/", - substr(files_to_download[i], - 95, nchar(files_to_download[i])))) + file_name = sub(".*/", "", files_to_download[i]) + message(paste("downloaded CAPFITOGEN script", file_name)) + file.copy(temp, paste0(Dir.R_scripts,"/", file_name)) unlink(temp) } +# alternative: download the whole repository,.. +download.file(url = "https://github.com/HMauricioParra/Capfitogen/archive/refs/heads/main.zip", + destfile = "capfitogen-main.zip") +unzip(zipfile = "capfitogen-main.zip") + +Dir.Capfitogen = file.path(Dir.Base, "Capfitogen-main/") + +# make folder for storing error log +dir.create(paste0(Dir.Results.ECLMap, "/Error")) + +## Variable selection --------------------------------------------------------- +# run variable selection based on variable inflation factor usdm::vif +all_predictors <- c(bioclim_variables, + #edaphic_variables, + geophysical_variables) + +predictor_vifs <- + vifcor( + all_predictors,# replace with either BV, EV, GV to run separately per type + th = 0.8, # threshold of correlation + keep = NULL, # if wanted, list variables to keep no matter what + size = 1000, # subset size in case of big data (default 5000) + method = "pearson" # 'pearson','kendall','spearman' + ) + +variables_to_keep <- + names(all_predictors)[names(all_predictors) %nin% predictor_vifs@excluded] + +message("variables kept after excluding most correlated:") +print(variables_to_keep) + +# subset variables to exclude highly correlated ones +predictors <- all_predictors[[(variables_to_keep)]] + +# save variables in CAPFITOGEN folder +dir.create(file.path(Dir.Capfitogen, + "rdatapoints/world/9x9")) + +saveRDS(predictors, + "Capfitogen-main/rdatapoints/world/9x9/base9x9.RData") + +for (i in 1:length(depth(predictors))) { + file_name_path = file.path("Capfitogen-main/rdatapoints/world/9x9", + paste0(names(predictors[[i]]),".tif")) + writeRaster(predictors[[i]], + file_name_path) +} + ## Parameters ----------------------------------------------------------------- ## copied and shortened from CAPFITOGEN scripts. ## TO to: DELETE UNNECESSARY PARAMS @@ -164,7 +219,7 @@ pais <- "World" #geoqual <- FALSE # ? # totalqual<-60 #Only applies if geoqual=TRUE distdup <- 1 # distance threshold in km to remove duplicates from same population -resol1 <- "9x9km" # resolution, change to 9x9 +resol1 <- "10x10" # resolution, change to 9x9 #buffy <- FALSE # buffer zone? latitud <- FALSE #Only applies if ecogeo=TRUE; whether to use latitude variable (Y) as a geophysical variable from 'pasaporte' longitud <- FALSE @@ -175,45 +230,25 @@ longitud <- FALSE nminvar <- 3 # minimum number of variables to select per component. For example, although the processes of variable selection by RF and bivariate correlation indicate that two variables will be selected, if the nminvar number is 3, the selection process by correlations will select the three least correlated variables. #ecogeopcaxe <- 4 # number of axes (principal components) that will be shown in the tables of eigenvectors, eigenvalues and the PCA scores. ecogeopcaxe cannot be greater than the smallest number of variables to be evaluated per component resultados <- Dir.Results.ECLMap # directory to place results -ruta <- Dir.Results.ECLMap - -## Variable selection --------------------------------------------------------- -# run variable selection based on variable inflation factor usdm::vif +ruta <- Dir.Capfitogen -all_predictors <- c(bioclim_variables, edaphic_variables, geophysical_variables) - -predictors <- - vifcor( - bioclim_variables,# replace with either BV, EV, GV to run separately per type - th = 0.9, # threshold of correlation - keep = NULL, # if wanted, list variables to keep no matter what - size = 1000, # subset size in case of big data (default 5000) - method = "pearson" # 'pearson','kendall','spearman' - ) ## Clustering and map creation: ELCmapas --------------------------------------- message("Clustering and creating maps") -# download ELC map creation script from https://github.com/HMauricioParra/Capfitogen -elc_script_url = "https://raw.githubusercontent.com/HMauricioParra/Capfitogen/main/scripts/Tools%20Herramientas/ELCmapas.R" -temp <- tempfile() -download.file(elc_script_url, temp) -file.copy(temp, paste0(Dir.R_scripts,"/ELCmapas.R")) -unlink(temp) - - # Set additional parameters bioclimv <- BioClim_names # edaphv <- names(edaphic_variables) # edaphic variables (defaults from SOILGRIDS) geophysv <- names(geophysical_variables) # geophysical variables -maxg <- 10 # maximum number of clusters per component +maxg <- 3 # maximum number of clusters per component metodo <- "kmeansbic" # clustering algorithm type. Options: medoides, elbow, calinski, ssi, bic iterat <- 10 # if metodo="Calinski" or "ssi", the number of iterations to calculate the optimal number of clusters. # run the script -source(file.path(Dir.R_scripts, "ELCmapas.R")) +source(file.path(Dir.Capfitogen, + "/scripts/Tools Herramientas/ELCmapas_BioDT.R")) # inputs to clustering: extracted values after variable selection From 64cda5614e1b19033309aa299caf750f7e29d705 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Wed, 19 Feb 2025 13:26:21 +0100 Subject: [PATCH 17/72] update ELC pipeline - raster read errors --- capfitogen_master.R | 66 ++++++++++++++++++--------------------------- 1 file changed, 26 insertions(+), 40 deletions(-) diff --git a/capfitogen_master.R b/capfitogen_master.R index a074be2..b1e5eaf 100644 --- a/capfitogen_master.R +++ b/capfitogen_master.R @@ -146,23 +146,7 @@ geophysical_variables <- rast(geophysical_variables) # CAPFITOGEN pipeline ========================================================= ## Download CAPFITOGEN scripts and files -------------------------------------- -files_to_download <- c( - "https://raw.githubusercontent.com/HMauricioParra/Capfitogen/main/scripts/Tools%20Herramientas/ELCmapas.R", - "https://raw.githubusercontent.com/HMauricioParra/Capfitogen/main/scripts/Parameters%20scripts%20(English)/Parameters_ELCmapas_2024_BioDT.R" -) - -## download files in a for loop -for (i in 1:length(files_to_download)) { - file_url = files_to_download[i] - temp <- tempfile() - download.file(file_url, temp) - file_name = sub(".*/", "", files_to_download[i]) - message(paste("downloaded CAPFITOGEN script", file_name)) - file.copy(temp, paste0(Dir.R_scripts,"/", file_name)) - unlink(temp) -} - -# alternative: download the whole repository,.. +# download and unzip CAPFITOGEN repository download.file(url = "https://github.com/HMauricioParra/Capfitogen/archive/refs/heads/main.zip", destfile = "capfitogen-main.zip") unzip(zipfile = "capfitogen-main.zip") @@ -199,12 +183,19 @@ predictors <- all_predictors[[(variables_to_keep)]] # save variables in CAPFITOGEN folder dir.create(file.path(Dir.Capfitogen, "rdatapoints/world/9x9")) +dir.create(file.path(Dir.Capfitogen, + "rdatamaps/world/9x9"), + recursive = TRUE) saveRDS(predictors, "Capfitogen-main/rdatapoints/world/9x9/base9x9.RData") +save(predictors, + file = "Capfitogen-main/rdatapoints/world/9x9/base9x9.RData") +predictor_names <- names(predictors) + for (i in 1:length(depth(predictors))) { - file_name_path = file.path("Capfitogen-main/rdatapoints/world/9x9", + file_name_path = file.path("Capfitogen-main/rdatamaps/world/9x9", paste0(names(predictors[[i]]),".tif")) writeRaster(predictors[[i]], file_name_path) @@ -219,7 +210,7 @@ pais <- "World" #geoqual <- FALSE # ? # totalqual<-60 #Only applies if geoqual=TRUE distdup <- 1 # distance threshold in km to remove duplicates from same population -resol1 <- "10x10" # resolution, change to 9x9 +resol1 <- "9x9" # resolution, change to 9x9 #buffy <- FALSE # buffer zone? latitud <- FALSE #Only applies if ecogeo=TRUE; whether to use latitude variable (Y) as a geophysical variable from 'pasaporte' longitud <- FALSE @@ -232,17 +223,14 @@ nminvar <- 3 # minimum number of variables to select per component. For example, resultados <- Dir.Results.ECLMap # directory to place results ruta <- Dir.Capfitogen - ## Clustering and map creation: ELCmapas --------------------------------------- message("Clustering and creating maps") # Set additional parameters -bioclimv <- BioClim_names # +bioclimv <- predictor_names[grep("BIO", predictor_names)] # edaphv <- names(edaphic_variables) # edaphic variables (defaults from SOILGRIDS) geophysv <- names(geophysical_variables) # geophysical variables - maxg <- 3 # maximum number of clusters per component - metodo <- "kmeansbic" # clustering algorithm type. Options: medoides, elbow, calinski, ssi, bic iterat <- 10 # if metodo="Calinski" or "ssi", the number of iterations to calculate the optimal number of clusters. @@ -250,23 +238,21 @@ iterat <- 10 # if metodo="Calinski" or "ssi", the number of iterations to calcul source(file.path(Dir.Capfitogen, "/scripts/Tools Herramientas/ELCmapas_BioDT.R")) -# inputs to clustering: extracted values after variable selection - -bioclim_cl <- FUN.KmeansClust(ext_values = bioclim_ext, - max_clusters = 8, - vartype = 'bioclim') -geophys_cl <- FUN.KmeansClust(ext_values = geophys_ext, - max_clusters = 8, - vartype = 'geophys') -edaph_cl <- FUN.KmeansClust(ext_values = edaph_ext, - max_clusters = 8, - vartype = 'edaph') - - +# below added by HJ, I'm not sure what it does +# bioclim_cl <- FUN.KmeansClust(ext_values = bioclim_ext, +# max_clusters = 8, +# vartype = 'bioclim') +# geophys_cl <- FUN.KmeansClust(ext_values = geophys_ext, +# max_clusters = 8, +# vartype = 'geophys') +# edaph_cl <- FUN.KmeansClust(ext_values = edaph_ext, +# max_clusters = 8, +# vartype = 'edaph') +# # HJ: function NOT READY yet, end of map creation doesn't work -maps <- FUN.ELCmaps(edaph = edaph_clust, - bioclim = bioclim_clust, - geophys = geophys_cl) - +# maps <- FUN.ELCmaps(edaph = edaph_clust, +# bioclim = bioclim_clust, +# geophys = geophys_cl) +# From a8f42070d86e98e90f56264070b9328837fac578 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Wed, 19 Feb 2025 17:32:50 +0100 Subject: [PATCH 18/72] get to next error, puntos object missing --- R_scripts/ModGP-commonlines.R | 1 - capfitogen_master.R | 16 +++++++++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/R_scripts/ModGP-commonlines.R b/R_scripts/ModGP-commonlines.R index f3df46a..ccaf0ed 100644 --- a/R_scripts/ModGP-commonlines.R +++ b/R_scripts/ModGP-commonlines.R @@ -56,7 +56,6 @@ package_vec <- c( # Capfitogen ECLmapas packages # HJ: added here from Capfitogen ECLmapas script. To do: remove unnecessary ones 'modeltools', - 'maptools', 'flexmix', 'fpc', 'vegan', diff --git a/capfitogen_master.R b/capfitogen_master.R index b1e5eaf..bd29bef 100644 --- a/capfitogen_master.R +++ b/capfitogen_master.R @@ -62,6 +62,7 @@ source(file.path(Dir.Scripts, "ModGP-commonlines.R")) readline(prompt = "Please enter your Climate Data Store API key number and hit ENTER.") } # end of CDS API check } + ## NUMBER OF CORES if(!exists("numberOfCores")){ # Core check: if number of cores for parallel processing has not been set yet numberOfCores <- as.numeric(readline(prompt = paste("How many cores do you want to allocate to these processes? Your machine has", parallel::detectCores()))) @@ -133,8 +134,9 @@ edaphic_variables <- FUN.DownEV( resample_to_match = bioclim_variables[[1]] ) +edaphic_variables <- rast("Data/Environment/bdod_0-5cm_mean.tif") + ### Geophysical data ------ - geophysical_variables <- FUN.DownGV( Dir = Dir.Data.Envir, Force = FALSE, @@ -151,6 +153,7 @@ download.file(url = "https://github.com/HMauricioParra/Capfitogen/archive/refs/h destfile = "capfitogen-main.zip") unzip(zipfile = "capfitogen-main.zip") +# define path to CAPFITOGEN folder Dir.Capfitogen = file.path(Dir.Base, "Capfitogen-main/") # make folder for storing error log @@ -238,6 +241,8 @@ iterat <- 10 # if metodo="Calinski" or "ssi", the number of iterations to calcul source(file.path(Dir.Capfitogen, "/scripts/Tools Herramientas/ELCmapas_BioDT.R")) +# visualise output + # below added by HJ, I'm not sure what it does # bioclim_cl <- FUN.KmeansClust(ext_values = bioclim_ext, # max_clusters = 8, @@ -255,4 +260,13 @@ source(file.path(Dir.Capfitogen, # geophys = geophys_cl) # +## Overlaying conservation maps "Complementa" --------------------------------- +# set additional parameters +#... + +# run the script +source(file.path(Dir.Capfitogen, + "/scripts/Tools Herramientas/Complementa.R")) + +# visualise output From c91526517a28f3d50dd8f2aedb354115f99bd79d Mon Sep 17 00:00:00 2001 From: evalieungh Date: Thu, 20 Feb 2025 13:20:44 +0100 Subject: [PATCH 19/72] update script, stuck on vifcor error --- R_scripts/SHARED-Data.R | 406 +++++++++++++++++++++------------------- capfitogen_master.R | 58 +++--- 2 files changed, 253 insertions(+), 211 deletions(-) diff --git a/R_scripts/SHARED-Data.R b/R_scripts/SHARED-Data.R index a7d7949..6871096 100644 --- a/R_scripts/SHARED-Data.R +++ b/R_scripts/SHARED-Data.R @@ -287,7 +287,7 @@ FUN.DownGBIF <- function( save_ls } -# BIOCLIMATIC VARIABLE DOWNLOAD -------------------------------------------- +# BIOCLIMATIC DATA DOWNLOAD -------------------------------------------- #' queries, downloads, and computes bioclimatic variables #' at global extent from ERA5-Land. Water availability is based on #' soil moisture level 1 (0-7cm) and 2 (7-28cm) @@ -296,151 +296,158 @@ FUN.DownBV <- function( T_End = 2000, # what year to end climatology calculation in Dir = getwd(), # where to store the data output on disk Force = FALSE # do not overwrite already present data - ){ - +) { # Workaround for Dir, 1/2 # original_wd = getwd() # setwd(Dir) # End of workaround - FNAME <- file.path(Dir, paste0("BV_", T_Start, "-", T_End, ".nc")) - - if(!Force & file.exists(FNAME)){ - BV_ras <- stack(FNAME) - names(BV_ras) <- paste0("BIO", 1:19) - message("Data has already been downloaded with these specifications. - It has been loaded from the disk. If you wish to override - the present data, please specify Force = TRUE") - return(BV_ras) - } - - ### Raw soil moisture level data ---- - #' We download raw soil moisture data for - #' layers 1 (0-7cm) and 2 (7-28cm) separately. - #' These are then summed up and used in the - #' bioclimatic variable computation of KrigR - FNAME_RAW <- file.path(Dir, paste(tools::file_path_sans_ext(basename(FNAME)), - "volumetric_soil_water_layer_1", "RAW.nc", sep = "_")) - if(!file.exists(FNAME_RAW)) { - #### Downloading ---- - Qsoil1_ras <- CDownloadS( - Variable = "volumetric_soil_water_layer_1", # could also be total_precipitation - DataSet = "reanalysis-era5-land-monthly-means", - Type = "monthly_averaged_reanalysis", - DateStart = paste0(T_Start, "-01-01 00:00"), - DateStop = paste0(T_End, "-12-31 23:00"), - TResolution = "month", - Dir = Dir, - Extent = ne_countries(type = "countries", scale = "medium")[,1], - FileName = "Qsoil1", - API_User = API_User, - API_Key = API_Key - ) - - Qsoil2_ras <- CDownloadS( - Variable = "volumetric_soil_water_layer_2", # could also be total_precipitation - DataSet = "reanalysis-era5-land-monthly-means", - Type = "monthly_averaged_reanalysis", - DateStart = paste0(T_Start, "-01-01 00:00"), - DateStop = paste0(T_End, "-12-31 23:00"), - TResolution = "month", - Dir = Dir, - Extent = ne_countries(type = "countries", scale = "medium")[,1], - FileName = "Qsoil2", - API_User = API_User, - API_Key = API_Key - ) - - #### Combining ---- - QSoilCombin_ras <- Qsoil1_ras + Qsoil2_ras - - #### Saving ---- - terra::metags(QSoilCombin_ras) <- terra::metags(Qsoil1_ras) - QSoilCombin_ras <- KrigR:::Meta.NC( - NC = QSoilCombin_ras, - FName = FNAME_RAW, - Attrs = terra::metags(QSoilCombin_ras), Write = TRUE, - Compression = 9 - ) - - ### Deleting unnecessary files ---- - unlink(list.files(Dir, pattern = "Qsoil", full.names = TRUE)) - } - - ### Bioclimatic data ---- - BV_ras <- BioClim( - Temperature_Var = "2m_temperature", - Temperature_DataSet = "reanalysis-era5-land", - Temperature_Type = NA, - Water_Var = "volumetric_soil_water_layer_1", # could also be total_precipitation - Water_DataSet = "reanalysis-era5-land-monthly-means", - Water_Type = "monthly_averaged_reanalysis", - Y_start = T_Start, Y_end = T_End, - Extent = ne_countries(type = "countries", scale = "medium")[,1], - Dir = Dir, - FileName = basename(FNAME), - FileExtension = ".nc", - Compression = 9, - API_User = API_User, - API_Key = API_Key - ) - - # Not sure if this should be in the code or not. - # ### Masking ---- - # Land_sp <- ne_countries(type = "countries", scale = "medium") - # BV_ras <- crop(BV_ras, extent(Land_sp)) - # BV_mask <- KrigR:::mask_Shape(base.map = BV_ras[[1]], Shape = Land_sp[,"name"]) - # BV_ras <- mask(BV_ras, BV_mask) - # - # ### Saving ---- - # writeRaster(BV_ras, filename = FNAME, format = "CDF", overwrite = TRUE) - # unlink(file.path(Dir, "Qsoil_BC.nc")) - # names(BV_ras) <- paste0("BIO", 1:19) - - ### JSON RO-CRATE creation ---- - JSON_ls <- jsonlite::read_json("ro-crate-metadata.json") - - JSON_ls$`@graph`[[2]]$hasPart[[1]]$`@id` <- basename(FNAME) - JSON_ls$`@graph`[[2]]$about[[1]]$`@id` <- - "https://cds.climate.copernicus.eu/cdsapp#!/dataset/reanalysis-era5-land" - JSON_ls$`@graph`[[2]]$datePublished <- Sys.time() # tail(file.info(FNAME)$ctime) - JSON_ls$`@graph`[[2]]$name <- - "Bioclimatic data obtained from ERA5-Land. Water avialbility is denoted via the sum of soil moisture layer 1 and 2." - JSON_ls$`@graph`[[2]]$keywords <- list("ERA5-Land", - "ECMWF", - "Bioclimatic Variables", - "Soil Moisture") - JSON_ls$`@graph`[[2]]$description <- - "Bioclimatic data obtained from ERA5-Land. Water avialbility is denoted via the sum of soil moisture layer 1 and 2." - - JSON_ls$`@graph`[[3]]$name <- basename(FNAME) - JSON_ls$`@graph`[[3]]$contentSize <- file.size(FNAME) - JSON_ls$`@graph`[[3]]$`@id` <- basename(FNAME) - - JSON_ls$`@graph`[[5]]$instrument$`@id` <- - "https://doi.org/10.1088/1748-9326/ac48b3" - - con <- file(file.path(Dir, - paste0(tools::file_path_sans_ext(basename(FNAME)), - ".json"))) - writeLines(jsonlite::toJSON(JSON_ls, pretty = TRUE), - con) - close(con) - - message(paste0("ERA5 citation:\nCopernicus Climate Change Service, Climate Data Store, (2024): ERA5-land post-processed daily-statistics from 1950 to present. Copernicus Climate Change Service (C3S) Climate Data Store (CDS), DOI: 10.24381/cds.e9c9c792 ", - "(Accessed on ", Sys.Date(),")")) - - # Workaround for Dir, 2/2 -# setwd(original_wd) - # End of workaround - - BV_ras + FNAME <- file.path(Dir, paste0("BV_", T_Start, "-", T_End, ".nc")) + + if (!Force & file.exists(FNAME)) { + BV_ras <- stack(FNAME) + names(BV_ras) <- paste0("BIO", 1:19) + message( + "Data has already been downloaded with these specifications. + It has been loaded from the disk. If you wish to override + the present data, please specify Force = TRUE" + ) + return(BV_ras) + } + + ### Raw soil moisture level data ---- + #' We download raw soil moisture data for + #' layers 1 (0-7cm) and 2 (7-28cm) separately. + #' These are then summed up and used in the + #' bioclimatic variable computation of KrigR + FNAME_RAW <- + file.path( + Dir, + paste( + tools::file_path_sans_ext(basename(FNAME)), + "volumetric_soil_water_layer_1", + "RAW.nc", + sep = "_" + ) + ) + if (!file.exists(FNAME_RAW)) { + #### Downloading ---- + Qsoil1_ras <- CDownloadS( + Variable = "volumetric_soil_water_layer_1", # could also be total_precipitation + DataSet = "reanalysis-era5-land-monthly-means", + Type = "monthly_averaged_reanalysis", + DateStart = paste0(T_Start, "-01-01 00:00"), + DateStop = paste0(T_End, "-12-31 23:00"), + TResolution = "month", + Dir = Dir, + Extent = ne_countries(type = "countries", scale = "medium")[, 1], + FileName = "Qsoil1", + API_User = API_User, + API_Key = API_Key + ) + + Qsoil2_ras <- CDownloadS( + Variable = "volumetric_soil_water_layer_2", + # could also be total_precipitation + DataSet = "reanalysis-era5-land-monthly-means", + Type = "monthly_averaged_reanalysis", + DateStart = paste0(T_Start, "-01-01 00:00"), + DateStop = paste0(T_End, "-12-31 23:00"), + TResolution = "month", + Dir = Dir, + Extent = ne_countries(type = "countries", scale = "medium")[, 1], + FileName = "Qsoil2", + API_User = API_User, + API_Key = API_Key + ) + + #### Combining ---- + QSoilCombin_ras <- Qsoil1_ras + Qsoil2_ras + + #### Saving ---- + terra::metags(QSoilCombin_ras) <- terra::metags(Qsoil1_ras) + QSoilCombin_ras <- KrigR:::Meta.NC( + NC = QSoilCombin_ras, + FName = FNAME_RAW, + Attrs = terra::metags(QSoilCombin_ras), + Write = TRUE, + Compression = 9 + ) + + ### Deleting unnecessary files ---- + unlink(list.files(Dir, pattern = "Qsoil", full.names = TRUE)) + } + + ### Bioclimatic data ---- + BV_ras <- BioClim( + Temperature_Var = "2m_temperature", + Temperature_DataSet = "reanalysis-era5-land", + Temperature_Type = NA, + Water_Var = "volumetric_soil_water_layer_1", # could also be total_precipitation + Water_DataSet = "reanalysis-era5-land-monthly-means", + Water_Type = "monthly_averaged_reanalysis", + Y_start = T_Start, + Y_end = T_End, + Extent = ne_countries(type = "countries", scale = "medium")[, 1], + Dir = Dir, + FileName = basename(FNAME), + FileExtension = ".nc", + Compression = 9, + API_User = API_User, + API_Key = API_Key + ) + + ### JSON RO-CRATE creation ---- + JSON_ls <- jsonlite::read_json("ro-crate-metadata.json") + + JSON_ls$`@graph`[[2]]$hasPart[[1]]$`@id` <- basename(FNAME) + JSON_ls$`@graph`[[2]]$about[[1]]$`@id` <- + "https://cds.climate.copernicus.eu/cdsapp#!/dataset/reanalysis-era5-land" + JSON_ls$`@graph`[[2]]$datePublished <- + Sys.time() # tail(file.info(FNAME)$ctime) + JSON_ls$`@graph`[[2]]$name <- + "Bioclimatic data obtained from ERA5-Land. Water avialbility is denoted via the sum of soil moisture layer 1 and 2." + JSON_ls$`@graph`[[2]]$keywords <- list("ERA5-Land", + "ECMWF", + "Bioclimatic Variables", + "Soil Moisture") + JSON_ls$`@graph`[[2]]$description <- + "Bioclimatic data obtained from ERA5-Land. Water avialbility is denoted via the sum of soil moisture layer 1 and 2." + + JSON_ls$`@graph`[[3]]$name <- basename(FNAME) + JSON_ls$`@graph`[[3]]$contentSize <- file.size(FNAME) + JSON_ls$`@graph`[[3]]$`@id` <- basename(FNAME) + + JSON_ls$`@graph`[[5]]$instrument$`@id` <- + "https://doi.org/10.1088/1748-9326/ac48b3" + + con <- file(file.path(Dir, + paste0( + tools::file_path_sans_ext(basename(FNAME)), + ".json" + ))) + writeLines(jsonlite::toJSON(JSON_ls, pretty = TRUE), + con) + close(con) + + message( + paste0( + "ERA5 citation:\nCopernicus Climate Change Service, Climate Data Store, (2024): ERA5-land post-processed daily-statistics from 1950 to present. Copernicus Climate Change Service (C3S) Climate Data Store (CDS), DOI: 10.24381/cds.e9c9c792 ", + "(Accessed on ", Sys.Date(),")") + ) + + # Workaround for Dir, 2/2 + # setwd(original_wd) + # End of workaround + + BV_ras } -## EDAPHIC DATA DOWNLOAD ------------------------------------------------------ +# EDAPHIC DATA DOWNLOAD ------------------------------------------------------- # INCOMPLETE! Works for some variables, but the data set is incomplete. FUN.DownEV <- function(Dir = getwd(), # where to store the data output on disk + target_resolution = c(250, 250), Force = FALSE, # do not overwrite already present data, resample_to_match = FALSE) { # define a file name @@ -450,13 +457,14 @@ FUN.DownEV <- if (!Force & file.exists(FNAME)) { EV_ras <- stack(FNAME) message( - "Data has already been downloaded with these specifications. It has been loaded from the disk. If you wish to override the present data, please specify Force = TRUE" + "Data has already been downloaded. It has been loaded from the disk. If you wish to override the present data, please specify Force = TRUE" ) return(EV_ras) } + # if the file doesn't already exist: if (!file.exists(FNAME)) { - ## downloading data from SoilGrids + ## download data from SoilGrids ---- message("Start downloading data from SoilGrids: files.isric.org/soilgrids/latest/data/") soilGrids_url = "/vsicurl?max_retry=3&retry_delay=1&list_dir=no&url=https://files.isric.org/soilgrids/latest/data/" @@ -468,11 +476,11 @@ FUN.DownEV <- #' MostProbable.vrt has the most probable soil type per gridcell. #' Soil salinity: https://data.isric.org/geonetwork/srv/eng/catalog.search#/metadata/c59d0162-a258-4210-af80-777d7929c512 - SoilGrids_variables_in <- c( - "bdod/bdod_0-5cm_mean", # Bulk density of the fine earth fraction, cg/cm³ - "cec/cec_0-5cm_mean", # Cation Exchange Capacity of the soil, mmol(c)/kg - "cfvo/cfvo_0-5cm_mean", # Volumetric fraction of coarse fragments (> 2 mm) cm3/dm3 (vol‰) - "silt/silt_0-5cm_mean")#, # Proportion of silt particles (≥ 0.002 mm and ≤ 0.05/0.063 mm) in the fine earth fraction g/kg + SoilGrids_variables_in <- + c("bdod/bdod_0-5cm_mean", # Bulk density of the fine earth fraction, cg/cm³ + "cec/cec_0-5cm_mean")#, # Cation Exchange Capacity of the soil, mmol(c)/kg + #"cfvo/cfvo_0-5cm_mean", # Volumetric fraction of coarse fragments (> 2 mm) cm3/dm3 (vol‰) + #"silt/silt_0-5cm_mean")#, # Proportion of silt particles (≥ 0.002 mm and ≤ 0.05/0.063 mm) in the fine earth fraction g/kg #"clay/clay_0-5cm_mean", # Proportion of clay particles (< 0.002 mm) in the fine earth fraction g/kg #"sand/sand_0-5cm_mean", # Proportion of sand particles (> 0.05/0.063 mm) in the fine earth fraction g/kg #"nitrogen/nitrogen_0-5cm_mean", # Total nitrogen (N) cg/kg @@ -483,7 +491,7 @@ FUN.DownEV <- SoilGrids_variables <- sub(".*/", "", SoilGrids_variables_in) - soilGrids_data <- list() + soilGrids_data <- NULL for (i in 1:length(SoilGrids_variables_in)) { variable_name = SoilGrids_variables[i] @@ -501,7 +509,7 @@ FUN.DownEV <- src_dataset = paste0(soilGrids_url, SoilGrids_variables_in[i], ".vrt"), dst_dataset = path_to_downloaded_file, - tr = c(250, 250) # target resolution + tr = target_resolution # target resolution ), # or else load it from file downloaded_variable <- path_to_downloaded_file @@ -509,7 +517,9 @@ FUN.DownEV <- ## load variable as raster downloaded_raster <- rast(downloaded_variable) - + plot(downloaded_raster, main = SoilGrids_variables[i]) + + ### resample ---- ## if provided, resample to match another raster object's origin and resolution if (!missing(resample_to_match)) { message(paste0("resampling raster to match ", names(resample_to_match))) @@ -518,52 +528,69 @@ FUN.DownEV <- ## project downloaded rasters to match resample_to_match file projection_to_match <- terra::crs(resample_to_match) terra::crs(downloaded_raster) <- projection_to_match - + ## resample downloaded_raster <- terra::resample(downloaded_raster, - resample_to_match) - - } - - soilGrids_data[i] <- downloaded_raster + resample_to_match) + } - } + soilGrids_data <- c(soilGrids_data, downloaded_raster) + } + + ## HSWD downloads ---- ## download additional rasters from HSWD message("Downloading data from HSWD (harmonised world soil database) via fao.org") - PH_nutrient <- - rast("https://www.fao.org/fileadmin/user_upload/soils/docs/HWSD/Soil_Quality_data/sq1.asc") + path_to_PH_nutrient = file.path(Dir, "HSWD_PH_nutrient.tif") + if (!missing(path_to_PH_nutrient)) { + message("downloading HSWD PH nutrient") + download.file(url = "https://www.fao.org/fileadmin/user_upload/soils/docs/HWSD/Soil_Quality_data/sq1.asc", + destfile = path_to_PH_nutrient) + } - PH_toxicity <- - rast("https://www.fao.org/fileadmin/user_upload/soils/docs/HWSD/Soil_Quality_data/sq6.asc") + PH_nutrient <- rast(path_to_PH_nutrient) - ## if provided, resample to match another raster object's origin and resolution - if (!missing(resample_to_match)) { - message(paste0("resampling raster to match ", names(resample_to_match))) - resample_to_match <- rast(resample_to_match) - - ## project downloaded rasters to match resample_to_match file - projection_to_match <- terra::crs(resample_to_match) - terra::crs(PH_nutrient) <- projection_to_match - terra::crs(PH_toxicity) <- projection_to_match - - ## resample - PH_nutrient <- - terra::resample(PH_nutrient, - resample_to_match) - - PH_toxicity <- - terra::resample(PH_toxicity, - resample_to_match) + path_to_PH_toxicity = file.path(Dir, "HSWD_PH_toxicity.tif") + if (!missing(path_to_PH_toxicity)) { + message("downloading HSWD PH toxicity") + download.file(url = "https://www.fao.org/fileadmin/user_upload/soils/docs/HWSD/Soil_Quality_data/sq6.asc", + destfile = path_to_PH_toxicity) + } - ## combine and rename rasters - EV_rasters <- rast(c(soilGrids_data, PH_nutrient, PH_toxicity)) - names(EV_rasters) <- c(SoilGrids_variables, "Nutrient", "Toxicity") + PH_toxicity <- rast(path_to_PH_toxicity) } + ### resample ---- + ## if provided, resample to match another raster object's origin and resolution + if (!missing(resample_to_match)) { + message(paste0("resampling raster to match ", names(resample_to_match))) + resample_to_match <- rast(resample_to_match) + + ## project downloaded rasters to match resample_to_match file + projection_to_match <- terra::crs(resample_to_match) + terra::crs(PH_nutrient) <- projection_to_match + terra::crs(PH_toxicity) <- projection_to_match + + ## resample + PH_nutrient <- + terra::resample(PH_nutrient, + resample_to_match) + + PH_toxicity <- + terra::resample(PH_toxicity, + resample_to_match) + + } + ### combine and rename rasters ---- + EV_rasters <- rast(c(soilGrids_data, + PH_nutrient, PH_toxicity)) + + names(EV_rasters) <- c(SoilGrids_variables, + "Nutrient", "Toxicity") + ### Saving ---- message(paste0("saving as netCDF:", FNAME)) terra::writeCDF(EV_rasters, @@ -573,9 +600,8 @@ FUN.DownEV <- EV_rasters } -} -## GEOPHYSICAL DATA DOWNLOAD -------------------------------------------------- +# GEOPHYSICAL DATA DOWNLOAD -------------------------------------------------- FUN.DownGV <- function(Dir = getwd(),# where to store the data output on disk Force = FALSE,# do not overwrite already present data, @@ -589,8 +615,10 @@ FUN.DownGV <- message( "Data has already been downloaded with these specifications. It has been loaded from the disk. If you wish to override the present data, please specify Force = TRUE" ) + return(GV_ras) - } + + } # if the file doesn't already exist: if (!file.exists(FNAME)) { @@ -602,6 +630,7 @@ FUN.DownGV <- ## Download digital elevation model (DEM) ------ ##' Fick, S.E. and R.J. Hijmans, 2017. WorldClim 2 ##' https://doi.org/10.1002/joc.5086 + message("digital elevation model") worldclim_dem_url = "https://geodata.ucdavis.edu/climate/worldclim/2_1/base/wc2.1_2.5m_elev.zip" temp <- tempfile() download.file(worldclim_dem_url, temp) @@ -609,7 +638,7 @@ FUN.DownGV <- exdir = Dir) unlink(temp) dem <- rast(paste0(Dir, "/wc2.1_2.5m_elev.tif")) - + names(dem) <- "elevation" ## resample ------ ## if provided, resample to match another raster object's origin and resolution @@ -625,25 +654,24 @@ FUN.DownGV <- dem <- terra::resample(dem, resample_to_match) - - } + } geophysical_data[1] <- dem - - } + } ## combine and rename rasters geophysical_rasters <- rast(geophysical_data) ### Saving ---- + message("saving as NetCDF") terra::writeCDF(geophysical_rasters, filename = FNAME, overwrite = TRUE) geophysical_rasters - - } + + } @@ -656,7 +684,7 @@ FUN.DownGV <- ##' (CIAT), available from http://srtm.csi.cgiar.org. #dem <- rast("https://srtm.csi.cgiar.org/wp-content/uploads/files/250m/tiles250m.jpg") -### DRAFT: Google Earth Engine downloads. ------------------------------------- +# DRAFT: Google Earth Engine downloads. ------------------------------------- #' Almost working, but missing user/project credentials and login. #' See https://developers.google.com/earth-engine/guides/auth #' ee.Authenticate() diff --git a/capfitogen_master.R b/capfitogen_master.R index bd29bef..0f31528 100644 --- a/capfitogen_master.R +++ b/capfitogen_master.R @@ -5,23 +5,24 @@ #' - Execution of shared data pipeline #' - Species occurrence download from GBIF #' - Environmental data load/download -#' - Execution of data formatting for CAPFITOGEN -#' - Execution of Capfitogen pipeline +#' - Data download of soil and geophisical variables +#' - Data formatting for CAPFITOGEN +#' - Execution of Capfitogen tools 'ELC maps' and 'Complementa' +#' - Visualisation of outputs #' DEPENDENCIES: #' - R_Scripts directory containing: #' - "MoDGP-commonlines.R" #' - "SHARED-APICredentials.R" -- NB! internal to project members, ask for access #' - "SHARED-Data.R" -#' - "ELCmaps" - CAPFITOGEN tool -#' - "Complementa" - CAPFITOGEN tool -#' AUTHORS: [Erik Kusch, Heli Juottonen, Eva Lieungh] +#' AUTHORS: [Eva Lieungh, Erik Kusch, Heli Juottonen] #' Capfitogen credit: Parra-Quijano et al. 2021, -#' https://repositorio.unal.edu.co/handle/unal/85787 +#' https://repositorio.unal.edu.co/handle/unal/85787 #' ####################################################################### # -#https://github.com/ErikKusch/KrigR/blob/94cdb9e0aa3cff9f996575f4fb3d10c617eb37ab/metadata/reanalysis-era5-land.RData + # PREAMBLE ================================================================ set.seed(42) # making things reproducibly random rm(list=ls()) # clean environment +gc() # Read species from command-line argument args = commandArgs(trailingOnly=TRUE) @@ -33,11 +34,11 @@ if (length(args)==0) { } message(sprintf("SPECIES = %s", SPECIES)) -### Define directories in relation to project directory +# Define directories in relation to project directory Dir.Base <- getwd() Dir.Scripts <- file.path(Dir.Base, "R_scripts") -## source packages, directories, simple functions (...) +# source packages, directories, simple functions (...) source(file.path(Dir.Scripts, "ModGP-commonlines.R")) ## API Credentials -------------------------------------------------------- @@ -64,10 +65,19 @@ source(file.path(Dir.Scripts, "ModGP-commonlines.R")) } ## NUMBER OF CORES -if(!exists("numberOfCores")){ # Core check: if number of cores for parallel processing has not been set yet - numberOfCores <- as.numeric(readline(prompt = paste("How many cores do you want to allocate to these processes? Your machine has", parallel::detectCores()))) -} # end of Core check -message(sprintf("numberOfCores = %d", numberOfCores)) +{ + if (!exists("numberOfCores")) { + # Core check: if number of cores for parallel processing has not been set yet + numberOfCores <- + as.numeric(readline( + prompt = paste( + "How many cores do you want to allocate to these processes? Your machine has", + parallel::detectCores() + ) + )) + } # end of Core check + message(sprintf("numberOfCores = %d", numberOfCores)) +} # DATA ==================================================================== ## Run SHARED-Data script ------------------------------------------------- @@ -93,7 +103,7 @@ Species_ls <- FUN.DownGBIF( ##' Will this download Global Multi-resolution Terrain Elevation Data (GMTED2010) as well? ##' Temporal coverage: January 1950 to present ? https://cds.climate.copernicus.eu/datasets/derived-era5-land-daily-statistics?tab=overview message("Downloading new or loading existing 19 BioClim bioclimatic variables") -bioclim_data <- FUN.DownBV( +bioclim_variables <- FUN.DownBV( T_Start = 1999, # what year to begin climatology calculation in T_End = 1999, # what year to end climatology calculation in Dir = Dir.Data.Envir, # where to store the data output on disk @@ -130,21 +140,17 @@ message("Downloading new or loading existing edaphic/soil variables") edaphic_variables <- FUN.DownEV( Dir = Dir.Data.Envir, + target_resolution = c(250, 250), Force = TRUE, resample_to_match = bioclim_variables[[1]] ) -edaphic_variables <- rast("Data/Environment/bdod_0-5cm_mean.tif") - ### Geophysical data ------ geophysical_variables <- FUN.DownGV( Dir = Dir.Data.Envir, Force = FALSE, resample_to_match = bioclim_variables[[1]] ) - -terra::res(geophysical_variables) -geophysical_variables <- rast(geophysical_variables) # CAPFITOGEN pipeline ========================================================= ## Download CAPFITOGEN scripts and files -------------------------------------- @@ -159,10 +165,17 @@ Dir.Capfitogen = file.path(Dir.Base, "Capfitogen-main/") # make folder for storing error log dir.create(paste0(Dir.Results.ECLMap, "/Error")) +## Format GBIF data ----------------------------------------------------------- +# need a data frame named 'puntos' = points with occurrence points +puntos <- data.frame(POINTID = 1:length(Species_ls[["occs"]][["DECLATITUDE"]]), + POINT_X = Species_ls[["occs"]][["DECLONGITUDE"]], + POINT_Y = Species_ls[["occs"]][["DECLATITUDE"]]) + + ## Variable selection --------------------------------------------------------- # run variable selection based on variable inflation factor usdm::vif all_predictors <- c(bioclim_variables, - #edaphic_variables, + edaphic_variables, # Error in xcor[mx[1], mx[2]] : subscript out of bounds / In addition: Warning message: / [spatSample] fewer values returned than requested geophysical_variables) predictor_vifs <- @@ -185,7 +198,7 @@ predictors <- all_predictors[[(variables_to_keep)]] # save variables in CAPFITOGEN folder dir.create(file.path(Dir.Capfitogen, - "rdatapoints/world/9x9")) + "rdatapoints/world/9x9")) dir.create(file.path(Dir.Capfitogen, "rdatamaps/world/9x9"), recursive = TRUE) @@ -201,7 +214,8 @@ for (i in 1:length(depth(predictors))) { file_name_path = file.path("Capfitogen-main/rdatamaps/world/9x9", paste0(names(predictors[[i]]),".tif")) writeRaster(predictors[[i]], - file_name_path) + file_name_path, + overwrite=TRUE) } ## Parameters ----------------------------------------------------------------- From 94376ac63734eb9aefb33524159866953cb7b381 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Thu, 20 Feb 2025 19:52:12 +0100 Subject: [PATCH 20/72] new error with ELCmaps find.clusters --- .gitignore | 2 ++ R_scripts/SHARED-Data.R | 12 +++++------- capfitogen_master.R | 10 ++++++---- 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/.gitignore b/.gitignore index 5a8fa69..2da08c9 100644 --- a/.gitignore +++ b/.gitignore @@ -73,6 +73,8 @@ Logo/* # Job output files *.out +results/ELCmap/ +results/ # Downloads R_scripts/ELCmapas.R diff --git a/R_scripts/SHARED-Data.R b/R_scripts/SHARED-Data.R index 6871096..447e44c 100644 --- a/R_scripts/SHARED-Data.R +++ b/R_scripts/SHARED-Data.R @@ -544,21 +544,18 @@ FUN.DownEV <- message("Downloading data from HSWD (harmonised world soil database) via fao.org") path_to_PH_nutrient = file.path(Dir, "HSWD_PH_nutrient.tif") - if (!missing(path_to_PH_nutrient)) { - message("downloading HSWD PH nutrient") + if (!file.exists(path_to_PH_nutrient)) { download.file(url = "https://www.fao.org/fileadmin/user_upload/soils/docs/HWSD/Soil_Quality_data/sq1.asc", destfile = path_to_PH_nutrient) - } - + } PH_nutrient <- rast(path_to_PH_nutrient) path_to_PH_toxicity = file.path(Dir, "HSWD_PH_toxicity.tif") - if (!missing(path_to_PH_toxicity)) { + if (missing(path_to_PH_toxicity)) { message("downloading HSWD PH toxicity") download.file(url = "https://www.fao.org/fileadmin/user_upload/soils/docs/HWSD/Soil_Quality_data/sq6.asc", destfile = path_to_PH_toxicity) - } - + } PH_toxicity <- rast(path_to_PH_toxicity) } @@ -584,6 +581,7 @@ FUN.DownEV <- resample_to_match) } + ### combine and rename rasters ---- EV_rasters <- rast(c(soilGrids_data, PH_nutrient, PH_toxicity)) diff --git a/capfitogen_master.R b/capfitogen_master.R index 0f31528..56c496b 100644 --- a/capfitogen_master.R +++ b/capfitogen_master.R @@ -175,7 +175,7 @@ puntos <- data.frame(POINTID = 1:length(Species_ls[["occs"]][["DECLATITUDE"]]), ## Variable selection --------------------------------------------------------- # run variable selection based on variable inflation factor usdm::vif all_predictors <- c(bioclim_variables, - edaphic_variables, # Error in xcor[mx[1], mx[2]] : subscript out of bounds / In addition: Warning message: / [spatSample] fewer values returned than requested + #edaphic_variables, # Error in xcor[mx[1], mx[2]] : subscript out of bounds / In addition: Warning message: / [spatSample] fewer values returned than requested geophysical_variables) predictor_vifs <- @@ -190,11 +190,12 @@ predictor_vifs <- variables_to_keep <- names(all_predictors)[names(all_predictors) %nin% predictor_vifs@excluded] -message("variables kept after excluding most correlated:") +message("variables kept after excluding the most correlated ones:") print(variables_to_keep) # subset variables to exclude highly correlated ones predictors <- all_predictors[[(variables_to_keep)]] +predictors <- raster::stack(predictors) # save variables in CAPFITOGEN folder dir.create(file.path(Dir.Capfitogen, @@ -210,7 +211,7 @@ save(predictors, predictor_names <- names(predictors) -for (i in 1:length(depth(predictors))) { +for (i in 1:dim(predictors)[3]) { file_name_path = file.path("Capfitogen-main/rdatamaps/world/9x9", paste0(names(predictors[[i]]),".tif")) writeRaster(predictors[[i]], @@ -245,13 +246,14 @@ message("Clustering and creating maps") # Set additional parameters bioclimv <- predictor_names[grep("BIO", predictor_names)] # -edaphv <- names(edaphic_variables) # edaphic variables (defaults from SOILGRIDS) +edaphv <- names(geophysical_variables)#names(edaphic_variables) # edaphic variables (defaults from SOILGRIDS) geophysv <- names(geophysical_variables) # geophysical variables maxg <- 3 # maximum number of clusters per component metodo <- "kmeansbic" # clustering algorithm type. Options: medoides, elbow, calinski, ssi, bic iterat <- 10 # if metodo="Calinski" or "ssi", the number of iterations to calculate the optimal number of clusters. # run the script +## NB! Change made in capfitogen script: replaced extract with raster::extract (other package masked it and caused error) source(file.path(Dir.Capfitogen, "/scripts/Tools Herramientas/ELCmapas_BioDT.R")) From e7bb01fd38113f57fc985b282c90f6e4acc8d0cf Mon Sep 17 00:00:00 2001 From: evalieungh Date: Fri, 21 Feb 2025 08:13:03 +0100 Subject: [PATCH 21/72] bug fix FUN.DownEV --- R_scripts/SHARED-Data.R | 10 +++++----- capfitogen_master.R | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/R_scripts/SHARED-Data.R b/R_scripts/SHARED-Data.R index 447e44c..5998b1c 100644 --- a/R_scripts/SHARED-Data.R +++ b/R_scripts/SHARED-Data.R @@ -452,18 +452,19 @@ FUN.DownEV <- resample_to_match = FALSE) { # define a file name FNAME <- file.path(Dir, "edaphic.nc") + message(FNAME) # check if file already exists and whether to overwrite if (!Force & file.exists(FNAME)) { - EV_ras <- stack(FNAME) + EV_ras <- rast(FNAME) message( "Data has already been downloaded. It has been loaded from the disk. If you wish to override the present data, please specify Force = TRUE" ) return(EV_ras) } - # if the file doesn't already exist: - if (!file.exists(FNAME)) { + # if Force=TRUE or the file doesn't already exist: + if (Force | !file.exists(FNAME)) { ## download data from SoilGrids ---- message("Start downloading data from SoilGrids: files.isric.org/soilgrids/latest/data/") soilGrids_url = "/vsicurl?max_retry=3&retry_delay=1&list_dir=no&url=https://files.isric.org/soilgrids/latest/data/" @@ -558,8 +559,6 @@ FUN.DownEV <- } PH_toxicity <- rast(path_to_PH_toxicity) - } - ### resample ---- ## if provided, resample to match another raster object's origin and resolution if (!missing(resample_to_match)) { @@ -597,6 +596,7 @@ FUN.DownEV <- EV_rasters + } } # GEOPHYSICAL DATA DOWNLOAD -------------------------------------------------- diff --git a/capfitogen_master.R b/capfitogen_master.R index 56c496b..390035d 100644 --- a/capfitogen_master.R +++ b/capfitogen_master.R @@ -141,7 +141,7 @@ message("Downloading new or loading existing edaphic/soil variables") edaphic_variables <- FUN.DownEV( Dir = Dir.Data.Envir, target_resolution = c(250, 250), - Force = TRUE, + Force = FALSE, resample_to_match = bioclim_variables[[1]] ) From a1b3e4673ce736540524b262688558f6067f18c0 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Fri, 21 Feb 2025 10:50:11 +0100 Subject: [PATCH 22/72] add wind data --- R_scripts/SHARED-Data.R | 93 +++++++++++++++++++++++++++-------------- capfitogen_master.R | 22 ++++++---- 2 files changed, 74 insertions(+), 41 deletions(-) diff --git a/R_scripts/SHARED-Data.R b/R_scripts/SHARED-Data.R index 5998b1c..0696fd3 100644 --- a/R_scripts/SHARED-Data.R +++ b/R_scripts/SHARED-Data.R @@ -609,35 +609,61 @@ FUN.DownGV <- # check if file already exists and whether to overwrite if (!Force & file.exists(FNAME)) { - GV_ras <- stack(FNAME) + GV_raster <- rast(FNAME) + names(GV_raster) <- c("elevation", + "mean_wind_speed_of_windiest_month") message( "Data has already been downloaded with these specifications. It has been loaded from the disk. If you wish to override the present data, please specify Force = TRUE" ) - return(GV_ras) - - } + return(GV_raster) + + } # if the file doesn't already exist: - if (!file.exists(FNAME)) { - - message("downloading geophysical data") - - geophysical_data <- list() + if (Force | !file.exists(FNAME)) { + message("downloading or loading geophysical data") ## Download digital elevation model (DEM) ------ ##' Fick, S.E. and R.J. Hijmans, 2017. WorldClim 2 ##' https://doi.org/10.1002/joc.5086 - message("digital elevation model") - worldclim_dem_url = "https://geodata.ucdavis.edu/climate/worldclim/2_1/base/wc2.1_2.5m_elev.zip" - temp <- tempfile() - download.file(worldclim_dem_url, temp) - unzip(zipfile = temp, - exdir = Dir) - unlink(temp) + message("- digital elevation model") + if (!file.exists(paste0(Dir, "/wc2.1_2.5m_elev.tif"))) { + worldclim_dem_url = "https://geodata.ucdavis.edu/climate/worldclim/2_1/base/wc2.1_2.5m_elev.zip" + temp <- tempfile() + download.file(worldclim_dem_url, temp) + unzip(zipfile = temp, + exdir = Dir) + unlink(temp) + } dem <- rast(paste0(Dir, "/wc2.1_2.5m_elev.tif")) names(dem) <- "elevation" + ## Download wind speed ------ + ##' WorldClim 2 + message("- mean wind speed of windiest month (annual max of monthly means)") + if (!file.exists(paste0(Dir, "/wc2.1_2.5m_wind_max.tif"))) { + worldclim_wind_url = "https://geodata.ucdavis.edu/climate/worldclim/2_1/base/wc2.1_2.5m_wind.zip" + temp <- tempfile() + download.file(worldclim_wind_url, temp) + unzip(zipfile = temp, + exdir = Dir) + unlink(temp) + + ## read monthly wind speed and find annual max of monthly means + month_numbers = c(paste0("0", 2:9), as.character(10:12)) + wind_stack <- rast(paste0(Dir, "/wc2.1_2.5m_wind_01.tif")) + for (i in month_numbers) { + raster_i = rast(paste0(Dir, "/wc2.1_2.5m_wind_", i, ".tif")) + wind_stack <- c(wind_stack, raster_i) + } + max_wind <- terra::app(wind_stack, max) + writeRaster(max_wind, + filename = paste0(Dir, "/wc2.1_2.5m_wind_max.tif")) + } + wind <- rast(paste0(Dir, "/wc2.1_2.5m_wind_max.tif")) + names(wind) <- "mean_wind_speed_of_windiest_month" + ## resample ------ ## if provided, resample to match another raster object's origin and resolution if (!missing(resample_to_match)) { @@ -647,29 +673,32 @@ FUN.DownGV <- ## project downloaded rasters to match resample_to_match file projection_to_match <- terra::crs(resample_to_match) terra::crs(dem) <- projection_to_match + terra::crs(wind) <- projection_to_match + message("projected to match input") ## resample dem <- terra::resample(dem, resample_to_match) + message("dem successfully resampled") + + wind <- terra::resample(wind, + resample_to_match) + message("wind successfully resampled") - } - - geophysical_data[1] <- dem - } - - ## combine and rename rasters - geophysical_rasters <- rast(geophysical_data) - - ### Saving ---- - message("saving as NetCDF") - terra::writeCDF(geophysical_rasters, - filename = FNAME, - overwrite = TRUE) - - geophysical_rasters - + + ### combine rasters + geophysical_rasters <- c(dem, wind) + + ### Saving ---- + message("saving as NetCDF") + terra::writeCDF(geophysical_rasters, + filename = FNAME, + overwrite = TRUE) + + geophysical_rasters } + } diff --git a/capfitogen_master.R b/capfitogen_master.R index 390035d..2a75a0e 100644 --- a/capfitogen_master.R +++ b/capfitogen_master.R @@ -153,17 +153,21 @@ geophysical_variables <- FUN.DownGV( ) # CAPFITOGEN pipeline ========================================================= -## Download CAPFITOGEN scripts and files -------------------------------------- +## Download CAPFITOGEN scripts ------------------------------------------------ # download and unzip CAPFITOGEN repository -download.file(url = "https://github.com/HMauricioParra/Capfitogen/archive/refs/heads/main.zip", +if (!file.exists("capfitogen-main.zip")) { + download.file(url = "https://github.com/HMauricioParra/Capfitogen/archive/refs/heads/main.zip", destfile = "capfitogen-main.zip") -unzip(zipfile = "capfitogen-main.zip") + unzip(zipfile = "capfitogen-main.zip") +} # define path to CAPFITOGEN folder Dir.Capfitogen = file.path(Dir.Base, "Capfitogen-main/") # make folder for storing error log -dir.create(paste0(Dir.Results.ECLMap, "/Error")) +if (!dir.exists(paste0(Dir.Results.ECLMap, "/Error"))) { + dir.create(paste0(Dir.Results.ECLMap, "/Error")) +} ## Format GBIF data ----------------------------------------------------------- # need a data frame named 'puntos' = points with occurrence points @@ -198,11 +202,11 @@ predictors <- all_predictors[[(variables_to_keep)]] predictors <- raster::stack(predictors) # save variables in CAPFITOGEN folder -dir.create(file.path(Dir.Capfitogen, - "rdatapoints/world/9x9")) -dir.create(file.path(Dir.Capfitogen, - "rdatamaps/world/9x9"), - recursive = TRUE) +if (!dir.exists(file.path(Dir.Capfitogen, "rdatapoints/world/9x9"))) { + dir.create(file.path(Dir.Capfitogen, "rdatapoints/world/9x9")) + dir.create(file.path(Dir.Capfitogen, "rdatamaps/world/9x9"), + recursive = TRUE) +} saveRDS(predictors, "Capfitogen-main/rdatapoints/world/9x9/base9x9.RData") From c6836017cc901edbae6d8d1c5a6e8d6a633d3b0b Mon Sep 17 00:00:00 2001 From: evalieungh Date: Fri, 21 Feb 2025 11:46:44 +0100 Subject: [PATCH 23/72] get ELCmaps running, draft Complementa code --- R_scripts/ModGP-commonlines.R | 5 +- capfitogen_master.R | 98 +++++++++++++++++------------------ 2 files changed, 52 insertions(+), 51 deletions(-) diff --git a/R_scripts/ModGP-commonlines.R b/R_scripts/ModGP-commonlines.R index ccaf0ed..0836705 100644 --- a/R_scripts/ModGP-commonlines.R +++ b/R_scripts/ModGP-commonlines.R @@ -122,9 +122,12 @@ Dir.Exports <- file.path(Dir.Base, "Exports") Dir.Exports.ModGP <- file.path(Dir.Exports, "ModGP") Dir.Exports.Capfitogen <- file.path(Dir.Exports, "Capfitogen") Dir.R_scripts <- file.path(Dir.Base, "R_scripts") +Dir.Capfitogen <- file.path(Dir.Base, "Capfitogen-main") Dir.Results <- file.path(Dir.Base, "results") -Dir.Results.SelectVar <- file.path(Dir.Results, "SelectVar") Dir.Results.ECLMap <- file.path(Dir.Results, "ECLMap") +Dir.Results.ECLMap.Error <- file.path(Dir.Results.ECLMap, "Error") +Dir.Results.Complementa <- file.path(Dir.Results, "Complementa") +Dir.Results.Complementa.Error <- file.path(Dir.Results.Complementa, "Error") ### Create directories which aren't present yet Dirs <- grep(ls(), pattern = "Dir.", value = TRUE) diff --git a/capfitogen_master.R b/capfitogen_master.R index 2a75a0e..d6c7093 100644 --- a/capfitogen_master.R +++ b/capfitogen_master.R @@ -161,20 +161,14 @@ if (!file.exists("capfitogen-main.zip")) { unzip(zipfile = "capfitogen-main.zip") } -# define path to CAPFITOGEN folder -Dir.Capfitogen = file.path(Dir.Base, "Capfitogen-main/") - -# make folder for storing error log -if (!dir.exists(paste0(Dir.Results.ECLMap, "/Error"))) { - dir.create(paste0(Dir.Results.ECLMap, "/Error")) -} - ## Format GBIF data ----------------------------------------------------------- # need a data frame named 'puntos' = points with occurrence points puntos <- data.frame(POINTID = 1:length(Species_ls[["occs"]][["DECLATITUDE"]]), POINT_X = Species_ls[["occs"]][["DECLONGITUDE"]], POINT_Y = Species_ls[["occs"]][["DECLATITUDE"]]) +# create 'pasaporte' file + ## Variable selection --------------------------------------------------------- # run variable selection based on variable inflation factor usdm::vif @@ -187,7 +181,7 @@ predictor_vifs <- all_predictors,# replace with either BV, EV, GV to run separately per type th = 0.8, # threshold of correlation keep = NULL, # if wanted, list variables to keep no matter what - size = 1000, # subset size in case of big data (default 5000) + size = 5000, # subset size in case of big data (default 5000) method = "pearson" # 'pearson','kendall','spearman' ) @@ -224,67 +218,71 @@ for (i in 1:dim(predictors)[3]) { } ## Parameters ----------------------------------------------------------------- -## copied and shortened from CAPFITOGEN scripts. -## TO to: DELETE UNNECESSARY PARAMS - -pais <- "World" -#pasaporte <- file.path(Dir.Data.GBIF, "filename") # species observations - enter GBIF data file, check if column names work -#geoqual <- FALSE # ? -# totalqual<-60 #Only applies if geoqual=TRUE +## used in CAPFITOGEN scripts below +ruta <- Dir.Capfitogen # path to capfitogen scripts +pais <- "World" # global extent - big modifications will be necessary to use different extent +geoqual <- FALSE +duplicat <- TRUE# duplicat=TRUE indicates that records of the same GENUS/SPECIES/SUBTAXA will be deleted distdup <- 1 # distance threshold in km to remove duplicates from same population resol1 <- "9x9" # resolution, change to 9x9 -#buffy <- FALSE # buffer zone? latitud <- FALSE #Only applies if ecogeo=TRUE; whether to use latitude variable (Y) as a geophysical variable from 'pasaporte' longitud <- FALSE -#percenRF <- 0.66 # percentage of variables that will be selected by Random Forest -#percenCorr <- 0.33 # percentage of variables that will be selected by the analysis of bivariate correlations, which is executed after the selection by Random Forest (for example, if you wanted to select 1/3 of the total of variables by bivariate correlations, percenRF would be 0.33 -#CorrValue <- 0.5 # correlation threshold value, above (in its positive form) or below (in its negative form) of which it is assumed that there is a correlation between two variables. -#pValue <- 0.05 # significance threshold value for bivariate correlations. -nminvar <- 3 # minimum number of variables to select per component. For example, although the processes of variable selection by RF and bivariate correlation indicate that two variables will be selected, if the nminvar number is 3, the selection process by correlations will select the three least correlated variables. -#ecogeopcaxe <- 4 # number of axes (principal components) that will be shown in the tables of eigenvectors, eigenvalues and the PCA scores. ecogeopcaxe cannot be greater than the smallest number of variables to be evaluated per component -resultados <- Dir.Results.ECLMap # directory to place results -ruta <- Dir.Capfitogen +# nminvar <- 3 # minimum number of variables to select per component. For example, although the processes of variable selection by RF and bivariate correlation indicate that two variables will be selected, if the nminvar number is 3, the selection process by correlations will select the three least correlated variables. -## Clustering and map creation: ELCmapas --------------------------------------- -message("Clustering and creating maps") - -# Set additional parameters bioclimv <- predictor_names[grep("BIO", predictor_names)] # edaphv <- names(geophysical_variables)#names(edaphic_variables) # edaphic variables (defaults from SOILGRIDS) geophysv <- names(geophysical_variables) # geophysical variables -maxg <- 3 # maximum number of clusters per component +maxg <- 20 # maximum number of clusters per component metodo <- "kmeansbic" # clustering algorithm type. Options: medoides, elbow, calinski, ssi, bic iterat <- 10 # if metodo="Calinski" or "ssi", the number of iterations to calculate the optimal number of clusters. +# parameters for Complementa tool +gaptype <- FALSE # Note: Representa tool a prerequisite of gaptype=TRUE +celdas <- TRUE # Note: If celdas=TRUE, a complementarity analysis will be run by cells (grid) +resol1 <- "9x9"#"celdas 10x10 km aprox (5 arc-min)" #Only applies if celdas=TRUE +nceldas <- 10 #Only applies if celdas=TRUE, number of cells in a ranking (from most to least important in terms of taxa richness accumulation) +areas <- TRUE # If areas=TRUE, a complementary analysis will be run per protected areas (polygons), which can come from a world database (WDPA) or from a shapefile provided by the user. If areas=TRUE, at least one of the following two options (or both), WDPA or propio, must be TRUE, otherwise it may cause errors. +WDPA <- TRUE #Only applies if areas=TRUE +#propio<-TRUE # =own, alternative user defined file instead of WDPA +#nombre<-"EcuadorAreasProt" #Only applies if propio=TRUE, name of alternative shapefile +#campo<-"objectid" #Only applies if propio=TRUE, in campo you must specify the column of the shapefile table that contains the identifier code (ID) of each object (polygon) in the map of protected areas that the user provides through the shapefile. The name of the column must be inserted as it appears in the shapefile table, otherwise errors are generated +nareas <- 5 # the number of protected areas where the points from the passport table coordinates fall, areas organized in a ranking (from most to least important in terms of accumulation of taxa richness) that will be analyzed in detail. It can generate a problem or error if nareas is a very large number and the passport table has few records, or few different species, or all the points are highly concentrated spatially. +coveran <- TRUE # if coveran=TRUE a coverage analysis will be generated for the network of protected areas and a folder called CoverageAnalysis should appear in the results within the resultados para areas folder +niveltax <- "species"# At which taxonomic level the complementarity analysis is going to run (3 options: "genus", "species" or "subtaxa"). Take into account the following: If "genus" is selected, , in the GENUS column of the passport table there must be at least two different genera, or the same for "species" (SPECIES column) or "subtaxa" (SUBTAXA column)... if there are only NA values or there is only one value in the target column, it can generate errors. +datanatax <- FALSE # whether the NA values in genus, species or subtaxa will be taken into account as a different value. Any TRUE or FALSE option does not usually generate problems or errors. +mapaelcf <- TRUE # Note: Will an ELC map from a previous execution of the ELCmapas tool be used as an additional factor for classifying the taxonomic ranks for the complementarity analysis? +mapaelc <- "mapa_elc_world.grd" #Only applies if mapaelcf=TRUE, mapaelc must contain the name of the ELC map obtained by previously using the ELCmapas tool (.grd and .gri files that must always be in the CAPFITOGEN3/ELCmapas folder) +datanaelc <- FALSE # Only applies if mapaelcf=TRUE, indicates whether (TRUE) the records that fall in NA zones on the ELC map will be taken into account or not (FALSE) +data0elc <- FALSE #Only applies if mapaelcf=TRUE, indicates whether (TRUE) the records that fall in category 0 on the ELC map will be taken into account or not (FALSE) + +## Clustering and map creation: ELCmapas --------------------------------------- +message("Clustering and creating maps") +resultados <- Dir.Results.ECLMap # directory to place results # run the script -## NB! Change made in capfitogen script: replaced extract with raster::extract (other package masked it and caused error) +##' NB! Change made in capfitogen script: +##' replaced 'extract' with 'raster::extract' +##' (some other package masked it and caused an error) source(file.path(Dir.Capfitogen, "/scripts/Tools Herramientas/ELCmapas_BioDT.R")) +setwd(Dir.Base) -# visualise output +# quick visualisation of output +elc_tif_outputs <- list.files(path = Dir.Results.ECLMap, + pattern = "*.tif") -# below added by HJ, I'm not sure what it does -# bioclim_cl <- FUN.KmeansClust(ext_values = bioclim_ext, -# max_clusters = 8, -# vartype = 'bioclim') -# geophys_cl <- FUN.KmeansClust(ext_values = geophys_ext, -# max_clusters = 8, -# vartype = 'geophys') -# edaph_cl <- FUN.KmeansClust(ext_values = edaph_ext, -# max_clusters = 8, -# vartype = 'edaph') -# -# HJ: function NOT READY yet, end of map creation doesn't work -# maps <- FUN.ELCmaps(edaph = edaph_clust, -# bioclim = bioclim_clust, -# geophys = geophys_cl) -# +for (i in elc_tif_outputs) { + map_i = rast(paste0(Dir.Results.ECLMap, + "/", i)) + plot(map_i, + main = i) +} ## Overlaying conservation maps "Complementa" --------------------------------- -# set additional parameters -#... +message("running Capfitogen Complementa tool for conservation areas") +resultados <- resultados <- Dir.Results.Complementa # run the script +#' NB! Manually copied the script into the folder, as it is missing on GH... source(file.path(Dir.Capfitogen, "/scripts/Tools Herramientas/Complementa.R")) From 78375dfe45acddc52639dbd31883b888604855c1 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Fri, 21 Feb 2025 14:16:08 +0100 Subject: [PATCH 24/72] stuck on Complementa LATITUDE not found --- capfitogen_master.R | 58 +++++++++++++++++++++++++++++++++------------ 1 file changed, 43 insertions(+), 15 deletions(-) diff --git a/capfitogen_master.R b/capfitogen_master.R index d6c7093..b3975df 100644 --- a/capfitogen_master.R +++ b/capfitogen_master.R @@ -2,19 +2,20 @@ #' PROJECT: [BioDT CWR - Capfitogen] #' CONTENTS: #' - Loading/installing packages -#' - Execution of shared data pipeline +#' - Execution of data pipeline #' - Species occurrence download from GBIF #' - Environmental data load/download -#' - Data download of soil and geophisical variables -#' - Data formatting for CAPFITOGEN -#' - Execution of Capfitogen tools 'ELC maps' and 'Complementa' +#' - Data formatting and parameter definition for CAPFITOGEN +#' - Execution of CAPFITOGEN tools +#' - 'ELC maps' +#' - 'Complementa' #' - Visualisation of outputs #' DEPENDENCIES: #' - R_Scripts directory containing: #' - "MoDGP-commonlines.R" #' - "SHARED-APICredentials.R" -- NB! internal to project members, ask for access #' - "SHARED-Data.R" -#' AUTHORS: [Eva Lieungh, Erik Kusch, Heli Juottonen] +#' AUTHORS: [Eva Lieungh, Erik Kusch, Heli Juottonen, Desalegn Chala] #' Capfitogen credit: Parra-Quijano et al. 2021, #' https://repositorio.unal.edu.co/handle/unal/85787 #' ####################################################################### # @@ -161,14 +162,31 @@ if (!file.exists("capfitogen-main.zip")) { unzip(zipfile = "capfitogen-main.zip") } +if (!file.exists(file.path(Dir.Results.Complementa.Error,"process_info.txt"))) { + file.create(file.path(Dir.Results.Complementa.Error,"process_info.txt")) +} + ## Format GBIF data ----------------------------------------------------------- # need a data frame named 'puntos' = points with occurrence points puntos <- data.frame(POINTID = 1:length(Species_ls[["occs"]][["DECLATITUDE"]]), POINT_X = Species_ls[["occs"]][["DECLONGITUDE"]], POINT_Y = Species_ls[["occs"]][["DECLATITUDE"]]) -# create 'pasaporte' file +### create 'pasaporte' ---- +#' pasaporte file uses Darwincore names? +#' So it should be OK to use the occurrences from the GBIF download directly. +#' Place it in the right folder so Capfitogen can find it: +pasaporte_file_name = paste0(sub(pattern = " ", + replacement = "_", + SPECIES), + ".txt") +write.table(Species_ls[["occs"]], + file.path("Capfitogen-main/Pasaporte", + pasaporte_file_name)) + +pasaportest <- read.table("Capfitogen-main/Pasaporte/Lathyrus_angulatus.txt", + header = TRUE) ## Variable selection --------------------------------------------------------- # run variable selection based on variable inflation factor usdm::vif @@ -221,8 +239,11 @@ for (i in 1:dim(predictors)[3]) { ## used in CAPFITOGEN scripts below ruta <- Dir.Capfitogen # path to capfitogen scripts pais <- "World" # global extent - big modifications will be necessary to use different extent +pasaporte <- pasaporte_file_name # species occurrence data + geoqual <- FALSE -duplicat <- TRUE# duplicat=TRUE indicates that records of the same GENUS/SPECIES/SUBTAXA will be deleted +totalqual<-30 # Only applies if GEOQUAL=TRUE, must be a value between 0 and 100 +duplicat <- TRUE # duplicat=TRUE indicates that records of the same GENUS/SPECIES/SUBTAXA will be deleted distdup <- 1 # distance threshold in km to remove duplicates from same population resol1 <- "9x9" # resolution, change to 9x9 latitud <- FALSE #Only applies if ecogeo=TRUE; whether to use latitude variable (Y) as a geophysical variable from 'pasaporte' @@ -238,14 +259,16 @@ iterat <- 10 # if metodo="Calinski" or "ssi", the number of iterations to calcul # parameters for Complementa tool gaptype <- FALSE # Note: Representa tool a prerequisite of gaptype=TRUE +gaptresh <- 4 #Only applies if gaptype=TRUE +gapna <- "exclude" #Only applies if gaptype=TRUE celdas <- TRUE # Note: If celdas=TRUE, a complementarity analysis will be run by cells (grid) resol1 <- "9x9"#"celdas 10x10 km aprox (5 arc-min)" #Only applies if celdas=TRUE nceldas <- 10 #Only applies if celdas=TRUE, number of cells in a ranking (from most to least important in terms of taxa richness accumulation) areas <- TRUE # If areas=TRUE, a complementary analysis will be run per protected areas (polygons), which can come from a world database (WDPA) or from a shapefile provided by the user. If areas=TRUE, at least one of the following two options (or both), WDPA or propio, must be TRUE, otherwise it may cause errors. WDPA <- TRUE #Only applies if areas=TRUE -#propio<-TRUE # =own, alternative user defined file instead of WDPA -#nombre<-"EcuadorAreasProt" #Only applies if propio=TRUE, name of alternative shapefile -#campo<-"objectid" #Only applies if propio=TRUE, in campo you must specify the column of the shapefile table that contains the identifier code (ID) of each object (polygon) in the map of protected areas that the user provides through the shapefile. The name of the column must be inserted as it appears in the shapefile table, otherwise errors are generated +propio <- FALSE # =own, alternative user defined file instead of WDPA +nombre <- "EcuadorAreasProt" #Only applies if propio=TRUE, name of alternative shapefile +campo <- "objectid" #Only applies if propio=TRUE, in campo you must specify the column of the shapefile table that contains the identifier code (ID) of each object (polygon) in the map of protected areas that the user provides through the shapefile. The name of the column must be inserted as it appears in the shapefile table, otherwise errors are generated nareas <- 5 # the number of protected areas where the points from the passport table coordinates fall, areas organized in a ranking (from most to least important in terms of accumulation of taxa richness) that will be analyzed in detail. It can generate a problem or error if nareas is a very large number and the passport table has few records, or few different species, or all the points are highly concentrated spatially. coveran <- TRUE # if coveran=TRUE a coverage analysis will be generated for the network of protected areas and a folder called CoverageAnalysis should appear in the results within the resultados para areas folder niveltax <- "species"# At which taxonomic level the complementarity analysis is going to run (3 options: "genus", "species" or "subtaxa"). Take into account the following: If "genus" is selected, , in the GENUS column of the passport table there must be at least two different genera, or the same for "species" (SPECIES column) or "subtaxa" (SUBTAXA column)... if there are only NA values or there is only one value in the target column, it can generate errors. @@ -256,8 +279,10 @@ datanaelc <- FALSE # Only applies if mapaelcf=TRUE, indicates whether (TRUE) the data0elc <- FALSE #Only applies if mapaelcf=TRUE, indicates whether (TRUE) the records that fall in category 0 on the ELC map will be taken into account or not (FALSE) ## Clustering and map creation: ELCmapas --------------------------------------- -message("Clustering and creating maps") resultados <- Dir.Results.ECLMap # directory to place results + +message("Clustering and creating maps") + # run the script ##' NB! Change made in capfitogen script: ##' replaced 'extract' with 'raster::extract' @@ -278,13 +303,16 @@ for (i in elc_tif_outputs) { } ## Overlaying conservation maps "Complementa" --------------------------------- +resultados <- Dir.Results.Complementa + message("running Capfitogen Complementa tool for conservation areas") -resultados <- resultados <- Dir.Results.Complementa -# run the script + #' NB! Manually copied the script into the folder, as it is missing on GH... -source(file.path(Dir.Capfitogen, +{ # run the script + source(file.path(Dir.Capfitogen, "/scripts/Tools Herramientas/Complementa.R")) - + setwd(Dir.Base) +} # visualise output From a4f333e24ef9027be98688f5869356e811ff6089 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Fri, 21 Feb 2025 19:45:59 +0100 Subject: [PATCH 25/72] get to next error in Complementa; add illustration --- capfitogen_master.R | 18 ++++++++++++------ capfitogen_master_illustration.drawio.svg | 4 ++++ 2 files changed, 16 insertions(+), 6 deletions(-) create mode 100644 capfitogen_master_illustration.drawio.svg diff --git a/capfitogen_master.R b/capfitogen_master.R index b3975df..5244117 100644 --- a/capfitogen_master.R +++ b/capfitogen_master.R @@ -183,7 +183,8 @@ pasaporte_file_name = paste0(sub(pattern = " ", write.table(Species_ls[["occs"]], file.path("Capfitogen-main/Pasaporte", - pasaporte_file_name)) + pasaporte_file_name), + sep = "\t",) pasaportest <- read.table("Capfitogen-main/Pasaporte/Lathyrus_angulatus.txt", header = TRUE) @@ -309,10 +310,15 @@ message("running Capfitogen Complementa tool for conservation areas") #' NB! Manually copied the script into the folder, as it is missing on GH... -{ # run the script - source(file.path(Dir.Capfitogen, +#' NB! changed "" to " " in line 91: +#' pasaporte<-read.delim(paste("Pasaporte/",pasaporte,sep=" ")) +#' +# run the script +pasaporte <- pasaporte_file_name +source(file.path(Dir.Capfitogen, "/scripts/Tools Herramientas/Complementa.R")) - setwd(Dir.Base) -} -# visualise output +#' stops at line 173 mapaelcf if loop +setwd(Dir.Base) + +# visualise output \ No newline at end of file diff --git a/capfitogen_master_illustration.drawio.svg b/capfitogen_master_illustration.drawio.svg new file mode 100644 index 0000000..a47b7cd --- /dev/null +++ b/capfitogen_master_illustration.drawio.svg @@ -0,0 +1,4 @@ + + + +
GBIF
GBIF
GitHub -- uc-CRW repository
GitHub -- uc-CRW repository
/R_Scripts
/R_Scripts
/Data/Environment
/Data/Environment
/Data/GBIF
/Data/GBIF
/R_Scripts

ModGP-commonlines.R


ModGP-commonlines.R...
Packages,
directories,
simple functions
Packages,...
SHARED-Data.R
SHARED-Data.R
Define functions to download data:
GBIF (FUN.DownGBIF),
BioClim vars with KrigR (FUN.DownBV),
Edaphic variables (FUN.DownEV),
Geophysical variables (FUN.DownGV)
Define functions to download data:...
ERA5
ERA5
Harmonized World Soil Database
Harmonized World Soil Data...
SoilGrids
SoilGrids
Download CAPFITOGEN scripts
Download CAPFIT...
Set species name
Set species name
run shared scripts
run shared scri...
FUN.DownGBIF()
FUN.DownGBIF()
FUN.DownBV()
FUN.DownBV()
FUN.DownEV()
FUN.DownEV()
run Capfitogen tools
run Capfitogen tools
set parameters for capfitogen
set parameters for ca...
visualising output
visualising output
Variable selection with usdm::vifcor()
Variable selection wi...
capfitogen_master.R
capfitogen_master.R
/Capfitogen-main
/scripts/Tools Herramientas/
ELCmapas.R
/scripts/Tools Herramientas/...
CAPFITOGEN tool for
ecogeographic land characterization (ELC) maps
which reflect adaptive scenarios for
a specific species
CAPFITOGEN tool for...
/scripts/Tools Herramientas/
Complementa.R
/scripts/Tools Herramientas/...
CAPFITOGEN tool for
complementarity analysis between cells or 
protected areas to see the degree of coverage of
current protected area networks in terms of in situ conservation of crop wild relatives
CAPFITOGEN tool for...
\ No newline at end of file From 9ecdbe9ed2acc024d269c7ca0e81e0643f736867 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Mon, 24 Feb 2025 13:20:46 +0100 Subject: [PATCH 26/72] get to next Complementa error & protected areas download --- R_scripts/ModGP-commonlines.R | 6 +- capfitogen_master.R | 135 +++++++++++++++++++++++----------- 2 files changed, 98 insertions(+), 43 deletions(-) diff --git a/R_scripts/ModGP-commonlines.R b/R_scripts/ModGP-commonlines.R index 0836705..b310d23 100644 --- a/R_scripts/ModGP-commonlines.R +++ b/R_scripts/ModGP-commonlines.R @@ -118,14 +118,16 @@ Dir.Data <- file.path(Dir.Base, "Data") Dir.Data.ModGP <- file.path(Dir.Data, "ModGP") Dir.Data.GBIF <- file.path(Dir.Data, "GBIF") Dir.Data.Envir <- file.path(Dir.Data, "Environment") +Dir.Data.Capfitogen <- file.path(Dir.Data, "Capfitogen") Dir.Exports <- file.path(Dir.Base, "Exports") Dir.Exports.ModGP <- file.path(Dir.Exports, "ModGP") Dir.Exports.Capfitogen <- file.path(Dir.Exports, "Capfitogen") Dir.R_scripts <- file.path(Dir.Base, "R_scripts") Dir.Capfitogen <- file.path(Dir.Base, "Capfitogen-main") +Dir.Capfitogen.WDPA <- file.path(Dir.Capfitogen, "wdpa") +Dir.Capfitogen.ELCMap <- file.path(Dir.Capfitogen, "ELCmapas") +Dir.Capfitogen.ELCMap.Error <- file.path(Dir.Capfitogen.ELCMap, "Error") Dir.Results <- file.path(Dir.Base, "results") -Dir.Results.ECLMap <- file.path(Dir.Results, "ECLMap") -Dir.Results.ECLMap.Error <- file.path(Dir.Results.ECLMap, "Error") Dir.Results.Complementa <- file.path(Dir.Results, "Complementa") Dir.Results.Complementa.Error <- file.path(Dir.Results.Complementa, "Error") diff --git a/capfitogen_master.R b/capfitogen_master.R index 5244117..f68d993 100644 --- a/capfitogen_master.R +++ b/capfitogen_master.R @@ -147,6 +147,7 @@ edaphic_variables <- FUN.DownEV( ) ### Geophysical data ------ +message("Downloading new or loading existing geophysical variables") geophysical_variables <- FUN.DownGV( Dir = Dir.Data.Envir, Force = FALSE, @@ -166,7 +167,7 @@ if (!file.exists(file.path(Dir.Results.Complementa.Error,"process_info.txt"))) { file.create(file.path(Dir.Results.Complementa.Error,"process_info.txt")) } -## Format GBIF data ----------------------------------------------------------- +### Format GBIF data ---- # need a data frame named 'puntos' = points with occurrence points puntos <- data.frame(POINTID = 1:length(Species_ls[["occs"]][["DECLATITUDE"]]), POINT_X = Species_ls[["occs"]][["DECLONGITUDE"]], @@ -189,12 +190,27 @@ write.table(Species_ls[["occs"]], pasaportest <- read.table("Capfitogen-main/Pasaporte/Lathyrus_angulatus.txt", header = TRUE) +### Download protected areas ---- +#' https://www.protectedplanet.net/en/thematic-areas/wdpa&ved=2ahUKEwjA4fPhltyLAxVkJBAIHfdOEasQFnoECBUQAQ&usg=AOvVaw0eVrEFsb0_TP4UIl2am3Za +#' download shapefiles for protected areas to overlay with Complementa tool, +#' from The World Database on Protected Areas +#' UNEP-WCMC and IUCN (2025), Protected Planet: The World Database on Protected +#' Areas (WDPA)[On-line], [February 2025], Cambridge, UK: UNEP-WCMC and IUCN. +#' Available at: https://doi.org/10.34892/6fwd-af11 +#' NB! not working, no direct download link found +wdpa_url = "https://data-gis.unep-wcmc.org/server/rest/services/ProtectedSites/The_World_Database_of_Protected_Areas/FeatureServer/1" +wdpa_destination = file.path(Dir.Data.Capfitogen, + "wdpa") +download.file(url = wdpa_url, + destfile = wdpa_destination) + ## Variable selection --------------------------------------------------------- -# run variable selection based on variable inflation factor usdm::vif +# combine variables all_predictors <- c(bioclim_variables, #edaphic_variables, # Error in xcor[mx[1], mx[2]] : subscript out of bounds / In addition: Warning message: / [spatSample] fewer values returned than requested geophysical_variables) +# run variable selection based on variable inflation factor usdm::vif predictor_vifs <- vifcor( all_predictors,# replace with either BV, EV, GV to run separately per type @@ -204,9 +220,9 @@ predictor_vifs <- method = "pearson" # 'pearson','kendall','spearman' ) +# check which variables are kept variables_to_keep <- names(all_predictors)[names(all_predictors) %nin% predictor_vifs@excluded] - message("variables kept after excluding the most correlated ones:") print(variables_to_keep) @@ -233,15 +249,17 @@ for (i in 1:dim(predictors)[3]) { paste0(names(predictors[[i]]),".tif")) writeRaster(predictors[[i]], file_name_path, - overwrite=TRUE) + overwrite = TRUE) } -## Parameters ----------------------------------------------------------------- -## used in CAPFITOGEN scripts below +## Clustering and map creation: ELCmapas --------------------------------------- +### Parameters for ELC maps ---- +{ ruta <- Dir.Capfitogen # path to capfitogen scripts -pais <- "World" # global extent - big modifications will be necessary to use different extent +resultados <- Dir.Results.ELCMap # directory to place results pasaporte <- pasaporte_file_name # species occurrence data +pais <- "World" # global extent - big modifications will be necessary to use different extent geoqual <- FALSE totalqual<-30 # Only applies if GEOQUAL=TRUE, must be a value between 0 and 100 duplicat <- TRUE # duplicat=TRUE indicates that records of the same GENUS/SPECIES/SUBTAXA will be deleted @@ -249,72 +267,107 @@ distdup <- 1 # distance threshold in km to remove duplicates from same populatio resol1 <- "9x9" # resolution, change to 9x9 latitud <- FALSE #Only applies if ecogeo=TRUE; whether to use latitude variable (Y) as a geophysical variable from 'pasaporte' longitud <- FALSE -# nminvar <- 3 # minimum number of variables to select per component. For example, although the processes of variable selection by RF and bivariate correlation indicate that two variables will be selected, if the nminvar number is 3, the selection process by correlations will select the three least correlated variables. bioclimv <- predictor_names[grep("BIO", predictor_names)] # edaphv <- names(geophysical_variables)#names(edaphic_variables) # edaphic variables (defaults from SOILGRIDS) geophysv <- names(geophysical_variables) # geophysical variables + maxg <- 20 # maximum number of clusters per component metodo <- "kmeansbic" # clustering algorithm type. Options: medoides, elbow, calinski, ssi, bic iterat <- 10 # if metodo="Calinski" or "ssi", the number of iterations to calculate the optimal number of clusters. +} + +# run the script +message("Clustering and creating maps") +##' NB! Change made in capfitogen script: +##' replaced 'extract' with 'raster::extract' +##' (some other package masked it and caused an error) +source(file.path(Dir.Capfitogen, + "/scripts/Tools Herramientas/ELCmapas_BioDT.R")) +setwd(Dir.Base) + +### quick visualisation of ELC maps ---- +# List all the .tif files in the directory +elc_tif_outputs <- list.files(path = Dir.Results.ELCMap, + pattern = "\\.tif$", + full.names = TRUE) + +# Loop over each .tif file +for (file_path in elc_tif_outputs) { + # Read the raster file + map_i <- rast(file_path) + + # Replace NaN with NA (if they exist) + map_i[is.nan(values(map_i))] <- NA + + # Create a mask to highlight non-zero areas + non_zero_mask <- mask(map_i, !is.na(map_i)) + + # Convert to points to find non-zero values' extent + points <- as.points(non_zero_mask, na.rm = TRUE) + + # If there are any valid points, proceed with cropping + if (!is.null(points) && nrow(points) > 0) { + # Calculate extent directly from the non-empty points + coordinates <- terra::geom(points)[, c("x", "y")] + xmin = min(coordinates[,"x"]) + xmax = max(coordinates[,"x"]) + ymin = min(coordinates[,"y"]) + ymax = max(coordinates[,"y"]) + non_zero_extent <- ext(xmin, xmax, ymin, ymax) + + # Crop the raster using this extent + cropped_map <- crop(map_i, non_zero_extent) + + # Plot the cropped raster + plot(cropped_map, main = basename(file_path)) + } else { + plot(map_i, main = paste(basename(file_path), "(No non-zero values)")) + } +} + +## Overlaying conservation maps "Complementa" --------------------------------- +### parameters for Complementa ---- +{ +resultados <- Dir.Results.Complementa +pasaporte <- pasaporte_file_name -# parameters for Complementa tool gaptype <- FALSE # Note: Representa tool a prerequisite of gaptype=TRUE gaptresh <- 4 #Only applies if gaptype=TRUE gapna <- "exclude" #Only applies if gaptype=TRUE + celdas <- TRUE # Note: If celdas=TRUE, a complementarity analysis will be run by cells (grid) -resol1 <- "9x9"#"celdas 10x10 km aprox (5 arc-min)" #Only applies if celdas=TRUE +resol1 <- "celdas 10x10 km aprox (5 arc-min)" #Only applies if celdas=TRUE nceldas <- 10 #Only applies if celdas=TRUE, number of cells in a ranking (from most to least important in terms of taxa richness accumulation) + areas <- TRUE # If areas=TRUE, a complementary analysis will be run per protected areas (polygons), which can come from a world database (WDPA) or from a shapefile provided by the user. If areas=TRUE, at least one of the following two options (or both), WDPA or propio, must be TRUE, otherwise it may cause errors. WDPA <- TRUE #Only applies if areas=TRUE propio <- FALSE # =own, alternative user defined file instead of WDPA -nombre <- "EcuadorAreasProt" #Only applies if propio=TRUE, name of alternative shapefile +nombre <- "nameOfAlternativeShapefile" #Only applies if propio=TRUE, name of alternative shapefile campo <- "objectid" #Only applies if propio=TRUE, in campo you must specify the column of the shapefile table that contains the identifier code (ID) of each object (polygon) in the map of protected areas that the user provides through the shapefile. The name of the column must be inserted as it appears in the shapefile table, otherwise errors are generated nareas <- 5 # the number of protected areas where the points from the passport table coordinates fall, areas organized in a ranking (from most to least important in terms of accumulation of taxa richness) that will be analyzed in detail. It can generate a problem or error if nareas is a very large number and the passport table has few records, or few different species, or all the points are highly concentrated spatially. coveran <- TRUE # if coveran=TRUE a coverage analysis will be generated for the network of protected areas and a folder called CoverageAnalysis should appear in the results within the resultados para areas folder + niveltax <- "species"# At which taxonomic level the complementarity analysis is going to run (3 options: "genus", "species" or "subtaxa"). Take into account the following: If "genus" is selected, , in the GENUS column of the passport table there must be at least two different genera, or the same for "species" (SPECIES column) or "subtaxa" (SUBTAXA column)... if there are only NA values or there is only one value in the target column, it can generate errors. datanatax <- FALSE # whether the NA values in genus, species or subtaxa will be taken into account as a different value. Any TRUE or FALSE option does not usually generate problems or errors. + mapaelcf <- TRUE # Note: Will an ELC map from a previous execution of the ELCmapas tool be used as an additional factor for classifying the taxonomic ranks for the complementarity analysis? mapaelc <- "mapa_elc_world.grd" #Only applies if mapaelcf=TRUE, mapaelc must contain the name of the ELC map obtained by previously using the ELCmapas tool (.grd and .gri files that must always be in the CAPFITOGEN3/ELCmapas folder) datanaelc <- FALSE # Only applies if mapaelcf=TRUE, indicates whether (TRUE) the records that fall in NA zones on the ELC map will be taken into account or not (FALSE) data0elc <- FALSE #Only applies if mapaelcf=TRUE, indicates whether (TRUE) the records that fall in category 0 on the ELC map will be taken into account or not (FALSE) - -## Clustering and map creation: ELCmapas --------------------------------------- -resultados <- Dir.Results.ECLMap # directory to place results - -message("Clustering and creating maps") - -# run the script -##' NB! Change made in capfitogen script: -##' replaced 'extract' with 'raster::extract' -##' (some other package masked it and caused an error) -source(file.path(Dir.Capfitogen, - "/scripts/Tools Herramientas/ELCmapas_BioDT.R")) -setwd(Dir.Base) - -# quick visualisation of output -elc_tif_outputs <- list.files(path = Dir.Results.ECLMap, - pattern = "*.tif") - -for (i in elc_tif_outputs) { - map_i = rast(paste0(Dir.Results.ECLMap, - "/", i)) - plot(map_i, - main = i) } -## Overlaying conservation maps "Complementa" --------------------------------- -resultados <- Dir.Results.Complementa - message("running Capfitogen Complementa tool for conservation areas") - #' NB! Manually copied the script into the folder, as it is missing on GH... -#' NB! changed "" to " " in line 91: -#' pasaporte<-read.delim(paste("Pasaporte/",pasaporte,sep=" ")) +#' Changed in the script: +#' - replaced 'extract' with 'raster::extract' +#' - line ~576 tabla_especies<-puntosorigshp@data +#' is a problem. "no applicable method for `@` applied to an object of class "sf"" +#' replaced it with as.data.frame(puntosorigshp) #' # run the script -pasaporte <- pasaporte_file_name +setwd(Dir.Base) source(file.path(Dir.Capfitogen, "/scripts/Tools Herramientas/Complementa.R")) #' stops at line 173 mapaelcf if loop From 7f79bcc8acbe6b1216c7534619bf0b68d07cf6a6 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Mon, 24 Feb 2025 13:29:36 +0100 Subject: [PATCH 27/72] delete unused draft scripts --- R_scripts/3_VarSelection_HJ.R | 37 --- R_scripts/4_ELCMap_HJ.R | 406 ---------------------------- R_scripts/SHARED-Data_CAPFITOGEN.R | 408 ----------------------------- R_scripts/capfitogen.R | 174 ------------ 4 files changed, 1025 deletions(-) delete mode 100644 R_scripts/3_VarSelection_HJ.R delete mode 100644 R_scripts/4_ELCMap_HJ.R delete mode 100644 R_scripts/SHARED-Data_CAPFITOGEN.R delete mode 100644 R_scripts/capfitogen.R diff --git a/R_scripts/3_VarSelection_HJ.R b/R_scripts/3_VarSelection_HJ.R deleted file mode 100644 index 5961b51..0000000 --- a/R_scripts/3_VarSelection_HJ.R +++ /dev/null @@ -1,37 +0,0 @@ -#' Variable selection script, copied and modified from CAPFITOGEN -#' -#' to be added: vifcor approach from ModGP - -FUN.VarSelection <- function(specdata = Species_ls$occs, - varstack = NULL, - buf = NULL){ - - vifcor_res <- usdm::vifcor(varstack, th = 0.7) # variable inflation factor - varstack_sel <- usdm::exclude(varstack, vifcor_res) # excluding variables with high cor and vif - - # extracting values from the raster file - - if(missing(buf)) { - pnt <- sf::st_coordinates(specdata) # pnt <- sf::st_coordinates(occ_ls) - values <- terra::extract(varstack_sel, pnt) - #values <- exactextractr::exact_extract(varstack_sel, occ_ls) # HJ: a lot faster than terra but gives an error: 'Unsupported geometry' - } else { - pntbuf <- sf::st_buffer(specdata, dist = buf) # pntbuf <- sf::st_buffer(occ_ls, dist = buf) - st_crs(pntbuf) <- 4326 # HJ: without this, warning about 'no CRS specified'. How to do this properly? - values <- exactextractr::exact_extract(varstack_sel, pntbuf, 'mean') # HJ: is mean correct here? - # HJ: terra not used here because gets stuck with buffer, probably I'm misunderstanding something - } - - # correcting column names - colnames(values) <- names(varstack_sel) - - # adding coordinates - ext_values <- as.data.frame(row.names(values), nm = "pointid") - ext_values <- cbind(ext_values, st_coordinates(specdata), values) - #ext_values <- cbind(pnt, values) - #ext_values <- na.omit(ext_values) - - ext_values - -} - diff --git a/R_scripts/4_ELCMap_HJ.R b/R_scripts/4_ELCMap_HJ.R deleted file mode 100644 index d32b73f..0000000 --- a/R_scripts/4_ELCMap_HJ.R +++ /dev/null @@ -1,406 +0,0 @@ -#' ############################################################################## -#' In this script: Finding the optimal number of clusters for each -#' ecogeographic component, and performing clustering -#' -#' Script copied and modified from CAPFITOGEN ELCmapas.R -#' -#' AUTHORS: [Mauricio Parra Quijano, Heli Juottonen, Eva Lieungh] -#' Capfitogen credit: Parra-Quijano et al. 2021, -#' https://repositorio.unal.edu.co/handle/unal/85787 -#' -#' HJ: originally in Capfitogen six methods, here only kmeans/BIC) -#' HJ: everything here from Capfitogen scripts, just simplified/ -#' streamlined/turned into functions -#' ############################################################################## - -# Method of successive K-means and BIC criteria ================================ -#' HJ: ext_values = bioclim_ext/geophys_ext/edaph_ext -#' HJ: max_clusters = maximum number of clusters (was 8) -#' HJ: vartype: edaph / geopys / bioclim -{ - FUN.KmeansClust <- function(ext_values, max_clusters, vartype) { - # standardization (rescaling) the variables (mean zero, sd=1) - # clustering is run on standardized variables, actual values more useful later) - ext_values <- na.omit(ext_values) - stand_values <- terra::scale(ext_values[, 4:length(ext_values)]) - stand_values <- data.frame(ext_values[, 1], stand_values) - colnames(stand_values)[1] <- "pointid" - stand_values <- na.omit(stand_values) # place? needed? - - # HJ: missing: possibility to use latitude and longitude as variables for geophys data - is this needed? - # below how this was handled in Capfitogen scripts - - # ext_values <- raster::extract(variablestack, puntos[, c("POINT_X","POINT_Y")]) - # ext_values <- cbind(puntos[, 1], bioclim) - # colnames(var)[1] <- "POINTID" - # - # if(longitud){ - # var <- cbind(var, puntos[,2]) - # colnames(var)[ncol(var)] <- "LONGITUD" - # } - # if(latitud){ - # var <- cbind(var, puntos[,3]) - # colnames(var)[ncol(var)] < -"LATITUD" - # } - - # clustering - fitb <- find.clusters(stand_values[, -1], - stat = "BIC", - choose.n.clust = FALSE, - criterion = "goodfit", - max.n.clust = max_clusters, - center = TRUE, - scale = TRUE, - pca.select = "percVar", - perc.pca = 75 - ) - VARCLUST <- as.numeric(fitb$grp) - stand_values$VARCLUST <- VARCLUST - ext_values <- merge(ext_values, - stand_values[, c("pointid", "VARCLUST")], - by = "pointid" - ) - colnames(ext_values)[colnames(ext_values) == "VARCLUST"] <- - paste0(vartype, "_clust") - VARCLUST <- length(unique(VARCLUST)) - - # save jpeg - jpeg(file = file.path( - Dir.Results, - paste0(vartype, "_BIC_optimal_N_clusters.jpeg") - )) - plot(fitb$Kstat, - type = "o", - xlab = "number of clusters (K)", - ylab = "BIC", - col = "blue", - main = paste("Detection based on BIC. Optimal value=", - round(fitb$stat, 1), - sep = "" - ) - ) - points(VARCLUST, - fitb$Kstat[VARCLUST], - pch = "x", cex = 2 - ) - dev.off() - - ext_values - } -} - -# 9. Combining clusters from each component to obtain ecogeographical categories (bioclimatic, edaphic and geophysical unique conditions) -# 10. Creating each component and the combination (ELC) map - -{ - ######################################################################## - ################## End of clustering ################################# - - # HJ: input: edaph_cl, geophys_cl, bioclim_cl - # HJ: output: 3-4 maps, summary data? - - FUN.ELCmaps <- function(edaph = edaph_cl, - bioclim = bioclim_cl, - geophys = geophys_cl) { - # Consolidaci?n de tabla ?nica a trav?s de tabla puntos - tabla <- data.frame(edaph) # , geophys$geophys_clust, bioclim$bioclim_clust) - tabla <- merge(tabla, geophys, - by = "pointid", all.x = T - ) - tabla <- merge(tabla, bioclim, - by = "pointid", all.x = T - ) - # rm(bioclim,geophys,edaph,puntos) - mapaelc <- as.data.frame(matrix( - nrow = length(tabla[, 1]), - ncol = 2 - )) - mapaelc[, 1] <- tabla[, 1] - colnames(mapaelc)[1] <- "pointid" - colnames(mapaelc)[2] <- "combi" - for (i in 1:length(tabla[, 1])) { - mapaelc[i, 2] <- - ifelse(is.na(substr( - tabla$bioclim_clust[i], 1, 1 - )) | is.na(substr(tabla$geophys_clust[i],1, 1 - )) | is.na(substr(tabla$edaph_clust[i],1, 1 - )), NA, - paste(substr(tabla$bioclim_clust[i], 1, 1), - substr(tabla$geophys_clust[i], 1, 1), - substr(tabla$edaph_clust[i], 1, 1), - sep = "") - ) - } - elc <- subset(mapaelc, !duplicated(combi), select = -pointid) - elc <- subset(elc, !is.na(combi)) - elc <- elc[order(elc$combi), , drop = FALSE] - # elc <- elc[i,] - - # Assign number to each category - elc[, 2] <- 1:nrow(elc) - # mapaelc <- mapaelc[,1:2] - # Assignment - mapaelc <- merge(mapaelc, elc, by = "combi") - colnames(mapaelc)[3] <- "elc_cat" - tabla <- merge(tabla, mapaelc, by = "pointid", all.x = T) - tabla$elc_cat[is.na(tabla$elc_cat)] <- 0 - tabla$bioclim_clust[is.na(tabla$bioclim_clust)] <- 0 - tabla$geophys_clust[is.na(tabla$geophys_clust)] <- 0 - tabla$edaph_clust[is.na(tabla$edaph_clust)] <- 0 - - # Creating ELC raster map - mapaelc0 <- raster(matrix(nrow = (dim(bioclim_ras)[1]), - ncol = (dim(bioclim_ras)[2])), - template = bioclim_ras) - # HJ: unused argument: template ??? wrong type? - mapaelc1 <- rasterize(cbind(tabla[, 2], tabla[, 3]), - mapaelc0, - field = tabla$elc_cat) - mapaelc2 <- rasterize(cbind(tabla[, 2], tabla[, 3]), - mapaelc0, - field = tabla$bioclim_clust) - mapaelc3 <- rasterize(cbind(tabla[, 2], tabla[, 3]), - mapaelc0, - field = tabla$geophys_clust) - mapaelc4 <- rasterize(cbind(tabla[, 2], tabla[, 3]), - mapaelc0, - field = tabla$edaph_clust) - - # 11. Characterizing each final cluster by the original variables (not rescaled) - # 12. Exporting tables and maps in different formats - - # HJ: from here on things start to go wrong, more GIS experience needed - - crs(mapaelc1) <- "+proj=longlat" - crs(mapaelc2) <- "+proj=longlat" - crs(mapaelc3) <- "+proj=longlat" - crs(mapaelc4) <- "+proj=longlat" - - writeRaster(mapaelc1, - filename = paste(resultados, "/mapa_elc_", - pais, ".grd", sep = ""), - overwrite = T, datatype = "FLT4S") - - writeRaster(mapaelc1, - filename = paste(resultados, "/mapa_elc_DIVA_", - pais, ".grd", sep = ""), - overwrite = T, datatype = "FLT4S") - - writeRaster(mapaelc1, filename = paste(resultados, "/mapa_elc_", - pais, ".tif", sep = ""), - overwrite = T, datatype = "FLT4S") - - writeRaster(mapaelc2, - filename = paste(resultados, "/mapa_bioclimatico_", - pais, ".grd", sep = ""), - overwrite = T, datatype = "FLT4S") - - writeRaster(mapaelc2, - filename = paste(resultados, "/mapa_bioclimatico_", - pais, ".tif", sep = ""), - overwrite = T, datatype = "FLT4S") - - writeRaster(mapaelc3, - filename = paste(resultados, "/mapa_geofisico_", - pais, ".grd", sep = ""), - overwrite = T, datatype = "FLT4S") - - writeRaster(mapaelc3, - filename = paste(resultados, "/mapa_geofisico_", - pais, ".tif", sep = ""), - overwrite = T, datatype = "FLT4S") - - writeRaster(mapaelc4, - filename = paste(resultados, "/mapa_edafico_", - pais, ".grd", sep = ""), - overwrite = T, datatype = "FLT4S") - - writeRaster(mapaelc4, - filename = paste(resultados, "/mapa_edafico_", - pais, ".tif", sep = ""), - overwrite = T, datatype = "FLT4S") - - writeRaster(mapaelc2, - filename = "testibioclim.tif", - overwrite = T, datatype = "FLT4S") - - # HJ: above doesn't work, probably because of my limited GIS skills. Gave the error below: - - # Error in plot.window(...) : need finite 'ylim' values - # In addition: Warning messages: - # 1: In xy.coords(x, y, xlabel, ylabel, log) : NAs introduced by coercion - # 2: In min(x) : no non-missing arguments to min; returning Inf - # 3: In max(x) : no non-missing arguments to max; returning -Inf - - # HJ: script from here on creates tables with descriptive statistics of the environmental data - # HJ : creates a lot of tables, which ones of them are needed? - - # OBJETO SALIDA 3 - # tables of statistics for each component - tablabio <- data.frame(table(bioclim$BIOCLUST)) - colnames(tablabio)[1] <- "BIOCLIM_CAT" - tablabioclim <- aggregate(bioclim[, 2:(length(bioclim[1, ]) - 1)], - by = list(bioclim$BIOCLUST), mean, na.rm = TRUE) - colnames(tablabioclim)[1] <- "BIOCLIM_CAT" - tablabioclim1 <- aggregate(bioclim[, 2:(length(bioclim[1, ]) - 1)], - by = list(bioclim$BIOCLUST), min, na.rm = TRUE) - colnames(tablabioclim1)[1] <- "BIOCLIM_CAT" - tablabioclim2 <- aggregate(bioclim[, 2:(length(bioclim[1, ]) - 1)], - by = list(bioclim$BIOCLUST), max, na.rm = TRUE) - colnames(tablabioclim2)[1] <- "BIOCLIM_CAT" - tablabioclim3 <- aggregate(bioclim[, 2:(length(bioclim[1, ]) - 1)], - by = list(bioclim$BIOCLUST), sd, na.rm = TRUE) - colnames(tablabioclim3)[1] <- "BIOCLIM_CAT" - tablabioclim <- merge(tablabio, tablabioclim, by = "BIOCLIM_CAT") - tablabioclim <- merge(tablabioclim, tablabioclim1, - by = "BIOCLIM_CAT", suffixes = c(".media", ".min")) - tablabioclim2 <- merge(tablabioclim2, tablabioclim3, - by = "BIOCLIM_CAT", suffixes = c(".max", ".sd")) - tablabioclim <- merge(tablabioclim, tablabioclim2, - by = "BIOCLIM_CAT") - - write.table(tablabioclim, - file = paste(resultados, "/Estadist_BIOCLIM_", - pais, ".txt", sep = ""), sep = "\t", - row.names = FALSE, qmethod = "double") - - tablageo <- data.frame(table(geophys$GEOCLUST)) - colnames(tablageo)[1] <- "GEOPHYS_CAT" - tablageophys <- aggregate(geophys[, 2:(length(geophys[1, ]) - 1)], - by = list(geophys$GEOCLUST), mean, na.rm = TRUE) - colnames(tablageophys)[1] <- "GEOPHYS_CAT" - tablageophys1 <- aggregate(geophys[, 2:(length(geophys[1, ]) - 1)], - by = list(geophys$GEOCLUST), min, na.rm = TRUE) - colnames(tablageophys1)[1] <- "GEOPHYS_CAT" - tablageophys2 <- aggregate(geophys[, 2:(length(geophys[1, ]) - 1)], - by = list(geophys$GEOCLUST), max, na.rm = TRUE) - colnames(tablageophys2)[1] <- "GEOPHYS_CAT" - tablageophys3 <- aggregate(geophys[, 2:(length(geophys[1, ]) - 1)], - by = list(geophys$GEOCLUST), sd, na.rm = TRUE) - colnames(tablageophys3)[1] <- "GEOPHYS_CAT" - tablageophys <- merge(tablageo, tablageophys, - by = "GEOPHYS_CAT") - tablageophys <- merge(tablageophys, tablageophys1, - by = "GEOPHYS_CAT", suffixes = c(".media", ".min")) - tablageophys2 <- merge(tablageophys2, tablageophys3, - by = "GEOPHYS_CAT", suffixes = c(".max", ".sd")) - tablageophys <- merge(tablageophys, tablageophys2, - by = "GEOPHYS_CAT") - - write.table(tablageophys, - file = paste(resultados, "/Estadist_GEOPHYS_", - pais, ".txt", sep = ""), sep = "\t", - row.names = FALSE, qmethod = "double") - - tablaeda <- data.frame(table(edaph$EDACLUST)) - colnames(tablaeda)[1] <- "EDAPH_CAT" - tablaedaph <- aggregate(edaph[, 2:(length(edaph[1, ]) - 1)], - by = list(edaph$EDACLUST), mean, na.rm = TRUE) - colnames(tablaedaph)[1] <- "EDAPH_CAT" - tablaedaph1 <- aggregate(edaph[, 2:(length(edaph[1, ]) - 1)], - by = list(edaph$EDACLUST), min, na.rm = TRUE) - colnames(tablaedaph1)[1] <- "EDAPH_CAT" - tablaedaph2 <- aggregate(edaph[, 2:(length(edaph[1, ]) - 1)], - by = list(edaph$EDACLUST), max, na.rm = TRUE) - colnames(tablaedaph2)[1] <- "EDAPH_CAT" - tablaedaph3 <- aggregate(edaph[, 2:(length(edaph[1, ]) - 1)], - by = list(edaph$EDACLUST), sd, na.rm = TRUE) - colnames(tablaedaph3)[1] <- "EDAPH_CAT" - tablaedaph <- merge(tablaeda, tablaedaph, - by = "EDAPH_CAT") - tablaedaph <- merge(tablaedaph, tablaedaph1, - by = "EDAPH_CAT", suffixes = c(".media", ".min")) - tablaedaph2 <- merge(tablaedaph2, tablaedaph3, - by = "EDAPH_CAT", suffixes = c(".max", ".sd")) - tablaedaph <- merge(tablaedaph, tablaedaph2, - by = "EDAPH_CAT") - - write.table(tablaedaph, - file = paste(resultados, "/Estadist_EDAPH_", - pais, ".txt", sep = ""), - sep = "\t", row.names = FALSE, qmethod = "double") - write.table(tabla, - file = paste(resultados, "/Tabla_ELC_celdas_", - pais, ".txt", sep = ""), sep = "\t", - row.names = FALSE, qmethod = "double") - - if (any(unique(tabla$BIOCLUST) == 0)) { - N_bioclust <- length(unique(tabla$BIOCLUST)) - 1 - } - if (all(unique(tabla$BIOCLUST) > 0)) { - N_bioclust <- length(unique(tabla$BIOCLUST)) - } - if (any(unique(tabla$GEOCLUST) == 0)) { - N_geoclust <- length(unique(tabla$GEOCLUST)) - 1 - } - if (all(unique(tabla$GEOCLUST) > 0)) { - N_geoclust <- length(unique(tabla$GEOCLUST)) - } - if (any(unique(tabla$EDACLUST) == 0)) { - N_edaclust <- length(unique(tabla$EDACLUST)) - 1 - } - if (all(unique(tabla$EDACLUST) > 0)) { - N_edaclust <- length(unique(tabla$EDACLUST)) - } - if (any(unique(tabla$ELC_CAT) == 0)) { - N_ELC_CAT <- length(unique(tabla$ELC_CAT)) - 1 - } - if (all(unique(tabla$ELC_CAT) > 0)) { - N_ELC_CAT <- length(unique(tabla$ELC_CAT)) - } - NCATS <- as.data.frame(cbind(N_ELC_CAT, N_bioclust, N_geoclust, N_edaclust)) - - # OBJETO SALIDA 4 - write.table(NCATS, - file = paste(resultados, "/numero_categorias_", - pais, ".txt", sep = ""), sep = "\t", - row.names = FALSE, qmethod = "double") - - ## Obtain descriptive statistics para cada categor?a ELc - nbioclim <- length(bioclimv) - ngeophys <- vector(mode = "numeric", length = 1) - for (i in 1:1) { - ngeophys <- ifelse(all(latitud, longitud), - length(geophysv) + 2, - ifelse(any(latitud, longitud), - length(geophysv) + 1, length(geophysv))) - } - nedaph <- length(edaphv) - tabla <- tabla[, c(ncol(tabla), - ncol(tabla) - 1, - 4:(3 + nbioclim), - (5 + nbioclim):(4 + nbioclim + ngeophys), - (6 + nbioclim + ngeophys):(ncol(tabla) - 3))] - media <- aggregate(tabla[, c(-1, -2)], - by = list(tabla$ELC_CAT), FUN = "mean") - colnames(media)[1] <- "ELC_CAT" - mediana <- aggregate(tabla[, c(-1, -2)], - by = list(tabla$ELC_CAT), FUN = "median") - colnames(mediana)[1] <- "ELC_CAT" - maximo <- aggregate(tabla[, c(-1, -2)], - by = list(tabla$ELC_CAT), FUN = "max") - colnames(maximo)[1] <- "ELC_CAT" - minimo <- aggregate(tabla[, c(-1, -2)], - by = list(tabla$ELC_CAT), FUN = "min") - colnames(minimo)[1] <- "ELC_CAT" - desvest <- aggregate(tabla[, c(-1, -2)], - by = list(tabla$ELC_CAT), FUN = "sd") - colnames(desvest)[1] <- "ELC_CAT" - - # Tabla de unificaci?n de estad?sticos - estad <- merge(media, mediana, by = "ELC_CAT", suffixes = c(".media", ".mediana")) - estad1 <- merge(maximo, minimo, by = "ELC_CAT", suffixes = c(".maximo", ".minimo")) - estad <- merge(estad, estad1, by = "ELC_CAT") - aaa <- "sd" - for (i in 2:length(desvest[1, ])) { - colnames(desvest)[i] <- paste(colnames(desvest)[i], aaa, sep = ".") - } - estad <- merge(estad, desvest, by = "ELC_CAT") - # Export descriptive statistics - write.table(estad, file = paste(resultados, "/Estadist_ELC_", pais, ".txt", sep = ""), sep = "\t", row.names = FALSE, qmethod = "double") - - # Export table de equivalencias Combinaci?n (Bio-Geo-Eda) y categoriesfinal map - colnames(elc)[2] <- "ELC_CAT" - write.table(estad, file = paste(resultados, "/Combi_ELC_", pais, ".txt", sep = ""), sep = "\t", row.names = FALSE, qmethod = "double") - } -} diff --git a/R_scripts/SHARED-Data_CAPFITOGEN.R b/R_scripts/SHARED-Data_CAPFITOGEN.R deleted file mode 100644 index 5055c86..0000000 --- a/R_scripts/SHARED-Data_CAPFITOGEN.R +++ /dev/null @@ -1,408 +0,0 @@ -#' ####################################################################### # -#' PROJECT: [BioDT CWR - CAPFITOGEN] -#' CONTENTS: -#' - GBIF Data Download Functionality -#' - Bioclimatic Variable Climatology creation for qsoil1 and qsoil2 combined -#' - Edaphic data download (HJ: not finished!) -#' - Geophycical data download (HJ: not finished!) -#' DEPENDENCIES: -#' - To do: -#' AUTHOR: [Erik Kusch, Heli Juottonen, Eva Lieungh] -#' Capfitogen credit: Parra-Quijano et al. 2021, -#' https://repositorio.unal.edu.co/handle/unal/85787 -#' ####################################################################### # - -# HJ: first sections of script are taken from ModGP, only small modifications -# EL: how can we avoid code duplication from the MoDGP SHARED-Data script? -# need to work on this to find a good solution.Because the code here mostly just defines functions but does not run them, I think its OK to merge with the main SHARED-Data.R instead of duplicating. - -# GBIF DOWNLOAD FUNCTION -------------------------------------------------- -# queries download from GBIF, handles and cleans data, returns SF MULTIPOINT object and GBIF download metadata -FUN.DownGBIF <- function( - ## species name as character for whose genus data is to be downloaded - species = NULL, - ## where to store the data - Dir = getwd(), - ## whether the download should be forced despite local data already existing - Force = TRUE, - ## specify to get whole GENUS of supplied species (ModGP), - ## or only the species (Capfitogen) - Mode = "Capfitogen", - ## an integer, 1 = sequential; always defaults to sequential when Mode == "Capfitogen" - parallel = 1 -){ - ## Preparing species name identifiers - input_species <- species - - ## Focusing on Genus-part of the name if Mode is set to ModGP - if(Mode == "ModGP"){ - species <- strsplit(input_species, " ")[[1]][1] - } - - ## Filename and data presence check - FNAME <- file.path(Dir, paste0(species, ".RData")) - if(!Force & file.exists(FNAME)){ - save_ls <- loadObj(FNAME) - message("Data has already been downloaded with these specifications previously. It has been loaded from the disk. If you wish to override the present data, please specify Force = TRUE") - return(specs_ls) - } - - ## Function start - message("Starting GBIF data retrieval") - ## GBIF ID Query ---- - ## GBIF query - if(Mode == "ModGP"){ - message(paste("## Resolving", species, "at genus level")) - RankGBIF <- "genus" - } - if(Mode == "Capfitogen"){ - message(paste("## Resolving", species, "at species level")) - RankGBIF <- "species" - } - GBIF_match <- name_backbone(name = species, - rank = RankGBIF, kingdom = "plante") - - ## Extracting taxonkey - tax_ID <- ifelse(GBIF_match$rank != toupper(RankGBIF), NA, - GBIF_match$usageKey[GBIF_match$rank == toupper(RankGBIF)]) - - ## checking GBIF match success - if(is.na(tax_ID)){ - print(GBIF_match) - stop("The provided species name could not be resolved at GBIF. Please find the match at GBIF above.") - } - - ## GBIF Data Query ---- - message(paste("## Downloading", - occ_count(taxonKey = tax_ID, - hasCoordinate = TRUE, - occurrenceStatus = "PRESENT"), - "GBIF records")) - occ_down <- occ_download(pred("taxonKey", tax_ID), - pred("hasCoordinate", TRUE), - pred("occurrenceStatus", "PRESENT"), - format = "SIMPLE_CSV") - curlopts <- list(http_version = 2) # needed on Mac to avoid HTTP issues in next line (see here: https://github.com/ropensci/rgbif/issues/579) - occ_meta <- occ_download_wait(occ_down, status_ping = 30, - curlopts = list(), quiet = FALSE) # wait for download to finish - occ_get <- occ_download_get(occ_down, path = Dir) # download data - curlopts <- list(http_version = 1) # resetting this to not affect other functions - - ## Data Loading ---- - message("Loading GBIF Data into R") - occ_occ <- occ_download_import(occ_get) # import downloaded data - - ## Manipulating GBIF Data ---- - ### Resolving Common Issues ---- - message("Resolving Common Data Issues") - ## removing bases of record that may not be linked to coordinates properly - occ_occ <- occ_occ[occ_occ$basisOfRecord %nin% c("PRESERVED_SPECIMEN", "MATERIAL_CITATION"), ] - ## removing highly uncertain locations, i.e., anything more than 1km in uncertainty - occ_occ <- occ_occ[occ_occ$coordinateUncertaintyInMeters <= 1000, ] - ## removing rounded coordinates - occ_occ <- occ_occ[-grep(occ_occ$issue, pattern = "COORDINATE_ROUNDED"), ] - ## removing empty species rows - occ_occ <- occ_occ[occ_occ$species != "" & !is.na(occ_occ$species), ] - - ### Parallel Set-Up ---- - if(parallel == 1 | Mode == "Capfitogen"){parallel <- NULL} # no parallelisation - ### This needs to be commented back in when wanting to run code below directly - if(!is.null(parallel) && !RUNNING_ON_LUMI){ # parallelisation - message("Registering cluster for parallel processing") - print("Registering cluster") - parallel <- parallel::makeCluster(parallel) - on.exit(stopCluster(parallel)) - print("R Objects loading to cluster") - parallel::clusterExport(parallel, varlist = c( - "package_vec", "install.load.package", - "occ_occ" - ), envir = environment()) - print("R Packages loading on cluster") - clusterpacks <- clusterCall(parallel, function() sapply(package_vec, install.load.package)) - } - - ### Making SF for species ---- - message("Extracting species-level data into MULTIPOINT objects") - GBIF_specs <- unique(occ_occ$species) - - ## Making a list of spatialfeatures MULTIPOINT objects denoting unique locations of presence per species - specs_ls <- pblapply(GBIF_specs, - cl = parallel, - FUN = function(x){ - spec_df <- occ_occ[occ_occ$species == x, ] - spec_uniloca <- occ_occ[occ_occ$species == x, c("species", "decimalLatitude", "decimalLongitude")] - spec_df <- spec_df[!duplicated(spec_uniloca), - c("gbifID", "datasetKey", "occurrenceID", "species", "scientificName", "speciesKey", - "decimalLatitude", "decimalLongitude", "coordinateUncertaintyInMeters", - "eventDate", "basisOfRecord", "recordNumber", "issue") - ] - spec_df$presence <- 1 - st_as_sf(spec_df, coords = c("decimalLongitude", "decimalLatitude")) - }) - names(specs_ls) <- GBIF_specs - - ## Making list into single data frame when Capfitogen mode is toggled on. - - # HJ: section below to create a Capfitogen data frame not used - # species data included as the sf file created above - - if(Mode == "Capfitogen"){ - specs_ls <- specs_ls[[1]] - # ## create capfitogen data frame - # CapfitogenColumns <- c("INSTCODE", "ACCENUMB", "COLLNUMB", "COLLCODE", "COLLNAME", "COLLINSTADDRESS", "COLLMISSID", "GENUS", "SPECIES", "SPAUTHOR", "SUBTAXA", "SUBTAUTHOR", "CROPNAME", "ACCENAME", "ACQDATE", "ORIGCTY", "NAMECTY", "ADM1", "ADM2", "ADM3", "ADM4", "COLLSITE", "DECLATITUDE", "LATITUDE", "DECLONGITUDE", "LONGITUDE", "COORDUNCERT", "COORDDATUM", "GEOREFMETH", "ELEVATION", "COLLDATE", "BREDCODE", "BREDNAME", "SAMPSTAT", "ANCEST", "COLLSRC", "DONORCODE", "DONORNAME", "DONORNUMB", "OTHERNUMB", "DUPLSITE", "DUPLINSTNAME", "STORAGE", "MLSSTAT", "REMARKS") - # CapfitogenData <- data.frame(matrix(data = NA, nrow = nrow(specs_ls), ncol = length(CapfitogenColumns))) - # colnames(CapfitogenData) <- CapfitogenColumns - # ## Create unique rownames for the ACCENUMB - # CapfitogenData$ACCENUMB <- seq(from = 1, to = nrow(CapfitogenData), by = 1) - # ## Add in the species, latitude and longitude (nothing else at this point) - # CapfitogenData$SPECIES <- specs_ls$species - # CapfitogenData$DECLATITUDE <- st_coordinates(specs_ls)[,"Y"] - # CapfitogenData$DECLONGITUDE <- st_coordinates(specs_ls)[,"X"] - # specs_ls <- CapfitogenData - } - - ### Returning Object to Disk and Environment ---- - #save_ls <- st_write(specs_ls, Species_sf) - #save_ls <- st_write(specs_ls) - save_ls <- list(meta = occ_meta, - occs = specs_ls - #, - #json = JSON_ls - ) - - saveObj(specs_ls, file = FNAME) - unlink(occ_get) # removes .zip file - - # HJ: section below commented out because it wasn't in use in my tests - - ### JSON RO-CRATE creation ---- - # JSON_ls <- jsonlite::read_json("ro-crate-metadata.json") - # - # JSON_ls$`@graph`[[2]]$hasPart[[1]]$`@id` <- basename(FNAME) - # JSON_ls$`@graph`[[2]]$about[[1]]$`@id` <- paste0("https://www.gbif.org/species/", tax_ID) # gbif ID - # JSON_ls$`@graph`[[2]]$creator$`@id` <- c(JSON_ls$`@graph`[[2]]$creator$`@id`, as.character(options("gbif_email"))) - # JSON_ls$`@graph`[[2]]$author$`@id` <- c(JSON_ls$`@graph`[[2]]$author$`@id`, as.character(options("gbif_email"))) - # JSON_ls$`@graph`[[2]]$datePublished <- Sys.time() - # JSON_ls$`@graph`[[2]]$name <- paste("Cleaned GBIF occurrence records for", RankGBIF, species) - # JSON_ls$`@graph`[[2]]$keywords <- list("GBIF", "Occurrence", "Biodiversity", "Observation", Mode) - # JSON_ls$`@graph`[[2]]$description <- paste(Mode, "input data for", species) - # - # JSON_ls$`@graph`[[3]]$name <- basename(FNAME) - # JSON_ls$`@graph`[[3]]$contentSize <- file.size(FNAME) - # JSON_ls$`@graph`[[3]]$encodingFormat <- "application/RData" - # JSON_ls$`@graph`[[3]]$`@id` <- basename(FNAME) - # - # JSON_ls$`@graph`[[4]]$name <- c(as.character(options("gbif_user")), JSON_ls$`@graph`[[4]]$name) - # JSON_ls$`@graph`[[4]]$`@id` <- c(JSON_ls$`@graph`[[4]]$`@id`, as.character(options("gbif_email"))) - # JSON_ls$`@graph`[[4]]$`@type` <- c(JSON_ls$`@graph`[[4]]$`@type`, "Organisation") - # - # JSON_ls$`@graph`[[5]]$agent$`@id` <- c(JSON_ls$`@graph`[[5]]$agent$`@id`, as.character(options("gbif_email"))) - # JSON_ls$`@graph`[[5]]$instrument$`@id` <- "https://github.com/BioDT/uc-CWR" - # - # con <- file(file.path(Dir, paste0(species, ".json"))) - # writeLines(jsonlite::toJSON(JSON_ls, pretty = TRUE), con) - # close(con) - - save_ls -} - -# Downloading in environmental variable data (NOT READY) -# 1. Bioclimatic: FUN.DownBV (below as in ModGP; not used in my tests, couldn't figure out how) -# 2. Edaphic: FUN.DownEV (missing, just a rough draft) -# 3. Geophysical: FUN.DownGV (missing, just a rough draft) - - -# BIOCLIMATIC VARIABLE DOWNLOAD -------------------------------------------- -# queries and downloads and computes bioclimatic variables at global extent from ERA5-Land, Water availability is based on soil moisture level 1 (0-7cm) and 2 (7-28cm) -FUN.DownBV <- function(T_Start = 1970, # what year to begin climatology calculation in T_End = 2000, # what year to end climatology calculation in - Dir = getwd(), # where to store the data output on disk - Force = FALSE # do not overwrite already present data -){ - FNAME <- file.path(Dir, paste0("BV_", T_Start, "-", T_End, ".nc")) - - if(file.exists(FNAME)){ - BV_ras <- stack(FNAME) - names(BV_ras) <- paste0("BIO", 1:19) - message("Data has already been downloaded with these specifications previously. It has been loaded from the disk. If you wish to override the present data, please specify Force = TRUE") - return(BV_ras) - } - - Month_seq <- seq(as.Date(paste0(T_Start, "-01-01")), as.Date(paste0(T_End, "-12-31")), by = "month") - Month_seq <- strsplit(x = as.character(Month_seq), split = "-") - MonthsNeeded <- unlist(lapply(Month_seq, FUN = function(x){ - paste(x[1:2], collapse = "_") - })) - - ### Raw soil moisture level data ---- - #' We download raw soil moisture data for layers 1 (0-7cm) and 2 (7-28cm) separately. These are then summed up and used in the bioclimatic variable computation of KrigR - WaterPresent <- list.files(Dir, pattern = "volumetric_soil_water_layer_1") - AlreadyPresent <- length(unique(grep(paste(MonthsNeeded,collapse="|"), WaterPresent, value=TRUE))) - if(AlreadyPresent != length(Month_seq)){ - #### Downloading ---- - Qsoil1_ras <- download_ERA( - Variable = "volumetric_soil_water_layer_1", - DataSet = "era5-land", - DateStart = paste0(T_Start, "-01-01"), - DateStop = paste0(T_End, "-12-31"), - Dir = Dir, - FileName = "Qsoil1", - API_User = API_User, - API_Key = API_Key, - SingularDL = TRUE, - TimeOut = Inf - ) - - Qsoil2_ras <- download_ERA( - Variable = "volumetric_soil_water_layer_2", - DataSet = "era5-land", - DateStart = paste0(T_Start, "-01-01"), - DateStop = paste0(T_End, "-12-31"), - Dir = Dir, - FileName = "Qsoil2", - API_User = API_User, - API_Key = API_Key, - SingularDL = TRUE - ) - - #### Combining ---- - QSoilCombin_ras <- rast(Qsoil1_ras)+rast(Qsoil2_ras) - - #### Saving ---- - for(MonthSave_iter in 1:nlyr(QSoilCombin_ras)){ - FNAME <- file.path(Dir, paste0("volumetric_soil_water_layer_1-mean-", Month_seq[[MonthSave_iter]][1], "_", Month_seq[[MonthSave_iter]][2], "MonthlyBC.nc")) - terra::writeCDF(QSoilCombin_ras[[MonthSave_iter]], filename = FNAME, overwrite = TRUE) - } - - ### Deleting unnecessary files ---- - unlink(list.files(Dir, pattern = "Qsoil", full.names = TRUE)) - } - - ### Bioclimatic data ---- - if(file.exists(file.path(Dir, "Qsoil_BC.nc"))){ - BV_ras <- stack(file.path(Dir, "Qsoil_BC.nc")) - }else{ - BV_ras <- BioClim( - DataSet = "era5-land", - Water_Var = "volumetric_soil_water_layer_1", - Y_start = T_Start, - Y_end = T_End, - Dir = Dir, - Keep_Monthly = TRUE, - FileName = "Qsoil_BC.nc", - API_User = API_User, - API_Key = API_Key, - Cores = numberOfCores, - TimeOut = Inf, - SingularDL = FALSE - ) - } - - ### Masking ---- - Land_sp <- ne_countries(type = "countries", scale = "medium") - BV_ras <- crop(BV_ras, extent(Land_sp)) - BV_mask <- KrigR:::mask_Shape(base.map = BV_ras[[1]], Shape = Land_sp[,"name"]) - BV_ras <- mask(BV_ras, BV_mask) - - ### Saving ---- - writeRaster(BV_ras, filename = FNAME, format = "CDF", overwrite = TRUE) - unlink(file.path(Dir, "Qsoil_BC.nc")) - names(BV_ras) <- paste0("BIO", 1:19) - - ### JSON RO-CRATE creation ---- - JSON_ls <- jsonlite::read_json("ro-crate-metadata.json") - - JSON_ls$`@graph`[[2]]$hasPart[[1]]$`@id` <- basename(FNAME) - JSON_ls$`@graph`[[2]]$about[[1]]$`@id` <- "https://cds.climate.copernicus.eu/cdsapp#!/dataset/reanalysis-era5-land" - JSON_ls$`@graph`[[2]]$datePublished <- Sys.time() # tail(file.info(FNAME)$ctime) - JSON_ls$`@graph`[[2]]$name <- "Bioclimatic data obtained from ERA5-Land. Water avialbility is denoted via the sum of soil moisture layer 1 and 2." - JSON_ls$`@graph`[[2]]$keywords <- list("ERA5-Land", "ECMWF", "Bioclimatic Variables", "Soil Moisture") - JSON_ls$`@graph`[[2]]$description <- "Bioclimatic data obtained from ERA5-Land. Water avialbility is denoted via the sum of soil moisture layer 1 and 2." - - JSON_ls$`@graph`[[3]]$name <- basename(FNAME) - JSON_ls$`@graph`[[3]]$contentSize <- file.size(FNAME) - JSON_ls$`@graph`[[3]]$`@id` <- basename(FNAME) - - JSON_ls$`@graph`[[5]]$instrument$`@id` <- "https://doi.org/10.1088/1748-9326/ac48b3" - - con <- file(file.path(Dir, paste0(tools::file_path_sans_ext(basename(FNAME)), ".json"))) - writeLines(jsonlite::toJSON(JSON_ls, pretty = TRUE), con) - close(con) - - BV_ras -} - - -# HJ: new part: very rough draft for downloading edaphic and geophysical data -# missing all the important parts: proper data sources, functions -# where to set the variables for each category (bioclimatic, edaphic, geophysical)? - -# Edaphic data download - -# HJ: ONLY FOR TESTING PURPOSES: soildata package -# needed: data source and downloading commands for .nc files; where to set the selected variables - -FUN.DownEV <- function(arg1, arg2, arg3){ - - FNAME <- file.path(Dir.Data.Envir, "edaphic_data.nc") - - evarg <- c(arg1, arg2, arg3) - # evargs <- commandArgs(trailingOnly = TRUE) - #evarg <- c("soc", "silt") - edaph_ras <- soil_world(evarg, depth = 5, path = file.path(Dir.Data.Envir, "Edaphic")) - names(edaph_ras) <- evarg - - ### Masking ---- - # whole world - #Land_sp <- ne_countries(type = "countries", scale = "medium") - - # HJ: for testing/ to match previous Capfitogen tests: only Spain - # HJ: this is an attempt to do the same thing with terra that was done with KrigR in ModGP (see below) - # please switch back to KrigR is wanted/needed - Land_sp <- ne_states("Spain") - edaph_ras <- crop(edaph_ras, terra::ext(Land_sp)) - edaph_ras <- terra::mask(edaph_ras, vect(Land_sp)) - - # BV_ras <- crop(BV_ras, extent(Land_sp)) - # BV_mask <- KrigR:::mask_Shape(base.map = BV_ras[[1]], Shape = Land_sp[,"name"]) - # BV_ras <- mask(BV_ras, BV_mask) - - ### Saving ---- - terra::writeCDF(edaph_ras, filename = FNAME, overwrite = TRUE) - unlink(file.path(Dir.Data.Envir, "Edaphic", "soil_world", "*.tif")) - - edaph_ras - -} - -# Geophysical data download - -# HJ: missing: data source, download function for .nc files, where to set the selected variables - -FUN.DownGV <- function(arg1, arg2){ - - FNAME <- file.path(Dir.Data.Envir, "geophys.nc") - - evarg <- c(arg1, arg2) - geophys_ras <- ??? file.path(Dir.Data.Envir, "Geophysical") - - # whole world - #Land_sp <- ne_countries(type = "countries", scale = "medium") - # HJ: for testing/ to match previous Capfitogen tests: only Spain - # HJ: this is an attempt to do the same thing with terra that was done with KrigR in ModGP (see below) - # please switch back to KrigR is wanted/needed - Land_sp <- ne_states("Spain") - geophys_ras <- crop(geophys_ras, terra::ext(Land_sp)) - geophys_ras <- terra::mask(geophys_ras, vect(Land_sp)) - - # BV_ras <- crop(BV_ras, extent(Land_sp)) - # BV_mask <- KrigR:::mask_Shape(base.map = BV_ras[[1]], Shape = Land_sp[,"name"]) - # BV_ras <- mask(BV_ras, BV_mask) - - ### Saving ---- - terra::writeCDF(geophys_ras, filename = FNAME, overwrite = TRUE) - unlink(file.path(Dir.Data.Envir, "Geophysical")) - - geophys_ras - -} - diff --git a/R_scripts/capfitogen.R b/R_scripts/capfitogen.R deleted file mode 100644 index b48f51c..0000000 --- a/R_scripts/capfitogen.R +++ /dev/null @@ -1,174 +0,0 @@ -# this script starts with a copy of @trossi's 'capfitogen' script and will serve -# as notes and tests for scripting. To be deleted when a full capfitogen pipeline -# is in place. -## Heli's handover notes -------------------------------------- -#' Capfitogen R script modifications for CWR BioDT project -#' Heli Juottonen, CSC (heli.juottonen@csc.fi) (please email if any questions!) -#' -#' Main script: capfitogen_master_061124.R -#' -#' sources the following scripts: -#' -#' 1. SHARED-Data.R -#' 1. downloading species data from GBIF: FUN.DownGBIF (from ModGP) -#' 2. downloading environmental data (.nc files): -#' FUN.DownBV, bioclimatic data (as in ModGP, modifications needed?) -#' FUN.DownEV, edaphic data (not ready, only a very rough draft) -#' FUN.DownGV, geophysical data (not ready, only a very rough draft) -#' -#' unclear: best way to obtain the data as .nc files? -#' unclear: where to define which specific variables downloaded for each category? -#' unclear: how and where and if to set the geographic area? -#' -#' outputs (=inputs for the next step): -#' sf file of species occurrence: Species_ls$occs -#' raster stacks: bioclim_ras, edaph_ras, geophys_ras -#' -#' 2. VarSelection.R -#' 1. selection of variables separately for each category (bioclimatic, edaphic, geophysical): -#' FUN.VarSelection -#' uses the vifcor approach as ModGP -#' -#' outputs (= inputs for the next step): -#' values of selected variables extracted from raster stacks: bioclim_ext, edaph_ext, geophys_ext -#' -#' 3. ELCMap.R -#' 1. clustering (kmeans/BIC): FUN.KmeansClust -#' run separately for each variable category (bioclimatic, edaphic, geophysical) -#' -#' outputs (= inputs for the next step): bioclim_cl, geophys_cl, edaph_cl -#' (coordinates, cluster membership, extracted environmental variable values) -#' -#' 2. creating maps (not ready, requires someone with proper spatial data R skills) -#' -#' -#------------------------------------------------- -#test downloading soil data from harmonized world soil database v2.0 (hwsd) -hwsd_path = file.path(getwd(), "Data", "Environment") -hwsd_zipfile = paste(hwsd_path, "/HWSD2.zip", sep = "") -url = "https://s3.eu-west-1.amazonaws.com/data.gaezdev.aws.fao.org/HWSD/HWSD2_RASTER.zip" -library(httr) -GET(url, write_disk(hwsd_zipfile)) -## 1 km (30 arc-second) resolution -unzip(hwsd_zipfile, - exdir = paste(hwsd_path, "/soil", sep = "")) -# test reading BIL file -hwsd_raster <- terra::rast("Data/Environment/soil/HWSD2.bil") -hwsd_raster -plot(hwsd_raster) -summary(hwsd_raster) -res(hswd_raster) -names(hwsd_raster[[1]]) -# aggregate to coarser resolution by a factor of 9 -# (bin 9x9 neighbouring pixels into one, and assign the bigger pixel the mean) -soil30 <- aggregate(hwsd_raster, fact = 9, fun = mean) -soil30 -plot(soil30) -# a plot is made, but of what? There is only one layer of values, and it's not obvoius to me what those values are... - -# testing with SoilGrids instead, looks there are more variables there -# see https://www.isric.org/explore/soilgrids/soilgrids-access -library(terra) -library(gdalUtilities) -#projection_string = '+proj=igh +lat_0=0 +lon_0=0 +datum=WGS84 +units=m +no_defs' # proj string for Homolosine projection, https://en.wikipedia.org/wiki/Goode_homolosine_projection - -soilGrids_url="/vsicurl?max_retry=3&retry_delay=1&list_dir=no&url=https://files.isric.org/soilgrids/latest/data/" - - #' overview of datasets: https://www.isric.org/explore/soilgrids/faq-soilgrids#What_do_the_filename_codes_mean - #' NB! Each global map occupies circa 5 GB! It takes a while to download. - #' bdod_0-5cm_mean.vrt # bdod=Bulk density of the fine earth fraction, cg/cm³ - #' cec_0-5cm_mean.vrt # cec = Cation Exchange Capacity of the soil, mmol(c)/kg - #' cfvo_0-5cm_mean.vrt # cfvo = Volumetric fraction of coarse fragments (> 2 mm) cm3/dm3 (vol‰) - #' silt_0-5cm_mean.vrt # silt = Proportion of silt particles (≥ 0.002 mm and ≤ 0.05/0.063 mm) in the fine earth fraction g/kg - #' clay_0-5cm_mean.vrt # clay = Proportion of clay particles (< 0.002 mm) in the fine earth fraction g/kg - #' sand_0-5cm_mean.vrt # sand = Proportion of sand particles (> 0.05/0.063 mm) in the fine earth fraction g/kg - #' nitrogen_0-5cm_mean.vrt # nitrogen = Total nitrogen (N) cg/kg - #' phh2o_0-5cm_mean.vrt # phh2o = Soil pH pHx10 - #' ocd_0-5cm_mean.vrt # ocd = Organic carbon density hg/m³ - #' ocs_0-30cm_mean.vrt # ocs = Organic carbon stocks t/ha - #' soc_0-5cm_mean.vrt # soc = Soil organic carbon content in the fine earth fraction dg/kg - #' - #' in addition, https://files.isric.org/soilgrids/latest/data/wrb/ - #' has maps of soil types, as estimated probability of occurrence per type. - #' MostProbable.vrt has the most probable soil type per gridcell. - #' - # gdal_translate() converts raster data between different formats. -soilGrids_data <- gdal_translate( - src_dataset = paste0(soilGrids_url,'ocs/ocs_0-30cm_mean.vrt'), - paste0(Dir.Data.Envir, - "/crop_roi_igh_r.tif"), - tr = c(2500,2500) # target resolution -) - -crop_roi_igh_r <- rast(paste0(Dir.Data.Envir, - "/crop_roi_igh_r.tif")) - -plot(crop_roi_igh_r) -crop_roi_igh_r -summary(crop_roi_igh_r) -# test downloading CAPFITOGEN scripts into R_Scripts - - -#------------------------------------------------- -# Main input file (pasaporte): -# LathyrusData-ForCapfitogen_27oct2023.txt (by Carrie) -# Filter only one species for testing: -# head -n 1 LathyrusData-ForCapfitogen_27oct2023.txt > LathyrusData-ForCapfitogen_27oct2023_niger_only.txt -# grep "Lathyrus niger" LathyrusData-ForCapfitogen_27oct2023.txt >> LathyrusData-ForCapfitogen_27oct2023_niger_only.txt - -# Global options -pasaporte_file <- "LathyrusData-ForCapfitogen_27oct2023_niger_only.txt" -country <- "World" -# resolution <- "Celdas 1x1 km aprox (30 arc-seg)" -# resolution <- "Celdas 5x5 km aprox (2.5 arc-min)" -resolution <- "celdas 20x20 km aprox (10 arc-min)" - -# Paths -results_dpath <- file.path(getwd(), "Resultados") -root_dpath <- file.path(getwd(), "CAPFITOGEN3") -param_dpath <- file.path(root_dpath, "scripts", "Parameters scripts (English)") -tools_dpath <- file.path(root_dpath, "scripts", "Tools Herramientas") - -dir.create(results_dpath) - -# We execute SelecVar and ELCMapas modules in order -# The structure of each module execution is: -# - execute the corresponding parameters file for default settings -# - override relevant settings (ruta etc.) using variables defined above -# - execute the correspoding analysis script (unless done already) -# Note! Scripts write to a common log file: CAPFITOGEN3/Error/process_info.txt - -#### SelecVar ############################# -message("SelecVar") -source(file.path(param_dpath, "Parameters_SelecVar_2021.R")) -file.copy(file.path(getwd(), pasaporte_file), file.path(root_dpath, "Pasaporte", pasaporte_file), overwrite=TRUE) -ruta <- root_dpath -pasaporte <- pasaporte_file -geoqual <- FALSE -pais <- country -resol1 <- resolution -resultados <- file.path(results_dpath, "SelecVar") -dir.create(resultados) -if (file.exists(file.path(resultados, "SelectedVariables_edaphic.xls"))) { - message("- skipping") -} else { - message("- executing") - source(file.path(tools_dpath, "SelectVar.R")) - # Prevent crosstalk with the next step - rm(geophys) -} - -#### ELCmapas ############################# -message("ELCmapas") -source(file.path(param_dpath, "Parameters_ELCmapas_2021.R")) -ruta <- root_dpath -pais <- country -resol1 <- resolution -resultados <- file.path(results_dpath, "ELCmapas") -dir.create(resultados) -if (file.exists(file.path(resultados, "Producto.RData"))) { - message("- skipping") -} else { - message("- executing") - source(file.path(tools_dpath, "ELCmapas.R")) -} From 4eaacb71d54b2d6ab98b027d279de8cdf5e0e1c3 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Tue, 25 Feb 2025 16:25:23 +0100 Subject: [PATCH 28/72] working Complementa tool without areas analysis --- capfitogen_master.R | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/capfitogen_master.R b/capfitogen_master.R index f68d993..b4e39dc 100644 --- a/capfitogen_master.R +++ b/capfitogen_master.R @@ -340,7 +340,7 @@ celdas <- TRUE # Note: If celdas=TRUE, a complementarity analysis will be run by resol1 <- "celdas 10x10 km aprox (5 arc-min)" #Only applies if celdas=TRUE nceldas <- 10 #Only applies if celdas=TRUE, number of cells in a ranking (from most to least important in terms of taxa richness accumulation) -areas <- TRUE # If areas=TRUE, a complementary analysis will be run per protected areas (polygons), which can come from a world database (WDPA) or from a shapefile provided by the user. If areas=TRUE, at least one of the following two options (or both), WDPA or propio, must be TRUE, otherwise it may cause errors. +areas <- FALSE # If areas=TRUE, a complementary analysis will be run per protected areas (polygons), which can come from a world database (WDPA) or from a shapefile provided by the user. If areas=TRUE, at least one of the following two options (or both), WDPA or propio, must be TRUE, otherwise it may cause errors. WDPA <- TRUE #Only applies if areas=TRUE propio <- FALSE # =own, alternative user defined file instead of WDPA nombre <- "nameOfAlternativeShapefile" #Only applies if propio=TRUE, name of alternative shapefile @@ -370,8 +370,30 @@ message("running Capfitogen Complementa tool for conservation areas") setwd(Dir.Base) source(file.path(Dir.Capfitogen, "/scripts/Tools Herramientas/Complementa.R")) -#' stops at line 173 mapaelcf if loop +#' works if areas = FALSE ! setwd(Dir.Base) - -# visualise output \ No newline at end of file +### quick visualisation ---- +complementa_map <- rast( + file.path(Dir.Results.Complementa, + "AnalisisCeldas_CellAnalysis/Complementa_map.tif")) +plot(complementa_map) +complementa_map[is.nan(values(complementa_map))] <- NA +non_zero_mask <- mask(complementa_map, + !is.na(complementa_map)) +complementa_points <- as.points(non_zero_mask, na.rm = TRUE) +plot(complementa_points) + +library(maps) + +map( + 'world', + col = "grey", + fill = TRUE, + bg = "white", + lwd = 0.05, + mar = rep(0, 4), + border = 0, + ylim = c(-80, 80) +) +points(complementa_points) From c31739870de8689e7c50ed1e4d926e942614d844 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Wed, 26 Feb 2025 18:24:43 +0100 Subject: [PATCH 29/72] draft protected areas download --- .gitignore | 4 +- Data/Capfitogen/Placeholder.rtf | 8 --- capfitogen_master.R | 115 ++++++++++++++++++++++++-------- 3 files changed, 88 insertions(+), 39 deletions(-) delete mode 100644 Data/Capfitogen/Placeholder.rtf diff --git a/.gitignore b/.gitignore index 2da08c9..01f9310 100644 --- a/.gitignore +++ b/.gitignore @@ -78,8 +78,8 @@ results/ # Downloads R_scripts/ELCmapas.R -CAPFITOGEN3/ capfitogen-main.zip Capfitogen-main/ -scripts/ +Data/Capfitogen/ + hq diff --git a/Data/Capfitogen/Placeholder.rtf b/Data/Capfitogen/Placeholder.rtf deleted file mode 100644 index 3486b7c..0000000 --- a/Data/Capfitogen/Placeholder.rtf +++ /dev/null @@ -1,8 +0,0 @@ -{\rtf1\ansi\ansicpg1252\cocoartf2759 -\cocoatextscaling0\cocoaplatform0{\fonttbl\f0\fswiss\fcharset0 Helvetica;} -{\colortbl;\red255\green255\blue255;} -{\*\expandedcolortbl;;} -\paperw11900\paperh16840\margl1440\margr1440\vieww11520\viewh8400\viewkind0 -\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 - -\f0\fs24 \cf0 Placeholder to index directory on GitHub} \ No newline at end of file diff --git a/capfitogen_master.R b/capfitogen_master.R index b4e39dc..d415d7d 100644 --- a/capfitogen_master.R +++ b/capfitogen_master.R @@ -158,7 +158,7 @@ geophysical_variables <- FUN.DownGV( ## Download CAPFITOGEN scripts ------------------------------------------------ # download and unzip CAPFITOGEN repository if (!file.exists("capfitogen-main.zip")) { - download.file(url = "https://github.com/HMauricioParra/Capfitogen/archive/refs/heads/main.zip", + download.file(url = "https://github.com/evalieungh/Capfitogen/archive/refs/heads/main.zip", destfile = "capfitogen-main.zip") unzip(zipfile = "capfitogen-main.zip") } @@ -187,22 +187,90 @@ write.table(Species_ls[["occs"]], pasaporte_file_name), sep = "\t",) -pasaportest <- read.table("Capfitogen-main/Pasaporte/Lathyrus_angulatus.txt", - header = TRUE) - ### Download protected areas ---- #' https://www.protectedplanet.net/en/thematic-areas/wdpa&ved=2ahUKEwjA4fPhltyLAxVkJBAIHfdOEasQFnoECBUQAQ&usg=AOvVaw0eVrEFsb0_TP4UIl2am3Za -#' download shapefiles for protected areas to overlay with Complementa tool, -#' from The World Database on Protected Areas -#' UNEP-WCMC and IUCN (2025), Protected Planet: The World Database on Protected -#' Areas (WDPA)[On-line], [February 2025], Cambridge, UK: UNEP-WCMC and IUCN. -#' Available at: https://doi.org/10.34892/6fwd-af11 -#' NB! not working, no direct download link found -wdpa_url = "https://data-gis.unep-wcmc.org/server/rest/services/ProtectedSites/The_World_Database_of_Protected_Areas/FeatureServer/1" +#' download shapefiles for protected areas to overlay with Complementa tool: +#' UNEP-WCMC and IUCN (2025), Protected Planet: +#' The World Database on Protected Areas (WDPA) [Online], February 2025, +#' Cambridge, UK: UNEP-WCMC and IUCN. Available at: www.protectedplanet.net. +wdpa_url = "https://d1gam3xoknrgr2.cloudfront.net/current/WDPA_Feb2025_Public_shp.zip" wdpa_destination = file.path(Dir.Data.Capfitogen, - "wdpa") -download.file(url = wdpa_url, - destfile = wdpa_destination) + "WDPA_Feb2025_Public_shp.zip") +# if the file isn't there already, download it +if (!file.exists(wdpa_destination)) { + message("downloading zipped WDPA shapefiles, ca 4GB") + # set long timeout to avoid interrupting download + options(timeout = 1000) + # download the zipped files + download.file(url = wdpa_url, + destfile = wdpa_destination, + cacheOK = FALSE) + # unzip files + message(paste("unzipping WDPA shapefiles to", Dir.Data.Capfitogen)) + unzip(zipfile = wdpa_destination, + exdir = Dir.Data.Capfitogen) + # unzip split shapefile downloads + message("unzipping shapefiles split in download") + wdpa_path <- file.path(Dir.Data.Capfitogen, "wdpa") + shapefile_names <- c( + "WDPA_Feb2025_Public_shp-points.cpg", + "WDPA_Feb2025_Public_shp-points.dbf", + "WDPA_Feb2025_Public_shp-points.prj", + "WDPA_Feb2025_Public_shp-points.shp", + "WDPA_Feb2025_Public_shp-points.shx", + "WDPA_Feb2025_Public_shp-polygons.cpg", + "WDPA_Feb2025_Public_shp-polygons.dbf", + "WDPA_Feb2025_Public_shp-polygons.prj", + "WDPA_Feb2025_Public_shp-polygons.shp", + "WDPA_Feb2025_Public_shp-polygons.shx") + shapefile_paths <- file.path(wdpa_path, + shapefile_names) + for (i in 0:2) { + # define name of the current directory to be unzipped + zipfilename <- + file.path(Dir.Data.Capfitogen, + paste0("WDPA_Feb2025_Public_shp_", i, ".zip")) + # unzip the directory containing shapefiles + unzip(zipfile = zipfilename, + exdir = wdpa_path) + # rename shapefiles with numbers to prevent overwriting them + new_shapefile_names <- file.path(wdpa_path, + paste0(i, "_", + shapefile_names)) + file.rename(from = shapefile_paths, + to = new_shapefile_names) + } + # delete unnecessary files + files_to_keep <- c(wdpa_path, + wdpa_destination, + file.path(Dir.Data.Capfitogen, + "WDPA_sources_Feb2025.csv")) + files_to_delete <- + list.files(Dir.Data.Capfitogen, + full.names = TRUE)[list.files(Dir.Data.Capfitogen, + full.names = TRUE) %nin% files_to_keep] + file.remove(files_to_delete, + recursive = TRUE) +} + +# merge parts into one global shapefile +wdpa_polygon_shapefiles <- + substr(unique(sub("\\..*", "", + list.files(wdpa_path)[grep(pattern = "polygon", + x = all_wdpa_shapefiles)])), + 3, 34) +shapefile_list <- list() +for (i in 0:2) { + layer_name = paste0(i, "_", wdpa_polygon_shapefiles) + shapefile_list[[i + 1]] <- + read_sf(dsn = wdpa_path, layer = layer_name) +} + +wdpa <- do.call(rbind, shapefile_list) + +# save global wdpa +st_write(wdpa, + file.path(wdpa_path, "global_wdpa_polygons.shp")) ## Variable selection --------------------------------------------------------- # combine variables @@ -279,9 +347,6 @@ iterat <- 10 # if metodo="Calinski" or "ssi", the number of iterations to calcul # run the script message("Clustering and creating maps") -##' NB! Change made in capfitogen script: -##' replaced 'extract' with 'raster::extract' -##' (some other package masked it and caused an error) source(file.path(Dir.Capfitogen, "/scripts/Tools Herramientas/ELCmapas_BioDT.R")) setwd(Dir.Base) @@ -340,9 +405,9 @@ celdas <- TRUE # Note: If celdas=TRUE, a complementarity analysis will be run by resol1 <- "celdas 10x10 km aprox (5 arc-min)" #Only applies if celdas=TRUE nceldas <- 10 #Only applies if celdas=TRUE, number of cells in a ranking (from most to least important in terms of taxa richness accumulation) -areas <- FALSE # If areas=TRUE, a complementary analysis will be run per protected areas (polygons), which can come from a world database (WDPA) or from a shapefile provided by the user. If areas=TRUE, at least one of the following two options (or both), WDPA or propio, must be TRUE, otherwise it may cause errors. -WDPA <- TRUE #Only applies if areas=TRUE -propio <- FALSE # =own, alternative user defined file instead of WDPA +areas <- TRUE # If areas=TRUE, a complementary analysis will be run per protected areas (polygons), which can come from a world database (WDPA) or from a shapefile provided by the user. If areas=TRUE, at least one of the following two options (or both), WDPA or propio, must be TRUE, otherwise it may cause errors. +WDPA <- FALSE #Only applies if areas=TRUE +propio <- TRUE # =own, alternative user defined file instead of WDPA nombre <- "nameOfAlternativeShapefile" #Only applies if propio=TRUE, name of alternative shapefile campo <- "objectid" #Only applies if propio=TRUE, in campo you must specify the column of the shapefile table that contains the identifier code (ID) of each object (polygon) in the map of protected areas that the user provides through the shapefile. The name of the column must be inserted as it appears in the shapefile table, otherwise errors are generated nareas <- 5 # the number of protected areas where the points from the passport table coordinates fall, areas organized in a ranking (from most to least important in terms of accumulation of taxa richness) that will be analyzed in detail. It can generate a problem or error if nareas is a very large number and the passport table has few records, or few different species, or all the points are highly concentrated spatially. @@ -357,16 +422,8 @@ datanaelc <- FALSE # Only applies if mapaelcf=TRUE, indicates whether (TRUE) the data0elc <- FALSE #Only applies if mapaelcf=TRUE, indicates whether (TRUE) the records that fall in category 0 on the ELC map will be taken into account or not (FALSE) } -message("running Capfitogen Complementa tool for conservation areas") - -#' NB! Manually copied the script into the folder, as it is missing on GH... -#' Changed in the script: -#' - replaced 'extract' with 'raster::extract' -#' - line ~576 tabla_especies<-puntosorigshp@data -#' is a problem. "no applicable method for `@` applied to an object of class "sf"" -#' replaced it with as.data.frame(puntosorigshp) -#' # run the script +message("running Capfitogen Complementa tool for conservation areas") setwd(Dir.Base) source(file.path(Dir.Capfitogen, "/scripts/Tools Herramientas/Complementa.R")) From 34fe86768acd3a779d23c36d9bdfc706f0a6366e Mon Sep 17 00:00:00 2001 From: evalieungh Date: Wed, 26 Feb 2025 19:27:28 +0100 Subject: [PATCH 30/72] update annotation --- capfitogen_master.R | 57 ++++++++++++++++++++++++--------------------- 1 file changed, 31 insertions(+), 26 deletions(-) diff --git a/capfitogen_master.R b/capfitogen_master.R index d415d7d..75f67ac 100644 --- a/capfitogen_master.R +++ b/capfitogen_master.R @@ -105,15 +105,18 @@ Species_ls <- FUN.DownGBIF( ##' Temporal coverage: January 1950 to present ? https://cds.climate.copernicus.eu/datasets/derived-era5-land-daily-statistics?tab=overview message("Downloading new or loading existing 19 BioClim bioclimatic variables") bioclim_variables <- FUN.DownBV( - T_Start = 1999, # what year to begin climatology calculation in - T_End = 1999, # what year to end climatology calculation in + T_Start = 1999, # what year to begin climatology calculation? + T_End = 1999, # what year to end climatology calculation? Dir = Dir.Data.Envir, # where to store the data output on disk Force = FALSE # do not overwrite already present data ) -bioclim_variables <- terra::rast(file.path(Dir.Data.Envir, "BV_1985-2015.nc")) +# or use an existing data set (from Erik?) for testing... +# bioclim_variables <- terra::rast(file.path(Dir.Data.Envir, "BV_1985-2015.nc")) + +# make sure the BioClim data have the correct names BioClim_names <- c( - ## BioClim variable names, see https://www.worldclim.org/data/bioclim.html + # see https://www.worldclim.org/data/bioclim.html "BIO1_Annual_Mean_Temperature", "BIO2_Mean_Diurnal_Range", "BIO3_Isothermality", @@ -174,8 +177,9 @@ puntos <- data.frame(POINTID = 1:length(Species_ls[["occs"]][["DECLATITUDE"]]), POINT_Y = Species_ls[["occs"]][["DECLATITUDE"]]) ### create 'pasaporte' ---- -#' pasaporte file uses Darwincore names? -#' So it should be OK to use the occurrences from the GBIF download directly. +#' Capfitogen uses a file named "pasaporte" with species/taxa occurrences. +#' The file uses Darwincore names, so it should be OK to use the occurrences +#' from the GBIF download directly. #' Place it in the right folder so Capfitogen can find it: pasaporte_file_name = paste0(sub(pattern = " ", replacement = "_", @@ -251,27 +255,28 @@ if (!file.exists(wdpa_destination)) { full.names = TRUE) %nin% files_to_keep] file.remove(files_to_delete, recursive = TRUE) + + # merge parts into one global shapefile + wdpa_polygon_shapefiles <- + substr(unique(sub("\\..*", "", + list.files(wdpa_path)[grep(pattern = "polygon", + x = all_wdpa_shapefiles)])), + 3, 34) + shapefile_list <- list() + for (i in 0:2) { + layer_name = paste0(i, "_", wdpa_polygon_shapefiles) + shapefile_list[[i + 1]] <- + read_sf(dsn = wdpa_path, layer = layer_name) + } + message("combining parts of the WDPA shapefile. This can take a while ---") + wdpa <- do.call(rbind, shapefile_list) + + # save complete wdpa + message("Complete WDPA successfully combined. Saving it as global_wdpa_polygons.shp") + st_write(wdpa, + file.path(wdpa_path, "global_wdpa_polygons.shp")) } -# merge parts into one global shapefile -wdpa_polygon_shapefiles <- - substr(unique(sub("\\..*", "", - list.files(wdpa_path)[grep(pattern = "polygon", - x = all_wdpa_shapefiles)])), - 3, 34) -shapefile_list <- list() -for (i in 0:2) { - layer_name = paste0(i, "_", wdpa_polygon_shapefiles) - shapefile_list[[i + 1]] <- - read_sf(dsn = wdpa_path, layer = layer_name) -} - -wdpa <- do.call(rbind, shapefile_list) - -# save global wdpa -st_write(wdpa, - file.path(wdpa_path, "global_wdpa_polygons.shp")) - ## Variable selection --------------------------------------------------------- # combine variables all_predictors <- c(bioclim_variables, @@ -408,7 +413,7 @@ nceldas <- 10 #Only applies if celdas=TRUE, number of cells in a ranking (from m areas <- TRUE # If areas=TRUE, a complementary analysis will be run per protected areas (polygons), which can come from a world database (WDPA) or from a shapefile provided by the user. If areas=TRUE, at least one of the following two options (or both), WDPA or propio, must be TRUE, otherwise it may cause errors. WDPA <- FALSE #Only applies if areas=TRUE propio <- TRUE # =own, alternative user defined file instead of WDPA -nombre <- "nameOfAlternativeShapefile" #Only applies if propio=TRUE, name of alternative shapefile +nombre <- "global_wdpa_polygons.shp" #Only applies if propio=TRUE, name of alternative shapefile campo <- "objectid" #Only applies if propio=TRUE, in campo you must specify the column of the shapefile table that contains the identifier code (ID) of each object (polygon) in the map of protected areas that the user provides through the shapefile. The name of the column must be inserted as it appears in the shapefile table, otherwise errors are generated nareas <- 5 # the number of protected areas where the points from the passport table coordinates fall, areas organized in a ranking (from most to least important in terms of accumulation of taxa richness) that will be analyzed in detail. It can generate a problem or error if nareas is a very large number and the passport table has few records, or few different species, or all the points are highly concentrated spatially. coveran <- TRUE # if coveran=TRUE a coverage analysis will be generated for the network of protected areas and a folder called CoverageAnalysis should appear in the results within the resultados para areas folder From 72060b070260668c39ea075f29a73213d5fbc2ba Mon Sep 17 00:00:00 2001 From: evalieungh Date: Thu, 27 Feb 2025 09:09:34 +0100 Subject: [PATCH 31/72] fix paths wdpa --- R_scripts/SHARED-Data.R | 4 ---- capfitogen_master.R | 18 ++++++++++-------- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/R_scripts/SHARED-Data.R b/R_scripts/SHARED-Data.R index 0696fd3..933100a 100644 --- a/R_scripts/SHARED-Data.R +++ b/R_scripts/SHARED-Data.R @@ -700,10 +700,6 @@ FUN.DownGV <- } } - - - - # WGS84 = EPSG:4326 ## Download digital elevation model (DEM) from ##' Jarvis A., H.I. Reuter, A. Nelson, E. Guevara, 2008, Hole-filled diff --git a/capfitogen_master.R b/capfitogen_master.R index 75f67ac..afbdbc7 100644 --- a/capfitogen_master.R +++ b/capfitogen_master.R @@ -198,7 +198,7 @@ write.table(Species_ls[["occs"]], #' The World Database on Protected Areas (WDPA) [Online], February 2025, #' Cambridge, UK: UNEP-WCMC and IUCN. Available at: www.protectedplanet.net. wdpa_url = "https://d1gam3xoknrgr2.cloudfront.net/current/WDPA_Feb2025_Public_shp.zip" -wdpa_destination = file.path(Dir.Data.Capfitogen, +wdpa_destination = file.path(Dir.Capfitogen.WDPA, "WDPA_Feb2025_Public_shp.zip") # if the file isn't there already, download it if (!file.exists(wdpa_destination)) { @@ -210,12 +210,12 @@ if (!file.exists(wdpa_destination)) { destfile = wdpa_destination, cacheOK = FALSE) # unzip files - message(paste("unzipping WDPA shapefiles to", Dir.Data.Capfitogen)) + message(paste("unzipping WDPA shapefiles to", Dir.Capfitogen.WDPA)) unzip(zipfile = wdpa_destination, - exdir = Dir.Data.Capfitogen) + exdir = Dir.Capfitogen.WDPA) # unzip split shapefile downloads message("unzipping shapefiles split in download") - wdpa_path <- file.path(Dir.Data.Capfitogen, "wdpa") + wdpa_path <- file.path(Dir.Capfitogen.WDPA, "wdpa") shapefile_names <- c( "WDPA_Feb2025_Public_shp-points.cpg", "WDPA_Feb2025_Public_shp-points.dbf", @@ -247,23 +247,25 @@ if (!file.exists(wdpa_destination)) { # delete unnecessary files files_to_keep <- c(wdpa_path, wdpa_destination, - file.path(Dir.Data.Capfitogen, + file.path(Dir.Capfitogen.WDPA, "WDPA_sources_Feb2025.csv")) files_to_delete <- - list.files(Dir.Data.Capfitogen, - full.names = TRUE)[list.files(Dir.Data.Capfitogen, + list.files(Dir.Capfitogen.WDPA, + full.names = TRUE)[list.files(Dir.Capfitogen.WDPA, full.names = TRUE) %nin% files_to_keep] file.remove(files_to_delete, recursive = TRUE) # merge parts into one global shapefile - wdpa_polygon_shapefiles <- + wdpa_polygon_shapefiles <- + # list polygon shapefiles in WDPA directory substr(unique(sub("\\..*", "", list.files(wdpa_path)[grep(pattern = "polygon", x = all_wdpa_shapefiles)])), 3, 34) shapefile_list <- list() for (i in 0:2) { + # read in all the polygon shapefile layers layer_name = paste0(i, "_", wdpa_polygon_shapefiles) shapefile_list[[i + 1]] <- read_sf(dsn = wdpa_path, layer = layer_name) From 132a2c4d8ccfaa270168c5994a56c19f6f6d4b0a Mon Sep 17 00:00:00 2001 From: Michal Torma Date: Thu, 27 Feb 2025 15:41:09 +0200 Subject: [PATCH 32/72] Add Slurm submission script for CapFitogen analysis (draft) --- submit_capfitogen.sh | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 submit_capfitogen.sh diff --git a/submit_capfitogen.sh b/submit_capfitogen.sh new file mode 100644 index 0000000..de76951 --- /dev/null +++ b/submit_capfitogen.sh @@ -0,0 +1,25 @@ +#!/bin/bash -l +#SBATCH -J CapFitogen +#SBATCH -o capfitogen-%j.out +#SBATCH --account=project_465000915 +#SBATCH --nodes=1 +#SBATCH --tasks-per-node=1 +#SBATCH --cpus-per-task=8 +#SBATCH --time=48:00:00 +#SBATCH --partition=small --mem=64G +##SBATCH --partition=standard --exclusive --mem=0 +##SBATCH --partition=debug --exclusive --mem=0 --time=0:30:00 + +SPECIES="${1:-Lathyrus}" + +# Workaround for memory issues in terra +# Max memory for terra +R_TERRA_MAX_RAM_MB=$((100 * 1024)) +# Limit max memory further if available memory is less than above +if [[ ${SLURM_MEM_PER_NODE:-0} > 0 ]]; then + R_TERRA_MAX_RAM_MB=$(( $R_TERRA_MAX_RAM_MB < $SLURM_MEM_PER_NODE ? $R_TERRA_MAX_RAM_MB : $SLURM_MEM_PER_NODE )) +fi +export R_TERRA_MAX_RAM_MB +# End of workaround + +singularity run --bind $PWD cwr_0.5.3.sif "capfitogen_master.R" From 5e45754315f9e51cb06196f148101c161dfb13f9 Mon Sep 17 00:00:00 2001 From: Michal Torma Date: Thu, 27 Feb 2025 15:41:22 +0200 Subject: [PATCH 33/72] Add fallback mechanism for BioClim data loading --- capfitogen_master.R | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/capfitogen_master.R b/capfitogen_master.R index afbdbc7..acd9083 100644 --- a/capfitogen_master.R +++ b/capfitogen_master.R @@ -104,12 +104,20 @@ Species_ls <- FUN.DownGBIF( ##' Will this download Global Multi-resolution Terrain Elevation Data (GMTED2010) as well? ##' Temporal coverage: January 1950 to present ? https://cds.climate.copernicus.eu/datasets/derived-era5-land-daily-statistics?tab=overview message("Downloading new or loading existing 19 BioClim bioclimatic variables") -bioclim_variables <- FUN.DownBV( - T_Start = 1999, # what year to begin climatology calculation? - T_End = 1999, # what year to end climatology calculation? - Dir = Dir.Data.Envir, # where to store the data output on disk - Force = FALSE # do not overwrite already present data + +# Check for existing BioClim data file +existing_bioclim_file <- file.path(Dir.Data.Envir, "BV_1985-2015.nc") +if (file.exists(existing_bioclim_file)) { + message("Using existing BioClim data") + bioclim_variables <- terra::rast(existing_bioclim_file) +} else { + bioclim_variables <- FUN.DownBV( + T_Start = 1999, # what year to begin climatology calculation? + T_End = 1999, # what year to end climatology calculation? + Dir = Dir.Data.Envir, # where to store the data output on disk + Force = FALSE # do not overwrite already present data ) +} # or use an existing data set (from Erik?) for testing... # bioclim_variables <- terra::rast(file.path(Dir.Data.Envir, "BV_1985-2015.nc")) From 62697816aa4827479c769b762c96d373fbca6c0f Mon Sep 17 00:00:00 2001 From: Michal Torma Date: Thu, 27 Feb 2025 15:42:21 +0200 Subject: [PATCH 34/72] Clean up ModGP-commonlines.R script - Remove 'modeltools' from package list - Add missing closing brace to install.load.package function - Add end-of-file comment --- R_scripts/ModGP-commonlines.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/R_scripts/ModGP-commonlines.R b/R_scripts/ModGP-commonlines.R index 62f64fc..663801a 100644 --- a/R_scripts/ModGP-commonlines.R +++ b/R_scripts/ModGP-commonlines.R @@ -7,7 +7,7 @@ install.load.package <- function(x) { if (!require(x, character.only = TRUE)) install.packages(x, repos='http://cran.us.r-project.org') require(x, character.only = TRUE) - +} ### CRAN PACKAGES ---- package_vec <- c( 'automap', # automatic interpolation (for KrigR) @@ -49,7 +49,6 @@ package_vec <- c( # Capfitogen ECLmapas packages # HJ: added here from Capfitogen ECLmapas script. To do: remove unnecessary ones - 'modeltools', 'flexmix', 'fpc', 'vegan', @@ -131,3 +130,5 @@ CreateDir <- sapply(Dirs, function(x){ x <- eval(parse(text=x)) if(!dir.exists(x)) dir.create(x)}) rm(Dirs) + +# End of file From 71dd0a4f501a6caeca484c262c2bebbdb08944c5 Mon Sep 17 00:00:00 2001 From: Michal Torma Date: Thu, 27 Feb 2025 15:52:13 +0200 Subject: [PATCH 35/72] Add Capfitogen submodule - Initialize .gitmodules configuration - Link to Capfitogen GitHub repository - Set specific commit reference --- .gitmodules | 3 +++ Capfitogen | 1 + 2 files changed, 4 insertions(+) create mode 100644 .gitmodules create mode 160000 Capfitogen diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..597542c --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "Capfitogen"] + path = Capfitogen + url = https://github.com/evalieungh/Capfitogen.git diff --git a/Capfitogen b/Capfitogen new file mode 160000 index 0000000..205ec9a --- /dev/null +++ b/Capfitogen @@ -0,0 +1 @@ +Subproject commit 205ec9a9f483fde0190d849eb14211ca4a3fa116 From e9b597f9cf621094bf4ca3542fe65f74d7039a79 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Thu, 27 Feb 2025 15:11:09 +0100 Subject: [PATCH 36/72] annotation updates --- capfitogen_master.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/capfitogen_master.R b/capfitogen_master.R index afbdbc7..a3f0003 100644 --- a/capfitogen_master.R +++ b/capfitogen_master.R @@ -192,6 +192,8 @@ write.table(Species_ls[["occs"]], sep = "\t",) ### Download protected areas ---- +#' NB! This download is large and combining big shapefiles requires +#' more computing power than should be done on a normal computer. #' https://www.protectedplanet.net/en/thematic-areas/wdpa&ved=2ahUKEwjA4fPhltyLAxVkJBAIHfdOEasQFnoECBUQAQ&usg=AOvVaw0eVrEFsb0_TP4UIl2am3Za #' download shapefiles for protected areas to overlay with Complementa tool: #' UNEP-WCMC and IUCN (2025), Protected Planet: From 46e1b161f54441b34e42412fc4e15c4ca0c71ec9 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Thu, 27 Feb 2025 15:27:49 +0100 Subject: [PATCH 37/72] comment out packages --- R_scripts/ModGP-commonlines.R | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/R_scripts/ModGP-commonlines.R b/R_scripts/ModGP-commonlines.R index 663801a..0b777c1 100644 --- a/R_scripts/ModGP-commonlines.R +++ b/R_scripts/ModGP-commonlines.R @@ -36,16 +36,15 @@ package_vec <- c( 'iterators', 'gdalUtilities', # to download from SoilGrids (FUN.DownEV) - # Capfitogen SelectVar packages - # HJ: added here from Capfitogen SelectVar script. To do: remove unnecessary ones - 'dismo', - 'cluster', - 'ade4', - 'labdsv', - 'mclust', - 'clustvarsel', - #'randomForest', # ---------------- replace with ranger? - 'ranger', + # # Capfitogen SelectVar packages + # # HJ: added here from Capfitogen SelectVar script. To do: remove unnecessary ones + # 'dismo', + # 'cluster', + # 'ade4', + # 'labdsv', + # 'mclust', + # 'clustvarsel', + # 'ranger', # Capfitogen ECLmapas packages # HJ: added here from Capfitogen ECLmapas script. To do: remove unnecessary ones From cd097979cc0d46ae89310bd133870235ff9f55a3 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Thu, 27 Feb 2025 16:14:44 +0100 Subject: [PATCH 38/72] delete Capfitogen repo download --- capfitogen_master.R | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/capfitogen_master.R b/capfitogen_master.R index f8aeba5..328cde9 100644 --- a/capfitogen_master.R +++ b/capfitogen_master.R @@ -166,18 +166,6 @@ geophysical_variables <- FUN.DownGV( ) # CAPFITOGEN pipeline ========================================================= -## Download CAPFITOGEN scripts ------------------------------------------------ -# download and unzip CAPFITOGEN repository -if (!file.exists("capfitogen-main.zip")) { - download.file(url = "https://github.com/evalieungh/Capfitogen/archive/refs/heads/main.zip", - destfile = "capfitogen-main.zip") - unzip(zipfile = "capfitogen-main.zip") -} - -if (!file.exists(file.path(Dir.Results.Complementa.Error,"process_info.txt"))) { - file.create(file.path(Dir.Results.Complementa.Error,"process_info.txt")) -} - ### Format GBIF data ---- # need a data frame named 'puntos' = points with occurrence points puntos <- data.frame(POINTID = 1:length(Species_ls[["occs"]][["DECLATITUDE"]]), @@ -254,6 +242,7 @@ if (!file.exists(wdpa_destination)) { file.rename(from = shapefile_paths, to = new_shapefile_names) } + # delete unnecessary files files_to_keep <- c(wdpa_path, wdpa_destination, @@ -409,6 +398,11 @@ for (file_path in elc_tif_outputs) { } ## Overlaying conservation maps "Complementa" --------------------------------- +#' create template file for logging script processes +if (!file.exists(file.path(Dir.Results.Complementa.Error,"process_info.txt"))) { + file.create(file.path(Dir.Results.Complementa.Error,"process_info.txt")) +} + ### parameters for Complementa ---- { resultados <- Dir.Results.Complementa From 92808ed637ff9e844caa9f3a846ae0d22481666e Mon Sep 17 00:00:00 2001 From: evalieungh Date: Fri, 21 Mar 2025 12:29:38 +0100 Subject: [PATCH 39/72] move wdpa download to SHARED-Data as function --- .gitignore | 3 +- README.md | 2 + R_scripts/ModGP-commonlines.R | 3 +- R_scripts/SHARED-Data.R | 191 ++++++++++++++++++++++++---------- capfitogen_master.R | 126 +++++----------------- 5 files changed, 167 insertions(+), 158 deletions(-) diff --git a/.gitignore b/.gitignore index 01f9310..236e82d 100644 --- a/.gitignore +++ b/.gitignore @@ -78,8 +78,7 @@ results/ # Downloads R_scripts/ELCmapas.R -capfitogen-main.zip -Capfitogen-main/ +Capfitogen/* Data/Capfitogen/ hq diff --git a/README.md b/README.md index ee7d348..b869bbe 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,8 @@ As an addition to ModGP, you can run two of Capfitogen's most useful tools: ELC maps and Complementa maps to visualise overlap with protected areas. +- After cloning this repository, you need to clone Capfitogen (a submodule) as well with `git submodule update --init`. + - To run our version of CAPFITOGEN in [RStudio](https://posit.co/downloads/), open `capfitogen_master.R` and execute the code, changing inputs like species name and other parameters. The script guides you through the whole process. - To run on LUMI: obtain interactive session: diff --git a/R_scripts/ModGP-commonlines.R b/R_scripts/ModGP-commonlines.R index 0b777c1..779fbfe 100644 --- a/R_scripts/ModGP-commonlines.R +++ b/R_scripts/ModGP-commonlines.R @@ -18,6 +18,7 @@ package_vec <- c( 'ggpubr', # t-test comparison in ggplot 'gridExtra', # ggplot saving in PDF 'ncdf4', # handling NetCDF files + 'maps', # background maps 'parallel', # parallel runs 'pbapply', # parallel runs with estimator bar 'raster', # spatial data ----------------------- should be replaced by terra @@ -115,7 +116,7 @@ Dir.Exports <- file.path(Dir.Base, "Exports") Dir.Exports.ModGP <- file.path(Dir.Exports, "ModGP") Dir.Exports.Capfitogen <- file.path(Dir.Exports, "Capfitogen") Dir.R_scripts <- file.path(Dir.Base, "R_scripts") -Dir.Capfitogen <- file.path(Dir.Base, "Capfitogen-main") +Dir.Capfitogen <- file.path(Dir.Base, "Capfitogen") Dir.Capfitogen.WDPA <- file.path(Dir.Capfitogen, "wdpa") Dir.Capfitogen.ELCMap <- file.path(Dir.Capfitogen, "ELCmapas") Dir.Capfitogen.ELCMap.Error <- file.path(Dir.Capfitogen.ELCMap, "Error") diff --git a/R_scripts/SHARED-Data.R b/R_scripts/SHARED-Data.R index f6c3d13..4b6afca 100644 --- a/R_scripts/SHARED-Data.R +++ b/R_scripts/SHARED-Data.R @@ -481,16 +481,16 @@ FUN.DownEV <- SoilGrids_variables_in <- c("bdod/bdod_0-5cm_mean", # Bulk density of the fine earth fraction, cg/cm³ - "cec/cec_0-5cm_mean")#, # Cation Exchange Capacity of the soil, mmol(c)/kg - #"cfvo/cfvo_0-5cm_mean", # Volumetric fraction of coarse fragments (> 2 mm) cm3/dm3 (vol‰) - #"silt/silt_0-5cm_mean")#, # Proportion of silt particles (≥ 0.002 mm and ≤ 0.05/0.063 mm) in the fine earth fraction g/kg - #"clay/clay_0-5cm_mean", # Proportion of clay particles (< 0.002 mm) in the fine earth fraction g/kg - #"sand/sand_0-5cm_mean", # Proportion of sand particles (> 0.05/0.063 mm) in the fine earth fraction g/kg - #"nitrogen/nitrogen_0-5cm_mean", # Total nitrogen (N) cg/kg - #"phh2o/phh2o_0-5cm_mean", # Soil pH pHx10 - #"ocd/ocd_0-5cm_mean",# Organic carbon density hg/m³ - #"ocs/ocs_0-30cm_mean",# Organic carbon stocks t/ha - #"soc/soc_0-5cm_mean")# Soil organic carbon content in the fine earth fraction dg/kg + "cec/cec_0-5cm_mean", # Cation Exchange Capacity of the soil, mmol(c)/kg + "cfvo/cfvo_0-5cm_mean", # Volumetric fraction of coarse fragments (> 2 mm) cm3/dm3 (vol‰) + "silt/silt_0-5cm_mean", # Proportion of silt particles (≥ 0.002 mm and ≤ 0.05/0.063 mm) in the fine earth fraction g/kg + "clay/clay_0-5cm_mean", # Proportion of clay particles (< 0.002 mm) in the fine earth fraction g/kg + "sand/sand_0-5cm_mean", # Proportion of sand particles (> 0.05/0.063 mm) in the fine earth fraction g/kg + "nitrogen/nitrogen_0-5cm_mean", # Total nitrogen (N) cg/kg + "phh2o/phh2o_0-5cm_mean", # Soil pH pHx10 + "ocd/ocd_0-5cm_mean", # Organic carbon density hg/m³ + "ocs/ocs_0-30cm_mean", # Organic carbon stocks t/ha + "soc/soc_0-5cm_mean") # Soil organic carbon content in the fine earth fraction dg/kg SoilGrids_variables <- sub(".*/", "", SoilGrids_variables_in) @@ -602,6 +602,8 @@ FUN.DownEV <- } # GEOPHYSICAL DATA DOWNLOAD -------------------------------------------------- +#' define a function to load existing data or download them to +#' the given directory. FUN.DownGV <- function(Dir = getwd(),# where to store the data output on disk Force = FALSE,# do not overwrite already present data, @@ -666,7 +668,7 @@ FUN.DownGV <- wind <- rast(paste0(Dir, "/wc2.1_2.5m_wind_max.tif")) names(wind) <- "mean_wind_speed_of_windiest_month" - ## resample ------ + ## Resample ------ ## if provided, resample to match another raster object's origin and resolution if (!missing(resample_to_match)) { message(paste0("resampling raster to match ", names(resample_to_match))) @@ -686,13 +688,12 @@ FUN.DownGV <- wind <- terra::resample(wind, resample_to_match) message("wind successfully resampled") - } ### combine rasters geophysical_rasters <- c(dem, wind) - ### Saving ---- + ## Saving ---- message("saving as NetCDF") terra::writeCDF(geophysical_rasters, filename = FNAME, @@ -709,43 +710,127 @@ FUN.DownGV <- ##' (CIAT), available from http://srtm.csi.cgiar.org. #dem <- rast("https://srtm.csi.cgiar.org/wp-content/uploads/files/250m/tiles250m.jpg") -# DRAFT: Google Earth Engine downloads. ------------------------------------- -#' Almost working, but missing user/project credentials and login. -#' See https://developers.google.com/earth-engine/guides/auth -#' ee.Authenticate() -#' ee.Initialize(project='my-project') -# -# install.packages("reticulate") # python environment - https://rstudio.github.io/reticulate/ -# install.packages("rgeedim") # search and download Google Earth Engine imagery with Python -# -# library(reticulate) -# -# virtualenv_create(envname = "uc_CWR", # saved under /Documents/.virtualenvs/uc_CWR -# packages = c("numpy","geedim"), -# python = "C:/Program Files/Python3.10/python.exe" -# ) -# -# virtualenv_list() -# -# use_virtualenv("uc_CWR") -# -# library(rgeedim) -# -# names(geedim()$enums) -# -# -# ## Download CHILI: Continuous Heat-Insolation Load Index -# ##' Theobald, D. M., Harrison-Atlas, D., Monahan, W. B., & Albano, C. M. -# ##' (2015). Ecologically-relevant maps of landforms and physiographic -# ##' diversity for climate adaptation planning. PloS one, 10(12), e0143619 -# -# chili_img_id <- gd_image_from_id('CSP/ERGo/1_0/Global/ALOS_CHILI') -# -# chili <- -# gd_download(chili_img_id, -# filename = 'chili.tif', -# resampling = "bilinear", -# scale = 2500, # scale=10: request ~10m resolution -# overwrite = TRUE, -# silent = FALSE -# ) + +# WORLD DATABASE ON PROTECTED AREAS --------------------------------------- +#' UNEP-WCMC and IUCN (2025), Protected Planet: +#' The World Database on Protected Areas (WDPA) [Online], February 2025, +#' Cambridge, UK: UNEP-WCMC and IUCN. Available at: www.protectedplanet.net. +#' https://www.protectedplanet.net/en/thematic-areas/wdpa&ved=2ahUKEwjA4fPhltyLAxVkJBAIHfdOEasQFnoECBUQAQ&usg=AOvVaw0eVrEFsb0_TP4UIl2am3Za +FUN.DownWDPA <- function( + wdpa_url = "https://d1gam3xoknrgr2.cloudfront.net/current/WDPA_Feb2025_Public_shp.zip", + wdpa_destination = file.path(Dir.Capfitogen.WDPA, + "WDPA_Feb2025_Public_shp.zip"), + Force = FALSE) { + # set a path to wdpa shapefiles + wdpa_path <- file.path(Dir.Capfitogen.WDPA, "wdpa") + + # define the file name of global wdpa shapefile to be created + FNAME <- file.path(wdpa_path, "global_wdpa_polygons.shp") + + # check if the final wdpa file already exists and whether to overwrite + if (!Force & file.exists(FNAME)) { + message(paste0("A global wdpa file with polygons exists already: ", FNAME)) + } else { + # download if Force = TRUE or the file isn't already there + message("downloading zipped WDPA shapefiles, ca 4GB") + # set long timeout to avoid interrupting download + options(timeout = 1000) + # download the zipped files + download.file(url = wdpa_url, + destfile = wdpa_destination, + cacheOK = FALSE) + # unzip files + message(paste("unzipping WDPA shapefiles to", Dir.Capfitogen.WDPA)) + unzip(zipfile = wdpa_destination, + exdir = Dir.Capfitogen.WDPA) + + # unzip split shapefile downloads + message("unzipping shapefiles split in download") + + shapefile_names <- c( + "WDPA_Feb2025_Public_shp-points.cpg", + "WDPA_Feb2025_Public_shp-points.dbf", + "WDPA_Feb2025_Public_shp-points.prj", + "WDPA_Feb2025_Public_shp-points.shp", + "WDPA_Feb2025_Public_shp-points.shx", + "WDPA_Feb2025_Public_shp-polygons.cpg", + "WDPA_Feb2025_Public_shp-polygons.dbf", + "WDPA_Feb2025_Public_shp-polygons.prj", + "WDPA_Feb2025_Public_shp-polygons.shp", + "WDPA_Feb2025_Public_shp-polygons.shx" + ) + + shapefile_paths <- file.path(wdpa_path, + shapefile_names) + + # loop over zip directories with parts of the global data (numbered 0, 1, 2) + for (i in 0:2) { + # define name of the current directory to be unzipped + zipfilename <- + file.path(Dir.Capfitogen.WDPA, + paste0("WDPA_Feb2025_Public_shp_", i, ".zip")) + + # unzip the directory containing shapefiles + unzip(zipfile = zipfilename, + exdir = wdpa_path) + message(paste0("unzipped ", zipfilename, + "\nto ", wdpa_path)) + + # rename shapefiles with numbers to prevent overwriting them + new_shapefile_names <- file.path(wdpa_path, + paste0(i, "_", + shapefile_names)) + file.rename(from = shapefile_paths, + to = new_shapefile_names) + } + + # delete unnecessary files + message("deleting redundant files (translations etc.)") + files_to_keep <- c( + wdpa_path, + wdpa_destination, + file.path(Dir.Capfitogen.WDPA, + "WDPA_sources_Feb2025.csv")) + + files_to_delete <- + list.files(Dir.Capfitogen.WDPA, + full.names = TRUE)[list.files(Dir.Capfitogen.WDPA, + full.names = TRUE) %nin% files_to_keep] + file.remove(files_to_delete, + recursive = TRUE) + + # prepare list of shapefiles to be combined + wdpa_polygon_shapefiles <- + # list polygon shapefiles in WDPA directory + substr(unique(sub("\\..*", "", + list.files(wdpa_path)[grep(pattern = "polygon", + x = shapefile_names)])), + 3, 34) + + shapefile_list <- list() + + for (i in 0:2) { + # read in all the polygon shapefile layers + layer_name = paste0(i, "_", wdpa_polygon_shapefiles) + shapefile_list[[i + 1]] <- + read_sf(dsn = wdpa_path, layer = layer_name) + } + + # merge parts into one global shapefile + message("combining parts of the WDPA shapefile. This can take a while ---") + wdpa <- do.call(rbind, shapefile_list) + + # save complete wdpa + message("Complete WDPA successfully combined.") + st_write(wdpa, + FNAME) + #' ERROR + + #' Warning messages: + #' 1: In CPL_write_ogr(obj, dsn, layer, driver, as.character(dataset_options), ... : + #' GDAL Message 1: One or several characters couldn't be converted correctly from UTF-8 to ISO-8859-1. This warning will not be emitted anymore. + #' 2: In CPL_write_ogr(obj, dsn, layer, driver, as.character(dataset_options), ... : + #' GDAL Message 1: Value 555510160 of field WDPAID of feature 114424 not successfully written. Possibly due to too larger number with respect to field width + message("WDPA saved as global_wdpa_polygons.shp") + } +} + diff --git a/capfitogen_master.R b/capfitogen_master.R index 328cde9..ca7131d 100644 --- a/capfitogen_master.R +++ b/capfitogen_master.R @@ -11,9 +11,10 @@ #' - 'Complementa' #' - Visualisation of outputs #' DEPENDENCIES: +#' - Capfitogen submodule (from https://github.com/evalieungh/Capfitogen) #' - R_Scripts directory containing: #' - "MoDGP-commonlines.R" -#' - "SHARED-APICredentials.R" -- NB! internal to project members, ask for access +#' - "SHARED-APICredentials.R" -- NB! internal to project, ask for access #' - "SHARED-Data.R" #' AUTHORS: [Eva Lieungh, Erik Kusch, Heli Juottonen, Desalegn Chala] #' Capfitogen credit: Parra-Quijano et al. 2021, @@ -101,8 +102,10 @@ Species_ls <- FUN.DownGBIF( ##' 19 BioClim variables ##' FUN.DownBV uses KrigR to download ERA5 data from Climate Data Store (CDS) ##' is each file of each variable >20GB? -##' Will this download Global Multi-resolution Terrain Elevation Data (GMTED2010) as well? -##' Temporal coverage: January 1950 to present ? https://cds.climate.copernicus.eu/datasets/derived-era5-land-daily-statistics?tab=overview +##' Will this download Global Multi-resolution Terrain Elevation Data +##' (GMTED2010) as well? +##' Temporal coverage: January 1950 to present ? +##' https://cds.climate.copernicus.eu/datasets/derived-era5-land-daily-statistics?tab=overview message("Downloading new or loading existing 19 BioClim bioclimatic variables") # Check for existing BioClim data file @@ -183,100 +186,20 @@ pasaporte_file_name = paste0(sub(pattern = " ", ".txt") write.table(Species_ls[["occs"]], - file.path("Capfitogen-main/Pasaporte", + file.path("Capfitogen/Pasaporte", pasaporte_file_name), sep = "\t",) ### Download protected areas ---- -#' NB! This download is large and combining big shapefiles requires -#' more computing power than should be done on a normal computer. -#' https://www.protectedplanet.net/en/thematic-areas/wdpa&ved=2ahUKEwjA4fPhltyLAxVkJBAIHfdOEasQFnoECBUQAQ&usg=AOvVaw0eVrEFsb0_TP4UIl2am3Za -#' download shapefiles for protected areas to overlay with Complementa tool: -#' UNEP-WCMC and IUCN (2025), Protected Planet: -#' The World Database on Protected Areas (WDPA) [Online], February 2025, -#' Cambridge, UK: UNEP-WCMC and IUCN. Available at: www.protectedplanet.net. -wdpa_url = "https://d1gam3xoknrgr2.cloudfront.net/current/WDPA_Feb2025_Public_shp.zip" -wdpa_destination = file.path(Dir.Capfitogen.WDPA, - "WDPA_Feb2025_Public_shp.zip") -# if the file isn't there already, download it -if (!file.exists(wdpa_destination)) { - message("downloading zipped WDPA shapefiles, ca 4GB") - # set long timeout to avoid interrupting download - options(timeout = 1000) - # download the zipped files - download.file(url = wdpa_url, - destfile = wdpa_destination, - cacheOK = FALSE) - # unzip files - message(paste("unzipping WDPA shapefiles to", Dir.Capfitogen.WDPA)) - unzip(zipfile = wdpa_destination, - exdir = Dir.Capfitogen.WDPA) - # unzip split shapefile downloads - message("unzipping shapefiles split in download") - wdpa_path <- file.path(Dir.Capfitogen.WDPA, "wdpa") - shapefile_names <- c( - "WDPA_Feb2025_Public_shp-points.cpg", - "WDPA_Feb2025_Public_shp-points.dbf", - "WDPA_Feb2025_Public_shp-points.prj", - "WDPA_Feb2025_Public_shp-points.shp", - "WDPA_Feb2025_Public_shp-points.shx", - "WDPA_Feb2025_Public_shp-polygons.cpg", - "WDPA_Feb2025_Public_shp-polygons.dbf", - "WDPA_Feb2025_Public_shp-polygons.prj", - "WDPA_Feb2025_Public_shp-polygons.shp", - "WDPA_Feb2025_Public_shp-polygons.shx") - shapefile_paths <- file.path(wdpa_path, - shapefile_names) - for (i in 0:2) { - # define name of the current directory to be unzipped - zipfilename <- - file.path(Dir.Data.Capfitogen, - paste0("WDPA_Feb2025_Public_shp_", i, ".zip")) - # unzip the directory containing shapefiles - unzip(zipfile = zipfilename, - exdir = wdpa_path) - # rename shapefiles with numbers to prevent overwriting them - new_shapefile_names <- file.path(wdpa_path, - paste0(i, "_", - shapefile_names)) - file.rename(from = shapefile_paths, - to = new_shapefile_names) - } - - # delete unnecessary files - files_to_keep <- c(wdpa_path, - wdpa_destination, - file.path(Dir.Capfitogen.WDPA, - "WDPA_sources_Feb2025.csv")) - files_to_delete <- - list.files(Dir.Capfitogen.WDPA, - full.names = TRUE)[list.files(Dir.Capfitogen.WDPA, - full.names = TRUE) %nin% files_to_keep] - file.remove(files_to_delete, - recursive = TRUE) - - # merge parts into one global shapefile - wdpa_polygon_shapefiles <- - # list polygon shapefiles in WDPA directory - substr(unique(sub("\\..*", "", - list.files(wdpa_path)[grep(pattern = "polygon", - x = all_wdpa_shapefiles)])), - 3, 34) - shapefile_list <- list() - for (i in 0:2) { - # read in all the polygon shapefile layers - layer_name = paste0(i, "_", wdpa_polygon_shapefiles) - shapefile_list[[i + 1]] <- - read_sf(dsn = wdpa_path, layer = layer_name) - } - message("combining parts of the WDPA shapefile. This can take a while ---") - wdpa <- do.call(rbind, shapefile_list) - - # save complete wdpa - message("Complete WDPA successfully combined. Saving it as global_wdpa_polygons.shp") - st_write(wdpa, - file.path(wdpa_path, "global_wdpa_polygons.shp")) -} +#' download shapefiles for protected areas to overlay with Complementa tool +FUN.DownWDPA( + # download from url: + wdpa_url = "https://d1gam3xoknrgr2.cloudfront.net/current/WDPA_Feb2025_Public_shp.zip", + # save the downloaded zipfile as: + wdpa_destination = file.path(Dir.Capfitogen.WDPA, + "WDPA_Feb2025_Public_shp.zip"), + # do not overwrite existing data + Force = FALSE) ## Variable selection --------------------------------------------------------- # combine variables @@ -306,20 +229,21 @@ predictors <- raster::stack(predictors) # save variables in CAPFITOGEN folder if (!dir.exists(file.path(Dir.Capfitogen, "rdatapoints/world/9x9"))) { - dir.create(file.path(Dir.Capfitogen, "rdatapoints/world/9x9")) + dir.create(file.path(Dir.Capfitogen, "rdatapoints/world/9x9"), + recursive = TRUE) dir.create(file.path(Dir.Capfitogen, "rdatamaps/world/9x9"), recursive = TRUE) } saveRDS(predictors, - "Capfitogen-main/rdatapoints/world/9x9/base9x9.RData") + "Capfitogen/rdatapoints/world/9x9/base9x9.RData") save(predictors, - file = "Capfitogen-main/rdatapoints/world/9x9/base9x9.RData") + file = "Capfitogen/rdatapoints/world/9x9/base9x9.RData") predictor_names <- names(predictors) for (i in 1:dim(predictors)[3]) { - file_name_path = file.path("Capfitogen-main/rdatamaps/world/9x9", + file_name_path = file.path("Capfitogen/rdatamaps/world/9x9", paste0(names(predictors[[i]]),".tif")) writeRaster(predictors[[i]], file_name_path, @@ -330,7 +254,7 @@ for (i in 1:dim(predictors)[3]) { ### Parameters for ELC maps ---- { ruta <- Dir.Capfitogen # path to capfitogen scripts -resultados <- Dir.Results.ELCMap # directory to place results +resultados <- Dir.Capfitogen.ELCMap # directory to place results pasaporte <- pasaporte_file_name # species occurrence data pais <- "World" # global extent - big modifications will be necessary to use different extent @@ -354,7 +278,7 @@ iterat <- 10 # if metodo="Calinski" or "ssi", the number of iterations to calcul # run the script message("Clustering and creating maps") source(file.path(Dir.Capfitogen, - "/scripts/Tools Herramientas/ELCmapas_BioDT.R")) + "scripts/Tools Herramientas/ELCmapas.R")) setwd(Dir.Base) ### quick visualisation of ELC maps ---- @@ -438,7 +362,7 @@ message("running Capfitogen Complementa tool for conservation areas") setwd(Dir.Base) source(file.path(Dir.Capfitogen, "/scripts/Tools Herramientas/Complementa.R")) -#' works if areas = FALSE ! + setwd(Dir.Base) ### quick visualisation ---- @@ -452,8 +376,6 @@ non_zero_mask <- mask(complementa_map, complementa_points <- as.points(non_zero_mask, na.rm = TRUE) plot(complementa_points) -library(maps) - map( 'world', col = "grey", From 52455c65e37e6b713fa805b657b5c2269c453338 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Fri, 21 Mar 2025 13:34:58 +0100 Subject: [PATCH 40/72] move plotting to visualisation script --- R_scripts/Capfitogen_visualisation.R | 97 ++++++++++++++++++++++++++++ capfitogen_master.R | 66 +------------------ 2 files changed, 100 insertions(+), 63 deletions(-) create mode 100644 R_scripts/Capfitogen_visualisation.R diff --git a/R_scripts/Capfitogen_visualisation.R b/R_scripts/Capfitogen_visualisation.R new file mode 100644 index 0000000..6ecb6b3 --- /dev/null +++ b/R_scripts/Capfitogen_visualisation.R @@ -0,0 +1,97 @@ +############################################################## +#' Visualisation of Capfitogen inputs and output +#' CONTENTS: +#' - Visualisation of input data +#' - World Database on Protected Areas (WDPA) +#' - Visualisation of outputs +#' - ELC maps +#' - +#' DEPENDENCIES: +#' - capfitogen_master.R (to download and create data) +#' - ModGP-commonlines.R (packages, paths) +#' AUTHORS: [Eva Lieungh] +#' ########################################################### + +# Load dependencies ------------------------------------------ +# Define directories in relation to project directory +Dir.Base <- getwd() +Dir.Scripts <- file.path(Dir.Base, "R_scripts") + +# source packages, directories, simple functions (...) +source(file.path(Dir.Scripts, "ModGP-commonlines.R")) + +# VISUALISE INPUTS =========================================== + +# WDPA ------------------------------------------------------- + + + + + +# VISUALISE OUTPUTS ========================================== + +# ELC maps --------------------------------------------------- +## quick visualisation ---- +# List all the .tif files in the directory +elc_tif_outputs <- list.files(path = Dir.Results.ELCMap, + pattern = "\\.tif$", + full.names = TRUE) + +# Loop over each .tif file +for (file_path in elc_tif_outputs) { + # Read the raster file + map_i <- rast(file_path) + + # Replace NaN with NA (if they exist) + map_i[is.nan(values(map_i))] <- NA + + # Create a mask to highlight non-zero areas + non_zero_mask <- mask(map_i, !is.na(map_i)) + + # Convert to points to find non-zero values' extent + points <- as.points(non_zero_mask, na.rm = TRUE) + + # If there are any valid points, proceed with cropping + if (!is.null(points) && nrow(points) > 0) { + # Calculate extent directly from the non-empty points + coordinates <- terra::geom(points)[, c("x", "y")] + xmin = min(coordinates[,"x"]) + xmax = max(coordinates[,"x"]) + ymin = min(coordinates[,"y"]) + ymax = max(coordinates[,"y"]) + non_zero_extent <- ext(xmin, xmax, ymin, ymax) + + # Crop the raster using this extent + cropped_map <- crop(map_i, non_zero_extent) + + # Plot the cropped raster + plot(cropped_map, main = basename(file_path)) + } else { + plot(map_i, main = paste(basename(file_path), "(No non-zero values)")) + } +} + + +# Complementa --------------------------------------------------------- + +complementa_map <- rast( + file.path(Dir.Results.Complementa, + "AnalisisCeldas_CellAnalysis/Complementa_map.tif")) +plot(complementa_map) +complementa_map[is.nan(values(complementa_map))] <- NA +non_zero_mask <- mask(complementa_map, + !is.na(complementa_map)) +complementa_points <- as.points(non_zero_mask, na.rm = TRUE) +plot(complementa_points) + +map( + 'world', + col = "grey", + fill = TRUE, + bg = "white", + lwd = 0.05, + mar = rep(0, 4), + border = 0, + ylim = c(-80, 80) +) +points(complementa_points) \ No newline at end of file diff --git a/capfitogen_master.R b/capfitogen_master.R index ca7131d..5669039 100644 --- a/capfitogen_master.R +++ b/capfitogen_master.R @@ -191,7 +191,9 @@ write.table(Species_ls[["occs"]], sep = "\t",) ### Download protected areas ---- -#' download shapefiles for protected areas to overlay with Complementa tool +#' download shapefiles for protected areas to overlay with Complementa tool. +#' The FUN.DownWDPA function will save the file to a folder, but not load it +#' into RStudio as an object. FUN.DownWDPA( # download from url: wdpa_url = "https://d1gam3xoknrgr2.cloudfront.net/current/WDPA_Feb2025_Public_shp.zip", @@ -281,46 +283,6 @@ source(file.path(Dir.Capfitogen, "scripts/Tools Herramientas/ELCmapas.R")) setwd(Dir.Base) -### quick visualisation of ELC maps ---- -# List all the .tif files in the directory -elc_tif_outputs <- list.files(path = Dir.Results.ELCMap, - pattern = "\\.tif$", - full.names = TRUE) - -# Loop over each .tif file -for (file_path in elc_tif_outputs) { - # Read the raster file - map_i <- rast(file_path) - - # Replace NaN with NA (if they exist) - map_i[is.nan(values(map_i))] <- NA - - # Create a mask to highlight non-zero areas - non_zero_mask <- mask(map_i, !is.na(map_i)) - - # Convert to points to find non-zero values' extent - points <- as.points(non_zero_mask, na.rm = TRUE) - - # If there are any valid points, proceed with cropping - if (!is.null(points) && nrow(points) > 0) { - # Calculate extent directly from the non-empty points - coordinates <- terra::geom(points)[, c("x", "y")] - xmin = min(coordinates[,"x"]) - xmax = max(coordinates[,"x"]) - ymin = min(coordinates[,"y"]) - ymax = max(coordinates[,"y"]) - non_zero_extent <- ext(xmin, xmax, ymin, ymax) - - # Crop the raster using this extent - cropped_map <- crop(map_i, non_zero_extent) - - # Plot the cropped raster - plot(cropped_map, main = basename(file_path)) - } else { - plot(map_i, main = paste(basename(file_path), "(No non-zero values)")) - } -} - ## Overlaying conservation maps "Complementa" --------------------------------- #' create template file for logging script processes if (!file.exists(file.path(Dir.Results.Complementa.Error,"process_info.txt"))) { @@ -365,25 +327,3 @@ source(file.path(Dir.Capfitogen, setwd(Dir.Base) -### quick visualisation ---- -complementa_map <- rast( - file.path(Dir.Results.Complementa, - "AnalisisCeldas_CellAnalysis/Complementa_map.tif")) -plot(complementa_map) -complementa_map[is.nan(values(complementa_map))] <- NA -non_zero_mask <- mask(complementa_map, - !is.na(complementa_map)) -complementa_points <- as.points(non_zero_mask, na.rm = TRUE) -plot(complementa_points) - -map( - 'world', - col = "grey", - fill = TRUE, - bg = "white", - lwd = 0.05, - mar = rep(0, 4), - border = 0, - ylim = c(-80, 80) -) -points(complementa_points) From f352fee3b4c905ad3a3554babcfe0e685f45d6a8 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Fri, 21 Mar 2025 13:50:12 +0100 Subject: [PATCH 41/72] fix saving wdpa as gpkg --- R_scripts/SHARED-Data.R | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/R_scripts/SHARED-Data.R b/R_scripts/SHARED-Data.R index 4b6afca..1e6f07a 100644 --- a/R_scripts/SHARED-Data.R +++ b/R_scripts/SHARED-Data.R @@ -5,7 +5,7 @@ #' - Bioclimatic Variable Climatology creation for qsoil1 and qsoil2 combined #' DEPENDENCIES: #' - None -#' AUTHOR: [Erik Kusch] +#' AUTHORS: [Erik Kusch, Eva Lieungh] #' ####################################################################### # # GBIF DOWNLOAD FUNCTION -------------------------------------------------- @@ -819,18 +819,27 @@ FUN.DownWDPA <- function( # merge parts into one global shapefile message("combining parts of the WDPA shapefile. This can take a while ---") wdpa <- do.call(rbind, shapefile_list) + message("Complete WDPA successfully combined.") + # wdpa$WDPAID <- as.character(wdpa$WDPAID) + # wdpa$text_field <- iconv(wdpa$text_field, to = "ASCII//TRANSLIT") + # save complete wdpa - message("Complete WDPA successfully combined.") - st_write(wdpa, - FNAME) - #' ERROR + - #' Warning messages: - #' 1: In CPL_write_ogr(obj, dsn, layer, driver, as.character(dataset_options), ... : - #' GDAL Message 1: One or several characters couldn't be converted correctly from UTF-8 to ISO-8859-1. This warning will not be emitted anymore. - #' 2: In CPL_write_ogr(obj, dsn, layer, driver, as.character(dataset_options), ... : - #' GDAL Message 1: Value 555510160 of field WDPAID of feature 114424 not successfully written. Possibly due to too larger number with respect to field width - message("WDPA saved as global_wdpa_polygons.shp") + message("save as GeoPackage") + st_write(wdpa, file.path(wdpa_path, "global_wdpa_polygons.gpkg")) + message(paste0("global WDPA saved as: ", + file.path(wdpa_path, "global_wdpa_polygons.gpkg"))) + + message("save as shapefile") + #st_write(wdpa, FNAME) + st_write( + wdpa, + "global_wdpa_polygons.shp", + layer_options = "ENCODING=UTF-8", + field_type = c(WDPAID = "Character") + ) + message("global WDPA saved as global_wdpa_polygons.shp") } } +# end From 47fec08a34b5445dc6e6ac17e6d04d2b6862eef0 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Wed, 2 Apr 2025 08:56:05 +0200 Subject: [PATCH 42/72] update scripts --- R_scripts/ModGP-commonlines.R | 2 +- R_scripts/SHARED-Data.R | 23 +++--- capfitogen_master.R | 145 ++++++++++++++++++++++++++++------ 3 files changed, 134 insertions(+), 36 deletions(-) diff --git a/R_scripts/ModGP-commonlines.R b/R_scripts/ModGP-commonlines.R index 779fbfe..eddae1d 100644 --- a/R_scripts/ModGP-commonlines.R +++ b/R_scripts/ModGP-commonlines.R @@ -119,7 +119,7 @@ Dir.R_scripts <- file.path(Dir.Base, "R_scripts") Dir.Capfitogen <- file.path(Dir.Base, "Capfitogen") Dir.Capfitogen.WDPA <- file.path(Dir.Capfitogen, "wdpa") Dir.Capfitogen.ELCMap <- file.path(Dir.Capfitogen, "ELCmapas") -Dir.Capfitogen.ELCMap.Error <- file.path(Dir.Capfitogen.ELCMap, "Error") +Dir.Capfitogen.Error <- file.path(Dir.Capfitogen, "Error") Dir.Results <- file.path(Dir.Base, "results") Dir.Results.Complementa <- file.path(Dir.Results, "Complementa") Dir.Results.Complementa.Error <- file.path(Dir.Results.Complementa, "Error") diff --git a/R_scripts/SHARED-Data.R b/R_scripts/SHARED-Data.R index 1e6f07a..2d1ad6b 100644 --- a/R_scripts/SHARED-Data.R +++ b/R_scripts/SHARED-Data.R @@ -641,7 +641,7 @@ FUN.DownGV <- unlink(temp) } dem <- rast(paste0(Dir, "/wc2.1_2.5m_elev.tif")) - names(dem) <- "elevation" + names(dem) <- "Elevacion" ## Download wind speed ------ ##' WorldClim 2 @@ -725,20 +725,23 @@ FUN.DownWDPA <- function( wdpa_path <- file.path(Dir.Capfitogen.WDPA, "wdpa") # define the file name of global wdpa shapefile to be created - FNAME <- file.path(wdpa_path, "global_wdpa_polygons.shp") + FNAME <- file.path(wdpa_path, "global_wdpa_polygons.gpkg") # check if the final wdpa file already exists and whether to overwrite if (!Force & file.exists(FNAME)) { message(paste0("A global wdpa file with polygons exists already: ", FNAME)) } else { - # download if Force = TRUE or the file isn't already there - message("downloading zipped WDPA shapefiles, ca 4GB") - # set long timeout to avoid interrupting download - options(timeout = 1000) - # download the zipped files - download.file(url = wdpa_url, - destfile = wdpa_destination, - cacheOK = FALSE) + if (!file.exists(wdpa_destination)) { + # download if Force = TRUE or the file isn't already there + message("downloading zipped WDPA shapefiles, ca 4GB") + # set long timeout to avoid interrupting download + options(timeout = 1000) + # download the zipped files + download.file(url = wdpa_url, + destfile = wdpa_destination, + cacheOK = FALSE) + } + # unzip files message(paste("unzipping WDPA shapefiles to", Dir.Capfitogen.WDPA)) unzip(zipfile = wdpa_destination, diff --git a/capfitogen_master.R b/capfitogen_master.R index 5669039..1196095 100644 --- a/capfitogen_master.R +++ b/capfitogen_master.R @@ -82,6 +82,11 @@ source(file.path(Dir.Scripts, "ModGP-commonlines.R")) } # DATA ==================================================================== +message(paste("-----------------------------", + " starting data download/load ", + "-----------------------------", + sep = "\n")) + ## Run SHARED-Data script ------------------------------------------------- ## defines FUN.DownGBIF(), FUN.DownBV(), FUN.DownEV() source(file.path(Dir.R_scripts, "SHARED-Data.R")) @@ -122,9 +127,6 @@ if (file.exists(existing_bioclim_file)) { ) } -# or use an existing data set (from Erik?) for testing... -# bioclim_variables <- terra::rast(file.path(Dir.Data.Envir, "BV_1985-2015.nc")) - # make sure the BioClim data have the correct names BioClim_names <- c( # see https://www.worldclim.org/data/bioclim.html @@ -168,7 +170,25 @@ geophysical_variables <- FUN.DownGV( resample_to_match = bioclim_variables[[1]] ) +## Protected areas database ----------------------------------------------- +#' download shapefiles for protected areas to overlay with Complementa tool. +#' The FUN.DownWDPA function will save the file to a folder, but not load it +#' into RStudio as an object. +FUN.DownWDPA( + # download from url: + wdpa_url = "https://d1gam3xoknrgr2.cloudfront.net/current/WDPA_Feb2025_Public_shp.zip", + # save the downloaded zipfile as: + wdpa_destination = file.path(Dir.Capfitogen.WDPA, + "WDPA_Feb2025_Public_shp.zip"), + # do not overwrite existing data + Force = FALSE) + # CAPFITOGEN pipeline ========================================================= +message(paste("------------------------------", + " starting Capfitogen pipeline ", + "------------------------------", + sep = "\n")) + ### Format GBIF data ---- # need a data frame named 'puntos' = points with occurrence points puntos <- data.frame(POINTID = 1:length(Species_ls[["occs"]][["DECLATITUDE"]]), @@ -176,6 +196,7 @@ puntos <- data.frame(POINTID = 1:length(Species_ls[["occs"]][["DECLATITUDE"]]), POINT_Y = Species_ls[["occs"]][["DECLATITUDE"]]) ### create 'pasaporte' ---- +message("create Pasaporte file") #' Capfitogen uses a file named "pasaporte" with species/taxa occurrences. #' The file uses Darwincore names, so it should be OK to use the occurrences #' from the GBIF download directly. @@ -190,20 +211,8 @@ write.table(Species_ls[["occs"]], pasaporte_file_name), sep = "\t",) -### Download protected areas ---- -#' download shapefiles for protected areas to overlay with Complementa tool. -#' The FUN.DownWDPA function will save the file to a folder, but not load it -#' into RStudio as an object. -FUN.DownWDPA( - # download from url: - wdpa_url = "https://d1gam3xoknrgr2.cloudfront.net/current/WDPA_Feb2025_Public_shp.zip", - # save the downloaded zipfile as: - wdpa_destination = file.path(Dir.Capfitogen.WDPA, - "WDPA_Feb2025_Public_shp.zip"), - # do not overwrite existing data - Force = FALSE) - ## Variable selection --------------------------------------------------------- +message("running variable selection") # combine variables all_predictors <- c(bioclim_variables, #edaphic_variables, # Error in xcor[mx[1], mx[2]] : subscript out of bounds / In addition: Warning message: / [spatSample] fewer values returned than requested @@ -229,6 +238,17 @@ print(variables_to_keep) predictors <- all_predictors[[(variables_to_keep)]] predictors <- raster::stack(predictors) +# get bioclimatic variable names that matches Capfitogen's format +predictor_names <- names(predictors) +bioclim_predictor_names <- predictor_names[grep("BIO", predictor_names)] +bioclim_predictor_codes <- sub("_.*", "", tolower(bioclim_predictor_names)) +bioclim_predictor_codes <- sub("o", "o_", bioclim_predictor_codes) +capfitogen_bioclim_names <- read.delim("Capfitogen/bioclim.txt", + fileEncoding = "latin1") +bioclim_subset <- + capfitogen_bioclim_names[capfitogen_bioclim_names$VARCODE %in% bioclim_predictor_codes, ] +bioclim_predictor_names_capfitogen <- bioclim_subset$VARDESCR + # save variables in CAPFITOGEN folder if (!dir.exists(file.path(Dir.Capfitogen, "rdatapoints/world/9x9"))) { dir.create(file.path(Dir.Capfitogen, "rdatapoints/world/9x9"), @@ -242,40 +262,112 @@ saveRDS(predictors, save(predictors, file = "Capfitogen/rdatapoints/world/9x9/base9x9.RData") -predictor_names <- names(predictors) - +# names(predictors[[1:length(bioclim_predictor_names)]]) <- bioclim_predictor_codes +predictor_names_for_saving <- + c(bioclim_predictor_codes, + predictor_names[grep("BIO", predictor_names, invert = TRUE)] + ) + for (i in 1:dim(predictors)[3]) { file_name_path = file.path("Capfitogen/rdatamaps/world/9x9", - paste0(names(predictors[[i]]),".tif")) + paste0(predictor_names_for_saving[i],".tif")) writeRaster(predictors[[i]], file_name_path, overwrite = TRUE) } +## Modify Capfitogen possible values ------------------------------------------ + +# add a line with our data resolution to list of possible values +load(file.path(Dir.Capfitogen, "resol.RData")) +if (nrow(resol[resol$resol == "9x9",]) < 1) { + line = paste("\"celdas 9x9 km aprox (4.5 arc-min)\"", "\"9x9\"", 0.075, + sep = "\t") + write(line, + file = file.path(Dir.Capfitogen, "resol.txt"), + append = TRUE) + + resol <- rbind( + resol, + data.frame( + resolucion = "celdas 9x9 km aprox (4.5 arc-min)", + resol = "9x9", + resoldec = 0.075 + ) + ) + save(resol, file = file.path(Dir.Capfitogen, "resol.RData")) +} +rm(resol) + +# create template file for logging script processes +if (!file.exists(file.path(Dir.Capfitogen.Error,"process_info.txt"))) { + file.create(file.path(Dir.Capfitogen.Error,"process_info.txt")) +} + +# add geophysical variables to list of possible variables +load(file.path(Dir.Capfitogen, "geophys.RData")) +if (nrow(geophys[geophys$VARCODE == "wind_max", ]) < 1) { + geophys <- rbind( + geophys, + data.frame( + VARID = 145, + VARCODE = "wind_max", + VARDESCR_EN = "mean wind speed of windiest month (annual max of monthly means)", + VARDESCR = "mean_wind_speed_of_windiest_month", + VARUNIDAD = "ms-1", + VARFUENTE = "Derivada de Worldclim", + VARMODULO = "Geofisico/Geophysic", + FUENTELINK = "http://worldclim.org" + ) + ) + save(geophys, file = file.path(Dir.Capfitogen, "geophys.RData")) + + # rename geophysical variable files ---- NB! will break when edaphic vars are added... + number_of_geophys_variables <- length(predictor_names[grep("BIO", + predictor_names, + invert = TRUE)]) + for (i in 1:number_of_geophys_variables) { + from_name = predictor_names[grep("BIO", + predictor_names, + invert = TRUE)][i] + + to_name = geophys[geophys$VARDESCR == from_name, "VARCODE"][1] + file.rename( + from = file.path( + "Capfitogen/rdatamaps/world/9x9", + paste0(from_name, ".tif") + ), + to = file.path("Capfitogen/rdatamaps/world/9x9", + paste0(to_name, ".tif")) + ) + } +} + +rm(geophys) + ## Clustering and map creation: ELCmapas --------------------------------------- +message("setting parameters and running ELC map script (ecogeographic land characterization)") ### Parameters for ELC maps ---- -{ ruta <- Dir.Capfitogen # path to capfitogen scripts resultados <- Dir.Capfitogen.ELCMap # directory to place results pasaporte <- pasaporte_file_name # species occurrence data -pais <- "World" # global extent - big modifications will be necessary to use different extent +pais <- "world" # global extent - big modifications will be necessary to use different extent geoqual <- FALSE totalqual<-30 # Only applies if GEOQUAL=TRUE, must be a value between 0 and 100 duplicat <- TRUE # duplicat=TRUE indicates that records of the same GENUS/SPECIES/SUBTAXA will be deleted distdup <- 1 # distance threshold in km to remove duplicates from same population -resol1 <- "9x9" # resolution, change to 9x9 +resol1 <- "celdas 9x9 km aprox (4.5 arc-min)" # resolution latitud <- FALSE #Only applies if ecogeo=TRUE; whether to use latitude variable (Y) as a geophysical variable from 'pasaporte' longitud <- FALSE -bioclimv <- predictor_names[grep("BIO", predictor_names)] # +bioclimv <- bioclim_predictor_names_capfitogen # edaphv <- names(geophysical_variables)#names(edaphic_variables) # edaphic variables (defaults from SOILGRIDS) geophysv <- names(geophysical_variables) # geophysical variables maxg <- 20 # maximum number of clusters per component metodo <- "kmeansbic" # clustering algorithm type. Options: medoides, elbow, calinski, ssi, bic iterat <- 10 # if metodo="Calinski" or "ssi", the number of iterations to calculate the optimal number of clusters. -} # run the script message("Clustering and creating maps") @@ -316,7 +408,7 @@ datanatax <- FALSE # whether the NA values in genus, species or subtaxa will be mapaelcf <- TRUE # Note: Will an ELC map from a previous execution of the ELCmapas tool be used as an additional factor for classifying the taxonomic ranks for the complementarity analysis? mapaelc <- "mapa_elc_world.grd" #Only applies if mapaelcf=TRUE, mapaelc must contain the name of the ELC map obtained by previously using the ELCmapas tool (.grd and .gri files that must always be in the CAPFITOGEN3/ELCmapas folder) datanaelc <- FALSE # Only applies if mapaelcf=TRUE, indicates whether (TRUE) the records that fall in NA zones on the ELC map will be taken into account or not (FALSE) -data0elc <- FALSE #Only applies if mapaelcf=TRUE, indicates whether (TRUE) the records that fall in category 0 on the ELC map will be taken into account or not (FALSE) +data0elc <- FALSE # Only applies if mapaelcf=TRUE, indicates whether (TRUE) the records that fall in category 0 on the ELC map will be taken into account or not (FALSE) } # run the script @@ -327,3 +419,6 @@ source(file.path(Dir.Capfitogen, setwd(Dir.Base) +message(" - - - end of capfitogen script - - - ") + +# end From 4d707fdba2c3ba038db5ec9207fb2f13850cf7b6 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Thu, 10 Apr 2025 17:22:22 +0200 Subject: [PATCH 43/72] draft capfitogen default data download --- R_scripts/SHARED-Data.R | 91 ++++++++++++++++++++++++++++++++++++++++- capfitogen_master.R | 22 ++++++---- 2 files changed, 104 insertions(+), 9 deletions(-) diff --git a/R_scripts/SHARED-Data.R b/R_scripts/SHARED-Data.R index 2d1ad6b..d6782c7 100644 --- a/R_scripts/SHARED-Data.R +++ b/R_scripts/SHARED-Data.R @@ -445,8 +445,97 @@ FUN.DownBV <- function( BV_ras } + +# CAPFITOGEN DATA DOWNLOAD ---------------------------------------------------- +# Download the standard data from CAPFITOGEN for the globe. +FUN.DownCAPFITOGEN <- + function(Dir = getwd(), + Force = FALSE, + resample_to_match = FALSE) { + + # define a file name + FNAME <- file.path(Dir, "capfitogen.nc") + + # check if file already exists and whether to overwrite + if (!Force & file.exists(FNAME)) { + capfitogen_rasters <- rast(FNAME) + message( + paste(FNAME, "exists already. It has been loaded from the disk. + If you wish to override the present data, please specify Force = TRUE") + ) + return(capfitogen_rasters) + } + + # if Force=TRUE or the file doesn't already exist: + if (Force | !file.exists(FNAME)) { + ## download data from Capfitogen Google drive ---- + message("Start downloading 10x10 data from Capfitogen google drive:") + googledrive_data_files <- read.csv( + file.path(Dir.Data.Capfitogen, "capfitogen_world_data_googledrive_links.csv")) + + dir.create(path = paste0(Dir, "/capfitogen"), + showWarnings = FALSE) + + for (i in 1:nrow(googledrive_data_files)) { + capfitogen_data_url = paste0( + "https://drive.google.com/uc?export=download&id=", + googledrive_data_files$id[i]) + file_name = googledrive_data_files$name[i] + download.file(url = capfitogen_data_url, + destfile = paste0(Dir, "/capfitogen/", + googledrive_data_files$name[i])) + } + + # list the downloaded files + file_list <- list.files(paste0(Dir.Data.Envir, + "/capfitogen")) + # read in as a list of rasters + rasters <- list() + for (i in 1:length(file_list)) { + file_path_i <- file.path(Dir.Data.Envir,"capfitogen", + file_list[i]) + rasters[[i]] <- rast(file_path_i) + } + # rename rasters + names(rasters) <- googledrive_data_files$name + + ### resample ---- + ## if provided, resample to match another raster object's origin and resolution + if (!missing(resample_to_match)) { + message(paste0("resampling raster to match ", names(resample_to_match))) + resample_to_match <- rast(resample_to_match) + + ## project downloaded rasters to match resample_to_match file + projection_to_match <- terra::crs(resample_to_match) + + for (i in 1:length(rasters)) { + terra::crs(rasters[[i]]) <- projection_to_match + } + + ## resample + # Error: [resample] warp failure + # In addition: Warning messages: + # 1: C:/Users/evaler/OneDrive - Universitetet i Oslo/Eva/Artikler/_BioDT-CWR-capfitogen/base/uc-CWR/Data/Environment/capfitogen/prec_1.tif:Using code not yet in table (GDAL error 1) + # 2: TIFFReadEncodedStrip() failed. (GDAL error 1) + # 3: C:/Users/evaler/OneDrive - Universitetet i Oslo/Eva/Artikler/_BioDT-CWR-capfitogen/base/uc-CWR/Data/Environment/capfitogen/prec_1.tif, band 1: IReadBlock failed at X offset 0, Y offset 0: TIFFReadEncodedStrip() failed. (GDAL error 1) + for (i in 1:length(rasters)) { + rasters[[i]] <- terra::resample(rasters[[i]], + resample_to_match) + } + + # combine the individual rasters into a NetCDF file + message(paste0("saving as netCDF:", FNAME)) + terra::writeCDF(rasters, + filename = FNAME, + overwrite = FALSE) + rasters + + } + } + # EDAPHIC DATA DOWNLOAD ------------------------------------------------------- -# INCOMPLETE! Works for some variables, but the data set is incomplete. +# NB! Works for some variables, but the data set is incomplete. +# This function should be modified to add or replace data for capfitogen. FUN.DownEV <- function(Dir = getwd(), # where to store the data output on disk target_resolution = c(250, 250), diff --git a/capfitogen_master.R b/capfitogen_master.R index 1196095..9fd8b1a 100644 --- a/capfitogen_master.R +++ b/capfitogen_master.R @@ -82,9 +82,9 @@ source(file.path(Dir.Scripts, "ModGP-commonlines.R")) } # DATA ==================================================================== -message(paste("-----------------------------", - " starting data download/load ", - "-----------------------------", +message(paste("------------------------------", + " starting data download/load ", + "------------------------------", sep = "\n")) ## Run SHARED-Data script ------------------------------------------------- @@ -151,6 +151,14 @@ BioClim_names <- c( "BIO19_Precipitation_of_Coldest_Quarter") names(bioclim_variables) <- BioClim_names +### CAPFITOGEN data ------ +# download the default data from CAPFITOGEN. +FUN.DownCAPFITOGEN( + Dir = Dir.Data.Envir, + Force = FALSE, + resample_to_match = bioclim_variables[[1]] +) + ### Edaphic data ------ ## NB! each file at 250x250m is ~20GB... message("Downloading new or loading existing edaphic/soil variables") @@ -292,9 +300,8 @@ if (nrow(resol[resol$resol == "9x9",]) < 1) { data.frame( resolucion = "celdas 9x9 km aprox (4.5 arc-min)", resol = "9x9", - resoldec = 0.075 - ) - ) + resoldec = 0.075)) + save(resol, file = file.path(Dir.Capfitogen, "resol.RData")) } rm(resol) @@ -335,8 +342,7 @@ if (nrow(geophys[geophys$VARCODE == "wind_max", ]) < 1) { file.rename( from = file.path( "Capfitogen/rdatamaps/world/9x9", - paste0(from_name, ".tif") - ), + paste0(from_name, ".tif")), to = file.path("Capfitogen/rdatamaps/world/9x9", paste0(to_name, ".tif")) ) From dbe7efe9b00dd0d90c15cff3e1d6853ecd4ab64f Mon Sep 17 00:00:00 2001 From: evalieungh Date: Fri, 11 Apr 2025 14:17:13 +0200 Subject: [PATCH 44/72] update script --- R_scripts/SHARED-Data.R | 160 +++++++++++++++++++--------------------- 1 file changed, 77 insertions(+), 83 deletions(-) diff --git a/R_scripts/SHARED-Data.R b/R_scripts/SHARED-Data.R index d6782c7..9cc667d 100644 --- a/R_scripts/SHARED-Data.R +++ b/R_scripts/SHARED-Data.R @@ -448,88 +448,95 @@ FUN.DownBV <- function( # CAPFITOGEN DATA DOWNLOAD ---------------------------------------------------- # Download the standard data from CAPFITOGEN for the globe. -FUN.DownCAPFITOGEN <- +FUN.DownCAPFITOGEN <- function(Dir = getwd(), Force = FALSE, resample_to_match = FALSE) { - # define a file name - FNAME <- file.path(Dir, "capfitogen.nc") + FNAME <- file.path("Data/Environment/capfitogen.nc") # check if file already exists and whether to overwrite if (!Force & file.exists(FNAME)) { capfitogen_rasters <- rast(FNAME) message( - paste(FNAME, "exists already. It has been loaded from the disk. - If you wish to override the present data, please specify Force = TRUE") + paste( + FNAME, + "exists already. It has been loaded from the disk. + If you wish to override the present data, please specify Force = TRUE" ) + ) return(capfitogen_rasters) } # if Force=TRUE or the file doesn't already exist: if (Force | !file.exists(FNAME)) { ## download data from Capfitogen Google drive ---- - message("Start downloading 10x10 data from Capfitogen google drive:") + message("Start downloading 10x10 data from Capfitogen google drive.") googledrive_data_files <- read.csv( - file.path(Dir.Data.Capfitogen, "capfitogen_world_data_googledrive_links.csv")) + file.path( + Dir.Data.Capfitogen, + "capfitogen_world_data_googledrive_links.csv" + ) + ) + + if (!missing(resample_to_match)) { + message(paste0("Resampling rasters to match ", names(resample_to_match))) + } dir.create(path = paste0(Dir, "/capfitogen"), showWarnings = FALSE) - + # download each file separately by google id for (i in 1:nrow(googledrive_data_files)) { capfitogen_data_url = paste0( "https://drive.google.com/uc?export=download&id=", - googledrive_data_files$id[i]) + googledrive_data_files$id[i] + ) file_name = googledrive_data_files$name[i] - download.file(url = capfitogen_data_url, - destfile = paste0(Dir, "/capfitogen/", - googledrive_data_files$name[i])) + download.file( + url = capfitogen_data_url, + destfile = paste0(Dir, "/capfitogen/", googledrive_data_files$name[i]) + ) } # list the downloaded files - file_list <- list.files(paste0(Dir.Data.Envir, - "/capfitogen")) - # read in as a list of rasters - rasters <- list() - for (i in 1:length(file_list)) { - file_path_i <- file.path(Dir.Data.Envir,"capfitogen", - file_list[i]) - rasters[[i]] <- rast(file_path_i) - } - # rename rasters - names(rasters) <- googledrive_data_files$name + file_list <- list.files(paste0(Dir.Data.Envir, "/capfitogen")) - ### resample ---- - ## if provided, resample to match another raster object's origin and resolution - if (!missing(resample_to_match)) { - message(paste0("resampling raster to match ", names(resample_to_match))) - resample_to_match <- rast(resample_to_match) - - ## project downloaded rasters to match resample_to_match file - projection_to_match <- terra::crs(resample_to_match) - - for (i in 1:length(rasters)) { - terra::crs(rasters[[i]]) <- projection_to_match - } + # read in and format rasters one by one from file name + rasters <- NULL + for (i in 1:length(file_list)) { + file_path_i <- file.path(Dir.Data.Envir, "capfitogen", file_list[i]) + raster_i <- rast(file_path_i) + # rename raster + names(raster_i) <- googledrive_data_files$name[i] - ## resample - # Error: [resample] warp failure - # In addition: Warning messages: - # 1: C:/Users/evaler/OneDrive - Universitetet i Oslo/Eva/Artikler/_BioDT-CWR-capfitogen/base/uc-CWR/Data/Environment/capfitogen/prec_1.tif:Using code not yet in table (GDAL error 1) - # 2: TIFFReadEncodedStrip() failed. (GDAL error 1) - # 3: C:/Users/evaler/OneDrive - Universitetet i Oslo/Eva/Artikler/_BioDT-CWR-capfitogen/base/uc-CWR/Data/Environment/capfitogen/prec_1.tif, band 1: IReadBlock failed at X offset 0, Y offset 0: TIFFReadEncodedStrip() failed. (GDAL error 1) - for (i in 1:length(rasters)) { - rasters[[i]] <- terra::resample(rasters[[i]], - resample_to_match) + # resample + # if provided, resample to match another raster object's origin and resolution + if (!missing(resample_to_match)) { + resample_to_match <- rast(resample_to_match) + + ## project downloaded rasters to match resample_to_match file + projection_to_match <- terra::crs(resample_to_match) + + terra::crs(raster_i) <- projection_to_match + + raster_i <- terra::resample(raster_i, resample_to_match) + } - - # combine the individual rasters into a NetCDF file + } + + ## resample + # Error: [resample] warp failure + # In addition: Warning messages: + # 1: C:/Users/evaler/OneDrive - Universitetet i Oslo/Eva/Artikler/_BioDT-CWR-capfitogen/base/uc-CWR/Data/Environment/capfitogen/prec_1.tif:Using code not yet in table (GDAL error 1) + # 2: TIFFReadEncodedStrip() failed. (GDAL error 1) + # 3: C:/Users/evaler/OneDrive - Universitetet i Oslo/Eva/Artikler/_BioDT-CWR-capfitogen/base/uc-CWR/Data/Environment/capfitogen/prec_1.tif, band 1: IReadBlock failed at X offset 0, Y offset 0: TIFFReadEncodedStrip() failed. (GDAL error 1) + + rasters <- c(rasters, raster_i) + + # save rasters as a NetCDF file message(paste0("saving as netCDF:", FNAME)) - terra::writeCDF(rasters, - filename = FNAME, - overwrite = FALSE) + terra::writeCDF(rasters, filename = FNAME, overwrite = FALSE) rasters - } } @@ -805,11 +812,9 @@ FUN.DownGV <- #' The World Database on Protected Areas (WDPA) [Online], February 2025, #' Cambridge, UK: UNEP-WCMC and IUCN. Available at: www.protectedplanet.net. #' https://www.protectedplanet.net/en/thematic-areas/wdpa&ved=2ahUKEwjA4fPhltyLAxVkJBAIHfdOEasQFnoECBUQAQ&usg=AOvVaw0eVrEFsb0_TP4UIl2am3Za -FUN.DownWDPA <- function( - wdpa_url = "https://d1gam3xoknrgr2.cloudfront.net/current/WDPA_Feb2025_Public_shp.zip", - wdpa_destination = file.path(Dir.Capfitogen.WDPA, - "WDPA_Feb2025_Public_shp.zip"), - Force = FALSE) { +FUN.DownWDPA <- function(wdpa_url = "https://d1gam3xoknrgr2.cloudfront.net/current/WDPA_Feb2025_Public_shp.zip", + wdpa_destination = file.path(Dir.Capfitogen.WDPA, "WDPA_Feb2025_Public_shp.zip"), + Force = FALSE) { # set a path to wdpa shapefiles wdpa_path <- file.path(Dir.Capfitogen.WDPA, "wdpa") @@ -830,11 +835,10 @@ FUN.DownWDPA <- function( destfile = wdpa_destination, cacheOK = FALSE) } - + # unzip files message(paste("unzipping WDPA shapefiles to", Dir.Capfitogen.WDPA)) - unzip(zipfile = wdpa_destination, - exdir = Dir.Capfitogen.WDPA) + unzip(zipfile = wdpa_destination, exdir = Dir.Capfitogen.WDPA) # unzip split shapefile downloads message("unzipping shapefiles split in download") @@ -852,8 +856,7 @@ FUN.DownWDPA <- function( "WDPA_Feb2025_Public_shp-polygons.shx" ) - shapefile_paths <- file.path(wdpa_path, - shapefile_names) + shapefile_paths <- file.path(wdpa_path, shapefile_names) # loop over zip directories with parts of the global data (numbered 0, 1, 2) for (i in 0:2) { @@ -863,17 +866,12 @@ FUN.DownWDPA <- function( paste0("WDPA_Feb2025_Public_shp_", i, ".zip")) # unzip the directory containing shapefiles - unzip(zipfile = zipfilename, - exdir = wdpa_path) - message(paste0("unzipped ", zipfilename, - "\nto ", wdpa_path)) + unzip(zipfile = zipfilename, exdir = wdpa_path) + message(paste0("unzipped ", zipfilename, "\nto ", wdpa_path)) # rename shapefiles with numbers to prevent overwriting them - new_shapefile_names <- file.path(wdpa_path, - paste0(i, "_", - shapefile_names)) - file.rename(from = shapefile_paths, - to = new_shapefile_names) + new_shapefile_names <- file.path(wdpa_path, paste0(i, "_", shapefile_names)) + file.rename(from = shapefile_paths, to = new_shapefile_names) } # delete unnecessary files @@ -881,23 +879,17 @@ FUN.DownWDPA <- function( files_to_keep <- c( wdpa_path, wdpa_destination, - file.path(Dir.Capfitogen.WDPA, - "WDPA_sources_Feb2025.csv")) + file.path(Dir.Capfitogen.WDPA, "WDPA_sources_Feb2025.csv") + ) files_to_delete <- - list.files(Dir.Capfitogen.WDPA, - full.names = TRUE)[list.files(Dir.Capfitogen.WDPA, - full.names = TRUE) %nin% files_to_keep] - file.remove(files_to_delete, - recursive = TRUE) + list.files(Dir.Capfitogen.WDPA, full.names = TRUE)[list.files(Dir.Capfitogen.WDPA, full.names = TRUE) %nin% files_to_keep] + file.remove(files_to_delete, recursive = TRUE) # prepare list of shapefiles to be combined wdpa_polygon_shapefiles <- # list polygon shapefiles in WDPA directory - substr(unique(sub("\\..*", "", - list.files(wdpa_path)[grep(pattern = "polygon", - x = shapefile_names)])), - 3, 34) + substr(unique(sub("\\..*", "", list.files(wdpa_path)[grep(pattern = "polygon", x = shapefile_names)])), 3, 34) shapefile_list <- list() @@ -915,12 +907,14 @@ FUN.DownWDPA <- function( # wdpa$WDPAID <- as.character(wdpa$WDPAID) # wdpa$text_field <- iconv(wdpa$text_field, to = "ASCII//TRANSLIT") - + # save complete wdpa message("save as GeoPackage") st_write(wdpa, file.path(wdpa_path, "global_wdpa_polygons.gpkg")) - message(paste0("global WDPA saved as: ", - file.path(wdpa_path, "global_wdpa_polygons.gpkg"))) + message(paste0( + "global WDPA saved as: ", + file.path(wdpa_path, "global_wdpa_polygons.gpkg") + )) message("save as shapefile") #st_write(wdpa, FNAME) From 9b8df07d8b7d53e7adeb347603d3be9dfb4445f4 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Fri, 11 Apr 2025 18:25:20 +0200 Subject: [PATCH 45/72] add capfitogen download links --- .gitignore | 3 +- ...apfitogen_world_data_googledrive_links.csv | 51 +++++++++++++++++++ 2 files changed, 52 insertions(+), 2 deletions(-) create mode 100644 Data/Capfitogen/capfitogen_world_data_googledrive_links.csv diff --git a/.gitignore b/.gitignore index 236e82d..2e7bc95 100644 --- a/.gitignore +++ b/.gitignore @@ -59,6 +59,7 @@ rsconnect/ *.zip !Data/*/GlobalAreaCRS.RData Data/Environment/soil/ +!**/capfitogen_world_data_googledrive_links.csv Logo/* @@ -77,8 +78,6 @@ results/ELCmap/ results/ # Downloads -R_scripts/ELCmapas.R Capfitogen/* -Data/Capfitogen/ hq diff --git a/Data/Capfitogen/capfitogen_world_data_googledrive_links.csv b/Data/Capfitogen/capfitogen_world_data_googledrive_links.csv new file mode 100644 index 0000000..24acc7f --- /dev/null +++ b/Data/Capfitogen/capfitogen_world_data_googledrive_links.csv @@ -0,0 +1,51 @@ +id,url,name +158aVBGAigZMKlT1ZEVkazC3fVaNYub37,https://drive.google.com/file/d/158aVBGAigZMKlT1ZEVkazC3fVaNYub37/view?usp=drive_link,alt.tif +1hYORb-5f8Exr-HETJGY0y5EgXltTu2OS,https://drive.google.com/file/d/1hYORb-5f8Exr-HETJGY0y5EgXltTu2OS/view?usp=drive_link,aspect.tif +1FBp4YWBQ1L9Fb8FnwXY9m0Qsb0pkUBnJ,https://drive.google.com/file/d/1FBp4YWBQ1L9Fb8FnwXY9m0Qsb0pkUBnJ/view?usp=drive_link,bio_1.tif +1TNq-bbgmWLByxYpn0P1Mx4Zje0u8JOi0,https://drive.google.com/file/d/1TNq-bbgmWLByxYpn0P1Mx4Zje0u8JOi0/view?usp=drive_link,bio_10.tif +10-te0nlCGPdnEHO1rzLm-CEjXswpdk8i,https://drive.google.com/file/d/10-te0nlCGPdnEHO1rzLm-CEjXswpdk8i/view?usp=drive_link,bio_11.tif +1TtKD8KDXZK97SOA5KMYXHRr2gnRUmTdY,https://drive.google.com/file/d/1TtKD8KDXZK97SOA5KMYXHRr2gnRUmTdY/view?usp=drive_link,bio_12.tif +1PV0vbKuiwmrsZXBpVbSRaMFdwLJnPF14,https://drive.google.com/file/d/1PV0vbKuiwmrsZXBpVbSRaMFdwLJnPF14/view?usp=drive_link,bio_13.tif +1ld82q6efKbNo6aUNyPjxhq_XoIIzgWLL,https://drive.google.com/file/d/1ld82q6efKbNo6aUNyPjxhq_XoIIzgWLL/view?usp=drive_link,bio_14.tif +1n_m6HPV8wI8N6ObCEbu2TqbwKWUy0ilN,https://drive.google.com/file/d/1n_m6HPV8wI8N6ObCEbu2TqbwKWUy0ilN/view?usp=drive_link,bio_15.tif +1gn3HaS3pMT6HOxaVgX-0GII8MXCOPKDD,https://drive.google.com/file/d/1gn3HaS3pMT6HOxaVgX-0GII8MXCOPKDD/view?usp=drive_link,bio_16.tif +1pkBiHhUGWGAfdo3mpOu2XpFcS1_JcNCF,https://drive.google.com/file/d/1pkBiHhUGWGAfdo3mpOu2XpFcS1_JcNCF/view?usp=drive_link,bio_17.tif +1c6BAT3j9S8BGc-pbluNiZIp6jeNAUVaj,https://drive.google.com/file/d/1c6BAT3j9S8BGc-pbluNiZIp6jeNAUVaj/view?usp=drive_link,bio_18.tif +1Xwmq4es_VkVfDvuc0fXuKtEhaYyBx1s7,https://drive.google.com/file/d/1Xwmq4es_VkVfDvuc0fXuKtEhaYyBx1s7/view?usp=drive_link,bio_19.tif +1HvLcfrnyOFz5A16Tm6mPKLuQAmk-n-7_,https://drive.google.com/file/d/1HvLcfrnyOFz5A16Tm6mPKLuQAmk-n-7_/view?usp=drive_link,bio_2.tif +1xxGE7PTh67NKbuPzn2A9cyjLLFNqo5mG,https://drive.google.com/file/d/1xxGE7PTh67NKbuPzn2A9cyjLLFNqo5mG/view?usp=drive_link,bio_3.tif +1itxPIPMx2tU2GfbVSBmhKOgCy9bgfSSb,https://drive.google.com/file/d/1itxPIPMx2tU2GfbVSBmhKOgCy9bgfSSb/view?usp=drive_link,bio_4.tif +14FzTZAmY3-31JXcRBJIrndToexaEAYvd,https://drive.google.com/file/d/14FzTZAmY3-31JXcRBJIrndToexaEAYvd/view?usp=drive_link,bio_5.tif +1HuaYO10P4HXv7N4-JKEWgJ1fjLXtUkmp,https://drive.google.com/file/d/1HuaYO10P4HXv7N4-JKEWgJ1fjLXtUkmp/view?usp=drive_link,bio_6.tif +10qdX1HC98mZgeJBPNYfJdolj8jY_2WrS,https://drive.google.com/file/d/10qdX1HC98mZgeJBPNYfJdolj8jY_2WrS/view?usp=drive_link,bio_7.tif +1oe0wf7mXx6d12BYb5HE76dYrUHIvKobJ,https://drive.google.com/file/d/1oe0wf7mXx6d12BYb5HE76dYrUHIvKobJ/view?usp=drive_link,bio_8.tif +1namB4QuAF_qO27VcoJKWw9cgKjrcaAqq,https://drive.google.com/file/d/1namB4QuAF_qO27VcoJKWw9cgKjrcaAqq/view?usp=drive_link,bio_9.tif +1U13MXDHUKAerTB0EzBC0cWQvuL6UOzja,https://drive.google.com/file/d/1U13MXDHUKAerTB0EzBC0cWQvuL6UOzja/view?usp=drive_link,depth_rock.tif +159hPV37wWd-MbYso7eOoupR5KflcqLtW,https://drive.google.com/file/d/159hPV37wWd-MbYso7eOoupR5KflcqLtW/view?usp=drive_link,eastness.tif +14qOehUoZHWNd_7ShILpnuKc1GFyrof_K,https://drive.google.com/file/d/14qOehUoZHWNd_7ShILpnuKc1GFyrof_K/view?usp=drive_link,northness.tif +1w-5b033IfM2zW3h9QD6X8w_tKgrtxFB-,https://drive.google.com/file/d/1w-5b033IfM2zW3h9QD6X8w_tKgrtxFB-/view?usp=drive_link,prec_1.tif +1CMaQqKWooCoBs1co719WPTcE7a_Nctk4,https://drive.google.com/file/d/1CMaQqKWooCoBs1co719WPTcE7a_Nctk4/view?usp=drive_link,prec_10.tif +1lCqZKMAFB7_jbQ_6b1qt40QXkB1stxfX,https://drive.google.com/file/d/1lCqZKMAFB7_jbQ_6b1qt40QXkB1stxfX/view?usp=drive_link,prec_11.tif +16gTlKj9MDbtZYYh0Bbbk1VytSZHD3K3h,https://drive.google.com/file/d/16gTlKj9MDbtZYYh0Bbbk1VytSZHD3K3h/view?usp=drive_link,prec_12.tif +1r656ha9OzAgvXxFb5MLQieo1DrOeqY48,https://drive.google.com/file/d/1r656ha9OzAgvXxFb5MLQieo1DrOeqY48/view?usp=drive_link,prec_2.tif +1t-kXJuLw8kXTRzJ0gyX6AX8qpBLq1Plf,https://drive.google.com/file/d/1t-kXJuLw8kXTRzJ0gyX6AX8qpBLq1Plf/view?usp=drive_link,prec_3.tif +1w74B1FLIw67YMcEtA_d9bc6Pn-ZaqsPM,https://drive.google.com/file/d/1w74B1FLIw67YMcEtA_d9bc6Pn-ZaqsPM/view?usp=drive_link,prec_4.tif +1EsnRGWUcibr4leJ6yWgdAa8rKSREtRQX,https://drive.google.com/file/d/1EsnRGWUcibr4leJ6yWgdAa8rKSREtRQX/view?usp=drive_link,prec_5.tif +1E5LDseUHNyOJiLRTD3bpu2Y4GSyp38aL,https://drive.google.com/file/d/1E5LDseUHNyOJiLRTD3bpu2Y4GSyp38aL/view?usp=drive_link,prec_6.tif +1CnfdKv-YVaCkjYVqD6xRm6vKH9-KYXN7,https://drive.google.com/file/d/1CnfdKv-YVaCkjYVqD6xRm6vKH9-KYXN7/view?usp=drive_link,prec_7.tif +1ZO9eCVbTZnHC7IcR36pdibeYXCL088Tl,https://drive.google.com/file/d/1ZO9eCVbTZnHC7IcR36pdibeYXCL088Tl/view?usp=drive_link,prec_8.tif +1935wdE2WNQvcAYvR1eWK29YwyQfNKgEp,https://drive.google.com/file/d/1935wdE2WNQvcAYvR1eWK29YwyQfNKgEp/view?usp=drive_link,prec_9.tif +1kOKakXHKCS3W-9BblHGxmw-756DckwmU,https://drive.google.com/file/d/1kOKakXHKCS3W-9BblHGxmw-756DckwmU/view?usp=drive_link,r_horizon.tif +1FhsbsjtOo8GfOeoFY7YXFRNs5e7JEMd1,https://drive.google.com/file/d/1FhsbsjtOo8GfOeoFY7YXFRNs5e7JEMd1/view?usp=drive_link,ref_depth.tif +1FJKXepMub_k67Cia19nboMO6lcT1i2Uw,https://drive.google.com/file/d/1FJKXepMub_k67Cia19nboMO6lcT1i2Uw/view?usp=drive_link,s_awc1.tif +1lqrQ0z9t34s9jDXQJB1vzt5j4ND1LECN,https://drive.google.com/file/d/1lqrQ0z9t34s9jDXQJB1vzt5j4ND1LECN/view?usp=drive_link,s_awc2.tif +1lTHEqXkpDa7ZHA7ZoNBHMnxkbuXxpO0D,https://drive.google.com/file/d/1lTHEqXkpDa7ZHA7ZoNBHMnxkbuXxpO0D/view?usp=drive_link,s_awc3.tif +1rL7lfRmB6ZOZ05_7DlRvkE9Lnt3-dGgI,https://drive.google.com/file/d/1rL7lfRmB6ZOZ05_7DlRvkE9Lnt3-dGgI/view?usp=drive_link,s_awcts.tif +1z5m20WzAaySOXSQHmhoe7CCnuvP13clR,https://drive.google.com/file/d/1z5m20WzAaySOXSQHmhoe7CCnuvP13clR/view?usp=drive_link,s_bs.tif +1C51EwjLZ5uXjyIHHtFqjv3_ZJg0uGtk4,https://drive.google.com/file/d/1C51EwjLZ5uXjyIHHtFqjv3_ZJg0uGtk4/view?usp=drive_link,s_bulk_dens.tif +1R6foXvUXg-beUBrClcStO_oLrlT5ST7O,https://drive.google.com/file/d/1R6foXvUXg-beUBrClcStO_oLrlT5ST7O/view?usp=drive_link,s_caco3.tif +1PwWIlCFn_OiiRAnbUPl1ZAs1XarYtvPi,https://drive.google.com/file/d/1PwWIlCFn_OiiRAnbUPl1ZAs1XarYtvPi/view?usp=drive_link,s_caso4.tif +1piG-PBTMHBCnnAAa9F-dRiURQ8sbhXiP,https://drive.google.com/file/d/1piG-PBTMHBCnnAAa9F-dRiURQ8sbhXiP/view?usp=drive_link,s_cec_clay.tif +171lvMcH3KsDUjQtcoshqviLirENLiHeD,https://drive.google.com/file/d/171lvMcH3KsDUjQtcoshqviLirENLiHeD/view?usp=drive_link,s_cec_soil.tif +1FkWiJzrDX5SgQEESxwTeyvsNgnwWpIoQ,https://drive.google.com/file/d/1FkWiJzrDX5SgQEESxwTeyvsNgnwWpIoQ/view?usp=drive_link,s_cecsol.tif +1EHyYvwWlDJNE7W3FaObILJIae_pLF_sl,https://drive.google.com/file/d/1EHyYvwWlDJNE7W3FaObILJIae_pLF_sl/view?usp=drive_link,s_clay.tif From 1da1dca8675f0ab2919711073f208cd965de717e Mon Sep 17 00:00:00 2001 From: evalieungh Date: Mon, 14 Apr 2025 13:55:16 +0200 Subject: [PATCH 46/72] move data download functions out of main scripts --- R_scripts/legacy-data-download.R | 293 +++++++++++++++++++++++++++++++ 1 file changed, 293 insertions(+) create mode 100644 R_scripts/legacy-data-download.R diff --git a/R_scripts/legacy-data-download.R b/R_scripts/legacy-data-download.R new file mode 100644 index 0000000..3e731a8 --- /dev/null +++ b/R_scripts/legacy-data-download.R @@ -0,0 +1,293 @@ +# Legacy code for downloading custom edaphic (soil) and geophysical data + +# by Eva Lieungh + +# this scripts defines functions in the same style as in SHARED-Data.R. +# Example use: +# +# ### Edaphic data ------ +# ## NB! each file at 250x250m is ~20GB... +# message("Downloading new or loading existing edaphic/soil variables") +# +# edaphic_variables <- FUN.DownEV( +# Dir = Dir.Data.Envir, +# target_resolution = c(250, 250), +# Force = FALSE, +# resample_to_match = bioclim_variables[[1]] +# ) +# +# ### Geophysical data ------ +# message("Downloading new or loading existing geophysical variables") +# geophysical_variables <- FUN.DownGV( +# Dir = Dir.Data.Envir, +# Force = FALSE, +# resample_to_match = bioclim_variables[[1]] +# ) + + +# EDAPHIC DATA DOWNLOAD ------------------------------------------------------- +# NB! Works for some variables, but the data set is incomplete. +# This function should be modified to add or replace data for capfitogen. +FUN.DownEV <- + function(Dir = getwd(), # where to store the data output on disk + target_resolution = c(250, 250), + Force = FALSE, # do not overwrite already present data, + resample_to_match = FALSE) { + # define a file name + FNAME <- file.path(Dir, "edaphic.nc") + message(FNAME) + + # check if file already exists and whether to overwrite + if (!Force & file.exists(FNAME)) { + EV_ras <- rast(FNAME) + message( + "Data has already been downloaded. It has been loaded from the disk. If you wish to override the present data, please specify Force = TRUE" + ) + return(EV_ras) + } + + # if Force=TRUE or the file doesn't already exist: + if (Force | !file.exists(FNAME)) { + ## download data from SoilGrids ---- + message("Start downloading data from SoilGrids: files.isric.org/soilgrids/latest/data/") + soilGrids_url = "/vsicurl?max_retry=3&retry_delay=1&list_dir=no&url=https://files.isric.org/soilgrids/latest/data/" + + #' overview of datasets: https://www.isric.org/explore/soilgrids/faq-soilgrids#What_do_the_filename_codes_mean + #' NB! Each global map occupies circa 20 GB for 250x20m resolution! + #' It takes a while to download. + #' In addition, https://files.isric.org/soilgrids/latest/data/wrb/ + #' has maps of soil types, as estimated probability of occurrence per type. + #' MostProbable.vrt has the most probable soil type per gridcell. + #' Soil salinity: https://data.isric.org/geonetwork/srv/eng/catalog.search#/metadata/c59d0162-a258-4210-af80-777d7929c512 + + SoilGrids_variables_in <- + c("bdod/bdod_0-5cm_mean", # Bulk density of the fine earth fraction, cg/cm³ + "cec/cec_0-5cm_mean", # Cation Exchange Capacity of the soil, mmol(c)/kg + "cfvo/cfvo_0-5cm_mean", # Volumetric fraction of coarse fragments (> 2 mm) cm3/dm3 (vol‰) + "silt/silt_0-5cm_mean", # Proportion of silt particles (≥ 0.002 mm and ≤ 0.05/0.063 mm) in the fine earth fraction g/kg + "clay/clay_0-5cm_mean", # Proportion of clay particles (< 0.002 mm) in the fine earth fraction g/kg + "sand/sand_0-5cm_mean", # Proportion of sand particles (> 0.05/0.063 mm) in the fine earth fraction g/kg + "nitrogen/nitrogen_0-5cm_mean", # Total nitrogen (N) cg/kg + "phh2o/phh2o_0-5cm_mean", # Soil pH pHx10 + "ocd/ocd_0-5cm_mean", # Organic carbon density hg/m³ + "ocs/ocs_0-30cm_mean", # Organic carbon stocks t/ha + "soc/soc_0-5cm_mean") # Soil organic carbon content in the fine earth fraction dg/kg + + SoilGrids_variables <- sub(".*/", "", SoilGrids_variables_in) + + soilGrids_data <- NULL + + for (i in 1:length(SoilGrids_variables_in)) { + variable_name = SoilGrids_variables[i] + + message(SoilGrids_variables[i]) + + path_to_downloaded_file <- paste0(Dir.Data.Envir, "/", + SoilGrids_variables[i], ".tif") + + # if variable is not downloaded already, ... + ifelse( + !file.exists(path_to_downloaded_file), + # download it, ... + downloaded_variable <- gdalUtilities::gdal_translate( + src_dataset = paste0(soilGrids_url, + SoilGrids_variables_in[i], ".vrt"), + dst_dataset = path_to_downloaded_file, + tr = target_resolution # target resolution + ), + # or else load it from file + downloaded_variable <- path_to_downloaded_file + ) + + ## load variable as raster + downloaded_raster <- rast(downloaded_variable) + plot(downloaded_raster, main = SoilGrids_variables[i]) + + ### resample ---- + ## if provided, resample to match another raster object's origin and resolution + if (!missing(resample_to_match)) { + message(paste0("resampling raster to match ", names(resample_to_match))) + resample_to_match <- rast(resample_to_match) + + ## project downloaded rasters to match resample_to_match file + projection_to_match <- terra::crs(resample_to_match) + terra::crs(downloaded_raster) <- projection_to_match + + ## resample + downloaded_raster <- + terra::resample(downloaded_raster, + resample_to_match) + } + + soilGrids_data <- c(soilGrids_data, downloaded_raster) + + } + + ## HSWD downloads ---- + ## download additional rasters from HSWD + message("Downloading data from HSWD (harmonised world soil database) via fao.org") + + path_to_PH_nutrient = file.path(Dir, "HSWD_PH_nutrient.tif") + if (!file.exists(path_to_PH_nutrient)) { + download.file(url = "https://www.fao.org/fileadmin/user_upload/soils/docs/HWSD/Soil_Quality_data/sq1.asc", + destfile = path_to_PH_nutrient) + } + PH_nutrient <- rast(path_to_PH_nutrient) + + path_to_PH_toxicity = file.path(Dir, "HSWD_PH_toxicity.tif") + if (missing(path_to_PH_toxicity)) { + message("downloading HSWD PH toxicity") + download.file(url = "https://www.fao.org/fileadmin/user_upload/soils/docs/HWSD/Soil_Quality_data/sq6.asc", + destfile = path_to_PH_toxicity) + } + PH_toxicity <- rast(path_to_PH_toxicity) + + ### resample ---- + ## if provided, resample to match another raster object's origin and resolution + if (!missing(resample_to_match)) { + message(paste0("resampling raster to match ", names(resample_to_match))) + resample_to_match <- rast(resample_to_match) + + ## project downloaded rasters to match resample_to_match file + projection_to_match <- terra::crs(resample_to_match) + terra::crs(PH_nutrient) <- projection_to_match + terra::crs(PH_toxicity) <- projection_to_match + + ## resample + PH_nutrient <- + terra::resample(PH_nutrient, + resample_to_match) + + PH_toxicity <- + terra::resample(PH_toxicity, + resample_to_match) + + } + + ### combine and rename rasters ---- + EV_rasters <- rast(c(soilGrids_data, + PH_nutrient, PH_toxicity)) + + names(EV_rasters) <- c(SoilGrids_variables, + "Nutrient", "Toxicity") + + ### Saving ---- + message(paste0("saving as netCDF:", FNAME)) + terra::writeCDF(EV_rasters, + filename = FNAME, + overwrite = FALSE) + + EV_rasters + + } + } + +# GEOPHYSICAL DATA DOWNLOAD -------------------------------------------------- +#' define a function to load existing data or download them to +#' the given directory. +FUN.DownGV <- + function(Dir = getwd(),# where to store the data output on disk + Force = FALSE,# do not overwrite already present data, + resample_to_match = FALSE) { + # define a file name + FNAME <- file.path(Dir, "geophysical.nc") + + # check if file already exists and whether to overwrite + if (!Force & file.exists(FNAME)) { + GV_raster <- rast(FNAME) + names(GV_raster) <- c("elevation", + "mean_wind_speed_of_windiest_month") + message( + "Data has already been downloaded with these specifications. It has been loaded from the disk. If you wish to override the present data, please specify Force = TRUE" + ) + + return(GV_raster) + + } + + # if the file doesn't already exist: + if (Force | !file.exists(FNAME)) { + message("downloading or loading geophysical data") + + ## Download digital elevation model (DEM) ------ + ##' Fick, S.E. and R.J. Hijmans, 2017. WorldClim 2 + ##' https://doi.org/10.1002/joc.5086 + message("- digital elevation model") + if (!file.exists(paste0(Dir, "/wc2.1_2.5m_elev.tif"))) { + worldclim_dem_url = "https://geodata.ucdavis.edu/climate/worldclim/2_1/base/wc2.1_2.5m_elev.zip" + temp <- tempfile() + download.file(worldclim_dem_url, temp) + unzip(zipfile = temp, + exdir = Dir) + unlink(temp) + } + dem <- rast(paste0(Dir, "/wc2.1_2.5m_elev.tif")) + names(dem) <- "Elevacion" + + ## Download wind speed ------ + ##' WorldClim 2 + message("- mean wind speed of windiest month (annual max of monthly means)") + if (!file.exists(paste0(Dir, "/wc2.1_2.5m_wind_max.tif"))) { + worldclim_wind_url = "https://geodata.ucdavis.edu/climate/worldclim/2_1/base/wc2.1_2.5m_wind.zip" + temp <- tempfile() + download.file(worldclim_wind_url, temp) + unzip(zipfile = temp, + exdir = Dir) + unlink(temp) + + ## read monthly wind speed and find annual max of monthly means + month_numbers = c(paste0("0", 2:9), as.character(10:12)) + wind_stack <- rast(paste0(Dir, "/wc2.1_2.5m_wind_01.tif")) + for (i in month_numbers) { + raster_i = rast(paste0(Dir, "/wc2.1_2.5m_wind_", i, ".tif")) + wind_stack <- c(wind_stack, raster_i) + } + max_wind <- terra::app(wind_stack, max) + writeRaster(max_wind, + filename = paste0(Dir, "/wc2.1_2.5m_wind_max.tif")) + } + wind <- rast(paste0(Dir, "/wc2.1_2.5m_wind_max.tif")) + names(wind) <- "mean_wind_speed_of_windiest_month" + + ## Resample ------ + ## if provided, resample to match another raster object's origin and resolution + if (!missing(resample_to_match)) { + message(paste0("resampling raster to match ", names(resample_to_match))) + resample_to_match <- rast(resample_to_match) + + ## project downloaded rasters to match resample_to_match file + projection_to_match <- terra::crs(resample_to_match) + terra::crs(dem) <- projection_to_match + terra::crs(wind) <- projection_to_match + message("projected to match input") + + ## resample + dem <- terra::resample(dem, + resample_to_match) + message("dem successfully resampled") + + wind <- terra::resample(wind, + resample_to_match) + message("wind successfully resampled") + } + + ### combine rasters + geophysical_rasters <- c(dem, wind) + + ## Saving ---- + message("saving as NetCDF") + terra::writeCDF(geophysical_rasters, + filename = FNAME, + overwrite = TRUE) + + geophysical_rasters + } + } + +# WGS84 = EPSG:4326 +## Download digital elevation model (DEM) from +##' Jarvis A., H.I. Reuter, A. Nelson, E. Guevara, 2008, Hole-filled +##' seamless SRTM data V4, International Centre for Tropical Agriculture +##' (CIAT), available from http://srtm.csi.cgiar.org. +#dem <- rast("https://srtm.csi.cgiar.org/wp-content/uploads/files/250m/tiles250m.jpg") + From 59e08613fc10e937a6ee7445784dcd9e7379bfb0 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Mon, 14 Apr 2025 13:56:00 +0200 Subject: [PATCH 47/72] restructuring --- R_scripts/SHARED-Data.R | 304 +++------------------------------------- capfitogen_master.R | 118 +++++++--------- 2 files changed, 72 insertions(+), 350 deletions(-) diff --git a/R_scripts/SHARED-Data.R b/R_scripts/SHARED-Data.R index 9cc667d..cf2dd6c 100644 --- a/R_scripts/SHARED-Data.R +++ b/R_scripts/SHARED-Data.R @@ -479,24 +479,32 @@ FUN.DownCAPFITOGEN <- ) ) - if (!missing(resample_to_match)) { - message(paste0("Resampling rasters to match ", names(resample_to_match))) - } + # message("installing gdown with pip install") + # system("pip install gdown") dir.create(path = paste0(Dir, "/capfitogen"), showWarnings = FALSE) + # download each file separately by google id for (i in 1:nrow(googledrive_data_files)) { capfitogen_data_url = paste0( "https://drive.google.com/uc?export=download&id=", googledrive_data_files$id[i] ) + file_name = googledrive_data_files$name[i] - download.file( - url = capfitogen_data_url, - destfile = paste0(Dir, "/capfitogen/", googledrive_data_files$name[i]) - ) + + message(file_name) + cmd <- paste0("gdown ", capfitogen_data_url) + #cmd <- sprintf("gdown https://drive.google.com/uc?id=%s", id) + system(cmd) + # + # download.file( + # url = capfitogen_data_url, + # destfile = paste0(Dir, "/capfitogen/", googledrive_data_files$name[i]) + # ) } + # file_ids <- readLines("file_ids.txt") # list the downloaded files file_list <- list.files(paste0(Dir.Data.Envir, "/capfitogen")) @@ -511,7 +519,11 @@ FUN.DownCAPFITOGEN <- # resample # if provided, resample to match another raster object's origin and resolution - if (!missing(resample_to_match)) { + if (resample_to_match == FALSE) { + message("No resampling") + } else { + message(paste0("Resampling rasters to match ", names(resample_to_match))) + resample_to_match <- rast(resample_to_match) ## project downloaded rasters to match resample_to_match file @@ -520,17 +532,10 @@ FUN.DownCAPFITOGEN <- terra::crs(raster_i) <- projection_to_match raster_i <- terra::resample(raster_i, resample_to_match) - } } - ## resample - # Error: [resample] warp failure - # In addition: Warning messages: - # 1: C:/Users/evaler/OneDrive - Universitetet i Oslo/Eva/Artikler/_BioDT-CWR-capfitogen/base/uc-CWR/Data/Environment/capfitogen/prec_1.tif:Using code not yet in table (GDAL error 1) - # 2: TIFFReadEncodedStrip() failed. (GDAL error 1) - # 3: C:/Users/evaler/OneDrive - Universitetet i Oslo/Eva/Artikler/_BioDT-CWR-capfitogen/base/uc-CWR/Data/Environment/capfitogen/prec_1.tif, band 1: IReadBlock failed at X offset 0, Y offset 0: TIFFReadEncodedStrip() failed. (GDAL error 1) - + message(paste("adding raster", names(raster_i))) rasters <- c(rasters, raster_i) # save rasters as a NetCDF file @@ -540,273 +545,6 @@ FUN.DownCAPFITOGEN <- } } -# EDAPHIC DATA DOWNLOAD ------------------------------------------------------- -# NB! Works for some variables, but the data set is incomplete. -# This function should be modified to add or replace data for capfitogen. -FUN.DownEV <- - function(Dir = getwd(), # where to store the data output on disk - target_resolution = c(250, 250), - Force = FALSE, # do not overwrite already present data, - resample_to_match = FALSE) { - # define a file name - FNAME <- file.path(Dir, "edaphic.nc") - message(FNAME) - - # check if file already exists and whether to overwrite - if (!Force & file.exists(FNAME)) { - EV_ras <- rast(FNAME) - message( - "Data has already been downloaded. It has been loaded from the disk. If you wish to override the present data, please specify Force = TRUE" - ) - return(EV_ras) - } - - # if Force=TRUE or the file doesn't already exist: - if (Force | !file.exists(FNAME)) { - ## download data from SoilGrids ---- - message("Start downloading data from SoilGrids: files.isric.org/soilgrids/latest/data/") - soilGrids_url = "/vsicurl?max_retry=3&retry_delay=1&list_dir=no&url=https://files.isric.org/soilgrids/latest/data/" - - #' overview of datasets: https://www.isric.org/explore/soilgrids/faq-soilgrids#What_do_the_filename_codes_mean - #' NB! Each global map occupies circa 20 GB for 250x20m resolution! - #' It takes a while to download. - #' In addition, https://files.isric.org/soilgrids/latest/data/wrb/ - #' has maps of soil types, as estimated probability of occurrence per type. - #' MostProbable.vrt has the most probable soil type per gridcell. - #' Soil salinity: https://data.isric.org/geonetwork/srv/eng/catalog.search#/metadata/c59d0162-a258-4210-af80-777d7929c512 - - SoilGrids_variables_in <- - c("bdod/bdod_0-5cm_mean", # Bulk density of the fine earth fraction, cg/cm³ - "cec/cec_0-5cm_mean", # Cation Exchange Capacity of the soil, mmol(c)/kg - "cfvo/cfvo_0-5cm_mean", # Volumetric fraction of coarse fragments (> 2 mm) cm3/dm3 (vol‰) - "silt/silt_0-5cm_mean", # Proportion of silt particles (≥ 0.002 mm and ≤ 0.05/0.063 mm) in the fine earth fraction g/kg - "clay/clay_0-5cm_mean", # Proportion of clay particles (< 0.002 mm) in the fine earth fraction g/kg - "sand/sand_0-5cm_mean", # Proportion of sand particles (> 0.05/0.063 mm) in the fine earth fraction g/kg - "nitrogen/nitrogen_0-5cm_mean", # Total nitrogen (N) cg/kg - "phh2o/phh2o_0-5cm_mean", # Soil pH pHx10 - "ocd/ocd_0-5cm_mean", # Organic carbon density hg/m³ - "ocs/ocs_0-30cm_mean", # Organic carbon stocks t/ha - "soc/soc_0-5cm_mean") # Soil organic carbon content in the fine earth fraction dg/kg - - SoilGrids_variables <- sub(".*/", "", SoilGrids_variables_in) - - soilGrids_data <- NULL - - for (i in 1:length(SoilGrids_variables_in)) { - variable_name = SoilGrids_variables[i] - - message(SoilGrids_variables[i]) - - path_to_downloaded_file <- paste0(Dir.Data.Envir, "/", - SoilGrids_variables[i], ".tif") - - # if variable is not downloaded already, ... - ifelse( - !file.exists(path_to_downloaded_file), - # download it, ... - downloaded_variable <- gdalUtilities::gdal_translate( - src_dataset = paste0(soilGrids_url, - SoilGrids_variables_in[i], ".vrt"), - dst_dataset = path_to_downloaded_file, - tr = target_resolution # target resolution - ), - # or else load it from file - downloaded_variable <- path_to_downloaded_file - ) - - ## load variable as raster - downloaded_raster <- rast(downloaded_variable) - plot(downloaded_raster, main = SoilGrids_variables[i]) - - ### resample ---- - ## if provided, resample to match another raster object's origin and resolution - if (!missing(resample_to_match)) { - message(paste0("resampling raster to match ", names(resample_to_match))) - resample_to_match <- rast(resample_to_match) - - ## project downloaded rasters to match resample_to_match file - projection_to_match <- terra::crs(resample_to_match) - terra::crs(downloaded_raster) <- projection_to_match - - ## resample - downloaded_raster <- - terra::resample(downloaded_raster, - resample_to_match) - } - - soilGrids_data <- c(soilGrids_data, downloaded_raster) - - } - - ## HSWD downloads ---- - ## download additional rasters from HSWD - message("Downloading data from HSWD (harmonised world soil database) via fao.org") - - path_to_PH_nutrient = file.path(Dir, "HSWD_PH_nutrient.tif") - if (!file.exists(path_to_PH_nutrient)) { - download.file(url = "https://www.fao.org/fileadmin/user_upload/soils/docs/HWSD/Soil_Quality_data/sq1.asc", - destfile = path_to_PH_nutrient) - } - PH_nutrient <- rast(path_to_PH_nutrient) - - path_to_PH_toxicity = file.path(Dir, "HSWD_PH_toxicity.tif") - if (missing(path_to_PH_toxicity)) { - message("downloading HSWD PH toxicity") - download.file(url = "https://www.fao.org/fileadmin/user_upload/soils/docs/HWSD/Soil_Quality_data/sq6.asc", - destfile = path_to_PH_toxicity) - } - PH_toxicity <- rast(path_to_PH_toxicity) - - ### resample ---- - ## if provided, resample to match another raster object's origin and resolution - if (!missing(resample_to_match)) { - message(paste0("resampling raster to match ", names(resample_to_match))) - resample_to_match <- rast(resample_to_match) - - ## project downloaded rasters to match resample_to_match file - projection_to_match <- terra::crs(resample_to_match) - terra::crs(PH_nutrient) <- projection_to_match - terra::crs(PH_toxicity) <- projection_to_match - - ## resample - PH_nutrient <- - terra::resample(PH_nutrient, - resample_to_match) - - PH_toxicity <- - terra::resample(PH_toxicity, - resample_to_match) - - } - - ### combine and rename rasters ---- - EV_rasters <- rast(c(soilGrids_data, - PH_nutrient, PH_toxicity)) - - names(EV_rasters) <- c(SoilGrids_variables, - "Nutrient", "Toxicity") - - ### Saving ---- - message(paste0("saving as netCDF:", FNAME)) - terra::writeCDF(EV_rasters, - filename = FNAME, - overwrite = FALSE) - - EV_rasters - - } - } - -# GEOPHYSICAL DATA DOWNLOAD -------------------------------------------------- -#' define a function to load existing data or download them to -#' the given directory. -FUN.DownGV <- - function(Dir = getwd(),# where to store the data output on disk - Force = FALSE,# do not overwrite already present data, - resample_to_match = FALSE) { - # define a file name - FNAME <- file.path(Dir, "geophysical.nc") - - # check if file already exists and whether to overwrite - if (!Force & file.exists(FNAME)) { - GV_raster <- rast(FNAME) - names(GV_raster) <- c("elevation", - "mean_wind_speed_of_windiest_month") - message( - "Data has already been downloaded with these specifications. It has been loaded from the disk. If you wish to override the present data, please specify Force = TRUE" - ) - - return(GV_raster) - - } - - # if the file doesn't already exist: - if (Force | !file.exists(FNAME)) { - message("downloading or loading geophysical data") - - ## Download digital elevation model (DEM) ------ - ##' Fick, S.E. and R.J. Hijmans, 2017. WorldClim 2 - ##' https://doi.org/10.1002/joc.5086 - message("- digital elevation model") - if (!file.exists(paste0(Dir, "/wc2.1_2.5m_elev.tif"))) { - worldclim_dem_url = "https://geodata.ucdavis.edu/climate/worldclim/2_1/base/wc2.1_2.5m_elev.zip" - temp <- tempfile() - download.file(worldclim_dem_url, temp) - unzip(zipfile = temp, - exdir = Dir) - unlink(temp) - } - dem <- rast(paste0(Dir, "/wc2.1_2.5m_elev.tif")) - names(dem) <- "Elevacion" - - ## Download wind speed ------ - ##' WorldClim 2 - message("- mean wind speed of windiest month (annual max of monthly means)") - if (!file.exists(paste0(Dir, "/wc2.1_2.5m_wind_max.tif"))) { - worldclim_wind_url = "https://geodata.ucdavis.edu/climate/worldclim/2_1/base/wc2.1_2.5m_wind.zip" - temp <- tempfile() - download.file(worldclim_wind_url, temp) - unzip(zipfile = temp, - exdir = Dir) - unlink(temp) - - ## read monthly wind speed and find annual max of monthly means - month_numbers = c(paste0("0", 2:9), as.character(10:12)) - wind_stack <- rast(paste0(Dir, "/wc2.1_2.5m_wind_01.tif")) - for (i in month_numbers) { - raster_i = rast(paste0(Dir, "/wc2.1_2.5m_wind_", i, ".tif")) - wind_stack <- c(wind_stack, raster_i) - } - max_wind <- terra::app(wind_stack, max) - writeRaster(max_wind, - filename = paste0(Dir, "/wc2.1_2.5m_wind_max.tif")) - } - wind <- rast(paste0(Dir, "/wc2.1_2.5m_wind_max.tif")) - names(wind) <- "mean_wind_speed_of_windiest_month" - - ## Resample ------ - ## if provided, resample to match another raster object's origin and resolution - if (!missing(resample_to_match)) { - message(paste0("resampling raster to match ", names(resample_to_match))) - resample_to_match <- rast(resample_to_match) - - ## project downloaded rasters to match resample_to_match file - projection_to_match <- terra::crs(resample_to_match) - terra::crs(dem) <- projection_to_match - terra::crs(wind) <- projection_to_match - message("projected to match input") - - ## resample - dem <- terra::resample(dem, - resample_to_match) - message("dem successfully resampled") - - wind <- terra::resample(wind, - resample_to_match) - message("wind successfully resampled") - } - - ### combine rasters - geophysical_rasters <- c(dem, wind) - - ## Saving ---- - message("saving as NetCDF") - terra::writeCDF(geophysical_rasters, - filename = FNAME, - overwrite = TRUE) - - geophysical_rasters - } - } - -# WGS84 = EPSG:4326 -## Download digital elevation model (DEM) from -##' Jarvis A., H.I. Reuter, A. Nelson, E. Guevara, 2008, Hole-filled -##' seamless SRTM data V4, International Centre for Tropical Agriculture -##' (CIAT), available from http://srtm.csi.cgiar.org. -#dem <- rast("https://srtm.csi.cgiar.org/wp-content/uploads/files/250m/tiles250m.jpg") - - # WORLD DATABASE ON PROTECTED AREAS --------------------------------------- #' UNEP-WCMC and IUCN (2025), Protected Planet: #' The World Database on Protected Areas (WDPA) [Online], February 2025, diff --git a/capfitogen_master.R b/capfitogen_master.R index 9fd8b1a..35473bd 100644 --- a/capfitogen_master.R +++ b/capfitogen_master.R @@ -103,79 +103,60 @@ Species_ls <- FUN.DownGBIF( ) ## Environmental Data ----------------------------------------------------- -### Bioclomatic data ------ -##' 19 BioClim variables -##' FUN.DownBV uses KrigR to download ERA5 data from Climate Data Store (CDS) -##' is each file of each variable >20GB? -##' Will this download Global Multi-resolution Terrain Elevation Data -##' (GMTED2010) as well? -##' Temporal coverage: January 1950 to present ? -##' https://cds.climate.copernicus.eu/datasets/derived-era5-land-daily-statistics?tab=overview -message("Downloading new or loading existing 19 BioClim bioclimatic variables") - -# Check for existing BioClim data file -existing_bioclim_file <- file.path(Dir.Data.Envir, "BV_1985-2015.nc") -if (file.exists(existing_bioclim_file)) { - message("Using existing BioClim data") - bioclim_variables <- terra::rast(existing_bioclim_file) -} else { - bioclim_variables <- FUN.DownBV( - T_Start = 1999, # what year to begin climatology calculation? - T_End = 1999, # what year to end climatology calculation? - Dir = Dir.Data.Envir, # where to store the data output on disk - Force = FALSE # do not overwrite already present data - ) -} - -# make sure the BioClim data have the correct names -BioClim_names <- c( - # see https://www.worldclim.org/data/bioclim.html - "BIO1_Annual_Mean_Temperature", - "BIO2_Mean_Diurnal_Range", - "BIO3_Isothermality", - "BIO4_Temperature_Seasonality", - "BIO5_Max_Temperature_of_Warmest_Month", - "BIO6_Min_Temperature_of_Coldest_Month", - "BIO7_Temperature_Annual_Range", - "BIO8_Mean_Temperature_of_Wettest_Quarter", - "BIO9_Mean_Temperature_of_Driest_Quarter", - "BIO10_Mean_Temperature_of_Warmest_Quarter", - "BIO11_Mean_Temperature_of_Coldest_Quarter", - "BIO12_Annual_Precipitation", - "BIO13_Precipitation_of_Wettest_Month", - "BIO14_Precipitation_of_Driest_Month", - "BIO15_Precipitation_Seasonality", - "BIO16_Precipitation_of_Wettest_Quarter", - "BIO17_Precipitation_of_Driest_Quarter", - "BIO18_Precipitation_of_Warmest_Quarter", - "BIO19_Precipitation_of_Coldest_Quarter") -names(bioclim_variables) <- BioClim_names +# ### Bioclomatic data ------ +# ##' 19 BioClim variables +# ##' FUN.DownBV uses KrigR to download ERA5 data from Climate Data Store (CDS) +# ##' is each file of each variable >20GB? +# ##' Will this download Global Multi-resolution Terrain Elevation Data +# ##' (GMTED2010) as well? +# ##' Temporal coverage: January 1950 to present ? +# ##' https://cds.climate.copernicus.eu/datasets/derived-era5-land-daily-statistics?tab=overview +# message("Downloading new or loading existing 19 BioClim bioclimatic variables") +# +# # Check for existing BioClim data file +# existing_bioclim_file <- file.path(Dir.Data.Envir, "BV_1985-2015.nc") +# if (file.exists(existing_bioclim_file)) { +# message("Using existing BioClim data") +# bioclim_variables <- terra::rast(existing_bioclim_file) +# } else { +# bioclim_variables <- FUN.DownBV( +# T_Start = 1999, # what year to begin climatology calculation? +# T_End = 1999, # what year to end climatology calculation? +# Dir = Dir.Data.Envir, # where to store the data output on disk +# Force = FALSE # do not overwrite already present data +# ) +# } +# +# # make sure the BioClim data have the correct names +# BioClim_names <- c( +# # see https://www.worldclim.org/data/bioclim.html +# "BIO1_Annual_Mean_Temperature", +# "BIO2_Mean_Diurnal_Range", +# "BIO3_Isothermality", +# "BIO4_Temperature_Seasonality", +# "BIO5_Max_Temperature_of_Warmest_Month", +# "BIO6_Min_Temperature_of_Coldest_Month", +# "BIO7_Temperature_Annual_Range", +# "BIO8_Mean_Temperature_of_Wettest_Quarter", +# "BIO9_Mean_Temperature_of_Driest_Quarter", +# "BIO10_Mean_Temperature_of_Warmest_Quarter", +# "BIO11_Mean_Temperature_of_Coldest_Quarter", +# "BIO12_Annual_Precipitation", +# "BIO13_Precipitation_of_Wettest_Month", +# "BIO14_Precipitation_of_Driest_Month", +# "BIO15_Precipitation_Seasonality", +# "BIO16_Precipitation_of_Wettest_Quarter", +# "BIO17_Precipitation_of_Driest_Quarter", +# "BIO18_Precipitation_of_Warmest_Quarter", +# "BIO19_Precipitation_of_Coldest_Quarter") +# names(bioclim_variables) <- BioClim_names ### CAPFITOGEN data ------ # download the default data from CAPFITOGEN. FUN.DownCAPFITOGEN( Dir = Dir.Data.Envir, Force = FALSE, - resample_to_match = bioclim_variables[[1]] -) - -### Edaphic data ------ -## NB! each file at 250x250m is ~20GB... -message("Downloading new or loading existing edaphic/soil variables") - -edaphic_variables <- FUN.DownEV( - Dir = Dir.Data.Envir, - target_resolution = c(250, 250), - Force = FALSE, - resample_to_match = bioclim_variables[[1]] -) - -### Geophysical data ------ -message("Downloading new or loading existing geophysical variables") -geophysical_variables <- FUN.DownGV( - Dir = Dir.Data.Envir, - Force = FALSE, - resample_to_match = bioclim_variables[[1]] + resample_to_match = FALSE # bioclim_variables[[1]] ) ## Protected areas database ----------------------------------------------- @@ -191,6 +172,9 @@ FUN.DownWDPA( # do not overwrite existing data Force = FALSE) +# if supplied, crop all the data to a map of native species range + + # CAPFITOGEN pipeline ========================================================= message(paste("------------------------------", " starting Capfitogen pipeline ", From 0388e76415521eef40dcbaaa707809d9360c4db2 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Tue, 15 Apr 2025 12:23:50 +0200 Subject: [PATCH 48/72] adjust download procedure --- ...apfitogen_world_data_googledrive_links.csv | 51 --- R_scripts/ModGP-commonlines.R | 5 + R_scripts/SHARED-Data.R | 319 ++++++++++-------- R_scripts/legacy-data-download.R | 50 +++ capfitogen_master.R | 86 ++--- 5 files changed, 263 insertions(+), 248 deletions(-) delete mode 100644 Data/Capfitogen/capfitogen_world_data_googledrive_links.csv diff --git a/Data/Capfitogen/capfitogen_world_data_googledrive_links.csv b/Data/Capfitogen/capfitogen_world_data_googledrive_links.csv deleted file mode 100644 index 24acc7f..0000000 --- a/Data/Capfitogen/capfitogen_world_data_googledrive_links.csv +++ /dev/null @@ -1,51 +0,0 @@ -id,url,name -158aVBGAigZMKlT1ZEVkazC3fVaNYub37,https://drive.google.com/file/d/158aVBGAigZMKlT1ZEVkazC3fVaNYub37/view?usp=drive_link,alt.tif -1hYORb-5f8Exr-HETJGY0y5EgXltTu2OS,https://drive.google.com/file/d/1hYORb-5f8Exr-HETJGY0y5EgXltTu2OS/view?usp=drive_link,aspect.tif -1FBp4YWBQ1L9Fb8FnwXY9m0Qsb0pkUBnJ,https://drive.google.com/file/d/1FBp4YWBQ1L9Fb8FnwXY9m0Qsb0pkUBnJ/view?usp=drive_link,bio_1.tif -1TNq-bbgmWLByxYpn0P1Mx4Zje0u8JOi0,https://drive.google.com/file/d/1TNq-bbgmWLByxYpn0P1Mx4Zje0u8JOi0/view?usp=drive_link,bio_10.tif -10-te0nlCGPdnEHO1rzLm-CEjXswpdk8i,https://drive.google.com/file/d/10-te0nlCGPdnEHO1rzLm-CEjXswpdk8i/view?usp=drive_link,bio_11.tif -1TtKD8KDXZK97SOA5KMYXHRr2gnRUmTdY,https://drive.google.com/file/d/1TtKD8KDXZK97SOA5KMYXHRr2gnRUmTdY/view?usp=drive_link,bio_12.tif -1PV0vbKuiwmrsZXBpVbSRaMFdwLJnPF14,https://drive.google.com/file/d/1PV0vbKuiwmrsZXBpVbSRaMFdwLJnPF14/view?usp=drive_link,bio_13.tif -1ld82q6efKbNo6aUNyPjxhq_XoIIzgWLL,https://drive.google.com/file/d/1ld82q6efKbNo6aUNyPjxhq_XoIIzgWLL/view?usp=drive_link,bio_14.tif -1n_m6HPV8wI8N6ObCEbu2TqbwKWUy0ilN,https://drive.google.com/file/d/1n_m6HPV8wI8N6ObCEbu2TqbwKWUy0ilN/view?usp=drive_link,bio_15.tif -1gn3HaS3pMT6HOxaVgX-0GII8MXCOPKDD,https://drive.google.com/file/d/1gn3HaS3pMT6HOxaVgX-0GII8MXCOPKDD/view?usp=drive_link,bio_16.tif -1pkBiHhUGWGAfdo3mpOu2XpFcS1_JcNCF,https://drive.google.com/file/d/1pkBiHhUGWGAfdo3mpOu2XpFcS1_JcNCF/view?usp=drive_link,bio_17.tif -1c6BAT3j9S8BGc-pbluNiZIp6jeNAUVaj,https://drive.google.com/file/d/1c6BAT3j9S8BGc-pbluNiZIp6jeNAUVaj/view?usp=drive_link,bio_18.tif -1Xwmq4es_VkVfDvuc0fXuKtEhaYyBx1s7,https://drive.google.com/file/d/1Xwmq4es_VkVfDvuc0fXuKtEhaYyBx1s7/view?usp=drive_link,bio_19.tif -1HvLcfrnyOFz5A16Tm6mPKLuQAmk-n-7_,https://drive.google.com/file/d/1HvLcfrnyOFz5A16Tm6mPKLuQAmk-n-7_/view?usp=drive_link,bio_2.tif -1xxGE7PTh67NKbuPzn2A9cyjLLFNqo5mG,https://drive.google.com/file/d/1xxGE7PTh67NKbuPzn2A9cyjLLFNqo5mG/view?usp=drive_link,bio_3.tif -1itxPIPMx2tU2GfbVSBmhKOgCy9bgfSSb,https://drive.google.com/file/d/1itxPIPMx2tU2GfbVSBmhKOgCy9bgfSSb/view?usp=drive_link,bio_4.tif -14FzTZAmY3-31JXcRBJIrndToexaEAYvd,https://drive.google.com/file/d/14FzTZAmY3-31JXcRBJIrndToexaEAYvd/view?usp=drive_link,bio_5.tif -1HuaYO10P4HXv7N4-JKEWgJ1fjLXtUkmp,https://drive.google.com/file/d/1HuaYO10P4HXv7N4-JKEWgJ1fjLXtUkmp/view?usp=drive_link,bio_6.tif -10qdX1HC98mZgeJBPNYfJdolj8jY_2WrS,https://drive.google.com/file/d/10qdX1HC98mZgeJBPNYfJdolj8jY_2WrS/view?usp=drive_link,bio_7.tif -1oe0wf7mXx6d12BYb5HE76dYrUHIvKobJ,https://drive.google.com/file/d/1oe0wf7mXx6d12BYb5HE76dYrUHIvKobJ/view?usp=drive_link,bio_8.tif -1namB4QuAF_qO27VcoJKWw9cgKjrcaAqq,https://drive.google.com/file/d/1namB4QuAF_qO27VcoJKWw9cgKjrcaAqq/view?usp=drive_link,bio_9.tif -1U13MXDHUKAerTB0EzBC0cWQvuL6UOzja,https://drive.google.com/file/d/1U13MXDHUKAerTB0EzBC0cWQvuL6UOzja/view?usp=drive_link,depth_rock.tif -159hPV37wWd-MbYso7eOoupR5KflcqLtW,https://drive.google.com/file/d/159hPV37wWd-MbYso7eOoupR5KflcqLtW/view?usp=drive_link,eastness.tif -14qOehUoZHWNd_7ShILpnuKc1GFyrof_K,https://drive.google.com/file/d/14qOehUoZHWNd_7ShILpnuKc1GFyrof_K/view?usp=drive_link,northness.tif -1w-5b033IfM2zW3h9QD6X8w_tKgrtxFB-,https://drive.google.com/file/d/1w-5b033IfM2zW3h9QD6X8w_tKgrtxFB-/view?usp=drive_link,prec_1.tif -1CMaQqKWooCoBs1co719WPTcE7a_Nctk4,https://drive.google.com/file/d/1CMaQqKWooCoBs1co719WPTcE7a_Nctk4/view?usp=drive_link,prec_10.tif -1lCqZKMAFB7_jbQ_6b1qt40QXkB1stxfX,https://drive.google.com/file/d/1lCqZKMAFB7_jbQ_6b1qt40QXkB1stxfX/view?usp=drive_link,prec_11.tif -16gTlKj9MDbtZYYh0Bbbk1VytSZHD3K3h,https://drive.google.com/file/d/16gTlKj9MDbtZYYh0Bbbk1VytSZHD3K3h/view?usp=drive_link,prec_12.tif -1r656ha9OzAgvXxFb5MLQieo1DrOeqY48,https://drive.google.com/file/d/1r656ha9OzAgvXxFb5MLQieo1DrOeqY48/view?usp=drive_link,prec_2.tif -1t-kXJuLw8kXTRzJ0gyX6AX8qpBLq1Plf,https://drive.google.com/file/d/1t-kXJuLw8kXTRzJ0gyX6AX8qpBLq1Plf/view?usp=drive_link,prec_3.tif -1w74B1FLIw67YMcEtA_d9bc6Pn-ZaqsPM,https://drive.google.com/file/d/1w74B1FLIw67YMcEtA_d9bc6Pn-ZaqsPM/view?usp=drive_link,prec_4.tif -1EsnRGWUcibr4leJ6yWgdAa8rKSREtRQX,https://drive.google.com/file/d/1EsnRGWUcibr4leJ6yWgdAa8rKSREtRQX/view?usp=drive_link,prec_5.tif -1E5LDseUHNyOJiLRTD3bpu2Y4GSyp38aL,https://drive.google.com/file/d/1E5LDseUHNyOJiLRTD3bpu2Y4GSyp38aL/view?usp=drive_link,prec_6.tif -1CnfdKv-YVaCkjYVqD6xRm6vKH9-KYXN7,https://drive.google.com/file/d/1CnfdKv-YVaCkjYVqD6xRm6vKH9-KYXN7/view?usp=drive_link,prec_7.tif -1ZO9eCVbTZnHC7IcR36pdibeYXCL088Tl,https://drive.google.com/file/d/1ZO9eCVbTZnHC7IcR36pdibeYXCL088Tl/view?usp=drive_link,prec_8.tif -1935wdE2WNQvcAYvR1eWK29YwyQfNKgEp,https://drive.google.com/file/d/1935wdE2WNQvcAYvR1eWK29YwyQfNKgEp/view?usp=drive_link,prec_9.tif -1kOKakXHKCS3W-9BblHGxmw-756DckwmU,https://drive.google.com/file/d/1kOKakXHKCS3W-9BblHGxmw-756DckwmU/view?usp=drive_link,r_horizon.tif -1FhsbsjtOo8GfOeoFY7YXFRNs5e7JEMd1,https://drive.google.com/file/d/1FhsbsjtOo8GfOeoFY7YXFRNs5e7JEMd1/view?usp=drive_link,ref_depth.tif -1FJKXepMub_k67Cia19nboMO6lcT1i2Uw,https://drive.google.com/file/d/1FJKXepMub_k67Cia19nboMO6lcT1i2Uw/view?usp=drive_link,s_awc1.tif -1lqrQ0z9t34s9jDXQJB1vzt5j4ND1LECN,https://drive.google.com/file/d/1lqrQ0z9t34s9jDXQJB1vzt5j4ND1LECN/view?usp=drive_link,s_awc2.tif -1lTHEqXkpDa7ZHA7ZoNBHMnxkbuXxpO0D,https://drive.google.com/file/d/1lTHEqXkpDa7ZHA7ZoNBHMnxkbuXxpO0D/view?usp=drive_link,s_awc3.tif -1rL7lfRmB6ZOZ05_7DlRvkE9Lnt3-dGgI,https://drive.google.com/file/d/1rL7lfRmB6ZOZ05_7DlRvkE9Lnt3-dGgI/view?usp=drive_link,s_awcts.tif -1z5m20WzAaySOXSQHmhoe7CCnuvP13clR,https://drive.google.com/file/d/1z5m20WzAaySOXSQHmhoe7CCnuvP13clR/view?usp=drive_link,s_bs.tif -1C51EwjLZ5uXjyIHHtFqjv3_ZJg0uGtk4,https://drive.google.com/file/d/1C51EwjLZ5uXjyIHHtFqjv3_ZJg0uGtk4/view?usp=drive_link,s_bulk_dens.tif -1R6foXvUXg-beUBrClcStO_oLrlT5ST7O,https://drive.google.com/file/d/1R6foXvUXg-beUBrClcStO_oLrlT5ST7O/view?usp=drive_link,s_caco3.tif -1PwWIlCFn_OiiRAnbUPl1ZAs1XarYtvPi,https://drive.google.com/file/d/1PwWIlCFn_OiiRAnbUPl1ZAs1XarYtvPi/view?usp=drive_link,s_caso4.tif -1piG-PBTMHBCnnAAa9F-dRiURQ8sbhXiP,https://drive.google.com/file/d/1piG-PBTMHBCnnAAa9F-dRiURQ8sbhXiP/view?usp=drive_link,s_cec_clay.tif -171lvMcH3KsDUjQtcoshqviLirENLiHeD,https://drive.google.com/file/d/171lvMcH3KsDUjQtcoshqviLirENLiHeD/view?usp=drive_link,s_cec_soil.tif -1FkWiJzrDX5SgQEESxwTeyvsNgnwWpIoQ,https://drive.google.com/file/d/1FkWiJzrDX5SgQEESxwTeyvsNgnwWpIoQ/view?usp=drive_link,s_cecsol.tif -1EHyYvwWlDJNE7W3FaObILJIae_pLF_sl,https://drive.google.com/file/d/1EHyYvwWlDJNE7W3FaObILJIae_pLF_sl/view?usp=drive_link,s_clay.tif diff --git a/R_scripts/ModGP-commonlines.R b/R_scripts/ModGP-commonlines.R index eddae1d..3c3d0b7 100644 --- a/R_scripts/ModGP-commonlines.R +++ b/R_scripts/ModGP-commonlines.R @@ -1,3 +1,7 @@ +#'########################################################################### +#' + + ## Default flags for runtime environment RUNNING_ON_LUMI <- FALSE RUNNING_ON_DESTINE <- FALSE @@ -35,6 +39,7 @@ package_vec <- c( 'viridis', # colour palette 'bit64', 'iterators', + 'rvest', # to scrape google drive html 'gdalUtilities', # to download from SoilGrids (FUN.DownEV) # # Capfitogen SelectVar packages diff --git a/R_scripts/SHARED-Data.R b/R_scripts/SHARED-Data.R index cf2dd6c..2c2a9d4 100644 --- a/R_scripts/SHARED-Data.R +++ b/R_scripts/SHARED-Data.R @@ -451,7 +451,7 @@ FUN.DownBV <- function( FUN.DownCAPFITOGEN <- function(Dir = getwd(), Force = FALSE, - resample_to_match = FALSE) { + resample_to_match = NULL) { # define a file name FNAME <- file.path("Data/Environment/capfitogen.nc") @@ -472,42 +472,50 @@ FUN.DownCAPFITOGEN <- if (Force | !file.exists(FNAME)) { ## download data from Capfitogen Google drive ---- message("Start downloading 10x10 data from Capfitogen google drive.") - googledrive_data_files <- read.csv( - file.path( - Dir.Data.Capfitogen, - "capfitogen_world_data_googledrive_links.csv" - ) - ) - # message("installing gdown with pip install") - # system("pip install gdown") + # scrape Capfitogen's google drive to get direct download links (rdatamaps/world/10x10) + folder_id <- "1Xxt1ztTkLITAUbTyJzjePs2CpfETwF_u" + embedded_url <- paste0("https://drive.google.com/embeddedfolderview?id=", + folder_id, "#list") + + # Scrape the page + page <- read_html(embedded_url) + + # Extract tags (these contain filenames + links) + file_links <- page %>% html_nodes("a") %>% html_attr("href") + file_names <- page %>% html_nodes("a") %>% html_text() + tif_files <- data.frame(name = file_names, link = file_links) + + # Extract file ID from link + tif_files$file_id <- substr(tif_files$link, + start = 33, stop = 65) + tif_files$download_url <- paste0("https://drive.google.com/uc?export=download&id=", tif_files$file_id) + + message("installing gdown with pip install") + system("pip install gdown") + + # create directory to store tiff files dir.create(path = paste0(Dir, "/capfitogen"), showWarnings = FALSE) + # set long timeout to aoid interrupting downloads + options(timeout = 1000) + # download each file separately by google id - for (i in 1:nrow(googledrive_data_files)) { - capfitogen_data_url = paste0( - "https://drive.google.com/uc?export=download&id=", - googledrive_data_files$id[i] - ) - - file_name = googledrive_data_files$name[i] - + for (i in 1:nrow(tif_files)) { + file_name = tif_files$name[i] message(file_name) - cmd <- paste0("gdown ", capfitogen_data_url) - #cmd <- sprintf("gdown https://drive.google.com/uc?id=%s", id) - system(cmd) - # - # download.file( - # url = capfitogen_data_url, - # destfile = paste0(Dir, "/capfitogen/", googledrive_data_files$name[i]) - # ) + download.file( + url = tif_files$download_url[i], + destfile = paste0(Dir, "/capfitogen/", tif_files$name[i]) + ) } - # file_ids <- readLines("file_ids.txt") - + # list the downloaded files file_list <- list.files(paste0(Dir.Data.Envir, "/capfitogen")) + message("downloaded files:") + print(file_list) # read in and format rasters one by one from file name rasters <- NULL @@ -516,12 +524,13 @@ FUN.DownCAPFITOGEN <- raster_i <- rast(file_path_i) # rename raster names(raster_i) <- googledrive_data_files$name[i] + message(names(raster_i)) # resample # if provided, resample to match another raster object's origin and resolution - if (resample_to_match == FALSE) { - message("No resampling") - } else { + if (is.null(resample_to_match)) { + message("No resampling.") + } else if (inherits(resample_to_match, "SpatRaster")) { message(paste0("Resampling rasters to match ", names(resample_to_match))) resample_to_match <- rast(resample_to_match) @@ -532,11 +541,16 @@ FUN.DownCAPFITOGEN <- terra::crs(raster_i) <- projection_to_match raster_i <- terra::resample(raster_i, resample_to_match) + } else { + stop("Invalid input for resample_to_match. Must be a SpatRaster or NULL.") } + + message(paste("adding raster", names(raster_i))) + rasters <- c(rasters, raster_i) } - message(paste("adding raster", names(raster_i))) - rasters <- c(rasters, raster_i) + typeof(rasters) + str(rasters) # save rasters as a NetCDF file message(paste0("saving as netCDF:", FNAME)) @@ -547,122 +561,153 @@ FUN.DownCAPFITOGEN <- # WORLD DATABASE ON PROTECTED AREAS --------------------------------------- #' UNEP-WCMC and IUCN (2025), Protected Planet: -#' The World Database on Protected Areas (WDPA) [Online], February 2025, +#' The World Database on Protected Areas (WDPA) [Online], #' Cambridge, UK: UNEP-WCMC and IUCN. Available at: www.protectedplanet.net. -#' https://www.protectedplanet.net/en/thematic-areas/wdpa&ved=2ahUKEwjA4fPhltyLAxVkJBAIHfdOEasQFnoECBUQAQ&usg=AOvVaw0eVrEFsb0_TP4UIl2am3Za -FUN.DownWDPA <- function(wdpa_url = "https://d1gam3xoknrgr2.cloudfront.net/current/WDPA_Feb2025_Public_shp.zip", - wdpa_destination = file.path(Dir.Capfitogen.WDPA, "WDPA_Feb2025_Public_shp.zip"), - Force = FALSE) { +#' https://www.protectedplanet.net/en/thematic-areas/wdpa +FUN.DownWDPA <- function(wdpa_url = FALSE, + wdpa_destination = file.path(Dir.Capfitogen.WDPA, "WDPA_Feb2025_Public_shp.zip"), + Force = FALSE) { + # get the current month and year (from system time) + MmmYYYY <- format(Sys.Date(), "%b%Y") + # set a path to wdpa shapefiles wdpa_path <- file.path(Dir.Capfitogen.WDPA, "wdpa") - + # define the file name of global wdpa shapefile to be created FNAME <- file.path(wdpa_path, "global_wdpa_polygons.gpkg") - + # check if the final wdpa file already exists and whether to overwrite if (!Force & file.exists(FNAME)) { message(paste0("A global wdpa file with polygons exists already: ", FNAME)) } else { if (!file.exists(wdpa_destination)) { - # download if Force = TRUE or the file isn't already there - message("downloading zipped WDPA shapefiles, ca 4GB") - # set long timeout to avoid interrupting download - options(timeout = 1000) - # download the zipped files - download.file(url = wdpa_url, - destfile = wdpa_destination, - cacheOK = FALSE) - } - - # unzip files - message(paste("unzipping WDPA shapefiles to", Dir.Capfitogen.WDPA)) - unzip(zipfile = wdpa_destination, exdir = Dir.Capfitogen.WDPA) - - # unzip split shapefile downloads - message("unzipping shapefiles split in download") - - shapefile_names <- c( - "WDPA_Feb2025_Public_shp-points.cpg", - "WDPA_Feb2025_Public_shp-points.dbf", - "WDPA_Feb2025_Public_shp-points.prj", - "WDPA_Feb2025_Public_shp-points.shp", - "WDPA_Feb2025_Public_shp-points.shx", - "WDPA_Feb2025_Public_shp-polygons.cpg", - "WDPA_Feb2025_Public_shp-polygons.dbf", - "WDPA_Feb2025_Public_shp-polygons.prj", - "WDPA_Feb2025_Public_shp-polygons.shp", - "WDPA_Feb2025_Public_shp-polygons.shx" - ) - - shapefile_paths <- file.path(wdpa_path, shapefile_names) - - # loop over zip directories with parts of the global data (numbered 0, 1, 2) - for (i in 0:2) { - # define name of the current directory to be unzipped - zipfilename <- - file.path(Dir.Capfitogen.WDPA, - paste0("WDPA_Feb2025_Public_shp_", i, ".zip")) - - # unzip the directory containing shapefiles - unzip(zipfile = zipfilename, exdir = wdpa_path) - message(paste0("unzipped ", zipfilename, "\nto ", wdpa_path)) - - # rename shapefiles with numbers to prevent overwriting them - new_shapefile_names <- file.path(wdpa_path, paste0(i, "_", shapefile_names)) - file.rename(from = shapefile_paths, to = new_shapefile_names) - } - - # delete unnecessary files - message("deleting redundant files (translations etc.)") - files_to_keep <- c( - wdpa_path, - wdpa_destination, - file.path(Dir.Capfitogen.WDPA, "WDPA_sources_Feb2025.csv") - ) - - files_to_delete <- - list.files(Dir.Capfitogen.WDPA, full.names = TRUE)[list.files(Dir.Capfitogen.WDPA, full.names = TRUE) %nin% files_to_keep] - file.remove(files_to_delete, recursive = TRUE) - - # prepare list of shapefiles to be combined - wdpa_polygon_shapefiles <- - # list polygon shapefiles in WDPA directory - substr(unique(sub("\\..*", "", list.files(wdpa_path)[grep(pattern = "polygon", x = shapefile_names)])), 3, 34) - - shapefile_list <- list() - - for (i in 0:2) { - # read in all the polygon shapefile layers - layer_name = paste0(i, "_", wdpa_polygon_shapefiles) - shapefile_list[[i + 1]] <- - read_sf(dsn = wdpa_path, layer = layer_name) + # check if a wdpa url is provided + if (wdpa_url == FALSE) { + message("please provide a valid url for download. + Download links change monthly, and follow the format + https://d1gam3xoknrgr2.cloudfront.net/current/WDPA_Apr2025_Public_shp.zip. + See https://www.protectedplanet.net/en/thematic-areas/wdpa?tab=WDPA") + } else { + # start download + message("downloading zipped WDPA shapefiles, ca 4GB") + # set long timeout to avoid interrupting download + options(timeout = 10000) + # download the zipped files + download.file( + url = wdpa_url, + destfile = wdpa_destination, + cacheOK = FALSE + ) + } + + # unzip files + message(paste("unzipping WDPA shapefiles to", Dir.Capfitogen.WDPA)) + unzip(zipfile = wdpa_destination, exdir = Dir.Capfitogen.WDPA) + + # unzip split shapefile downloads + message("unzipping shapefiles split in download") + + shapefile_names <- c( + paste0("WDPA_", MmmYYYY, "_Public_shp-points.cpg"), + paste0("WDPA_", MmmYYYY, "_Public_shp-points.dbf"), + paste0("WDPA_", MmmYYYY, "_Public_shp-points.prj"), + paste0("WDPA_", MmmYYYY, "_Public_shp-points.shp"), + paste0("WDPA_", MmmYYYY, "_Public_shp-points.shx"), + paste0("WDPA_", MmmYYYY, "_Public_shp-polygons.cpg"), + paste0("WDPA_", MmmYYYY, "_Public_shp-polygons.dbf"), + paste0("WDPA_", MmmYYYY, "_Public_shp-polygons.prj"), + paste0("WDPA_", MmmYYYY, "_Public_shp-polygons.shp"), + paste0("WDPA_", MmmYYYY, "_Public_shp-polygons.shx") + ) + + shapefile_paths <- file.path(wdpa_path, shapefile_names) + + # loop over zip directories with parts of the global data (numbered 0, 1, 2) + for (i in 0:2) { + # define name of the current directory to be unzipped + zipfilename <- + file.path( + Dir.Capfitogen.WDPA, + paste0("WDPA_", MmmYYYY, "_Public_shp_", i, ".zip") + ) + + # unzip the directory containing shapefiles + unzip(zipfile = zipfilename, exdir = wdpa_path) + message(paste0("unzipped ", zipfilename, "\nto ", wdpa_path)) + + # rename shapefiles with numbers to prevent overwriting them + new_shapefile_names <- file.path(wdpa_path, paste0(i, "_", shapefile_names)) + file.rename(from = shapefile_paths, to = new_shapefile_names) + } + + # delete unnecessary files + message("deleting redundant files (translations etc.)") + files_to_keep <- c( + wdpa_path, + wdpa_destination, + file.path( + Dir.Capfitogen.WDPA, + paste0("WDPA_sources_", MmmYYYY, ".csv") + ) + ) + + files_to_delete <- + list.files(Dir.Capfitogen.WDPA, + full.names = TRUE + )[list.files(Dir.Capfitogen.WDPA, + full.names = TRUE + ) %nin% files_to_keep] + file.remove(files_to_delete, recursive = TRUE) + + # prepare list of shapefiles to be combined + wdpa_polygon_shapefiles <- + # list polygon shapefiles in WDPA directory + substr( + unique(sub( + "\\..*", "", + list.files(wdpa_path)[grep( + pattern = "polygon", + x = shapefile_names + )] + )), + 3, 34 + ) + + shapefile_list <- list() + + for (i in 0:2) { + # read in all the polygon shapefile layers + layer_name <- paste0(i, "_", wdpa_polygon_shapefiles) + shapefile_list[[i + 1]] <- + read_sf(dsn = wdpa_path, layer = layer_name) + } + + # merge parts into one global shapefile + message("combining parts of the WDPA shapefile. This can take a while ---") + wdpa <- do.call(rbind, shapefile_list) + message("Complete WDPA successfully combined.") + + # wdpa$WDPAID <- as.character(wdpa$WDPAID) + # wdpa$text_field <- iconv(wdpa$text_field, to = "ASCII//TRANSLIT") + + # save complete wdpa + message("save as GeoPackage") + st_write(wdpa, file.path(wdpa_path, "global_wdpa_polygons.gpkg")) + message(paste0( + "global WDPA saved as: ", + file.path(wdpa_path, "global_wdpa_polygons.gpkg") + )) + + message("save as shapefile") + # st_write(wdpa, FNAME) + st_write( + wdpa, + "global_wdpa_polygons.shp", + layer_options = "ENCODING=UTF-8", + field_type = c(WDPAID = "Character") + ) + message("global WDPA saved as global_wdpa_polygons.shp") } - - # merge parts into one global shapefile - message("combining parts of the WDPA shapefile. This can take a while ---") - wdpa <- do.call(rbind, shapefile_list) - message("Complete WDPA successfully combined.") - - # wdpa$WDPAID <- as.character(wdpa$WDPAID) - # wdpa$text_field <- iconv(wdpa$text_field, to = "ASCII//TRANSLIT") - - # save complete wdpa - message("save as GeoPackage") - st_write(wdpa, file.path(wdpa_path, "global_wdpa_polygons.gpkg")) - message(paste0( - "global WDPA saved as: ", - file.path(wdpa_path, "global_wdpa_polygons.gpkg") - )) - - message("save as shapefile") - #st_write(wdpa, FNAME) - st_write( - wdpa, - "global_wdpa_polygons.shp", - layer_options = "ENCODING=UTF-8", - field_type = c(WDPAID = "Character") - ) - message("global WDPA saved as global_wdpa_polygons.shp") } } diff --git a/R_scripts/legacy-data-download.R b/R_scripts/legacy-data-download.R index 3e731a8..053a6e2 100644 --- a/R_scripts/legacy-data-download.R +++ b/R_scripts/legacy-data-download.R @@ -291,3 +291,53 @@ FUN.DownGV <- ##' (CIAT), available from http://srtm.csi.cgiar.org. #dem <- rast("https://srtm.csi.cgiar.org/wp-content/uploads/files/250m/tiles250m.jpg") + +# Bioclim data from ModGP, cut out from capfitogen_master.R ------------------- +### Bioclomatic data ------ +##' 19 BioClim variables +##' FUN.DownBV uses KrigR to download ERA5 data from Climate Data Store (CDS) +##' is each file of each variable >20GB? +##' Will this download Global Multi-resolution Terrain Elevation Data +##' (GMTED2010) as well? +##' Temporal coverage: January 1950 to present ? +##' https://cds.climate.copernicus.eu/datasets/derived-era5-land-daily-statistics?tab=overview +message("Downloading new or loading existing 19 BioClim bioclimatic variables") + +# Check for existing BioClim data file +existing_bioclim_file <- file.path(Dir.Data.Envir, "BV_1985-2015.nc") +if (file.exists(existing_bioclim_file)) { + message("Using existing BioClim data") + bioclim_variables <- terra::rast(existing_bioclim_file) +} else { + bioclim_variables <- FUN.DownBV( + T_Start = 1999, # what year to begin climatology calculation? + T_End = 1999, # what year to end climatology calculation? + Dir = Dir.Data.Envir, # where to store the data output on disk + Force = FALSE # do not overwrite already present data + ) +} + +# make sure the BioClim data have the correct names +BioClim_names <- c( + # see https://www.worldclim.org/data/bioclim.html + "BIO1_Annual_Mean_Temperature", + "BIO2_Mean_Diurnal_Range", + "BIO3_Isothermality", + "BIO4_Temperature_Seasonality", + "BIO5_Max_Temperature_of_Warmest_Month", + "BIO6_Min_Temperature_of_Coldest_Month", + "BIO7_Temperature_Annual_Range", + "BIO8_Mean_Temperature_of_Wettest_Quarter", + "BIO9_Mean_Temperature_of_Driest_Quarter", + "BIO10_Mean_Temperature_of_Warmest_Quarter", + "BIO11_Mean_Temperature_of_Coldest_Quarter", + "BIO12_Annual_Precipitation", + "BIO13_Precipitation_of_Wettest_Month", + "BIO14_Precipitation_of_Driest_Month", + "BIO15_Precipitation_Seasonality", + "BIO16_Precipitation_of_Wettest_Quarter", + "BIO17_Precipitation_of_Driest_Quarter", + "BIO18_Precipitation_of_Warmest_Quarter", + "BIO19_Precipitation_of_Coldest_Quarter") +names(bioclim_variables) <- BioClim_names + diff --git a/capfitogen_master.R b/capfitogen_master.R index 35473bd..fa33735 100644 --- a/capfitogen_master.R +++ b/capfitogen_master.R @@ -102,76 +102,41 @@ Species_ls <- FUN.DownGBIF( parallel = 1 # no speed gain here for parallelising on personal machine ) -## Environmental Data ----------------------------------------------------- -# ### Bioclomatic data ------ -# ##' 19 BioClim variables -# ##' FUN.DownBV uses KrigR to download ERA5 data from Climate Data Store (CDS) -# ##' is each file of each variable >20GB? -# ##' Will this download Global Multi-resolution Terrain Elevation Data -# ##' (GMTED2010) as well? -# ##' Temporal coverage: January 1950 to present ? -# ##' https://cds.climate.copernicus.eu/datasets/derived-era5-land-daily-statistics?tab=overview -# message("Downloading new or loading existing 19 BioClim bioclimatic variables") -# -# # Check for existing BioClim data file -# existing_bioclim_file <- file.path(Dir.Data.Envir, "BV_1985-2015.nc") -# if (file.exists(existing_bioclim_file)) { -# message("Using existing BioClim data") -# bioclim_variables <- terra::rast(existing_bioclim_file) -# } else { -# bioclim_variables <- FUN.DownBV( -# T_Start = 1999, # what year to begin climatology calculation? -# T_End = 1999, # what year to end climatology calculation? -# Dir = Dir.Data.Envir, # where to store the data output on disk -# Force = FALSE # do not overwrite already present data -# ) -# } -# -# # make sure the BioClim data have the correct names -# BioClim_names <- c( -# # see https://www.worldclim.org/data/bioclim.html -# "BIO1_Annual_Mean_Temperature", -# "BIO2_Mean_Diurnal_Range", -# "BIO3_Isothermality", -# "BIO4_Temperature_Seasonality", -# "BIO5_Max_Temperature_of_Warmest_Month", -# "BIO6_Min_Temperature_of_Coldest_Month", -# "BIO7_Temperature_Annual_Range", -# "BIO8_Mean_Temperature_of_Wettest_Quarter", -# "BIO9_Mean_Temperature_of_Driest_Quarter", -# "BIO10_Mean_Temperature_of_Warmest_Quarter", -# "BIO11_Mean_Temperature_of_Coldest_Quarter", -# "BIO12_Annual_Precipitation", -# "BIO13_Precipitation_of_Wettest_Month", -# "BIO14_Precipitation_of_Driest_Month", -# "BIO15_Precipitation_Seasonality", -# "BIO16_Precipitation_of_Wettest_Quarter", -# "BIO17_Precipitation_of_Driest_Quarter", -# "BIO18_Precipitation_of_Warmest_Quarter", -# "BIO19_Precipitation_of_Coldest_Quarter") -# names(bioclim_variables) <- BioClim_names - -### CAPFITOGEN data ------ +## Environmental Data (CAPFITOGEN) -------------------------------------------- +# make a template raster to resample to +template_raster <- rast(nrows=1800, ncols=4320, nlyr=1) +values(template_raster) <- rnorm(ncell(template_raster),1,1) +ext(template_raster) <- +print(ext(example_r)) + # download the default data from CAPFITOGEN. -FUN.DownCAPFITOGEN( +all_predictors <- FUN.DownCAPFITOGEN( Dir = Dir.Data.Envir, Force = FALSE, - resample_to_match = FALSE # bioclim_variables[[1]] + resample_to_match = template_raster ) ## Protected areas database ----------------------------------------------- #' download shapefiles for protected areas to overlay with Complementa tool. #' The FUN.DownWDPA function will save the file to a folder, but not load it #' into RStudio as an object. +MmmYYYY <- format(Sys.Date(), "%b%Y") FUN.DownWDPA( # download from url: - wdpa_url = "https://d1gam3xoknrgr2.cloudfront.net/current/WDPA_Feb2025_Public_shp.zip", + wdpa_url = paste0( + "https://d1gam3xoknrgr2.cloudfront.net/current/WDPA_", + MmmYYYY, "_Public_shp.zip"), # save the downloaded zipfile as: wdpa_destination = file.path(Dir.Capfitogen.WDPA, - "WDPA_Feb2025_Public_shp.zip"), + paste0("WDPA_", + MmmYYYY, + "_Public_shp.zip")), # do not overwrite existing data Force = FALSE) +# TEMP check wdpa files +wdpa <- read_sf("Data/Capfitogen/capfitogen_world_data_googledrive_links.csv") + # if supplied, crop all the data to a map of native species range @@ -205,17 +170,18 @@ write.table(Species_ls[["occs"]], ## Variable selection --------------------------------------------------------- message("running variable selection") -# combine variables -all_predictors <- c(bioclim_variables, - #edaphic_variables, # Error in xcor[mx[1], mx[2]] : subscript out of bounds / In addition: Warning message: / [spatSample] fewer values returned than requested - geophysical_variables) + +# predefine list of variables to keep +predictors_to_keep <- c( + +) # run variable selection based on variable inflation factor usdm::vif predictor_vifs <- vifcor( - all_predictors,# replace with either BV, EV, GV to run separately per type + all_predictors, th = 0.8, # threshold of correlation - keep = NULL, # if wanted, list variables to keep no matter what + keep = NULL, # if wanted, vector of variables to keep no matter what size = 5000, # subset size in case of big data (default 5000) method = "pearson" # 'pearson','kendall','spearman' ) From 5838d5a689f067ca8f753e1ffa07b22b77c47966 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Tue, 15 Apr 2025 13:31:43 +0200 Subject: [PATCH 49/72] draft cropping code, fix bug in data download --- R_scripts/SHARED-Data.R | 8 +++++--- capfitogen_master.R | 30 ++++++++++++++++++++++++++++-- 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/R_scripts/SHARED-Data.R b/R_scripts/SHARED-Data.R index 2c2a9d4..b4f0f09 100644 --- a/R_scripts/SHARED-Data.R +++ b/R_scripts/SHARED-Data.R @@ -506,10 +506,12 @@ FUN.DownCAPFITOGEN <- for (i in 1:nrow(tif_files)) { file_name = tif_files$name[i] message(file_name) + if (!file.exists(paste0(Dir, "/capfitogen/", tif_files$name[i]))){ download.file( url = tif_files$download_url[i], destfile = paste0(Dir, "/capfitogen/", tif_files$name[i]) - ) + ) + } } # list the downloaded files @@ -523,7 +525,7 @@ FUN.DownCAPFITOGEN <- file_path_i <- file.path(Dir.Data.Envir, "capfitogen", file_list[i]) raster_i <- rast(file_path_i) # rename raster - names(raster_i) <- googledrive_data_files$name[i] + names(raster_i) <- tif_files$name[i] message(names(raster_i)) # resample @@ -540,7 +542,7 @@ FUN.DownCAPFITOGEN <- terra::crs(raster_i) <- projection_to_match - raster_i <- terra::resample(raster_i, resample_to_match) + try(raster_i <- terra::resample(raster_i, resample_to_match)) } else { stop("Invalid input for resample_to_match. Must be a SpatRaster or NULL.") } diff --git a/capfitogen_master.R b/capfitogen_master.R index fa33735..45608e5 100644 --- a/capfitogen_master.R +++ b/capfitogen_master.R @@ -135,10 +135,36 @@ FUN.DownWDPA( Force = FALSE) # TEMP check wdpa files -wdpa <- read_sf("Data/Capfitogen/capfitogen_world_data_googledrive_links.csv") +wdpa <- read_sf(file.path(Dir.Capfitogen.WDPA, + "wdpa", "global_wdpa_polygons.gpkg")) # if supplied, crop all the data to a map of native species range - +crop_to_native_range <- function( + Dir = getwd(), + Force = TRUE, + native_range_map = NULL) { + # define a file name + FNAME <- file.path("Data/Environment/capfitogen_cropped.nc") + + # check if cropped data already exists and whether to overwrite + if (!Force & file.exists(FNAME)) { + message( + paste0(FNAME, "exists already. It has been loaded from the disk. + If you wish to override the present data, please specify Force = TRUE")) + capfitogen_cropped <- rast(FNAME) + return(capfitogen_cropped) + } else if (!file.exists(FNAME)) { + # if Force = TRUE or there is no preexisting data, + # check that the native range map is supplied and valid + if (!is.null(native_range_map)) { + # get the extent of the supplied native range + + # crop the environmental data to the native range + + # save the cropped version + } + } +} # CAPFITOGEN pipeline ========================================================= message(paste("------------------------------", From 877ed79686acc5a39066dcb520cfca1c8d20d860 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Wed, 23 Apr 2025 08:09:54 +0200 Subject: [PATCH 50/72] fix template raster --- R_scripts/SHARED-Data.R | 15 ++++++++------- capfitogen_master.R | 11 ++++++----- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/R_scripts/SHARED-Data.R b/R_scripts/SHARED-Data.R index b4f0f09..48cd514 100644 --- a/R_scripts/SHARED-Data.R +++ b/R_scripts/SHARED-Data.R @@ -445,7 +445,6 @@ FUN.DownBV <- function( BV_ras } - # CAPFITOGEN DATA DOWNLOAD ---------------------------------------------------- # Download the standard data from CAPFITOGEN for the globe. FUN.DownCAPFITOGEN <- @@ -474,7 +473,7 @@ FUN.DownCAPFITOGEN <- message("Start downloading 10x10 data from Capfitogen google drive.") # scrape Capfitogen's google drive to get direct download links (rdatamaps/world/10x10) - folder_id <- "1Xxt1ztTkLITAUbTyJzjePs2CpfETwF_u" + folder_id <- "1kPb27NnJyh7HKt774okYE_dWSCjS8--a" # "1Xxt1ztTkLITAUbTyJzjePs2CpfETwF_u" embedded_url <- paste0("https://drive.google.com/embeddedfolderview?id=", folder_id, "#list") @@ -519,7 +518,7 @@ FUN.DownCAPFITOGEN <- message("downloaded files:") print(file_list) - # read in and format rasters one by one from file name + ## read in and format rasters one by one from file name ---- rasters <- NULL for (i in 1:length(file_list)) { file_path_i <- file.path(Dir.Data.Envir, "capfitogen", file_list[i]) @@ -528,12 +527,13 @@ FUN.DownCAPFITOGEN <- names(raster_i) <- tif_files$name[i] message(names(raster_i)) - # resample + ## resample ---- # if provided, resample to match another raster object's origin and resolution if (is.null(resample_to_match)) { message("No resampling.") } else if (inherits(resample_to_match, "SpatRaster")) { - message(paste0("Resampling rasters to match ", names(resample_to_match))) + message(paste("Resampling", names(raster_i), + "to match", names(resample_to_match))) resample_to_match <- rast(resample_to_match) @@ -542,7 +542,7 @@ FUN.DownCAPFITOGEN <- terra::crs(raster_i) <- projection_to_match - try(raster_i <- terra::resample(raster_i, resample_to_match)) + tryCatch(raster_i <- terra::resample(raster_i, resample_to_match)) } else { stop("Invalid input for resample_to_match. Must be a SpatRaster or NULL.") } @@ -554,7 +554,8 @@ FUN.DownCAPFITOGEN <- typeof(rasters) str(rasters) - # save rasters as a NetCDF file + ## save rasters ---- + saveRDS(rasters, filename = "Data/Environment/capfitogen.RData") message(paste0("saving as netCDF:", FNAME)) terra::writeCDF(rasters, filename = FNAME, overwrite = FALSE) rasters diff --git a/capfitogen_master.R b/capfitogen_master.R index 45608e5..9f0f7e0 100644 --- a/capfitogen_master.R +++ b/capfitogen_master.R @@ -104,15 +104,15 @@ Species_ls <- FUN.DownGBIF( ## Environmental Data (CAPFITOGEN) -------------------------------------------- # make a template raster to resample to -template_raster <- rast(nrows=1800, ncols=4320, nlyr=1) -values(template_raster) <- rnorm(ncell(template_raster),1,1) -ext(template_raster) <- -print(ext(example_r)) +template_raster <- rast(nrows = 1800, + ncols = 4320, + nlyr = 1) +crs(template_raster) <- "epsg:4326" # WGS84 - World Geodetic System 1984 # download the default data from CAPFITOGEN. all_predictors <- FUN.DownCAPFITOGEN( Dir = Dir.Data.Envir, - Force = FALSE, + Force = TRUE, resample_to_match = template_raster ) @@ -138,6 +138,7 @@ FUN.DownWDPA( wdpa <- read_sf(file.path(Dir.Capfitogen.WDPA, "wdpa", "global_wdpa_polygons.gpkg")) +## Crop extent ---- # if supplied, crop all the data to a map of native species range crop_to_native_range <- function( Dir = getwd(), From eac72977a7b354801cb33650dafca689eadee727 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Wed, 23 Apr 2025 19:02:12 +0200 Subject: [PATCH 51/72] fix raster stacking, use workaround for capfitogen data --- R_scripts/SHARED-Data.R | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/R_scripts/SHARED-Data.R b/R_scripts/SHARED-Data.R index 48cd514..edaa768 100644 --- a/R_scripts/SHARED-Data.R +++ b/R_scripts/SHARED-Data.R @@ -473,7 +473,7 @@ FUN.DownCAPFITOGEN <- message("Start downloading 10x10 data from Capfitogen google drive.") # scrape Capfitogen's google drive to get direct download links (rdatamaps/world/10x10) - folder_id <- "1kPb27NnJyh7HKt774okYE_dWSCjS8--a" # "1Xxt1ztTkLITAUbTyJzjePs2CpfETwF_u" + folder_id <- "1NdeKwneRNtJmdGc26mPKX1tWezraAt7m" # 10x10:"1Xxt1ztTkLITAUbTyJzjePs2CpfETwF_u" # for 20x20: "1kPb27NnJyh7HKt774okYE_dWSCjS8--a" embedded_url <- paste0("https://drive.google.com/embeddedfolderview?id=", folder_id, "#list") @@ -520,7 +520,7 @@ FUN.DownCAPFITOGEN <- ## read in and format rasters one by one from file name ---- rasters <- NULL - for (i in 1:length(file_list)) { + for (i in 1:10) { # length(file_list) file_path_i <- file.path(Dir.Data.Envir, "capfitogen", file_list[i]) raster_i <- rast(file_path_i) # rename raster @@ -551,14 +551,13 @@ FUN.DownCAPFITOGEN <- rasters <- c(rasters, raster_i) } - typeof(rasters) - str(rasters) + # convert list of rasters to a stack + rasters <- rast(rasters) ## save rasters ---- - saveRDS(rasters, filename = "Data/Environment/capfitogen.RData") message(paste0("saving as netCDF:", FNAME)) terra::writeCDF(rasters, filename = FNAME, overwrite = FALSE) - rasters + return(rasters) } } From f4a00be392fd7f000e0bf85f5b6d820b94a4778e Mon Sep 17 00:00:00 2001 From: evalieungh Date: Thu, 24 Apr 2025 11:33:54 +0200 Subject: [PATCH 52/72] fix extent error --- R_scripts/SHARED-Data.R | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/R_scripts/SHARED-Data.R b/R_scripts/SHARED-Data.R index edaa768..4de6931 100644 --- a/R_scripts/SHARED-Data.R +++ b/R_scripts/SHARED-Data.R @@ -473,7 +473,7 @@ FUN.DownCAPFITOGEN <- message("Start downloading 10x10 data from Capfitogen google drive.") # scrape Capfitogen's google drive to get direct download links (rdatamaps/world/10x10) - folder_id <- "1NdeKwneRNtJmdGc26mPKX1tWezraAt7m" # 10x10:"1Xxt1ztTkLITAUbTyJzjePs2CpfETwF_u" # for 20x20: "1kPb27NnJyh7HKt774okYE_dWSCjS8--a" + folder_id <- "1Xxt1ztTkLITAUbTyJzjePs2CpfETwF_u" # for 20x20: "1kPb27NnJyh7HKt774okYE_dWSCjS8--a" embedded_url <- paste0("https://drive.google.com/embeddedfolderview?id=", folder_id, "#list") @@ -520,13 +520,16 @@ FUN.DownCAPFITOGEN <- ## read in and format rasters one by one from file name ---- rasters <- NULL - for (i in 1:10) { # length(file_list) + for (i in 1:length(file_list)) { file_path_i <- file.path(Dir.Data.Envir, "capfitogen", file_list[i]) raster_i <- rast(file_path_i) # rename raster names(raster_i) <- tif_files$name[i] message(names(raster_i)) + # there is a problem with some of the rasters. Try changing the extent + ext(raster_i) <- c(-180, 180, -60, 90) + ## resample ---- # if provided, resample to match another raster object's origin and resolution if (is.null(resample_to_match)) { @@ -556,7 +559,7 @@ FUN.DownCAPFITOGEN <- ## save rasters ---- message(paste0("saving as netCDF:", FNAME)) - terra::writeCDF(rasters, filename = FNAME, overwrite = FALSE) + terra::writeCDF(rasters, filename = FNAME, overwrite = TRUE) return(rasters) } } From 21b20f9fc9c70f95cf4de04209abe1a368fc7cf8 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Thu, 24 Apr 2025 11:34:21 +0200 Subject: [PATCH 53/72] update cropping draft --- capfitogen_master.R | 77 +++++++++++++++++++++++++++++++-------------- 1 file changed, 54 insertions(+), 23 deletions(-) diff --git a/capfitogen_master.R b/capfitogen_master.R index 9f0f7e0..8705a32 100644 --- a/capfitogen_master.R +++ b/capfitogen_master.R @@ -116,7 +116,7 @@ all_predictors <- FUN.DownCAPFITOGEN( resample_to_match = template_raster ) -## Protected areas database ----------------------------------------------- +## Protected areas database --------------------------------------------------- #' download shapefiles for protected areas to overlay with Complementa tool. #' The FUN.DownWDPA function will save the file to a folder, but not load it #' into RStudio as an object. @@ -134,39 +134,70 @@ FUN.DownWDPA( # do not overwrite existing data Force = FALSE) -# TEMP check wdpa files -wdpa <- read_sf(file.path(Dir.Capfitogen.WDPA, - "wdpa", "global_wdpa_polygons.gpkg")) - -## Crop extent ---- -# if supplied, crop all the data to a map of native species range +## Crop extents --------------------------------------------------------------- +# if supplied, crop all the data to a map of native species range. +# Define function crop_to_native_range <- function( Dir = getwd(), Force = TRUE, native_range_map = NULL) { - # define a file name - FNAME <- file.path("Data/Environment/capfitogen_cropped.nc") + + # define file names + FNAME_env <- file.path("Data/Environment/capfitogen_cropped.nc") + FNAME_wdpa <- file.path("Capfitogen/wdpa/wdpa_cropped.gpkg") # check if cropped data already exists and whether to overwrite - if (!Force & file.exists(FNAME)) { - message( - paste0(FNAME, "exists already. It has been loaded from the disk. - If you wish to override the present data, please specify Force = TRUE")) - capfitogen_cropped <- rast(FNAME) - return(capfitogen_cropped) - } else if (!file.exists(FNAME)) { - # if Force = TRUE or there is no preexisting data, - # check that the native range map is supplied and valid - if (!is.null(native_range_map)) { - # get the extent of the supplied native range + if (!Force & file.exists(FNAME_env) & file.exists(FNAME_wdpa)) { + message(paste0(FNAME_env, " and ", FNAME_wdpa, + " exist already. They have been loaded from the disk. ", + "If you wish to override the present data, ", + "please specify Force = TRUE")) + capfitogen_cropped <- rast(FNAME_env) + wdpa_cropped <- read_sf(FNAME_wdpa) + return(list(env = capfitogen_cropped, wdpa = wdpa_cropped)) + } + + # proceed with cropping if native_range_map is valid + if (!is.null(native_range_map)) { + # attempt to load native range map safely + tryCatch({ + native_range_raster <- rast(native_range_map) + native_range_extent <- ext(native_range_raster) + + if (is.null(native_range_extent) || + any(is.na(c(xmin(native_range_extent), xmax(native_range_extent), + ymin(native_range_extent), ymax(native_range_extent))))) { + stop("native_range_map does not have a valid extent.") + } - # crop the environmental data to the native range + # crop and save the environmental data + message("cropping environmental data") + all_predictors_cropped <- terra::crop(all_predictors, native_range_extent) + writeCDF(all_predictors_cropped, FNAME_env) - # save the cropped version - } + # crop and save protected areas (wdpa) + message("cropping protected areas") + wdpa <- read_sf(file.path(Dir, "wdpa", "global_wdpa_polygons.gpkg")) + wdpa_cropped <- terra::crop(wdpa, native_range_extent) + st_write(wdpa_cropped, FNAME_wdpa) + + return(list(env = all_predictors_cropped, wdpa = wdpa_cropped)) + + }, error = function(e) { + stop("Error reading or processing native_range_map: ", e$message) + }) + + } else { + stop("native_range_map must be provided when Force is TRUE or data does not exist.") } } +# Apply cropping function +crop_to_native_range( + Dir = Dir.Base, + Force = FALSE, + native_range_map = NULL) + # CAPFITOGEN pipeline ========================================================= message(paste("------------------------------", " starting Capfitogen pipeline ", From e9588cb209782dac3e121cf888cb2dbea84b764b Mon Sep 17 00:00:00 2001 From: evalieungh Date: Mon, 28 Apr 2025 17:41:13 +0200 Subject: [PATCH 54/72] fix google drive download --- R_scripts/ModGP-commonlines.R | 1 + R_scripts/SHARED-Data.R | 147 +++++++++++++++++++++------------- 2 files changed, 94 insertions(+), 54 deletions(-) diff --git a/R_scripts/ModGP-commonlines.R b/R_scripts/ModGP-commonlines.R index 3c3d0b7..39a2104 100644 --- a/R_scripts/ModGP-commonlines.R +++ b/R_scripts/ModGP-commonlines.R @@ -16,6 +16,7 @@ install.load.package <- function(x) { package_vec <- c( 'automap', # automatic interpolation (for KrigR) 'cowplot', # grid plotting + 'curl', # downloading data 'ggplot2', # ggplot machinery 'ggpp', 'ggpmisc', # table plotting in ggplot environment diff --git a/R_scripts/SHARED-Data.R b/R_scripts/SHARED-Data.R index 4de6931..0bc4455 100644 --- a/R_scripts/SHARED-Data.R +++ b/R_scripts/SHARED-Data.R @@ -446,14 +446,50 @@ FUN.DownBV <- function( } # CAPFITOGEN DATA DOWNLOAD ---------------------------------------------------- -# Download the standard data from CAPFITOGEN for the globe. +## Define a function for downloading data from google drives ---- +download_from_google_drive <- function( + file_id, + dest_path) { + require(curl) + + # Temporary file to capture cookies + tmp_cookie <- tempfile() + + # First request to get the "confirm" token (if needed) + url1 <- paste0("https://drive.google.com/uc?export=download&id=", file_id) + + h <- curl::new_handle() + curl::handle_setopt(h, cookiefile = tmp_cookie, cookiejar = tmp_cookie) + + con <- curl::curl(url1, "rb", handle = h) + page <- readLines(con, warn = FALSE) + close(con) + + # Look for confirm token + confirm <- regmatches(page, regexpr("confirm=([0-9A-Za-z_]+)", page)) + confirm <- sub("confirm=", "", confirm) + + if (length(confirm) > 0) { + # If token found, make second request with confirmation + url2 <- paste0("https://drive.google.com/uc?export=download&confirm=", confirm, "&id=", file_id) + + curl_download(url2, destfile = dest_path, handle = h, mode = "wb") + } else { + # If no token, download directly + curl_download(url1, destfile = dest_path, handle = h, mode = "wb") + } + + unlink(tmp_cookie) +} + +## Define download function for the standard data from CAPFITOGEN (world) ---- FUN.DownCAPFITOGEN <- function(Dir = getwd(), Force = FALSE, resample_to_match = NULL) { # define a file name FNAME <- file.path("Data/Environment/capfitogen.nc") - + # check if file already exists and whether to overwrite if (!Force & file.exists(FNAME)) { capfitogen_rasters <- rast(FNAME) @@ -466,97 +502,100 @@ FUN.DownCAPFITOGEN <- ) return(capfitogen_rasters) } - + # if Force=TRUE or the file doesn't already exist: if (Force | !file.exists(FNAME)) { - ## download data from Capfitogen Google drive ---- + ### download data from Capfitogen Google drive ---- + # https://drive.google.com/drive/folders/1Xxt1ztTkLITAUbTyJzjePs2CpfETwF_u message("Start downloading 10x10 data from Capfitogen google drive.") - + # scrape Capfitogen's google drive to get direct download links (rdatamaps/world/10x10) - folder_id <- "1Xxt1ztTkLITAUbTyJzjePs2CpfETwF_u" # for 20x20: "1kPb27NnJyh7HKt774okYE_dWSCjS8--a" - embedded_url <- paste0("https://drive.google.com/embeddedfolderview?id=", - folder_id, "#list") - + folder_id <- "1Xxt1ztTkLITAUbTyJzjePs2CpfETwF_u" + embedded_url <- paste0( + "https://drive.google.com/embeddedfolderview?id=", + folder_id, "#list" + ) + # Scrape the page page <- read_html(embedded_url) - + # Extract tags (these contain filenames + links) - file_links <- page %>% html_nodes("a") %>% html_attr("href") - file_names <- page %>% html_nodes("a") %>% html_text() - + file_links <- page %>% + html_nodes("a") %>% + html_attr("href") + file_names <- page %>% + html_nodes("a") %>% + html_text() + tif_files <- data.frame(name = file_names, link = file_links) - + # Extract file ID from link - tif_files$file_id <- substr(tif_files$link, - start = 33, stop = 65) - tif_files$download_url <- paste0("https://drive.google.com/uc?export=download&id=", tif_files$file_id) - - message("installing gdown with pip install") - system("pip install gdown") - + tif_files$file_id <- substr(tif_files$link, + start = 33, stop = 65 + ) + tif_files$download_url <- paste0( + "https://drive.google.com/uc?export=download&id=", tif_files$file_id) + # create directory to store tiff files - dir.create(path = paste0(Dir, "/capfitogen"), - showWarnings = FALSE) - - # set long timeout to aoid interrupting downloads - options(timeout = 1000) + dir.create( + path = paste0(Dir, "/capfitogen"), + showWarnings = FALSE + ) - # download each file separately by google id for (i in 1:nrow(tif_files)) { - file_name = tif_files$name[i] - message(file_name) - if (!file.exists(paste0(Dir, "/capfitogen/", tif_files$name[i]))){ - download.file( - url = tif_files$download_url[i], - destfile = paste0(Dir, "/capfitogen/", tif_files$name[i]) - ) + file_name <- tif_files$name[i] + file_id <- tif_files$file_id[i] + message("Downloading: ", file_name) + + dest_path <- file.path(Dir, "capfitogen", file_name) + + if (!file.exists(dest_path)) { + download_from_google_drive(file_id, dest_path) } } # list the downloaded files file_list <- list.files(paste0(Dir.Data.Envir, "/capfitogen")) - message("downloaded files:") - print(file_list) - - ## read in and format rasters one by one from file name ---- + + ### read in and format rasters one by one from file name ---- rasters <- NULL - for (i in 1:length(file_list)) { + for (i in 1:50) { # length(file_list) file_path_i <- file.path(Dir.Data.Envir, "capfitogen", file_list[i]) raster_i <- rast(file_path_i) # rename raster names(raster_i) <- tif_files$name[i] - message(names(raster_i)) - - # there is a problem with some of the rasters. Try changing the extent - ext(raster_i) <- c(-180, 180, -60, 90) - ## resample ---- + ### resample ---- # if provided, resample to match another raster object's origin and resolution if (is.null(resample_to_match)) { message("No resampling.") } else if (inherits(resample_to_match, "SpatRaster")) { - message(paste("Resampling", names(raster_i), - "to match", names(resample_to_match))) - + # read in the raster to use as template to match resample_to_match <- rast(resample_to_match) - - ## project downloaded rasters to match resample_to_match file + + # project downloaded rasters to match resample_to_match file projection_to_match <- terra::crs(resample_to_match) - + terra::crs(raster_i) <- projection_to_match - + + # resample + message(paste( + "Resampling", names(raster_i), + "to match", names(resample_to_match) + )) + tryCatch(raster_i <- terra::resample(raster_i, resample_to_match)) } else { stop("Invalid input for resample_to_match. Must be a SpatRaster or NULL.") } - + message(paste("adding raster", names(raster_i))) rasters <- c(rasters, raster_i) } - + # convert list of rasters to a stack rasters <- rast(rasters) - + ## save rasters ---- message(paste0("saving as netCDF:", FNAME)) terra::writeCDF(rasters, filename = FNAME, overwrite = TRUE) From e45750d33ac34abaaf691caeb94b99da3894f2e4 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Tue, 29 Apr 2025 16:45:25 +0200 Subject: [PATCH 55/72] update scripts --- README.md | 5 +- R_scripts/SHARED-Data.R | 10 +++- capfitogen_master.R | 109 ++++++++++++++++++++-------------------- submit_capfitogen.sh | 2 + 4 files changed, 69 insertions(+), 57 deletions(-) diff --git a/README.md b/README.md index b869bbe..5695a3f 100644 --- a/README.md +++ b/README.md @@ -25,8 +25,11 @@ ## CAPFITOGEN As an addition to ModGP, you can run two of Capfitogen's most useful tools: ELC maps and Complementa maps to visualise overlap with protected areas. +Because a lot of variables will be downloaded, the total memory requirements may be too large for most personal computers. Try with a subset of the data if necessary. -- After cloning this repository, you need to clone Capfitogen (a submodule) as well with `git submodule update --init`. +NB! After cloning this repository, you need to clone Capfitogen (a submodule) as well with `git submodule update --init`. + +Alternative ways of running the capfitogen capabilities: - To run our version of CAPFITOGEN in [RStudio](https://posit.co/downloads/), open `capfitogen_master.R` and execute the code, changing inputs like species name and other parameters. The script guides you through the whole process. diff --git a/R_scripts/SHARED-Data.R b/R_scripts/SHARED-Data.R index 0bc4455..85dbcaa 100644 --- a/R_scripts/SHARED-Data.R +++ b/R_scripts/SHARED-Data.R @@ -559,7 +559,7 @@ FUN.DownCAPFITOGEN <- ### read in and format rasters one by one from file name ---- rasters <- NULL - for (i in 1:50) { # length(file_list) + for (i in 1:length(file_list)) { # length(file_list) file_path_i <- file.path(Dir.Data.Envir, "capfitogen", file_list[i]) raster_i <- rast(file_path_i) # rename raster @@ -595,10 +595,16 @@ FUN.DownCAPFITOGEN <- # convert list of rasters to a stack rasters <- rast(rasters) + + # remove .tif extension from raster name + names(rasters) <- sub("\\.tif$", "", names(rasters)) ## save rasters ---- message(paste0("saving as netCDF:", FNAME)) - terra::writeCDF(rasters, filename = FNAME, overwrite = TRUE) + terra::writeCDF(rasters, + varname = names(rasters), + filename = FNAME, + overwrite = TRUE) return(rasters) } } diff --git a/capfitogen_master.R b/capfitogen_master.R index 8705a32..881a09d 100644 --- a/capfitogen_master.R +++ b/capfitogen_master.R @@ -88,7 +88,7 @@ message(paste("------------------------------", sep = "\n")) ## Run SHARED-Data script ------------------------------------------------- -## defines FUN.DownGBIF(), FUN.DownBV(), FUN.DownEV() +## defines functions used to download data source(file.path(Dir.R_scripts, "SHARED-Data.R")) ## GBIF Data -------------------------------------------------------------- @@ -110,11 +110,13 @@ template_raster <- rast(nrows = 1800, crs(template_raster) <- "epsg:4326" # WGS84 - World Geodetic System 1984 # download the default data from CAPFITOGEN. +# development goal: replace with argument to give alternative download links all_predictors <- FUN.DownCAPFITOGEN( Dir = Dir.Data.Envir, - Force = TRUE, + Force = FALSE, resample_to_match = template_raster ) +names(all_predictors) ## Protected areas database --------------------------------------------------- #' download shapefiles for protected areas to overlay with Complementa tool. @@ -192,11 +194,19 @@ crop_to_native_range <- function( } } -# Apply cropping function -crop_to_native_range( +# load native species range map +range_map <- NULL + +# apply cropping function +if(is.null(range_map)) { + message("global extent") +} else { + message(paste("cropping to native range map: ", names(range_map))) + crop_to_native_range( Dir = Dir.Base, Force = FALSE, - native_range_map = NULL) + native_range_map = range_map) +} # CAPFITOGEN pipeline ========================================================= message(paste("------------------------------", @@ -229,17 +239,17 @@ write.table(Species_ls[["occs"]], ## Variable selection --------------------------------------------------------- message("running variable selection") +# make a new function to get the most extreme monthly means? + # predefine list of variables to keep -predictors_to_keep <- c( - -) +predefined_predictors_to_keep <- NULL # run variable selection based on variable inflation factor usdm::vif predictor_vifs <- vifcor( all_predictors, th = 0.8, # threshold of correlation - keep = NULL, # if wanted, vector of variables to keep no matter what + keep = predefined_predictors_to_keep, # if wanted, vector of variables to keep no matter what size = 5000, # subset size in case of big data (default 5000) method = "pearson" # 'pearson','kendall','spearman' ) @@ -254,17 +264,6 @@ print(variables_to_keep) predictors <- all_predictors[[(variables_to_keep)]] predictors <- raster::stack(predictors) -# get bioclimatic variable names that matches Capfitogen's format -predictor_names <- names(predictors) -bioclim_predictor_names <- predictor_names[grep("BIO", predictor_names)] -bioclim_predictor_codes <- sub("_.*", "", tolower(bioclim_predictor_names)) -bioclim_predictor_codes <- sub("o", "o_", bioclim_predictor_codes) -capfitogen_bioclim_names <- read.delim("Capfitogen/bioclim.txt", - fileEncoding = "latin1") -bioclim_subset <- - capfitogen_bioclim_names[capfitogen_bioclim_names$VARCODE %in% bioclim_predictor_codes, ] -bioclim_predictor_names_capfitogen <- bioclim_subset$VARDESCR - # save variables in CAPFITOGEN folder if (!dir.exists(file.path(Dir.Capfitogen, "rdatapoints/world/9x9"))) { dir.create(file.path(Dir.Capfitogen, "rdatapoints/world/9x9"), @@ -275,18 +274,12 @@ if (!dir.exists(file.path(Dir.Capfitogen, "rdatapoints/world/9x9"))) { saveRDS(predictors, "Capfitogen/rdatapoints/world/9x9/base9x9.RData") -save(predictors, - file = "Capfitogen/rdatapoints/world/9x9/base9x9.RData") - -# names(predictors[[1:length(bioclim_predictor_names)]]) <- bioclim_predictor_codes -predictor_names_for_saving <- - c(bioclim_predictor_codes, - predictor_names[grep("BIO", predictor_names, invert = TRUE)] - ) +# save(predictors, +# file = "Capfitogen/rdatapoints/world/9x9/base9x9.RData") for (i in 1:dim(predictors)[3]) { file_name_path = file.path("Capfitogen/rdatamaps/world/9x9", - paste0(predictor_names_for_saving[i],".tif")) + paste0(names(predictors)[i],".tif")) writeRaster(predictors[[i]], file_name_path, overwrite = TRUE) @@ -319,7 +312,7 @@ if (!file.exists(file.path(Dir.Capfitogen.Error,"process_info.txt"))) { file.create(file.path(Dir.Capfitogen.Error,"process_info.txt")) } -# add geophysical variables to list of possible variables +# add new geophysical variables to list of possible variables load(file.path(Dir.Capfitogen, "geophys.RData")) if (nrow(geophys[geophys$VARCODE == "wind_max", ]) < 1) { geophys <- rbind( @@ -337,27 +330,35 @@ if (nrow(geophys[geophys$VARCODE == "wind_max", ]) < 1) { ) save(geophys, file = file.path(Dir.Capfitogen, "geophys.RData")) - # rename geophysical variable files ---- NB! will break when edaphic vars are added... - number_of_geophys_variables <- length(predictor_names[grep("BIO", - predictor_names, - invert = TRUE)]) - for (i in 1:number_of_geophys_variables) { - from_name = predictor_names[grep("BIO", - predictor_names, - invert = TRUE)][i] - - to_name = geophys[geophys$VARDESCR == from_name, "VARCODE"][1] - file.rename( - from = file.path( - "Capfitogen/rdatamaps/world/9x9", - paste0(from_name, ".tif")), - to = file.path("Capfitogen/rdatamaps/world/9x9", - paste0(to_name, ".tif")) - ) - } +# # rename geophysical variable files ---- NB! will break if edaphic vars are added... +# number_of_geophys_variables <- length(predictor_names[grep("BIO", +# predictor_names, +# invert = TRUE)]) +# for (i in 1:number_of_geophys_variables) { +# from_name = predictor_names[grep("BIO", +# predictor_names, +# invert = TRUE)][i] +# +# to_name = geophys[geophys$VARDESCR == from_name, "VARCODE"][1] +# file.rename( +# from = file.path( +# "Capfitogen/rdatamaps/world/9x9", +# paste0(from_name, ".tif")), +# to = file.path("Capfitogen/rdatamaps/world/9x9", +# paste0(to_name, ".tif")) +# ) +# } } -rm(geophys) +# find names of bioclimatic, edaphic, and geophysical variables +load(file.path(Dir.Capfitogen, "edaph.RData")) +load(file.path(Dir.Capfitogen, "bioclim.RData")) + +edaphic_variables <- intersect(edaph$VARCODE, names(predictors)) +bioclimatic_variables <- intersect(bioclim$VARCODE, names(predictors)) +geophysical_variables <- intersect(geophys$VARCODE, names(predictors)) + +rm(bioclim, edaph, geophys) ## Clustering and map creation: ELCmapas --------------------------------------- message("setting parameters and running ELC map script (ecogeographic land characterization)") @@ -366,18 +367,18 @@ ruta <- Dir.Capfitogen # path to capfitogen scripts resultados <- Dir.Capfitogen.ELCMap # directory to place results pasaporte <- pasaporte_file_name # species occurrence data -pais <- "world" # global extent - big modifications will be necessary to use different extent +pais <- "world" # global extent - big modifications will be necessary to use different resolution geoqual <- FALSE -totalqual<-30 # Only applies if GEOQUAL=TRUE, must be a value between 0 and 100 +totalqual <- 30 # Only applies if GEOQUAL=TRUE, must be a value between 0 and 100 duplicat <- TRUE # duplicat=TRUE indicates that records of the same GENUS/SPECIES/SUBTAXA will be deleted distdup <- 1 # distance threshold in km to remove duplicates from same population resol1 <- "celdas 9x9 km aprox (4.5 arc-min)" # resolution latitud <- FALSE #Only applies if ecogeo=TRUE; whether to use latitude variable (Y) as a geophysical variable from 'pasaporte' longitud <- FALSE -bioclimv <- bioclim_predictor_names_capfitogen # -edaphv <- names(geophysical_variables)#names(edaphic_variables) # edaphic variables (defaults from SOILGRIDS) -geophysv <- names(geophysical_variables) # geophysical variables +bioclimv <- bioclimatic_variables # +edaphv <- edaphic_variables #names(edaphic_variables) # edaphic variables (defaults from SOILGRIDS) +geophysv <- geophysical_variables # geophysical variables maxg <- 20 # maximum number of clusters per component metodo <- "kmeansbic" # clustering algorithm type. Options: medoides, elbow, calinski, ssi, bic diff --git a/submit_capfitogen.sh b/submit_capfitogen.sh index de76951..0c7a26f 100644 --- a/submit_capfitogen.sh +++ b/submit_capfitogen.sh @@ -22,4 +22,6 @@ fi export R_TERRA_MAX_RAM_MB # End of workaround +git submodule update --init + singularity run --bind $PWD cwr_0.5.3.sif "capfitogen_master.R" From 8cb2122419ccc3b78c43e73231e7ec979360e8a2 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Wed, 30 Apr 2025 18:31:36 +0200 Subject: [PATCH 56/72] add introduction, links --- README.md | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 5695a3f..82e6e3e 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,14 @@ -# uc-CWR +# Use case Crop Wild Relatives (uc-CWR) + +This repository hosts code for a [Biodiversity Digital Twin](https://biodt.eu/) use case: the prototype Digital Twin for [Crop Wild Relatives](https://biodt.eu/use-cases/crop-wild-relatives). The prototype Digital Twin can be accessed through a grapical user interface made with R shiny and hosted on Lifewatch: [prototype digital twins GUI](http://app.biodt.lifewatch.eu/) + +> *"The Prototype Biodiversity Digital Twin (pDT) for Crop Wild Relatives is an advanced tool designed to aid in the identification and use of crop wild relatives (CWR) genetic resources to enhance crop resilience against climate-driven stresses"* [BioDT.eu/use-cases/crop-wild-relatives](https://biodt.eu/use-cases/crop-wild-relatives) + +For technical documentation, see a separate [markdown file](technical_documentation.md). Below we also outline quick instructions for running the ModGP and Capfitogen tools in R and on the LUMI supercomputer. The prototype Digital Twin is also presented in a 'Research ideas and outcomes' paper: [Chala et al. 2024](https://doi.org/10.3897/rio.10.e125192). The core functionality of the digital twin is ModGP (Modelling the GermPlasm of interest), but two of Capfitogen's tools have since been added to extend the prototype Digital Twin's usefulness. + +> *"MoDGP leverages species distribution modelling, relying on occurrence data of CWR to produce habitat suitability maps, establish mathematical correlations between adaptive traits, such as tolerance to drought and pathogens and environmental factors and facilitates mapping geographic areas where populations possessing genetic resources for resilience against various biotic and abiotic stresses are potentially growing."* [Chala et al. 2024](https://doi.org/10.3897/rio.10.e125192) + +--------------------------------- ## ModGP on Rstudio @@ -24,17 +34,17 @@ ## CAPFITOGEN -As an addition to ModGP, you can run two of Capfitogen's most useful tools: ELC maps and Complementa maps to visualise overlap with protected areas. -Because a lot of variables will be downloaded, the total memory requirements may be too large for most personal computers. Try with a subset of the data if necessary. +As an addition to ModGP, you can run two of [Capfitogen](https://www.capfitogen.net/en/)'s most useful tools: [ecogeographic land characterization (ELC) maps](https://www.capfitogen.net/en/tools/elc-mapas/) and [Complementa](https://www.capfitogen.net/en/tools/complementa/) maps to visualise overlap with protected areas. +Because a lot of variables will be downloaded and processed, the total memory requirements may be too large for most personal computers. Try with a subset of the data if necessary. NB! After cloning this repository, you need to clone Capfitogen (a submodule) as well with `git submodule update --init`. Alternative ways of running the capfitogen capabilities: -- To run our version of CAPFITOGEN in [RStudio](https://posit.co/downloads/), open `capfitogen_master.R` and execute the code, changing inputs like species name and other parameters. The script guides you through the whole process. +- To run our version of CAPFITOGEN in [RStudio](https://posit.co/downloads/), open `capfitogen_master.R` and execute the code, changing inputs like species name and other parameters. The script guides you through the whole process. After changing the species name, you can run the whole script as a background job if desired. + +- To run on LUMI (assumes access to LUMI and the project): Fetch the container (step 2 from ModGP above), then submit the job for a desired species (e.g. Lathyrus): -- To run on LUMI: obtain interactive session: - `srun -p small --nodes=1 --ntasks-per-node=1 --mem=8G -t 4:00:00 --pty bash` - and execute the workflow: - `singularity run --bind $PWD cwr_0.2.0.sif capfitogen_master.R` + sbatch submit_capfitogen_prep_lumi.sh Lathyrus + sbatch submit_capfitogen_exec_lumi.sh Lathyrus From 3ab2962d5fd2f188d50b3cda7796c9722b4552e6 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Wed, 30 Apr 2025 18:53:00 +0200 Subject: [PATCH 57/72] add tech doc draft --- technical_documentation.md | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 technical_documentation.md diff --git a/technical_documentation.md b/technical_documentation.md new file mode 100644 index 0000000..8df08e2 --- /dev/null +++ b/technical_documentation.md @@ -0,0 +1,30 @@ +# Technical documentation use-case Crop Wild Relatives + +> NB! This document is under construction. + +Scientific usefulness, written in R to be run on an hpc for a crop species or genus. + +## ModGP + +Main characteristics, script and workflow structure + +### Inputs + +### Outputs + +## Capfitogen + +Main characteristics, script and workflow structure. + +![Figure: Illustration of scripts and data for running Capfitogen tools](capfitogen_master_illustration.drawio.svg) + +### Inputs + +- Species occurrences, taken from download by ModGP if available +- Environmental variables downloaded from CAPFITOGEN's google drive: 177 bioclimatic, edaphic (soil), and geophysical predictors. These are narrowed down using Variable Inflation Factor. +- protected areas, vector map, by the [The World Database on Protected Areas (WDPA)](https://www.protectedplanet.net/en/thematic-areas/wdpa?tab=WDPA) + +### Outputs + +- ELC maps created with CAPFITOGEN +- Complementa analysis created with CAPFITOGEN From 6b4ff8384efb41142d9abe082ef5dcf128b47fcc Mon Sep 17 00:00:00 2001 From: evalieungh Date: Wed, 30 Apr 2025 20:20:35 +0200 Subject: [PATCH 58/72] create prep and exec script drafts --- capfitogen-run_prep.R | 99 +++++++++++++++++++ submit_capfitogen_exec_lumi.sh | 15 +++ ...togen.sh => submit_capfitogen_prep_lumi.sh | 4 +- 3 files changed, 116 insertions(+), 2 deletions(-) create mode 100644 capfitogen-run_prep.R create mode 100644 submit_capfitogen_exec_lumi.sh rename submit_capfitogen.sh => submit_capfitogen_prep_lumi.sh (89%) diff --git a/capfitogen-run_prep.R b/capfitogen-run_prep.R new file mode 100644 index 0000000..62f0e4f --- /dev/null +++ b/capfitogen-run_prep.R @@ -0,0 +1,99 @@ +#' ####################################################################### # +#' PROJECT: [BioDT CWR - CAPFITOGEN] +#' CONTENTS: +#' - Execution of Capfitogen pipeline +#' DEPENDENCIES: +#' - R_scripts directory containing: +#' - "Capfitogen_ +#' - "SHARED-APICredentials.R" +#' - "SHARED-Data.R" +#' AUTHOR: [Eva Lieungh, Erik Kusch] +#' ####################################################################### # + +# PREAMBLE ================================================================ +set.seed(42) # making things reproducibly random +rm(list=ls()) + +# Read species from command-line argument +args = commandArgs(trailingOnly=TRUE) +if (length(args)==0) { + # Default species + SPECIES <- "Lathyrus" +} else { + SPECIES <- args[1] +} +message(sprintf("SPECIES = %s", SPECIES)) + +## Directories ------------------------------------------------------------ +### Define directories in relation to project directory +Dir.Base <- getwd() +Dir.Scripts <- file.path(Dir.Base, "R_scripts") + +## Sourcing --------------------------------------------------------------- +source(file.path(Dir.Scripts,"ModGP-commonlines.R")) +source(file.path(Dir.Scripts,"SHARED-Data.R")) +# source(file.path(Dir.Scripts,"ModGP-SDM.R")) # <-- replace with something? +# source(file.path(Dir.Scripts,"ModGP-Outputs.R")) + +## API Credentials -------------------------------------------------------- +try(source(file.path(Dir.Scripts, "SHARED-APICredentials.R"))) +if (!exists("API_User")) { + API_User <- "none@" +} + +# Choose the number of parallel processes +RUNNING_ON_LUMI <- TRUE + +numberOfCores <- strtoi(Sys.getenv("SLURM_CPUS_PER_TASK")) +if (is.na(numberOfCores)) { + numberOfCores <- 1 +} + +message(sprintf("numberOfCores = %d", numberOfCores)) +#' +#' # DATA ==================================================================== +#' ## GBIF Data -------------------------------------------------------------- +#' message("Retrieving GBIF data") +#' ## species of interest +#' Species_ls <- FUN.DownGBIF( +#' species = SPECIES, # which species to pull data for +#' Dir = Dir.Data.GBIF, # where to store the data output on disk +#' Force = FALSE, # do not overwrite already present data +#' Mode = "ModGP", # query download for entire genus +#' parallel = 1 # no speed gain here for parallelising on personal machine +#' ) +#' +#' ## Environmental Data ----------------------------------------------------- +#' message("Retrieving environmental data") +#' BV_ras <- FUN.DownBV(T_Start = 1985, # what year to begin climatology calculation in +#' T_End = 2015, # what year to end climatology calculation in +#' Dir = Dir.Data.Envir, # where to store the data output on disk +#' Force = FALSE # do not overwrite already present data +#' ) +#' +#' ## Posthoc Data ----------------------------------------------------------- +#' message("Retrieving additional covariates") +#' #' For relating SDM outputs to other characteristics of interest to users +#' PH_nutrient <- raster("https://www.fao.org/fileadmin/user_upload/soils/docs/HWSD/Soil_Quality_data/sq1.asc") +#' PH_toxicity <- raster("https://www.fao.org/fileadmin/user_upload/soils/docs/HWSD/Soil_Quality_data/sq6.asc") +#' PH_stack <- stack(PH_nutrient, PH_toxicity) +#' PH_stack <- raster::resample(PH_stack, BV_ras[[1]]) +#' PH_stack <- stack(PH_stack, BV_ras$BIO1, BV_ras$BIO12) +#' names(PH_stack) <- c("Nutrient", "Toxicity", "Temperature", "Soil Moisture") +#' FNAME <- file.path(Dir.Data.Envir, "PH_stack") +#' saveObj(PH_stack, file = FNAME) +#' +#' ## SDM Data Preparations -------------------------------------------------- +#' message("Preparing data for SDM workflow") +#' SDMInput_ls <- FUN.PrepSDMData(occ_ls = Species_ls$occs, # list of occurrence data frames per species +#' BV_ras = BV_ras, # bioclimatic rasterstack +#' Dir = Dir.Data.ModGP, # where to store the data output on disk +#' Force = FALSE, # # do not overwrite already present data +#' parallel = numberOfCores # parallelised execution +#' ) +#' +#' # Extract the list of species names +#' species_names <- names(SDMInput_ls) +# +# # Save the species names to a file +# writeLines(species_names, "species_list.txt") diff --git a/submit_capfitogen_exec_lumi.sh b/submit_capfitogen_exec_lumi.sh new file mode 100644 index 0000000..3aa207d --- /dev/null +++ b/submit_capfitogen_exec_lumi.sh @@ -0,0 +1,15 @@ +#!/bin/bash -l +#SBATCH -J capfitogen +#SBATCH -o capfitogen-%j.out +#SBATCH --account=project_465000915 +#SBATCH --nodes=1 +#SBATCH --tasks-per-node=1 +#SBATCH --cpus-per-task=128 +#SBATCH --time=10:00:00 +#SBATCH --partition=largemem +##SBATCH --partition=lumid +##SBATCH --partition=small --mem-per-cpu=2G +##SBATCH --partition=standard --exclusive --mem=0 +##SBATCH --partition=debug --exclusive --mem=0 --time=0:30:00 + +singularity run --bind $PWD cwr_0.5.3.sif "capfitogen_master.R" \ No newline at end of file diff --git a/submit_capfitogen.sh b/submit_capfitogen_prep_lumi.sh similarity index 89% rename from submit_capfitogen.sh rename to submit_capfitogen_prep_lumi.sh index 0c7a26f..15d1aa2 100644 --- a/submit_capfitogen.sh +++ b/submit_capfitogen_prep_lumi.sh @@ -5,7 +5,7 @@ #SBATCH --nodes=1 #SBATCH --tasks-per-node=1 #SBATCH --cpus-per-task=8 -#SBATCH --time=48:00:00 +#SBATCH --time=24:00:00 #SBATCH --partition=small --mem=64G ##SBATCH --partition=standard --exclusive --mem=0 ##SBATCH --partition=debug --exclusive --mem=0 --time=0:30:00 @@ -24,4 +24,4 @@ export R_TERRA_MAX_RAM_MB git submodule update --init -singularity run --bind $PWD cwr_0.5.3.sif "capfitogen_master.R" +singularity run --bind $PWD cwr_0.5.3.sif "capfitogen-run_prep.R" From e55548a579623b5db49549c5c3a4f7efcd35b009 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Wed, 30 Apr 2025 21:05:48 +0200 Subject: [PATCH 59/72] delete redundant script --- capfitogen-run_prep.R | 99 ------------------------------------------- 1 file changed, 99 deletions(-) delete mode 100644 capfitogen-run_prep.R diff --git a/capfitogen-run_prep.R b/capfitogen-run_prep.R deleted file mode 100644 index 62f0e4f..0000000 --- a/capfitogen-run_prep.R +++ /dev/null @@ -1,99 +0,0 @@ -#' ####################################################################### # -#' PROJECT: [BioDT CWR - CAPFITOGEN] -#' CONTENTS: -#' - Execution of Capfitogen pipeline -#' DEPENDENCIES: -#' - R_scripts directory containing: -#' - "Capfitogen_ -#' - "SHARED-APICredentials.R" -#' - "SHARED-Data.R" -#' AUTHOR: [Eva Lieungh, Erik Kusch] -#' ####################################################################### # - -# PREAMBLE ================================================================ -set.seed(42) # making things reproducibly random -rm(list=ls()) - -# Read species from command-line argument -args = commandArgs(trailingOnly=TRUE) -if (length(args)==0) { - # Default species - SPECIES <- "Lathyrus" -} else { - SPECIES <- args[1] -} -message(sprintf("SPECIES = %s", SPECIES)) - -## Directories ------------------------------------------------------------ -### Define directories in relation to project directory -Dir.Base <- getwd() -Dir.Scripts <- file.path(Dir.Base, "R_scripts") - -## Sourcing --------------------------------------------------------------- -source(file.path(Dir.Scripts,"ModGP-commonlines.R")) -source(file.path(Dir.Scripts,"SHARED-Data.R")) -# source(file.path(Dir.Scripts,"ModGP-SDM.R")) # <-- replace with something? -# source(file.path(Dir.Scripts,"ModGP-Outputs.R")) - -## API Credentials -------------------------------------------------------- -try(source(file.path(Dir.Scripts, "SHARED-APICredentials.R"))) -if (!exists("API_User")) { - API_User <- "none@" -} - -# Choose the number of parallel processes -RUNNING_ON_LUMI <- TRUE - -numberOfCores <- strtoi(Sys.getenv("SLURM_CPUS_PER_TASK")) -if (is.na(numberOfCores)) { - numberOfCores <- 1 -} - -message(sprintf("numberOfCores = %d", numberOfCores)) -#' -#' # DATA ==================================================================== -#' ## GBIF Data -------------------------------------------------------------- -#' message("Retrieving GBIF data") -#' ## species of interest -#' Species_ls <- FUN.DownGBIF( -#' species = SPECIES, # which species to pull data for -#' Dir = Dir.Data.GBIF, # where to store the data output on disk -#' Force = FALSE, # do not overwrite already present data -#' Mode = "ModGP", # query download for entire genus -#' parallel = 1 # no speed gain here for parallelising on personal machine -#' ) -#' -#' ## Environmental Data ----------------------------------------------------- -#' message("Retrieving environmental data") -#' BV_ras <- FUN.DownBV(T_Start = 1985, # what year to begin climatology calculation in -#' T_End = 2015, # what year to end climatology calculation in -#' Dir = Dir.Data.Envir, # where to store the data output on disk -#' Force = FALSE # do not overwrite already present data -#' ) -#' -#' ## Posthoc Data ----------------------------------------------------------- -#' message("Retrieving additional covariates") -#' #' For relating SDM outputs to other characteristics of interest to users -#' PH_nutrient <- raster("https://www.fao.org/fileadmin/user_upload/soils/docs/HWSD/Soil_Quality_data/sq1.asc") -#' PH_toxicity <- raster("https://www.fao.org/fileadmin/user_upload/soils/docs/HWSD/Soil_Quality_data/sq6.asc") -#' PH_stack <- stack(PH_nutrient, PH_toxicity) -#' PH_stack <- raster::resample(PH_stack, BV_ras[[1]]) -#' PH_stack <- stack(PH_stack, BV_ras$BIO1, BV_ras$BIO12) -#' names(PH_stack) <- c("Nutrient", "Toxicity", "Temperature", "Soil Moisture") -#' FNAME <- file.path(Dir.Data.Envir, "PH_stack") -#' saveObj(PH_stack, file = FNAME) -#' -#' ## SDM Data Preparations -------------------------------------------------- -#' message("Preparing data for SDM workflow") -#' SDMInput_ls <- FUN.PrepSDMData(occ_ls = Species_ls$occs, # list of occurrence data frames per species -#' BV_ras = BV_ras, # bioclimatic rasterstack -#' Dir = Dir.Data.ModGP, # where to store the data output on disk -#' Force = FALSE, # # do not overwrite already present data -#' parallel = numberOfCores # parallelised execution -#' ) -#' -#' # Extract the list of species names -#' species_names <- names(SDMInput_ls) -# -# # Save the species names to a file -# writeLines(species_names, "species_list.txt") From a422241545fc8178896089dbf89612dc2a12601b Mon Sep 17 00:00:00 2001 From: evalieungh Date: Wed, 30 Apr 2025 21:07:39 +0200 Subject: [PATCH 60/72] update structure --- capfitogen_master.R | 65 ++++++++++++++++------------------ submit_capfitogen_prep_lumi.sh | 6 ++-- technical_documentation.md | 16 +++++++++ 3 files changed, 49 insertions(+), 38 deletions(-) diff --git a/capfitogen_master.R b/capfitogen_master.R index 881a09d..8abb7cd 100644 --- a/capfitogen_master.R +++ b/capfitogen_master.R @@ -44,43 +44,40 @@ Dir.Scripts <- file.path(Dir.Base, "R_scripts") source(file.path(Dir.Scripts, "ModGP-commonlines.R")) ## API Credentials -------------------------------------------------------- -{# set API credentials for access to climate data store (CDS) - try(source(file.path(Dir.R_scripts, "SHARED-APICredentials.R"))) - if (as.character(options("gbif_user")) == "NULL") { - options(gbif_user = rstudioapi::askForPassword("my gbif username")) - } - if (as.character(options("gbif_email")) == "NULL") { - options(gbif_email = rstudioapi::askForPassword("my registred gbif e-mail")) - } - if (as.character(options("gbif_pwd")) == "NULL") { - options(gbif_pwd = rstudioapi::askForPassword("my gbif password")) - } - - if (!exists("API_Key") | - !exists("API_User")) { - # CS API check: if CDS API credentials have not been specified elsewhere - API_User <- - readline(prompt = "Please enter your Climate Data Store API user number and hit ENTER.") - API_Key <- - readline(prompt = "Please enter your Climate Data Store API key number and hit ENTER.") - } # end of CDS API check +# set API credentials for access to climate data store (CDS) +try(source(file.path(Dir.R_scripts, "SHARED-APICredentials.R"))) +if (as.character(options("gbif_user")) == "NULL") { + options(gbif_user = rstudioapi::askForPassword("my gbif username")) } - -## NUMBER OF CORES -{ - if (!exists("numberOfCores")) { - # Core check: if number of cores for parallel processing has not been set yet - numberOfCores <- - as.numeric(readline( - prompt = paste( - "How many cores do you want to allocate to these processes? Your machine has", - parallel::detectCores() - ) - )) - } # end of Core check - message(sprintf("numberOfCores = %d", numberOfCores)) +if (as.character(options("gbif_email")) == "NULL") { + options(gbif_email = rstudioapi::askForPassword("my registred gbif e-mail")) +} +if (as.character(options("gbif_pwd")) == "NULL") { + options(gbif_pwd = rstudioapi::askForPassword("my gbif password")) } +if (!exists("API_Key") | + !exists("API_User")) { + # CS API check: if CDS API credentials have not been specified elsewhere + API_User <- + readline(prompt = "Please enter your Climate Data Store API user number and hit ENTER.") + API_Key <- + readline(prompt = "Please enter your Climate Data Store API key number and hit ENTER.") +} # end of CDS API check + +## NUMBER OF CORES +if (!exists("numberOfCores")) { + # Core check: if number of cores for parallel processing has not been set yet + numberOfCores <- + as.numeric(readline( + prompt = paste( + "How many cores do you want to allocate to these processes? Your machine has", + parallel::detectCores() + ) + )) +} # end of Core check +message(sprintf("numberOfCores = %d", numberOfCores)) + # DATA ==================================================================== message(paste("------------------------------", " starting data download/load ", diff --git a/submit_capfitogen_prep_lumi.sh b/submit_capfitogen_prep_lumi.sh index 15d1aa2..341b20e 100644 --- a/submit_capfitogen_prep_lumi.sh +++ b/submit_capfitogen_prep_lumi.sh @@ -1,11 +1,11 @@ #!/bin/bash -l -#SBATCH -J CapFitogen +#SBATCH -J capfitogen #SBATCH -o capfitogen-%j.out #SBATCH --account=project_465000915 #SBATCH --nodes=1 #SBATCH --tasks-per-node=1 #SBATCH --cpus-per-task=8 -#SBATCH --time=24:00:00 +#SBATCH --time=48:00:00 #SBATCH --partition=small --mem=64G ##SBATCH --partition=standard --exclusive --mem=0 ##SBATCH --partition=debug --exclusive --mem=0 --time=0:30:00 @@ -23,5 +23,3 @@ export R_TERRA_MAX_RAM_MB # End of workaround git submodule update --init - -singularity run --bind $PWD cwr_0.5.3.sif "capfitogen-run_prep.R" diff --git a/technical_documentation.md b/technical_documentation.md index 8df08e2..69863fe 100644 --- a/technical_documentation.md +++ b/technical_documentation.md @@ -8,14 +8,30 @@ Scientific usefulness, written in R to be run on an hpc for a crop species or ge Main characteristics, script and workflow structure +|File | Description | +| --- | ----------- | + ### Inputs ### Outputs +---------------------- + ## Capfitogen Main characteristics, script and workflow structure. +> Disclaimer: CAPFITOGEN is separate software that was created by others. See Parra-Quijano et al. 2021, and [CAPFITOGEN.net](https://www.capfitogen.net/en/). + +Some files are shared between ModGP and capfitogen scripts (R_scripts/SHARED-Data.R and ModGP-commonlines.R). These files are specific only to capfitogen code: + +|File | Description | +| --- | ----------- | +| Capfitogen | the Capfitogen code repository. A submodule (repository within repository). | +| capfitogen_master.R | Main code for setting up environment, downloading data, and executing capfitogen tools. | +| submit_capfitogen_prep_lumi.sh | bash script to initialise capfitogen submodule and add a workaround fo memory issue. | +| submit_capfitogen_exec_lumi.sh | bash script to execute (run) capfitogen_master.R on LUMI. | + ![Figure: Illustration of scripts and data for running Capfitogen tools](capfitogen_master_illustration.drawio.svg) ### Inputs From 6dc25d7f6a5d1c62c3fbea196bb3d3679f95d13e Mon Sep 17 00:00:00 2001 From: evalieungh Date: Wed, 30 Apr 2025 21:18:23 +0200 Subject: [PATCH 61/72] update illustration --- capfitogen_master_illustration.drawio.svg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capfitogen_master_illustration.drawio.svg b/capfitogen_master_illustration.drawio.svg index a47b7cd..7f3a324 100644 --- a/capfitogen_master_illustration.drawio.svg +++ b/capfitogen_master_illustration.drawio.svg @@ -1,4 +1,4 @@ -
GBIF
GBIF
GitHub -- uc-CRW repository
GitHub -- uc-CRW repository
/R_Scripts
/R_Scripts
/Data/Environment
/Data/Environment
/Data/GBIF
/Data/GBIF
/R_Scripts

ModGP-commonlines.R


ModGP-commonlines.R...
Packages,
directories,
simple functions
Packages,...
SHARED-Data.R
SHARED-Data.R
Define functions to download data:
GBIF (FUN.DownGBIF),
BioClim vars with KrigR (FUN.DownBV),
Edaphic variables (FUN.DownEV),
Geophysical variables (FUN.DownGV)
Define functions to download data:...
ERA5
ERA5
Harmonized World Soil Database
Harmonized World Soil Data...
SoilGrids
SoilGrids
Download CAPFITOGEN scripts
Download CAPFIT...
Set species name
Set species name
run shared scripts
run shared scri...
FUN.DownGBIF()
FUN.DownGBIF()
FUN.DownBV()
FUN.DownBV()
FUN.DownEV()
FUN.DownEV()
run Capfitogen tools
run Capfitogen tools
set parameters for capfitogen
set parameters for ca...
visualising output
visualising output
Variable selection with usdm::vifcor()
Variable selection wi...
capfitogen_master.R
capfitogen_master.R
/Capfitogen-main
/scripts/Tools Herramientas/
ELCmapas.R
/scripts/Tools Herramientas/...
CAPFITOGEN tool for
ecogeographic land characterization (ELC) maps
which reflect adaptive scenarios for
a specific species
CAPFITOGEN tool for...
/scripts/Tools Herramientas/
Complementa.R
/scripts/Tools Herramientas/...
CAPFITOGEN tool for
complementarity analysis between cells or 
protected areas to see the degree of coverage of
current protected area networks in terms of in situ conservation of crop wild relatives
CAPFITOGEN tool for...
\ No newline at end of file +
GBIF
GitHub -- uc-CRW repository
/R_Scripts
/Data/Environment
/Data/GBIF
/R_Scripts

ModGP-commonlines.R


Packages,
directories,
simple functions
SHARED-Data.R
Define functions to download data:
GBIF (FUN.DownGBIF),
CAPFITOGEN's google drive (FUN.DownCAPFITOGEN)
World Database on Protected Areas (FUN.DownWDPA)
CAPFITOGEN
(WorldClim, SoilGrids, HSWD)
World Database on
Protected Areas
CAPFITOGEN submodule
Set species name
run shared scripts
FUN.DownGBIF()
FUN.DownCAPFITOGEN()
FUN.DownWDPA()
run Capfitogen tools
set parameters for capfitogen
visualising output
Variable selection with usdm::vifcor()
capfitogen_master.R
evalieungh/Capfitogen
/scripts/Tools Herramientas/
ELCmapas.R
CAPFITOGEN tool for
ecogeographic land characterization (ELC) maps
which reflect adaptive scenarios for
a specific species
/scripts/Tools Herramientas/
Complementa.R
CAPFITOGEN tool for
complementarity analysis between cells or 
protected areas to see the degree of coverage of
current protected area networks in terms of in situ conservation of crop wild relatives
\ No newline at end of file From 7d5dae3b5e334d8d15f4a459cb3da962a0e469c0 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Wed, 30 Apr 2025 21:23:49 +0200 Subject: [PATCH 62/72] update tech doc --- technical_documentation.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/technical_documentation.md b/technical_documentation.md index 69863fe..56ddab9 100644 --- a/technical_documentation.md +++ b/technical_documentation.md @@ -21,7 +21,7 @@ Main characteristics, script and workflow structure Main characteristics, script and workflow structure. -> Disclaimer: CAPFITOGEN is separate software that was created by others. See Parra-Quijano et al. 2021, and [CAPFITOGEN.net](https://www.capfitogen.net/en/). +> Disclaimer: CAPFITOGEN is separate software that was created by others. It is not owned by BioDT, but this prototype digital twin uses a subset of available scripts and functionality from CAPFITOGEN. See Parra-Quijano et al. 2021, and [CAPFITOGEN.net](https://www.capfitogen.net/en/). Some files are shared between ModGP and capfitogen scripts (R_scripts/SHARED-Data.R and ModGP-commonlines.R). These files are specific only to capfitogen code: @@ -33,6 +33,7 @@ Some files are shared between ModGP and capfitogen scripts (R_scripts/SHARED-Dat | submit_capfitogen_exec_lumi.sh | bash script to execute (run) capfitogen_master.R on LUMI. | ![Figure: Illustration of scripts and data for running Capfitogen tools](capfitogen_master_illustration.drawio.svg) +*Illustration of scripts and data for running the Capfitogen pipeline. The master script connects to data and other scripts through download functions and sourcing. To run on LUMI, the master script is executed through bash scripts.* ### Inputs From 155e362a5733b1301f39e3ef96091ad4aac475f6 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Fri, 2 May 2025 11:37:25 +0300 Subject: [PATCH 63/72] Reorder packages --- container/Dockerfile | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/container/Dockerfile b/container/Dockerfile index f222998..850b6da 100644 --- a/container/Dockerfile +++ b/container/Dockerfile @@ -56,13 +56,6 @@ RUN . /conda/etc/profile.d/conda.sh && \ conda activate /conda/env && \ conda install -c conda-forge --override-channels \ r-rcpparmadillo=14.2.2-1 \ - r-iterators=1.0.14 \ - r-sp=2.1-3 \ - r-raster=3.6-26 \ - r-rgbif=3.8.0 \ - r-rnaturalearth=1.0.1 \ - r-rnaturalearthdata=1.0.0 \ - r-ncdf4=1.22 \ r-epi=2.47.1 \ r-png=0.1-8 \ r-keyring=1.3.2 \ @@ -83,16 +76,23 @@ RUN . /conda/etc/profile.d/conda.sh && \ r-doParallel=1.0.17 \ r-foreach=1.5.2 \ r-doSNOW=1.0.20 \ - r-automap=1.1-12 \ r-fasterize=1.1.0 \ r-stars=0.6-6 \ + r-automap=1.1-12 \ + r-cowplot=1.1.3 \ r-ggplot2=3.5.1 \ + r-ggpmisc=0.6.1 \ r-ggpubr=0.6.0 \ + r-gridExtra=2.3 \ + r-ncdf4=1.22 \ + r-raster=3.6-26 \ + r-rgbif=3.8.0 \ + r-rnaturalearth=1.0.1 \ + r-rnaturalearthdata=1.0.0 \ + r-sp=2.1-3 \ r-tidyr=1.3.1 \ r-viridis=0.6.5 \ - r-cowplot=1.1.3 \ - r-ggpmisc=0.6.1 \ - r-gridExtra=2.3 \ + r-iterators=1.0.14 \ # Packages for capfitogen r-maptools=1.1-8 \ r-dismo=1.3-16 \ From 1c627e90a011ce3c1da1fe6362d012ffef80c8e2 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Fri, 2 May 2025 11:38:32 +0300 Subject: [PATCH 64/72] Add extra packages --- container/Dockerfile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/container/Dockerfile b/container/Dockerfile index 850b6da..4b6629e 100644 --- a/container/Dockerfile +++ b/container/Dockerfile @@ -85,6 +85,7 @@ RUN . /conda/etc/profile.d/conda.sh && \ r-ggpubr=0.6.0 \ r-gridExtra=2.3 \ r-ncdf4=1.22 \ + r-maps=3.4.2.1 \ r-raster=3.6-26 \ r-rgbif=3.8.0 \ r-rnaturalearth=1.0.1 \ @@ -93,6 +94,8 @@ RUN . /conda/etc/profile.d/conda.sh && \ r-tidyr=1.3.1 \ r-viridis=0.6.5 \ r-iterators=1.0.14 \ + r-rvest=1.0.4 \ + r-gdalutilities=1.2.5 \ # Packages for capfitogen r-maptools=1.1-8 \ r-dismo=1.3-16 \ From 029e77606f5ae060afa83932fd655c6c3d13b08c Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Fri, 2 May 2025 11:39:06 +0300 Subject: [PATCH 65/72] Update R --- container/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/container/Makefile b/container/Makefile index 960d0c1..61a2b27 100644 --- a/container/Makefile +++ b/container/Makefile @@ -1,13 +1,13 @@ IMAGE_ROOT?=ghcr.io/biodt IMAGE=cwr IMAGE_VERSION=0.5.3 -R_VERSION=4.3.2 +R_VERSION=4.3.3 build: Dockerfile docker buildx build --platform linux/amd64 \ --label "org.opencontainers.image.source=https://github.com/BioDT/uc-CWR" \ - --label "org.opencontainers.image.description=CWR environment with R $(R_VERSION)" \ + --label "org.opencontainers.image.description=CWR environment" \ --build-arg R_VERSION=$(R_VERSION) \ -t $(IMAGE_ROOT)/$(IMAGE):$(IMAGE_VERSION) \ . From 549cd0379e33a1333e99f3538e2ad9125fef0088 Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Fri, 2 May 2025 11:39:27 +0300 Subject: [PATCH 66/72] Remove unused broken package --- container/Dockerfile | 1 - 1 file changed, 1 deletion(-) diff --git a/container/Dockerfile b/container/Dockerfile index 4b6629e..f3e716e 100644 --- a/container/Dockerfile +++ b/container/Dockerfile @@ -143,7 +143,6 @@ RUN . /conda/etc/profile.d/conda.sh && \ "blockCV", \ "ecmwfr", \ "giscoR", \ - "tidyterra", \ "geodata", \ "mmap" \ ))' && \ From f2f495c94ef1828cd1df0ae0a8ef8a7b2fd677eb Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Fri, 2 May 2025 11:51:30 +0300 Subject: [PATCH 67/72] Update image version --- container/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/container/Makefile b/container/Makefile index 61a2b27..b303bcd 100644 --- a/container/Makefile +++ b/container/Makefile @@ -1,6 +1,6 @@ IMAGE_ROOT?=ghcr.io/biodt IMAGE=cwr -IMAGE_VERSION=0.5.3 +IMAGE_VERSION=0.6.0 R_VERSION=4.3.3 From aa27c277bbc64647c682b120812afa102529bc1f Mon Sep 17 00:00:00 2001 From: Tuomas Rossi Date: Fri, 2 May 2025 11:51:43 +0300 Subject: [PATCH 68/72] Add instructions for building containers --- container/README.md | 73 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 container/README.md diff --git a/container/README.md b/container/README.md new file mode 100644 index 0000000..7f4b6cf --- /dev/null +++ b/container/README.md @@ -0,0 +1,73 @@ +# Building container + +This directory has two main files: +- `Dockerfile`: This is the recipe for building a container image +- `Makefile`: This is a helper file for simplifying building the image (so that one can say `make build` instead of `docker buildx build --platform ... --build-arg ...`) + + +## Prerequisites + +Docker/Podman is needed for building the image on a local computer. Follow these steps on Ubuntu: + +1. Install docker or podman: + + sudo apt install podman-docker + +2. Add the following environment variable to `~/.bashrc` or redefine it always before running build commands: + + export BUILDAH_FORMAT=docker + + +## Workflow for building, publishing, and pulling images + +1. Updating and building a new image on a local computer + + 1. Update `IMAGE_VERSION` variable in `Makefile`. + If this is not done, an existing image with the same version gets overwritten, which is problematic for reproducibility. + Using [semantic versioning](https://semver.org/) is recommended. + + 2. Update `Dockerfile` and/or `Makefile` as needed. + + 3. Build a new image: + + make build + +2. Pushing image to ghcr.io + + 1. Login to GitHub container registry. + Use GitHub username and Personal Access Token with scope 'write:packages' as username and password, respectively. + See [these instructions for creating a token](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token#creating-a-personal-access-token-classic). + + docker login ghcr.io + + 2. Push the image to ghcr.io: + + make push + +3. Pulling image to LUMI and converting it to singularity + + 1. Execute the following on LUMI (if image is private, add `--docker-login` and login with a token with scope 'read:packages'): + + singularity pull --disable-cache docker://ghcr.io/biodt/cwr:IMAGE_VERSION + + This creates a file `cwr_IMAGE_VERSION.sif`. + + +## Development workflow without publishing images + +1. Updating and building a new image on a local computer + + - Follow the same steps as above + +2. Converting the image to singularity on a local computer + + 1. Convert the image to singularity (results in a .sif file): + + make singularity + +3. Transferring the image to LUMI + + 1. Use scp: + + scp file.sif user@lumi.csc.fi:/scratch/project_xxx/user/ + From 1c868aca121a19e501cf2d22cf990d6c04fcaadc Mon Sep 17 00:00:00 2001 From: evalieungh Date: Fri, 2 May 2025 16:03:01 +0200 Subject: [PATCH 69/72] update container version --- README.md | 5 ++++- submit_capfitogen_exec_lumi.sh | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 82e6e3e..1d58077 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,10 @@ Alternative ways of running the capfitogen capabilities: - To run our version of CAPFITOGEN in [RStudio](https://posit.co/downloads/), open `capfitogen_master.R` and execute the code, changing inputs like species name and other parameters. The script guides you through the whole process. After changing the species name, you can run the whole script as a background job if desired. -- To run on LUMI (assumes access to LUMI and the project): Fetch the container (step 2 from ModGP above), then submit the job for a desired species (e.g. Lathyrus): +- To run on LUMI (assumes access to LUMI and the project): + +1. Fetch the container: `singularity pull --disable-cache docker://ghcr.io/biodt/cwr:0.6.0` +2. then submit the job for a desired species (e.g. Lathyrus): sbatch submit_capfitogen_prep_lumi.sh Lathyrus sbatch submit_capfitogen_exec_lumi.sh Lathyrus diff --git a/submit_capfitogen_exec_lumi.sh b/submit_capfitogen_exec_lumi.sh index 3aa207d..0c9d8cc 100644 --- a/submit_capfitogen_exec_lumi.sh +++ b/submit_capfitogen_exec_lumi.sh @@ -12,4 +12,4 @@ ##SBATCH --partition=standard --exclusive --mem=0 ##SBATCH --partition=debug --exclusive --mem=0 --time=0:30:00 -singularity run --bind $PWD cwr_0.5.3.sif "capfitogen_master.R" \ No newline at end of file +singularity run --bind $PWD cwr_0.6.0.sif "capfitogen_master.R" \ No newline at end of file From a14a4da568dddd51469c3d27a65e9c72166c7182 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Mon, 5 May 2025 08:36:10 +0200 Subject: [PATCH 70/72] fix project number --- submit_capfitogen_exec_lumi.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/submit_capfitogen_exec_lumi.sh b/submit_capfitogen_exec_lumi.sh index 0c9d8cc..10ddad6 100644 --- a/submit_capfitogen_exec_lumi.sh +++ b/submit_capfitogen_exec_lumi.sh @@ -1,7 +1,7 @@ #!/bin/bash -l #SBATCH -J capfitogen #SBATCH -o capfitogen-%j.out -#SBATCH --account=project_465000915 +#SBATCH --account=project_465001987 #SBATCH --nodes=1 #SBATCH --tasks-per-node=1 #SBATCH --cpus-per-task=128 From 961b99ab73b144d68e77f7c8f1127e182b8b2e2d Mon Sep 17 00:00:00 2001 From: evalieungh Date: Mon, 12 May 2025 13:24:04 +0200 Subject: [PATCH 71/72] add API credentials template --- R_scripts/SHARED-APICredentials-template.R | 23 ++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 R_scripts/SHARED-APICredentials-template.R diff --git a/R_scripts/SHARED-APICredentials-template.R b/R_scripts/SHARED-APICredentials-template.R new file mode 100644 index 0000000..353cd83 --- /dev/null +++ b/R_scripts/SHARED-APICredentials-template.R @@ -0,0 +1,23 @@ +## API Credentials available for Project Partners upon request +message("API Credentials available for Project Partners upon request") + +# CDS +API_User <- "yourUserNameHere@gbif.no" # pw: yourSuperSecurePasswordHere +API_Key <- "yourSecretKeyHere" + +# Choose the number of parallel processes +RUNNING_ON_LUMI <- !is.na(strtoi(Sys.getenv("CWR_ON_LUMI"))) +if (RUNNING_ON_LUMI) { + numberOfCores <- strtoi(Sys.getenv("SLURM_NTASKS")) + if (is.na(numberOfCores)) { + numberOfCores <- 1 + } +} else { + numberOfCores <- parallel::detectCores() +} +# numberOfCores <- 1 + +# GBIF +options(gbif_user = "yourUserName") +options(gbif_email = "yourUserNameHere@gbif.no") +options(gbif_pwd = "yourGBIFpasswordHere") From 426af8c63c8c878bedea777ed78a844bf3522434 Mon Sep 17 00:00:00 2001 From: evalieungh Date: Tue, 20 May 2025 09:42:55 +0300 Subject: [PATCH 72/72] updates --- .gitignore | 2 +- .gitmodules | 1 + Data/Environment/README.md | 12 ++- Data/GBIF/Lathyrus angulatus.json | 4 +- R_scripts/SHARED-Data.R | 162 +++++++++++++++--------------- capfitogen_master.R | 18 +++- 6 files changed, 109 insertions(+), 90 deletions(-) diff --git a/.gitignore b/.gitignore index 2e7bc95..4a221e5 100644 --- a/.gitignore +++ b/.gitignore @@ -78,6 +78,6 @@ results/ELCmap/ results/ # Downloads -Capfitogen/* +Capfitogen/ hq diff --git a/.gitmodules b/.gitmodules index 597542c..0026a11 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,4 @@ [submodule "Capfitogen"] path = Capfitogen url = https://github.com/evalieungh/Capfitogen.git + ignore = dirty diff --git a/Data/Environment/README.md b/Data/Environment/README.md index 9fbba7b..f8a3f73 100644 --- a/Data/Environment/README.md +++ b/Data/Environment/README.md @@ -2,13 +2,19 @@ downloaded with functions run in capfitogen_master.R and defined in SHARED-Data.R. -## Bioclimatic variables (BV) +## Bioclimatic variables (FUN.DownBV) A set of 19 bioclimatic variables, downloaded and processed with the KrigR package. -## Edaphic variables (EV) +## Capfitogen's set of environmental variables (FUN.DownCAPFITOGEN) + +A collection of publicly available environmental data: bioclimatic, edaphic, and geophysical variables from e.g. WorldClim, SoildGrids and other sources collected in a google drive. The function downloads the data and collects it in a NetCDF (.nc) file. + +### Drafted download functions not currently in use: + +**Edaphic variables (EV)** Soil data downloaded from SoilGrids. Each map occupies ~ 5 GB. "SoilGrids is a system for global digital soil mapping that uses state-of-the-art machine learning methods to map the spatial distribution of soil properties across the globe. SoilGrids prediction models are fitted using over 230 000 soil profile observations from the WoSIS database and a series of environmental covariates. Covariates were selected from a pool of over 400 environmental layers from Earth observation derived products and other environmental information including climate, land cover and terrain morphology. The outputs of SoilGrids are global soil property maps at six standard depth intervals (according to the GlobalSoilMap IUSS working group and its specifications) at a spatial resolution of 250 meters. Prediction uncertainty is quantified by the lower and upper limits of a 90% prediction interval. The SoilGrids maps are publicly available under the [CC-BY 4.0 License](https://creativecommons.org/licenses/by/4.0/). Maps of the following soil properties are available: pH, soil organic carbon content, bulk density, coarse fragments content, sand content, silt content, clay content, cation exchange capacity (CEC), total nitrogen as well as soil organic carbon density and soil organic carbon stock." See [SoilGrids FAQ](https://www.isric.org/explore/soilgrids/faq-soilgrids). -## Geophysical variables (GV) +**Geophysical variables (GV)** diff --git a/Data/GBIF/Lathyrus angulatus.json b/Data/GBIF/Lathyrus angulatus.json index 0595327..eeecfa3 100644 --- a/Data/GBIF/Lathyrus angulatus.json +++ b/Data/GBIF/Lathyrus angulatus.json @@ -40,7 +40,7 @@ "studySubject": [ ["http://eurovoc.europa.eu/632"] ], - "datePublished": ["2025-01-27 19:07:06"], + "datePublished": ["2025-05-19 18:53:17"], "name": ["Cleaned GBIF occurrence records for species Lathyrus angulatus"], "encodingFormat": ["application/ld+json"], "mainEntity": ["Dataset"], @@ -59,7 +59,7 @@ ["File"] ], "name": ["Lathyrus angulatus.RData"], - "contentSize": [171823], + "contentSize": ["NA"], "encodingFormat": ["application/RData"] }, { diff --git a/R_scripts/SHARED-Data.R b/R_scripts/SHARED-Data.R index 85dbcaa..d22a4b1 100644 --- a/R_scripts/SHARED-Data.R +++ b/R_scripts/SHARED-Data.R @@ -162,86 +162,86 @@ FUN.DownGBIF <- function( ) names(specs_ls) <- GBIF_specs - ## Making list into single data frame when Capfitogen mode is toggled on. - # HJ: section below to create a Capfitogen data frame not used - # species data included as the sf file created above - if(Mode == "Capfitogen"){ - message("Making data for Capfitogen mode") - message(FNAME) - specs_ls <- specs_ls[[1]] - ## create capfitogen data frame - CapfitogenColumns <- c("INSTCODE", - "ACCENUMB", - "COLLNUMB", - "COLLCODE", - "COLLNAME", - "COLLINSTADDRESS", - "COLLMISSID", - "GENUS", - "SPECIES", - "SPAUTHOR", - "SUBTAXA", - "SUBTAUTHOR", - "CROPNAME", - "ACCENAME", - "ACQDATE", - "ORIGCTY", - "NAMECTY", - "ADM1", - "ADM2", - "ADM3", - "ADM4", - "COLLSITE", - "DECLATITUDE", - "LATITUDE", - "DECLONGITUDE", - "LONGITUDE", - "COORDUNCERT", - "COORDDATUM", - "GEOREFMETH", - "ELEVATION", - "COLLDATE", - "BREDCODE", - "BREDNAME", - "SAMPSTAT", - "ANCEST", - "COLLSRC", - "DONORCODE", - "DONORNAME", - "DONORNUMB", - "OTHERNUMB", - "DUPLSITE", - "DUPLINSTNAME", - "STORAGE", - "MLSSTAT", - "REMARKS") - CapfitogenData <- data.frame(matrix(data = NA, - nrow = nrow(specs_ls), - ncol = length(CapfitogenColumns))) - colnames(CapfitogenData) <- CapfitogenColumns - ## Create unique rownames for the ACCENUMB - CapfitogenData$ACCENUMB <- seq(from = 1, - to = nrow(CapfitogenData), - by = 1) - ## Add in the species, latitude and longitude (nothing else at this point) - CapfitogenData$SPECIES <- specs_ls$species - CapfitogenData$DECLATITUDE <- st_coordinates(specs_ls)[,"Y"] - CapfitogenData$DECLONGITUDE <- st_coordinates(specs_ls)[,"X"] - specs_ls_capfitogen <- CapfitogenData - } - - ### Returning Object to Disk and Environment ---- - ifelse(Mode == "Capfitogen", - occs = specs_ls_capfitogen, - occs = specs_ls) - save_ls <- list(meta = occ_meta, - occs = occs - # json = JSON_ls - ) - - saveObj(save_ls, file = FNAME) - unlink(occ_get) # removes .zip file - + # ## Making list into single data frame when Capfitogen mode is toggled on. + # # HJ: section below to create a Capfitogen data frame not used + # # species data included as the sf file created above + # if(Mode == "Capfitogen"){ + # message("Making data for Capfitogen mode") + # message(FNAME) + # specs_ls <- specs_ls[[1]] + # ## create capfitogen data frame + # CapfitogenColumns <- c("INSTCODE", + # "ACCENUMB", + # "COLLNUMB", + # "COLLCODE", + # "COLLNAME", + # "COLLINSTADDRESS", + # "COLLMISSID", + # "GENUS", + # "SPECIES", + # "SPAUTHOR", + # "SUBTAXA", + # "SUBTAUTHOR", + # "CROPNAME", + # "ACCENAME", + # "ACQDATE", + # "ORIGCTY", + # "NAMECTY", + # "ADM1", + # "ADM2", + # "ADM3", + # "ADM4", + # "COLLSITE", + # "DECLATITUDE", + # "LATITUDE", + # "DECLONGITUDE", + # "LONGITUDE", + # "COORDUNCERT", + # "COORDDATUM", + # "GEOREFMETH", + # "ELEVATION", + # "COLLDATE", + # "BREDCODE", + # "BREDNAME", + # "SAMPSTAT", + # "ANCEST", + # "COLLSRC", + # "DONORCODE", + # "DONORNAME", + # "DONORNUMB", + # "OTHERNUMB", + # "DUPLSITE", + # "DUPLINSTNAME", + # "STORAGE", + # "MLSSTAT", + # "REMARKS") + # CapfitogenData <- data.frame(matrix(data = NA, + # nrow = nrow(specs_ls), + # ncol = length(CapfitogenColumns))) + # colnames(CapfitogenData) <- CapfitogenColumns + # ## Create unique rownames for the ACCENUMB + # CapfitogenData$ACCENUMB <- seq(from = 1, + # to = nrow(CapfitogenData), + # by = 1) + # ## Add in the species, latitude and longitude (nothing else at this point) + # CapfitogenData$SPECIES <- specs_ls$species + # CapfitogenData$DECLATITUDE <- st_coordinates(specs_ls)[,"Y"] + # CapfitogenData$DECLONGITUDE <- st_coordinates(specs_ls)[,"X"] + # specs_ls_capfitogen <- CapfitogenData + # } + # + # ### Returning Object to Disk and Environment ---- + # ifelse(Mode == "Capfitogen", + # occs = specs_ls_capfitogen, + # occs = specs_ls) + # save_ls <- list(meta = occ_meta, + # occs = occs + # # json = JSON_ls + # ) + # + # saveObj(save_ls, file = FNAME) + # unlink(occ_get) # removes .zip file + # ### JSON RO-CRATE creation ---- message("Create .json RO-crate (research object) metadata") JSON_ls <- jsonlite::read_json("ro-crate-metadata.json") @@ -286,7 +286,7 @@ FUN.DownGBIF <- function( writeLines(jsonlite::toJSON(JSON_ls, pretty = TRUE), con) close(con) - save_ls + # save_ls } # BIOCLIMATIC DATA DOWNLOAD -------------------------------------------- diff --git a/capfitogen_master.R b/capfitogen_master.R index 8abb7cd..d194681 100644 --- a/capfitogen_master.R +++ b/capfitogen_master.R @@ -79,9 +79,9 @@ if (!exists("numberOfCores")) { message(sprintf("numberOfCores = %d", numberOfCores)) # DATA ==================================================================== -message(paste("------------------------------", - " starting data download/load ", - "------------------------------", +message(paste("-----------------------------------", + " starting GBIF data download/load ", + "-----------------------------------", sep = "\n")) ## Run SHARED-Data script ------------------------------------------------- @@ -100,6 +100,10 @@ Species_ls <- FUN.DownGBIF( ) ## Environmental Data (CAPFITOGEN) -------------------------------------------- +message(paste("-------------------------------------------------------", + " starting CAPFITOGEN environmental data download/load ", + "-------------------------------------------------------", + sep = "\n")) # make a template raster to resample to template_raster <- rast(nrows = 1800, ncols = 4320, @@ -116,6 +120,10 @@ all_predictors <- FUN.DownCAPFITOGEN( names(all_predictors) ## Protected areas database --------------------------------------------------- +message(paste("----------------------------------------------", + " starting protected areas data download/load ", + "----------------------------------------------", + sep = "\n")) #' download shapefiles for protected areas to overlay with Complementa tool. #' The FUN.DownWDPA function will save the file to a folder, but not load it #' into RStudio as an object. @@ -158,6 +166,10 @@ crop_to_native_range <- function( # proceed with cropping if native_range_map is valid if (!is.null(native_range_map)) { + message(paste("----------------------------------------", + " cropping to native range ", + "----------------------------------------", + sep = "\n")) # attempt to load native range map safely tryCatch({ native_range_raster <- rast(native_range_map)