diff --git a/.gitignore b/.gitignore index fabc830..4a221e5 100644 --- a/.gitignore +++ b/.gitignore @@ -54,7 +54,12 @@ rsconnect/ *.png *.txt *.rds +*.tif +*.xlsx +*.zip !Data/*/GlobalAreaCRS.RData +Data/Environment/soil/ +!**/capfitogen_world_data_googledrive_links.csv Logo/* @@ -69,7 +74,10 @@ Logo/* # Job output files *.out +results/ELCmap/ +results/ + +# Downloads +Capfitogen/ -# Downloaded data -CAPFITOGEN3/ hq diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..0026a11 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,4 @@ +[submodule "Capfitogen"] + path = Capfitogen + url = https://github.com/evalieungh/Capfitogen.git + ignore = dirty diff --git a/Capfitogen b/Capfitogen new file mode 160000 index 0000000..205ec9a --- /dev/null +++ b/Capfitogen @@ -0,0 +1 @@ +Subproject commit 205ec9a9f483fde0190d849eb14211ca4a3fa116 diff --git a/Data/Capfitogen/Placeholder.rtf b/Data/Capfitogen/Placeholder.rtf deleted file mode 100644 index 3486b7c..0000000 --- a/Data/Capfitogen/Placeholder.rtf +++ /dev/null @@ -1,8 +0,0 @@ -{\rtf1\ansi\ansicpg1252\cocoartf2759 -\cocoatextscaling0\cocoaplatform0{\fonttbl\f0\fswiss\fcharset0 Helvetica;} -{\colortbl;\red255\green255\blue255;} -{\*\expandedcolortbl;;} -\paperw11900\paperh16840\margl1440\margr1440\vieww11520\viewh8400\viewkind0 -\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 - -\f0\fs24 \cf0 Placeholder to index directory on GitHub} \ No newline at end of file diff --git a/Data/Environment/README.md b/Data/Environment/README.md new file mode 100644 index 0000000..f8a3f73 --- /dev/null +++ b/Data/Environment/README.md @@ -0,0 +1,20 @@ +# Environmental data + +downloaded with functions run in capfitogen_master.R and defined in SHARED-Data.R. + +## Bioclimatic variables (FUN.DownBV) + +A set of 19 bioclimatic variables, downloaded and processed with the KrigR package. + +## Capfitogen's set of environmental variables (FUN.DownCAPFITOGEN) + +A collection of publicly available environmental data: bioclimatic, edaphic, and geophysical variables from e.g. WorldClim, SoildGrids and other sources collected in a google drive. The function downloads the data and collects it in a NetCDF (.nc) file. + +### Drafted download functions not currently in use: + +**Edaphic variables (EV)** + +Soil data downloaded from SoilGrids. Each map occupies ~ 5 GB. "SoilGrids is a system for global digital soil mapping that uses state-of-the-art machine learning methods to map the spatial distribution of soil properties across the globe. SoilGrids prediction models are fitted using over 230 000 soil profile observations from the WoSIS database and a series of environmental covariates. Covariates were selected from a pool of over 400 environmental layers from Earth observation derived products and other environmental information including climate, land cover and terrain morphology. The outputs of SoilGrids are global soil property maps at six standard depth intervals (according to the GlobalSoilMap IUSS working group and its specifications) at a spatial resolution of 250 meters. Prediction uncertainty is quantified by the lower and upper limits of a 90% prediction interval. The SoilGrids maps are publicly available under the [CC-BY 4.0 License](https://creativecommons.org/licenses/by/4.0/). Maps of the following soil properties are available: pH, soil organic carbon content, bulk density, coarse fragments content, sand content, silt content, clay content, cation exchange capacity (CEC), total nitrogen as well as soil organic carbon density and soil organic carbon stock." See [SoilGrids FAQ](https://www.isric.org/explore/soilgrids/faq-soilgrids). + +**Geophysical variables (GV)** + diff --git a/Data/GBIF/Lathyrus angulatus.json b/Data/GBIF/Lathyrus angulatus.json new file mode 100644 index 0000000..eeecfa3 --- /dev/null +++ b/Data/GBIF/Lathyrus angulatus.json @@ -0,0 +1,86 @@ +{ + "@context": [ + ["https://w3id.org/ro/crate/1.1/context"] + ], + "@graph": [ + { + "@type": ["CreativeWork"], + "@id": ["ro-crate-metadata.json"], + "conformsTo": { + "@id": ["https://w3id.org/ro/crate/1.1"] + }, + "about": { + "@id": ["./"] + } + }, + { + "@id": ["./"], + "hasPart": [ + { + "@id": ["Lathyrus angulatus.RData"] + } + ], + "about": [ + { + "@id": ["https://www.gbif.org/species/5356429"] + } + ], + "@type": [ + ["Dataset"] + ], + "creator": { + "@id": ["https://orcid.org/0000-0002-4984-7646", "biodt-robot@gbif.no"] + }, + "author": { + "@id": ["https://orcid.org/0000-0002-4984-7646", "biodt-robot@gbif.no"] + }, + "license": { + "@id": ["https://creativecommons.org/licenses/by/4.0/"] + }, + "studySubject": [ + ["http://eurovoc.europa.eu/632"] + ], + "datePublished": ["2025-05-19 18:53:17"], + "name": ["Cleaned GBIF occurrence records for species Lathyrus angulatus"], + "encodingFormat": ["application/ld+json"], + "mainEntity": ["Dataset"], + "keywords": [ + ["GBIF"], + ["Occurrence"], + ["Biodiversity"], + ["Observation"], + ["Capfitogen"] + ], + "description": ["Capfitogen input data for Lathyrus angulatus"] + }, + { + "@id": ["Lathyrus angulatus.RData"], + "@type": [ + ["File"] + ], + "name": ["Lathyrus angulatus.RData"], + "contentSize": ["NA"], + "encodingFormat": ["application/RData"] + }, + { + "@id": ["https://orcid.org/0000-0002-4984-7646", "biodt-robot@gbif.no"], + "@type": ["Person", "Organisation"], + "name": ["biodt-cwr", "Erik Kusch"] + }, + { + "@id": ["#action1"], + "@type": ["CreateAction"], + "agent": { + "@id": ["https://orcid.org/0000-0002-4984-7646", "biodt-robot@gbif.no"] + }, + "instrument": { + "@id": ["https://github.com/BioDT/uc-CWR"] + }, + "result": [ + { + "@id": ["./"] + } + ] + } + ] +} diff --git a/ModGP-run_exec.R b/ModGP-run_exec.R index 4b5acf0..3f608c2 100644 --- a/ModGP-run_exec.R +++ b/ModGP-run_exec.R @@ -30,7 +30,11 @@ message(sprintf("SPECIES = %s", SPECIES)) Dir.Base <- getwd() Dir.Scripts <- file.path(Dir.Base, "R_scripts") +## Sourcing --------------------------------------------------------------- source(file.path(Dir.Scripts, "ModGP-commonlines.R")) +source(file.path(Dir.Scripts,"SHARED-Data.R")) +source(file.path(Dir.Scripts,"ModGP-SDM.R")) +source(file.path(Dir.Scripts,"ModGP-Outputs.R")) # Choose the number of parallel processes RUNNING_ON_LUMI <- TRUE diff --git a/ModGP-run_prep.R b/ModGP-run_prep.R index ab62e50..0dcb9cb 100644 --- a/ModGP-run_prep.R +++ b/ModGP-run_prep.R @@ -30,7 +30,11 @@ message(sprintf("SPECIES = %s", SPECIES)) Dir.Base <- getwd() Dir.Scripts <- file.path(Dir.Base, "R_scripts") +## Sourcing --------------------------------------------------------------- source(file.path(Dir.Scripts, "ModGP-commonlines.R")) +source(file.path(Dir.Scripts,"SHARED-Data.R")) +source(file.path(Dir.Scripts,"ModGP-SDM.R")) +source(file.path(Dir.Scripts,"ModGP-Outputs.R")) ## API Credentials -------------------------------------------------------- try(source(file.path(Dir.Scripts, "SHARED-APICredentials.R"))) diff --git a/ModGP_MASTER.R b/ModGP_MASTER.R index eaf5f47..4d6d381 100644 --- a/ModGP_MASTER.R +++ b/ModGP_MASTER.R @@ -30,7 +30,11 @@ message(sprintf("SPECIES = %s", SPECIES)) Dir.Base <- getwd() Dir.Scripts <- file.path(Dir.Base, "R_scripts") +## Sourcing --------------------------------------------------------------- source(file.path(Dir.Scripts, "ModGP-commonlines.R")) +source(file.path(Dir.Scripts,"SHARED-Data.R")) +source(file.path(Dir.Scripts,"ModGP-SDM.R")) +source(file.path(Dir.Scripts,"ModGP-Outputs.R")) ## API Credentials -------------------------------------------------------- try(source(file.path(Dir.Scripts, "SHARED-APICredentials.R"))) diff --git a/README.md b/README.md index 70e91d2..1d58077 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,18 @@ -# uc-CWR +# Use case Crop Wild Relatives (uc-CWR) + +This repository hosts code for a [Biodiversity Digital Twin](https://biodt.eu/) use case: the prototype Digital Twin for [Crop Wild Relatives](https://biodt.eu/use-cases/crop-wild-relatives). The prototype Digital Twin can be accessed through a grapical user interface made with R shiny and hosted on Lifewatch: [prototype digital twins GUI](http://app.biodt.lifewatch.eu/) + +> *"The Prototype Biodiversity Digital Twin (pDT) for Crop Wild Relatives is an advanced tool designed to aid in the identification and use of crop wild relatives (CWR) genetic resources to enhance crop resilience against climate-driven stresses"* [BioDT.eu/use-cases/crop-wild-relatives](https://biodt.eu/use-cases/crop-wild-relatives) + +For technical documentation, see a separate [markdown file](technical_documentation.md). Below we also outline quick instructions for running the ModGP and Capfitogen tools in R and on the LUMI supercomputer. The prototype Digital Twin is also presented in a 'Research ideas and outcomes' paper: [Chala et al. 2024](https://doi.org/10.3897/rio.10.e125192). The core functionality of the digital twin is ModGP (Modelling the GermPlasm of interest), but two of Capfitogen's tools have since been added to extend the prototype Digital Twin's usefulness. + +> *"MoDGP leverages species distribution modelling, relying on occurrence data of CWR to produce habitat suitability maps, establish mathematical correlations between adaptive traits, such as tolerance to drought and pathogens and environmental factors and facilitates mapping geographic areas where populations possessing genetic resources for resilience against various biotic and abiotic stresses are potentially growing."* [Chala et al. 2024](https://doi.org/10.3897/rio.10.e125192) + +--------------------------------- ## ModGP on Rstudio -1. Source `ModGP_MASTER.R` and change `SPECIES` argument at line 19 to execute ModGP pipeline for a specific genus. +1. Source `ModGP_MASTER.R` and change `SPECIES` argument at line 19 to execute ModGP pipeline for a specific genus. NB! ModGP should be run on a supercomputer. The environmental data download has very large interim files (>40GB per year per variable, >200 GB overall), and the distribution modelling also requires a long time to run. ## ModGP on LUMI with Hyperqueue @@ -22,20 +32,22 @@ sbatch submit_modgp_exec_lumi_HQ.sh Lathyrus -## CAPFITOGEN demo +## CAPFITOGEN + +As an addition to ModGP, you can run two of [Capfitogen](https://www.capfitogen.net/en/)'s most useful tools: [ecogeographic land characterization (ELC) maps](https://www.capfitogen.net/en/tools/elc-mapas/) and [Complementa](https://www.capfitogen.net/en/tools/complementa/) maps to visualise overlap with protected areas. +Because a lot of variables will be downloaded and processed, the total memory requirements may be too large for most personal computers. Try with a subset of the data if necessary. + +NB! After cloning this repository, you need to clone Capfitogen (a submodule) as well with `git submodule update --init`. + +Alternative ways of running the capfitogen capabilities: -See [documentation](https://www.capfitogen.net/en). +- To run our version of CAPFITOGEN in [RStudio](https://posit.co/downloads/), open `capfitogen_master.R` and execute the code, changing inputs like species name and other parameters. The script guides you through the whole process. After changing the species name, you can run the whole script as a background job if desired. -1. Download `CAPFITOGEN3.zip` from - [here](https://drive.google.com/file/d/1EJw-XcC1NRVFS7mwzlg1VpQBpRCdfWRd/view?usp=sharing) - and extract it to the project root. +- To run on LUMI (assumes access to LUMI and the project): -2. Download `rdatamaps/world/20x20` directory from - [here](https://drive.google.com/drive/folders/19bqG_Z3aFhzrCWQp1yWvMbsLivsCicHh) - and extract it to `CAPFITOGEN3/rdatamaps/world/20x20`. +1. Fetch the container: `singularity pull --disable-cache docker://ghcr.io/biodt/cwr:0.6.0` +2. then submit the job for a desired species (e.g. Lathyrus): -3. Run on LUMI: obtain interactive session: - `srun -p small --nodes=1 --ntasks-per-node=1 --mem=8G -t 4:00:00 --pty bash` - and execute the workflow: - `singularity run --bind $PWD cwr_0.2.0.sif capfitogen.R` + sbatch submit_capfitogen_prep_lumi.sh Lathyrus + sbatch submit_capfitogen_exec_lumi.sh Lathyrus diff --git a/R_scripts/Capfitogen_visualisation.R b/R_scripts/Capfitogen_visualisation.R new file mode 100644 index 0000000..6ecb6b3 --- /dev/null +++ b/R_scripts/Capfitogen_visualisation.R @@ -0,0 +1,97 @@ +############################################################## +#' Visualisation of Capfitogen inputs and output +#' CONTENTS: +#' - Visualisation of input data +#' - World Database on Protected Areas (WDPA) +#' - Visualisation of outputs +#' - ELC maps +#' - +#' DEPENDENCIES: +#' - capfitogen_master.R (to download and create data) +#' - ModGP-commonlines.R (packages, paths) +#' AUTHORS: [Eva Lieungh] +#' ########################################################### + +# Load dependencies ------------------------------------------ +# Define directories in relation to project directory +Dir.Base <- getwd() +Dir.Scripts <- file.path(Dir.Base, "R_scripts") + +# source packages, directories, simple functions (...) +source(file.path(Dir.Scripts, "ModGP-commonlines.R")) + +# VISUALISE INPUTS =========================================== + +# WDPA ------------------------------------------------------- + + + + + +# VISUALISE OUTPUTS ========================================== + +# ELC maps --------------------------------------------------- +## quick visualisation ---- +# List all the .tif files in the directory +elc_tif_outputs <- list.files(path = Dir.Results.ELCMap, + pattern = "\\.tif$", + full.names = TRUE) + +# Loop over each .tif file +for (file_path in elc_tif_outputs) { + # Read the raster file + map_i <- rast(file_path) + + # Replace NaN with NA (if they exist) + map_i[is.nan(values(map_i))] <- NA + + # Create a mask to highlight non-zero areas + non_zero_mask <- mask(map_i, !is.na(map_i)) + + # Convert to points to find non-zero values' extent + points <- as.points(non_zero_mask, na.rm = TRUE) + + # If there are any valid points, proceed with cropping + if (!is.null(points) && nrow(points) > 0) { + # Calculate extent directly from the non-empty points + coordinates <- terra::geom(points)[, c("x", "y")] + xmin = min(coordinates[,"x"]) + xmax = max(coordinates[,"x"]) + ymin = min(coordinates[,"y"]) + ymax = max(coordinates[,"y"]) + non_zero_extent <- ext(xmin, xmax, ymin, ymax) + + # Crop the raster using this extent + cropped_map <- crop(map_i, non_zero_extent) + + # Plot the cropped raster + plot(cropped_map, main = basename(file_path)) + } else { + plot(map_i, main = paste(basename(file_path), "(No non-zero values)")) + } +} + + +# Complementa --------------------------------------------------------- + +complementa_map <- rast( + file.path(Dir.Results.Complementa, + "AnalisisCeldas_CellAnalysis/Complementa_map.tif")) +plot(complementa_map) +complementa_map[is.nan(values(complementa_map))] <- NA +non_zero_mask <- mask(complementa_map, + !is.na(complementa_map)) +complementa_points <- as.points(non_zero_mask, na.rm = TRUE) +plot(complementa_points) + +map( + 'world', + col = "grey", + fill = TRUE, + bg = "white", + lwd = 0.05, + mar = rep(0, 4), + border = 0, + ylim = c(-80, 80) +) +points(complementa_points) \ No newline at end of file diff --git a/R_scripts/ModGP-commonlines.R b/R_scripts/ModGP-commonlines.R index 441dc67..39a2104 100644 --- a/R_scripts/ModGP-commonlines.R +++ b/R_scripts/ModGP-commonlines.R @@ -1,3 +1,7 @@ +#'########################################################################### +#' + + ## Default flags for runtime environment RUNNING_ON_LUMI <- FALSE RUNNING_ON_DESTINE <- FALSE @@ -10,46 +14,81 @@ install.load.package <- function(x) { } ### CRAN PACKAGES ---- package_vec <- c( - 'cowplot', # grid plotting - 'ggplot2', # ggplot machinery - 'ggpmisc', # table plotting in ggplot environment - 'ggpubr', # t-test comparison in ggplot - 'gridExtra', # ggplot saving in PDF - 'parallel', # parallel runs - 'pbapply', # parallel runs with estimator bar - 'raster', # spatial data - 'remotes', # remote installation - 'rgbif', # GBIF access - 'rnaturalearth', # shapefiles - 'sdm', # SDM machinery - 'sf', # spatial data - 'sp', # spatial data - 'terra', # spatial data - 'tidyr', # gather() - 'usdm', # vifcor() - 'viridis', # colour palette - 'iterators' + 'automap', # automatic interpolation (for KrigR) + 'cowplot', # grid plotting + 'curl', # downloading data + 'ggplot2', # ggplot machinery + 'ggpp', + 'ggpmisc', # table plotting in ggplot environment + 'ggpubr', # t-test comparison in ggplot + 'gridExtra', # ggplot saving in PDF + 'ncdf4', # handling NetCDF files + 'maps', # background maps + 'parallel', # parallel runs + 'pbapply', # parallel runs with estimator bar + 'raster', # spatial data ----------------------- should be replaced by terra + 'remotes', # remote installation + 'rgbif', # GBIF access + 'rnaturalearth', # shapefiles + 'rnaturalearthdata', # needed for FUN.Down.BV() + 'sdm', # SDM machinery + 'sf', # spatial data + 'sp', # spatial data + 'terra', # spatial data + 'tidyr', # gather() + 'usdm', # vifcor() + 'viridis', # colour palette + 'bit64', + 'iterators', + 'rvest', # to scrape google drive html + 'gdalUtilities', # to download from SoilGrids (FUN.DownEV) + + # # Capfitogen SelectVar packages + # # HJ: added here from Capfitogen SelectVar script. To do: remove unnecessary ones + # 'dismo', + # 'cluster', + # 'ade4', + # 'labdsv', + # 'mclust', + # 'clustvarsel', + # 'ranger', + + # Capfitogen ECLmapas packages + # HJ: added here from Capfitogen ECLmapas script. To do: remove unnecessary ones + 'flexmix', + 'fpc', + 'vegan', + 'adegenet' #find.clusters ELCmap.R ) + sapply(package_vec, install.load.package) ### NON-CRAN PACKAGES ---- -if(packageVersion("KrigR") < "0.9.6.1"){ # KrigR check - devtools::install_github("https://github.com/ErikKusch/KrigR", ref = "Development") +# check if KrigR is missing or outdated +if(packageVersion("KrigR") < "0.9.6.1" || + "KrigR" %in% rownames(installed.packages()) == FALSE) { + message("installing KrigR from github.com/ErikKusch/KrigR") + devtools::install_github("https://github.com/ErikKusch/KrigR", + ref = "Development") } library(KrigR) -if("mraster" %in% rownames(installed.packages()) == FALSE){ # KrigR check - remotes::install_github("babaknaimi/mraster") +if ("mraster" %in% rownames(installed.packages()) == FALSE) { + # KrigR check + remotes::install_github("babaknaimi/mraster") } library(mraster) -if(!("maxent" %in% unlist(getmethodNames()))){sdm::installAll()} # install methods for sdm package +if (!("maxent" %in% unlist(getmethodNames()))) { + sdm::installAll() +} # install methods for sdm package ## updating package_vec for handling of parallel environments package_vec <- c(package_vec, "KrigR", "mraster") ## Functionality ---------------------------------------------------------- -`%nin%` <- Negate(`%in%`) # a function for negation of %in% function +#' a function for negation of %in% function +`%nin%` <- Negate(`%in%`) #' Progress bar for data loading saveObj <- function(object, file.name){ @@ -78,8 +117,19 @@ Dir.Data <- file.path(Dir.Base, "Data") Dir.Data.ModGP <- file.path(Dir.Data, "ModGP") Dir.Data.GBIF <- file.path(Dir.Data, "GBIF") Dir.Data.Envir <- file.path(Dir.Data, "Environment") +Dir.Data.Capfitogen <- file.path(Dir.Data, "Capfitogen") Dir.Exports <- file.path(Dir.Base, "Exports") Dir.Exports.ModGP <- file.path(Dir.Exports, "ModGP") +Dir.Exports.Capfitogen <- file.path(Dir.Exports, "Capfitogen") +Dir.R_scripts <- file.path(Dir.Base, "R_scripts") +Dir.Capfitogen <- file.path(Dir.Base, "Capfitogen") +Dir.Capfitogen.WDPA <- file.path(Dir.Capfitogen, "wdpa") +Dir.Capfitogen.ELCMap <- file.path(Dir.Capfitogen, "ELCmapas") +Dir.Capfitogen.Error <- file.path(Dir.Capfitogen, "Error") +Dir.Results <- file.path(Dir.Base, "results") +Dir.Results.Complementa <- file.path(Dir.Results, "Complementa") +Dir.Results.Complementa.Error <- file.path(Dir.Results.Complementa, "Error") + ### Create directories which aren't present yet Dirs <- grep(ls(), pattern = "Dir.", value = TRUE) CreateDir <- sapply(Dirs, function(x){ @@ -87,7 +137,4 @@ CreateDir <- sapply(Dirs, function(x){ if(!dir.exists(x)) dir.create(x)}) rm(Dirs) -## Sourcing --------------------------------------------------------------- -source(file.path(Dir.Scripts,"SHARED-Data.R")) -source(file.path(Dir.Scripts,"ModGP-SDM.R")) -source(file.path(Dir.Scripts,"ModGP-Outputs.R")) +# End of file diff --git a/R_scripts/README.md b/R_scripts/README.md new file mode 100644 index 0000000..891ce9c --- /dev/null +++ b/R_scripts/README.md @@ -0,0 +1 @@ +This directory contains R scripts to be sourced by master scripts in the repository root (i.e. one level up in the folder structure). \ No newline at end of file diff --git a/R_scripts/SHARED-APICredentials-template.R b/R_scripts/SHARED-APICredentials-template.R new file mode 100644 index 0000000..353cd83 --- /dev/null +++ b/R_scripts/SHARED-APICredentials-template.R @@ -0,0 +1,23 @@ +## API Credentials available for Project Partners upon request +message("API Credentials available for Project Partners upon request") + +# CDS +API_User <- "yourUserNameHere@gbif.no" # pw: yourSuperSecurePasswordHere +API_Key <- "yourSecretKeyHere" + +# Choose the number of parallel processes +RUNNING_ON_LUMI <- !is.na(strtoi(Sys.getenv("CWR_ON_LUMI"))) +if (RUNNING_ON_LUMI) { + numberOfCores <- strtoi(Sys.getenv("SLURM_NTASKS")) + if (is.na(numberOfCores)) { + numberOfCores <- 1 + } +} else { + numberOfCores <- parallel::detectCores() +} +# numberOfCores <- 1 + +# GBIF +options(gbif_user = "yourUserName") +options(gbif_email = "yourUserNameHere@gbif.no") +options(gbif_pwd = "yourGBIFpasswordHere") diff --git a/R_scripts/SHARED-Data.R b/R_scripts/SHARED-Data.R index 3a11e2f..d22a4b1 100644 --- a/R_scripts/SHARED-Data.R +++ b/R_scripts/SHARED-Data.R @@ -5,37 +5,39 @@ #' - Bioclimatic Variable Climatology creation for qsoil1 and qsoil2 combined #' DEPENDENCIES: #' - None -#' AUTHOR: [Erik Kusch] +#' AUTHORS: [Erik Kusch, Eva Lieungh] #' ####################################################################### # # GBIF DOWNLOAD FUNCTION -------------------------------------------------- -# queries download from GBIF, handles and cleans data, returns SF MULTIPOINT object and GBIF download metadata -FUN.DownGBIF <- function(species = NULL, # species name as character for whose genus data is to be downloaded - Dir = getwd(), # where to store the data - Force = FALSE, # whether the download should be forced despite local data already existing - Mode = "ModGP", # which specification to run, either for whole GENUS of supplied species (ModGP), or for species directly (Capfitogen) - parallel = 1 # an integer, 1 = sequential; always defaults to sequential when Mode == "Capfitogen" +#' queries download from GBIF, handles and cleans data, +#' returns SF MULTIPOINT object and GBIF download metadata +FUN.DownGBIF <- function( + species = NULL, # species name as character for whose genus data is to be downloaded + Dir = getwd(), # where to store the data + Force = FALSE, # overwrite existing data? + Mode = "ModGP", # which specification to run, either for whole GENUS of supplied species (ModGP), or for species directly (Capfitogen) + parallel = 1 # an integer, 1 = sequential; always defaults to sequential when Mode == "Capfitogen" ){ ## Preparing species name identifiers input_species <- species - - ## Focussing on Genus-part of the name if Mode is set to ModGP + + ## Focusing on Genus-part of the name if Mode is set to ModGP if(Mode == "ModGP"){ species <- strsplit(input_species, " ")[[1]][1] } - ## Filename and data presence check + ## File name and data presence check FNAME <- file.path(Dir, paste0(species, ".RData")) if(!Force & file.exists(FNAME)){ save_ls <- loadObj(FNAME) - message("Data has already been downloaded with these specifications previously. It has been loaded from the disk. If you wish to override the present data, please specify Force = TRUE") + message("Data has already been downloaded with these specifications. It has been loaded from the disk. \nIf you wish to override the present data, please specify Force = TRUE") return(save_ls) } ## Function start message("Starting GBIF data retrieval") - ## GBIF ID Query ---- - ## GBIF query + + ## GBIF taxa Query ---- if(Mode == "ModGP"){ message(paste("## Resolving", species, "at genus level")) RankGBIF <- "genus" @@ -45,7 +47,8 @@ FUN.DownGBIF <- function(species = NULL, # species name as character for whose g RankGBIF <- "species" } GBIF_match <- name_backbone(name = species, - rank = RankGBIF, kingdom = "plante") + rank = RankGBIF, + kingdom = "plante") ## Extracting taxonkey tax_ID <- ifelse(GBIF_match$rank != toupper(RankGBIF), NA, @@ -68,8 +71,10 @@ FUN.DownGBIF <- function(species = NULL, # species name as character for whose g pred("occurrenceStatus", "PRESENT"), format = "SIMPLE_CSV") curlopts <- list(http_version = 2) # needed on Mac to avoid HTTP issues in next line (see here: https://github.com/ropensci/rgbif/issues/579) - occ_meta <- occ_download_wait(occ_down, status_ping = 30, - curlopts = list(), quiet = FALSE) # wait for download to finish + occ_meta <- occ_download_wait(occ_down, + status_ping = 30, + curlopts = list(), + quiet = FALSE) # wait for download to finish occ_get <- occ_download_get(occ_down, path = Dir) # download data curlopts <- list(http_version = 1) # resetting this to not affect other functions @@ -79,14 +84,20 @@ FUN.DownGBIF <- function(species = NULL, # species name as character for whose g ## Manipulating GBIF Data ---- ### Resolving Common Issues ---- - message("Resolving Common Data Issues") + message("Resolving Common Data Issues. Removing occurrences") ## removing bases of record that may not be linked to coordinates properly - occ_occ <- occ_occ[occ_occ$basisOfRecord %nin% c("PRESERVED_SPECIMEN", "MATERIAL_CITATION"), ] + message("... that may not be linked to coordinates properly") + occ_occ <- occ_occ[occ_occ$basisOfRecord %nin% c("PRESERVED_SPECIMEN", + "MATERIAL_CITATION"), ] ## removing highly uncertain locations, i.e., anything more than 1km in uncertainty + message("... with >1km uncertainty") occ_occ <- occ_occ[occ_occ$coordinateUncertaintyInMeters <= 1000, ] ## removing rounded coordinates - occ_occ <- occ_occ[-grep(occ_occ$issue, pattern = "COORDINATE_ROUNDED"), ] + message("... with rounded coordinates") + occ_occ <- occ_occ[-grep(occ_occ$issue, + pattern = "COORDINATE_ROUNDED"), ] ## removing empty species rows + message("... with empty species rows") occ_occ <- occ_occ[occ_occ$species != "" & !is.na(occ_occ$species), ] ### Parallel Set-Up ---- @@ -98,191 +109,656 @@ FUN.DownGBIF <- function(species = NULL, # species name as character for whose g parallel <- parallel::makeCluster(parallel) on.exit(stopCluster(parallel)) print("R Objects loading to cluster") - parallel::clusterExport(parallel, varlist = c( - "package_vec", "install.load.package", - "occ_occ" - ), envir = environment()) + parallel::clusterExport(parallel, + varlist = c( + "package_vec", + "install.load.package", + "occ_occ"), + envir = environment()) print("R Packages loading on cluster") - clusterpacks <- clusterCall(parallel, function() sapply(package_vec, install.load.package)) + clusterpacks <- clusterCall(parallel, + function() sapply(package_vec, + install.load.package)) } ### Making SF for species ---- message("Extracting species-level data into MULTIPOINT objects") GBIF_specs <- unique(occ_occ$species) - ## Making a list of spatialfeatures MULTIPOINT objects denoting unique locations of presence per species - specs_ls <- pblapply(GBIF_specs, - cl = parallel, - FUN = function(x){ - spec_df <- occ_occ[occ_occ$species == x, ] - spec_uniloca <- occ_occ[occ_occ$species == x, c("species", "decimalLatitude", "decimalLongitude")] - spec_df <- spec_df[!duplicated(spec_uniloca), - c("gbifID", "datasetKey", "occurrenceID", "species", "scientificName", "speciesKey", - "decimalLatitude", "decimalLongitude", "coordinateUncertaintyInMeters", - "eventDate", "basisOfRecord", "recordNumber", "issue") - ] - spec_df$presence <- 1 - st_as_sf(spec_df, coords = c("decimalLongitude", "decimalLatitude")) - }) + ##' Making a list of spatialfeatures MULTIPOINT objects + ##' denoting unique locations of presence per species + specs_ls <- pblapply( + GBIF_specs, + cl = parallel, + FUN = function(x) { + ## dataframe of occurrences + spec_df <- occ_occ[occ_occ$species == x,] + ## unique locations + spec_uniloca <- occ_occ[occ_occ$species == x, + c("species", + "decimalLatitude", + "decimalLongitude")] + ## remove duplicates (of populations) + spec_df <- spec_df[!duplicated(spec_uniloca), + c("gbifID", + "datasetKey", + "occurrenceID", + "species", + "scientificName", + "speciesKey", + "decimalLatitude", + "decimalLongitude", + "coordinateUncertaintyInMeters", + "eventDate", + "basisOfRecord", + "recordNumber", + "issue" + )] + spec_df$presence <- 1 + st_as_sf(spec_df, + coords = c("decimalLongitude", + "decimalLatitude")) + } + ) names(specs_ls) <- GBIF_specs - ## Making list into single data frame when Capfitogen mode is toggled on. - if(Mode == "Capfitogen"){ - specs_ls <- specs_ls[[1]] - ## create capfitogen data frame - CapfitogenColumns <- c("INSTCODE", "ACCENUMB", "COLLNUMB", "COLLCODE", "COLLNAME", "COLLINSTADDRESS", "COLLMISSID", "GENUS", "SPECIES", "SPAUTHOR", "SUBTAXA", "SUBTAUTHOR", "CROPNAME", "ACCENAME", "ACQDATE", "ORIGCTY", "NAMECTY", "ADM1", "ADM2", "ADM3", "ADM4", "COLLSITE", "DECLATITUDE", "LATITUDE", "DECLONGITUDE", "LONGITUDE", "COORDUNCERT", "COORDDATUM", "GEOREFMETH", "ELEVATION", "COLLDATE", "BREDCODE", "BREDNAME", "SAMPSTAT", "ANCEST", "COLLSRC", "DONORCODE", "DONORNAME", "DONORNUMB", "OTHERNUMB", "DUPLSITE", "DUPLINSTNAME", "STORAGE", "MLSSTAT", "REMARKS") - CapfitogenData <- data.frame(matrix(data = NA, nrow = nrow(specs_ls), ncol = length(CapfitogenColumns))) - colnames(CapfitogenData) <- CapfitogenColumns - ## Create unique rownames for the ACCENUMB - CapfitogenData$ACCENUMB <- seq(from = 1, to = nrow(CapfitogenData), by = 1) - ## Add in the species, latitude and longitude (nothing else at this point) - CapfitogenData$SPECIES <- specs_ls$species - CapfitogenData$DECLATITUDE <- st_coordinates(specs_ls)[,"Y"] - CapfitogenData$DECLONGITUDE <- st_coordinates(specs_ls)[,"X"] - specs_ls <- CapfitogenData - } - - ### Returning Object to Disk and Environment ---- - save_ls <- list(meta = occ_meta, - occs = specs_ls - # , - # json = JSON_ls - ) - - saveObj(save_ls, file = FNAME) - unlink(occ_get) # removes .zip file - + # ## Making list into single data frame when Capfitogen mode is toggled on. + # # HJ: section below to create a Capfitogen data frame not used + # # species data included as the sf file created above + # if(Mode == "Capfitogen"){ + # message("Making data for Capfitogen mode") + # message(FNAME) + # specs_ls <- specs_ls[[1]] + # ## create capfitogen data frame + # CapfitogenColumns <- c("INSTCODE", + # "ACCENUMB", + # "COLLNUMB", + # "COLLCODE", + # "COLLNAME", + # "COLLINSTADDRESS", + # "COLLMISSID", + # "GENUS", + # "SPECIES", + # "SPAUTHOR", + # "SUBTAXA", + # "SUBTAUTHOR", + # "CROPNAME", + # "ACCENAME", + # "ACQDATE", + # "ORIGCTY", + # "NAMECTY", + # "ADM1", + # "ADM2", + # "ADM3", + # "ADM4", + # "COLLSITE", + # "DECLATITUDE", + # "LATITUDE", + # "DECLONGITUDE", + # "LONGITUDE", + # "COORDUNCERT", + # "COORDDATUM", + # "GEOREFMETH", + # "ELEVATION", + # "COLLDATE", + # "BREDCODE", + # "BREDNAME", + # "SAMPSTAT", + # "ANCEST", + # "COLLSRC", + # "DONORCODE", + # "DONORNAME", + # "DONORNUMB", + # "OTHERNUMB", + # "DUPLSITE", + # "DUPLINSTNAME", + # "STORAGE", + # "MLSSTAT", + # "REMARKS") + # CapfitogenData <- data.frame(matrix(data = NA, + # nrow = nrow(specs_ls), + # ncol = length(CapfitogenColumns))) + # colnames(CapfitogenData) <- CapfitogenColumns + # ## Create unique rownames for the ACCENUMB + # CapfitogenData$ACCENUMB <- seq(from = 1, + # to = nrow(CapfitogenData), + # by = 1) + # ## Add in the species, latitude and longitude (nothing else at this point) + # CapfitogenData$SPECIES <- specs_ls$species + # CapfitogenData$DECLATITUDE <- st_coordinates(specs_ls)[,"Y"] + # CapfitogenData$DECLONGITUDE <- st_coordinates(specs_ls)[,"X"] + # specs_ls_capfitogen <- CapfitogenData + # } + # + # ### Returning Object to Disk and Environment ---- + # ifelse(Mode == "Capfitogen", + # occs = specs_ls_capfitogen, + # occs = specs_ls) + # save_ls <- list(meta = occ_meta, + # occs = occs + # # json = JSON_ls + # ) + # + # saveObj(save_ls, file = FNAME) + # unlink(occ_get) # removes .zip file + # ### JSON RO-CRATE creation ---- + message("Create .json RO-crate (research object) metadata") JSON_ls <- jsonlite::read_json("ro-crate-metadata.json") JSON_ls$`@graph`[[2]]$hasPart[[1]]$`@id` <- basename(FNAME) - JSON_ls$`@graph`[[2]]$about[[1]]$`@id` <- paste0("https://www.gbif.org/species/", tax_ID) # gbif ID - JSON_ls$`@graph`[[2]]$creator$`@id` <- c(JSON_ls$`@graph`[[2]]$creator$`@id`, as.character(options("gbif_email"))) - JSON_ls$`@graph`[[2]]$author$`@id` <- c(JSON_ls$`@graph`[[2]]$author$`@id`, as.character(options("gbif_email"))) + JSON_ls$`@graph`[[2]]$about[[1]]$`@id` <- paste0("https://www.gbif.org/species/", + tax_ID) # gbif taxon ID + JSON_ls$`@graph`[[2]]$creator$`@id` <- c(JSON_ls$`@graph`[[2]]$creator$`@id`, + as.character(options("gbif_email"))) + JSON_ls$`@graph`[[2]]$author$`@id` <- c(JSON_ls$`@graph`[[2]]$author$`@id`, + as.character(options("gbif_email"))) JSON_ls$`@graph`[[2]]$datePublished <- Sys.time() - JSON_ls$`@graph`[[2]]$name <- paste("Cleaned GBIF occurrence records for", RankGBIF, species) - JSON_ls$`@graph`[[2]]$keywords <- list("GBIF", "Occurrence", "Biodiversity", "Observation", Mode) - JSON_ls$`@graph`[[2]]$description <- paste(Mode, "input data for", species) + JSON_ls$`@graph`[[2]]$name <- paste("Cleaned GBIF occurrence records for", + RankGBIF, species) + JSON_ls$`@graph`[[2]]$keywords <- list("GBIF", + "Occurrence", + "Biodiversity", + "Observation", + Mode) + JSON_ls$`@graph`[[2]]$description <- paste(Mode, + "input data for", + species) JSON_ls$`@graph`[[3]]$name <- basename(FNAME) JSON_ls$`@graph`[[3]]$contentSize <- file.size(FNAME) JSON_ls$`@graph`[[3]]$encodingFormat <- "application/RData" JSON_ls$`@graph`[[3]]$`@id` <- basename(FNAME) + + JSON_ls$`@graph`[[4]]$name <- c(as.character(options("gbif_user")), + JSON_ls$`@graph`[[4]]$name) + JSON_ls$`@graph`[[4]]$`@id` <- c(JSON_ls$`@graph`[[4]]$`@id`, + as.character(options("gbif_email"))) + JSON_ls$`@graph`[[4]]$`@type` <- c(JSON_ls$`@graph`[[4]]$`@type`, + "Organisation") - JSON_ls$`@graph`[[4]]$name <- c(as.character(options("gbif_user")), JSON_ls$`@graph`[[4]]$name) - JSON_ls$`@graph`[[4]]$`@id` <- c(JSON_ls$`@graph`[[4]]$`@id`, as.character(options("gbif_email"))) - JSON_ls$`@graph`[[4]]$`@type` <- c(JSON_ls$`@graph`[[4]]$`@type`, "Organisation") - - JSON_ls$`@graph`[[5]]$agent$`@id` <- c(JSON_ls$`@graph`[[5]]$agent$`@id`, as.character(options("gbif_email"))) + JSON_ls$`@graph`[[5]]$agent$`@id` <- c(JSON_ls$`@graph`[[5]]$agent$`@id`, + as.character(options("gbif_email"))) + JSON_ls$`@graph`[[5]]$instrument$`@id` <- "https://github.com/BioDT/uc-CWR" con <- file(file.path(Dir, paste0(species, ".json"))) writeLines(jsonlite::toJSON(JSON_ls, pretty = TRUE), con) close(con) - save_ls + # save_ls } -# BIOCLIMATIC VARIABLE DOWNLOAD -------------------------------------------- -#' queries and downloads and computes bioclimatic variables at global extent from ERA5-Land, Water availability is based on soil moisture level 1 (0-7cm) and 2 (7-28cm) -FUN.DownBV <- function(T_Start = 1970, # what year to begin climatology calculation in - T_End = 2000, # what year to end climatology calculation in - Dir = getwd(), # where to store the data output on disk - Force = FALSE # do not overwrite already present data - ){ - FNAME <- file.path(Dir, paste0("BV_", T_Start, "-", T_End, ".nc")) - - if(!Force & file.exists(FNAME)){ - BV_ras <- stack(FNAME) - names(BV_ras) <- paste0("BIO", 1:19) - message("Data has already been downloaded with these specifications previously. It has been loaded from the disk. If you wish to override the present data, please specify Force = TRUE") - return(BV_ras) - } - - ### Raw soil moisture level data ---- - #' We download raw soil moisture data for layers 1 (0-7cm) and 2 (7-28cm) separately. These are then summed up and used in the bioclimatic variable computation of KrigR - FNAME_RAW <- file.path(Dir, paste(tools::file_path_sans_ext(basename(FNAME)), "volumetric_soil_water_layer_1", "RAW.nc", sep = "_")) - if(!file.exists(FNAME_RAW)) { - #### Downloading ---- - Qsoil1_ras <- CDownloadS( - Variable = "volumetric_soil_water_layer_1", # could also be total_precipitation - DataSet = "reanalysis-era5-land-monthly-means", - Type = "monthly_averaged_reanalysis", - DateStart = paste0(T_Start, "-01-01 00:00"), - DateStop = paste0(T_End, "-12-31 23:00"), - TResolution = "month", - Dir = Dir, - Extent = ne_countries(type = "countries", scale = "medium")[,1], - FileName = "Qsoil1", - API_User = API_User, - API_Key = API_Key - ) - - Qsoil2_ras <- CDownloadS( - Variable = "volumetric_soil_water_layer_2", # could also be total_precipitation - DataSet = "reanalysis-era5-land-monthly-means", - Type = "monthly_averaged_reanalysis", - DateStart = paste0(T_Start, "-01-01 00:00"), - DateStop = paste0(T_End, "-12-31 23:00"), - TResolution = "month", - Dir = Dir, - Extent = ne_countries(type = "countries", scale = "medium")[,1], - FileName = "Qsoil2", - API_User = API_User, - API_Key = API_Key - ) - - #### Combining ---- - QSoilCombin_ras <- Qsoil1_ras + Qsoil2_ras - - #### Saving ---- - terra::metags(QSoilCombin_ras) <- terra::metags(Qsoil1_ras) - QSoilCombin_ras <- KrigR:::Meta.NC( - NC = QSoilCombin_ras, - FName = FNAME_RAW, - Attrs = terra::metags(QSoilCombin_ras), Write = TRUE, - Compression = 9 - ) - - ### Deleting unnecessary files ---- - unlink(list.files(Dir, pattern = "Qsoil", full.names = TRUE)) - } - - ### Bioclimatic data ---- - BV_ras <- BioClim( - Temperature_Var = "2m_temperature", - Temperature_DataSet = "reanalysis-era5-land", - Temperature_Type = NA, - Water_Var = "volumetric_soil_water_layer_1", # could also be total_precipitation - Water_DataSet = "reanalysis-era5-land-monthly-means", - Water_Type = "monthly_averaged_reanalysis", - Y_start = T_Start, Y_end = T_End, - Extent = ne_countries(type = "countries", scale = "medium")[,1], - Dir = Dir, FileName = basename(FNAME), - FileExtension = ".nc", Compression = 9, # file storing - API_User = API_User, - API_Key = API_Key - ) - - ### JSON RO-CRATE creation ---- - JSON_ls <- jsonlite::read_json("ro-crate-metadata.json") - - JSON_ls$`@graph`[[2]]$hasPart[[1]]$`@id` <- basename(FNAME) - JSON_ls$`@graph`[[2]]$about[[1]]$`@id` <- "https://cds.climate.copernicus.eu/cdsapp#!/dataset/reanalysis-era5-land" - JSON_ls$`@graph`[[2]]$datePublished <- Sys.time() # tail(file.info(FNAME)$ctime) - JSON_ls$`@graph`[[2]]$name <- "Bioclimatic data obtained from ERA5-Land. Water avialbility is denoted via the sum of soil moisture layer 1 and 2." - JSON_ls$`@graph`[[2]]$keywords <- list("ERA5-Land", "ECMWF", "Bioclimatic Variables", "Soil Moisture") - JSON_ls$`@graph`[[2]]$description <- "Bioclimatic data obtained from ERA5-Land. Water avialbility is denoted via the sum of soil moisture layer 1 and 2." - - JSON_ls$`@graph`[[3]]$name <- basename(FNAME) - JSON_ls$`@graph`[[3]]$contentSize <- file.size(FNAME) - JSON_ls$`@graph`[[3]]$`@id` <- basename(FNAME) - - JSON_ls$`@graph`[[5]]$instrument$`@id` <- "https://doi.org/10.1088/1748-9326/ac48b3" - - con <- file(file.path(Dir, paste0(tools::file_path_sans_ext(basename(FNAME)), ".json"))) - writeLines(jsonlite::toJSON(JSON_ls, pretty = TRUE), con) - close(con) +# BIOCLIMATIC DATA DOWNLOAD -------------------------------------------- +#' queries, downloads, and computes bioclimatic variables +#' at global extent from ERA5-Land. Water availability is based on +#' soil moisture level 1 (0-7cm) and 2 (7-28cm) +FUN.DownBV <- function( + T_Start = 1970, # what year to begin climatology calculation in + T_End = 2000, # what year to end climatology calculation in + Dir = getwd(), # where to store the data output on disk + Force = FALSE # do not overwrite already present data +) { + # Workaround for Dir, 1/2 + # original_wd = getwd() + # setwd(Dir) + # End of workaround + + FNAME <- file.path(Dir, paste0("BV_", T_Start, "-", T_End, ".nc")) + + if (!Force & file.exists(FNAME)) { + BV_ras <- stack(FNAME) + names(BV_ras) <- paste0("BIO", 1:19) + message( + "Data has already been downloaded with these specifications. + It has been loaded from the disk. If you wish to override + the present data, please specify Force = TRUE" + ) + return(BV_ras) + } + + ### Raw soil moisture level data ---- + #' We download raw soil moisture data for + #' layers 1 (0-7cm) and 2 (7-28cm) separately. + #' These are then summed up and used in the + #' bioclimatic variable computation of KrigR + FNAME_RAW <- + file.path( + Dir, + paste( + tools::file_path_sans_ext(basename(FNAME)), + "volumetric_soil_water_layer_1", + "RAW.nc", + sep = "_" + ) + ) + if (!file.exists(FNAME_RAW)) { + #### Downloading ---- + Qsoil1_ras <- CDownloadS( + Variable = "volumetric_soil_water_layer_1", # could also be total_precipitation + DataSet = "reanalysis-era5-land-monthly-means", + Type = "monthly_averaged_reanalysis", + DateStart = paste0(T_Start, "-01-01 00:00"), + DateStop = paste0(T_End, "-12-31 23:00"), + TResolution = "month", + Dir = Dir, + Extent = ne_countries(type = "countries", scale = "medium")[, 1], + FileName = "Qsoil1", + API_User = API_User, + API_Key = API_Key + ) + + Qsoil2_ras <- CDownloadS( + Variable = "volumetric_soil_water_layer_2", + # could also be total_precipitation + DataSet = "reanalysis-era5-land-monthly-means", + Type = "monthly_averaged_reanalysis", + DateStart = paste0(T_Start, "-01-01 00:00"), + DateStop = paste0(T_End, "-12-31 23:00"), + TResolution = "month", + Dir = Dir, + Extent = ne_countries(type = "countries", scale = "medium")[, 1], + FileName = "Qsoil2", + API_User = API_User, + API_Key = API_Key + ) + + #### Combining ---- + QSoilCombin_ras <- Qsoil1_ras + Qsoil2_ras + + #### Saving ---- + terra::metags(QSoilCombin_ras) <- terra::metags(Qsoil1_ras) + QSoilCombin_ras <- KrigR:::Meta.NC( + NC = QSoilCombin_ras, + FName = FNAME_RAW, + Attrs = terra::metags(QSoilCombin_ras), + Write = TRUE, + Compression = 9 + ) + + ### Deleting unnecessary files ---- + unlink(list.files(Dir, pattern = "Qsoil", full.names = TRUE)) + } + + ### Bioclimatic data ---- + BV_ras <- BioClim( + Temperature_Var = "2m_temperature", + Temperature_DataSet = "reanalysis-era5-land", + Temperature_Type = NA, + Water_Var = "volumetric_soil_water_layer_1", # could also be total_precipitation + Water_DataSet = "reanalysis-era5-land-monthly-means", + Water_Type = "monthly_averaged_reanalysis", + Y_start = T_Start, + Y_end = T_End, + Extent = ne_countries(type = "countries", scale = "medium")[, 1], + Dir = Dir, + FileName = basename(FNAME), + FileExtension = ".nc", + Compression = 9, + API_User = API_User, + API_Key = API_Key + ) + + ### JSON RO-CRATE creation ---- + JSON_ls <- jsonlite::read_json("ro-crate-metadata.json") + + JSON_ls$`@graph`[[2]]$hasPart[[1]]$`@id` <- basename(FNAME) + JSON_ls$`@graph`[[2]]$about[[1]]$`@id` <- + "https://cds.climate.copernicus.eu/cdsapp#!/dataset/reanalysis-era5-land" + JSON_ls$`@graph`[[2]]$datePublished <- + Sys.time() # tail(file.info(FNAME)$ctime) + JSON_ls$`@graph`[[2]]$name <- + "Bioclimatic data obtained from ERA5-Land. Water avialbility is denoted via the sum of soil moisture layer 1 and 2." + JSON_ls$`@graph`[[2]]$keywords <- list("ERA5-Land", + "ECMWF", + "Bioclimatic Variables", + "Soil Moisture") + JSON_ls$`@graph`[[2]]$description <- + "Bioclimatic data obtained from ERA5-Land. Water avialbility is denoted via the sum of soil moisture layer 1 and 2." + + JSON_ls$`@graph`[[3]]$name <- basename(FNAME) + JSON_ls$`@graph`[[3]]$contentSize <- file.size(FNAME) + JSON_ls$`@graph`[[3]]$`@id` <- basename(FNAME) + + JSON_ls$`@graph`[[5]]$instrument$`@id` <- + "https://doi.org/10.1088/1748-9326/ac48b3" + + con <- file(file.path(Dir, + paste0( + tools::file_path_sans_ext(basename(FNAME)), + ".json" + ))) + writeLines(jsonlite::toJSON(JSON_ls, pretty = TRUE), + con) + close(con) + + message( + paste0( + "ERA5 citation:\nCopernicus Climate Change Service, Climate Data Store, (2024): ERA5-land post-processed daily-statistics from 1950 to present. Copernicus Climate Change Service (C3S) Climate Data Store (CDS), DOI: 10.24381/cds.e9c9c792 ", + "(Accessed on ", Sys.Date(),")") + ) + + # Workaround for Dir, 2/2 + # setwd(original_wd) + # End of workaround + + BV_ras +} + +# CAPFITOGEN DATA DOWNLOAD ---------------------------------------------------- +## Define a function for downloading data from google drives ---- +download_from_google_drive <- function( + file_id, + dest_path) { + require(curl) + + # Temporary file to capture cookies + tmp_cookie <- tempfile() + + # First request to get the "confirm" token (if needed) + url1 <- paste0("https://drive.google.com/uc?export=download&id=", file_id) + + h <- curl::new_handle() + curl::handle_setopt(h, cookiefile = tmp_cookie, cookiejar = tmp_cookie) + + con <- curl::curl(url1, "rb", handle = h) + page <- readLines(con, warn = FALSE) + close(con) + + # Look for confirm token + confirm <- regmatches(page, regexpr("confirm=([0-9A-Za-z_]+)", page)) + confirm <- sub("confirm=", "", confirm) + + if (length(confirm) > 0) { + # If token found, make second request with confirmation + url2 <- paste0("https://drive.google.com/uc?export=download&confirm=", confirm, "&id=", file_id) + + curl_download(url2, destfile = dest_path, handle = h, mode = "wb") + } else { + # If no token, download directly + curl_download(url1, destfile = dest_path, handle = h, mode = "wb") + } - BV_ras + unlink(tmp_cookie) } + +## Define download function for the standard data from CAPFITOGEN (world) ---- +FUN.DownCAPFITOGEN <- + function(Dir = getwd(), + Force = FALSE, + resample_to_match = NULL) { + # define a file name + FNAME <- file.path("Data/Environment/capfitogen.nc") + + # check if file already exists and whether to overwrite + if (!Force & file.exists(FNAME)) { + capfitogen_rasters <- rast(FNAME) + message( + paste( + FNAME, + "exists already. It has been loaded from the disk. + If you wish to override the present data, please specify Force = TRUE" + ) + ) + return(capfitogen_rasters) + } + + # if Force=TRUE or the file doesn't already exist: + if (Force | !file.exists(FNAME)) { + ### download data from Capfitogen Google drive ---- + # https://drive.google.com/drive/folders/1Xxt1ztTkLITAUbTyJzjePs2CpfETwF_u + message("Start downloading 10x10 data from Capfitogen google drive.") + + # scrape Capfitogen's google drive to get direct download links (rdatamaps/world/10x10) + folder_id <- "1Xxt1ztTkLITAUbTyJzjePs2CpfETwF_u" + embedded_url <- paste0( + "https://drive.google.com/embeddedfolderview?id=", + folder_id, "#list" + ) + + # Scrape the page + page <- read_html(embedded_url) + + # Extract tags (these contain filenames + links) + file_links <- page %>% + html_nodes("a") %>% + html_attr("href") + file_names <- page %>% + html_nodes("a") %>% + html_text() + + tif_files <- data.frame(name = file_names, link = file_links) + + # Extract file ID from link + tif_files$file_id <- substr(tif_files$link, + start = 33, stop = 65 + ) + tif_files$download_url <- paste0( + "https://drive.google.com/uc?export=download&id=", tif_files$file_id) + + # create directory to store tiff files + dir.create( + path = paste0(Dir, "/capfitogen"), + showWarnings = FALSE + ) + + for (i in 1:nrow(tif_files)) { + file_name <- tif_files$name[i] + file_id <- tif_files$file_id[i] + message("Downloading: ", file_name) + + dest_path <- file.path(Dir, "capfitogen", file_name) + + if (!file.exists(dest_path)) { + download_from_google_drive(file_id, dest_path) + } + } + + # list the downloaded files + file_list <- list.files(paste0(Dir.Data.Envir, "/capfitogen")) + + ### read in and format rasters one by one from file name ---- + rasters <- NULL + for (i in 1:length(file_list)) { # length(file_list) + file_path_i <- file.path(Dir.Data.Envir, "capfitogen", file_list[i]) + raster_i <- rast(file_path_i) + # rename raster + names(raster_i) <- tif_files$name[i] + + ### resample ---- + # if provided, resample to match another raster object's origin and resolution + if (is.null(resample_to_match)) { + message("No resampling.") + } else if (inherits(resample_to_match, "SpatRaster")) { + # read in the raster to use as template to match + resample_to_match <- rast(resample_to_match) + + # project downloaded rasters to match resample_to_match file + projection_to_match <- terra::crs(resample_to_match) + + terra::crs(raster_i) <- projection_to_match + + # resample + message(paste( + "Resampling", names(raster_i), + "to match", names(resample_to_match) + )) + + tryCatch(raster_i <- terra::resample(raster_i, resample_to_match)) + } else { + stop("Invalid input for resample_to_match. Must be a SpatRaster or NULL.") + } + + message(paste("adding raster", names(raster_i))) + rasters <- c(rasters, raster_i) + } + + # convert list of rasters to a stack + rasters <- rast(rasters) + + # remove .tif extension from raster name + names(rasters) <- sub("\\.tif$", "", names(rasters)) + + ## save rasters ---- + message(paste0("saving as netCDF:", FNAME)) + terra::writeCDF(rasters, + varname = names(rasters), + filename = FNAME, + overwrite = TRUE) + return(rasters) + } + } + +# WORLD DATABASE ON PROTECTED AREAS --------------------------------------- +#' UNEP-WCMC and IUCN (2025), Protected Planet: +#' The World Database on Protected Areas (WDPA) [Online], +#' Cambridge, UK: UNEP-WCMC and IUCN. Available at: www.protectedplanet.net. +#' https://www.protectedplanet.net/en/thematic-areas/wdpa +FUN.DownWDPA <- function(wdpa_url = FALSE, + wdpa_destination = file.path(Dir.Capfitogen.WDPA, "WDPA_Feb2025_Public_shp.zip"), + Force = FALSE) { + # get the current month and year (from system time) + MmmYYYY <- format(Sys.Date(), "%b%Y") + + # set a path to wdpa shapefiles + wdpa_path <- file.path(Dir.Capfitogen.WDPA, "wdpa") + + # define the file name of global wdpa shapefile to be created + FNAME <- file.path(wdpa_path, "global_wdpa_polygons.gpkg") + + # check if the final wdpa file already exists and whether to overwrite + if (!Force & file.exists(FNAME)) { + message(paste0("A global wdpa file with polygons exists already: ", FNAME)) + } else { + if (!file.exists(wdpa_destination)) { + # check if a wdpa url is provided + if (wdpa_url == FALSE) { + message("please provide a valid url for download. + Download links change monthly, and follow the format + https://d1gam3xoknrgr2.cloudfront.net/current/WDPA_Apr2025_Public_shp.zip. + See https://www.protectedplanet.net/en/thematic-areas/wdpa?tab=WDPA") + } else { + # start download + message("downloading zipped WDPA shapefiles, ca 4GB") + # set long timeout to avoid interrupting download + options(timeout = 10000) + # download the zipped files + download.file( + url = wdpa_url, + destfile = wdpa_destination, + cacheOK = FALSE + ) + } + + # unzip files + message(paste("unzipping WDPA shapefiles to", Dir.Capfitogen.WDPA)) + unzip(zipfile = wdpa_destination, exdir = Dir.Capfitogen.WDPA) + + # unzip split shapefile downloads + message("unzipping shapefiles split in download") + + shapefile_names <- c( + paste0("WDPA_", MmmYYYY, "_Public_shp-points.cpg"), + paste0("WDPA_", MmmYYYY, "_Public_shp-points.dbf"), + paste0("WDPA_", MmmYYYY, "_Public_shp-points.prj"), + paste0("WDPA_", MmmYYYY, "_Public_shp-points.shp"), + paste0("WDPA_", MmmYYYY, "_Public_shp-points.shx"), + paste0("WDPA_", MmmYYYY, "_Public_shp-polygons.cpg"), + paste0("WDPA_", MmmYYYY, "_Public_shp-polygons.dbf"), + paste0("WDPA_", MmmYYYY, "_Public_shp-polygons.prj"), + paste0("WDPA_", MmmYYYY, "_Public_shp-polygons.shp"), + paste0("WDPA_", MmmYYYY, "_Public_shp-polygons.shx") + ) + + shapefile_paths <- file.path(wdpa_path, shapefile_names) + + # loop over zip directories with parts of the global data (numbered 0, 1, 2) + for (i in 0:2) { + # define name of the current directory to be unzipped + zipfilename <- + file.path( + Dir.Capfitogen.WDPA, + paste0("WDPA_", MmmYYYY, "_Public_shp_", i, ".zip") + ) + + # unzip the directory containing shapefiles + unzip(zipfile = zipfilename, exdir = wdpa_path) + message(paste0("unzipped ", zipfilename, "\nto ", wdpa_path)) + + # rename shapefiles with numbers to prevent overwriting them + new_shapefile_names <- file.path(wdpa_path, paste0(i, "_", shapefile_names)) + file.rename(from = shapefile_paths, to = new_shapefile_names) + } + + # delete unnecessary files + message("deleting redundant files (translations etc.)") + files_to_keep <- c( + wdpa_path, + wdpa_destination, + file.path( + Dir.Capfitogen.WDPA, + paste0("WDPA_sources_", MmmYYYY, ".csv") + ) + ) + + files_to_delete <- + list.files(Dir.Capfitogen.WDPA, + full.names = TRUE + )[list.files(Dir.Capfitogen.WDPA, + full.names = TRUE + ) %nin% files_to_keep] + file.remove(files_to_delete, recursive = TRUE) + + # prepare list of shapefiles to be combined + wdpa_polygon_shapefiles <- + # list polygon shapefiles in WDPA directory + substr( + unique(sub( + "\\..*", "", + list.files(wdpa_path)[grep( + pattern = "polygon", + x = shapefile_names + )] + )), + 3, 34 + ) + + shapefile_list <- list() + + for (i in 0:2) { + # read in all the polygon shapefile layers + layer_name <- paste0(i, "_", wdpa_polygon_shapefiles) + shapefile_list[[i + 1]] <- + read_sf(dsn = wdpa_path, layer = layer_name) + } + + # merge parts into one global shapefile + message("combining parts of the WDPA shapefile. This can take a while ---") + wdpa <- do.call(rbind, shapefile_list) + message("Complete WDPA successfully combined.") + + # wdpa$WDPAID <- as.character(wdpa$WDPAID) + # wdpa$text_field <- iconv(wdpa$text_field, to = "ASCII//TRANSLIT") + + # save complete wdpa + message("save as GeoPackage") + st_write(wdpa, file.path(wdpa_path, "global_wdpa_polygons.gpkg")) + message(paste0( + "global WDPA saved as: ", + file.path(wdpa_path, "global_wdpa_polygons.gpkg") + )) + + message("save as shapefile") + # st_write(wdpa, FNAME) + st_write( + wdpa, + "global_wdpa_polygons.shp", + layer_options = "ENCODING=UTF-8", + field_type = c(WDPAID = "Character") + ) + message("global WDPA saved as global_wdpa_polygons.shp") + } + } +} + +# end diff --git a/R_scripts/legacy-data-download.R b/R_scripts/legacy-data-download.R new file mode 100644 index 0000000..053a6e2 --- /dev/null +++ b/R_scripts/legacy-data-download.R @@ -0,0 +1,343 @@ +# Legacy code for downloading custom edaphic (soil) and geophysical data + +# by Eva Lieungh + +# this scripts defines functions in the same style as in SHARED-Data.R. +# Example use: +# +# ### Edaphic data ------ +# ## NB! each file at 250x250m is ~20GB... +# message("Downloading new or loading existing edaphic/soil variables") +# +# edaphic_variables <- FUN.DownEV( +# Dir = Dir.Data.Envir, +# target_resolution = c(250, 250), +# Force = FALSE, +# resample_to_match = bioclim_variables[[1]] +# ) +# +# ### Geophysical data ------ +# message("Downloading new or loading existing geophysical variables") +# geophysical_variables <- FUN.DownGV( +# Dir = Dir.Data.Envir, +# Force = FALSE, +# resample_to_match = bioclim_variables[[1]] +# ) + + +# EDAPHIC DATA DOWNLOAD ------------------------------------------------------- +# NB! Works for some variables, but the data set is incomplete. +# This function should be modified to add or replace data for capfitogen. +FUN.DownEV <- + function(Dir = getwd(), # where to store the data output on disk + target_resolution = c(250, 250), + Force = FALSE, # do not overwrite already present data, + resample_to_match = FALSE) { + # define a file name + FNAME <- file.path(Dir, "edaphic.nc") + message(FNAME) + + # check if file already exists and whether to overwrite + if (!Force & file.exists(FNAME)) { + EV_ras <- rast(FNAME) + message( + "Data has already been downloaded. It has been loaded from the disk. If you wish to override the present data, please specify Force = TRUE" + ) + return(EV_ras) + } + + # if Force=TRUE or the file doesn't already exist: + if (Force | !file.exists(FNAME)) { + ## download data from SoilGrids ---- + message("Start downloading data from SoilGrids: files.isric.org/soilgrids/latest/data/") + soilGrids_url = "/vsicurl?max_retry=3&retry_delay=1&list_dir=no&url=https://files.isric.org/soilgrids/latest/data/" + + #' overview of datasets: https://www.isric.org/explore/soilgrids/faq-soilgrids#What_do_the_filename_codes_mean + #' NB! Each global map occupies circa 20 GB for 250x20m resolution! + #' It takes a while to download. + #' In addition, https://files.isric.org/soilgrids/latest/data/wrb/ + #' has maps of soil types, as estimated probability of occurrence per type. + #' MostProbable.vrt has the most probable soil type per gridcell. + #' Soil salinity: https://data.isric.org/geonetwork/srv/eng/catalog.search#/metadata/c59d0162-a258-4210-af80-777d7929c512 + + SoilGrids_variables_in <- + c("bdod/bdod_0-5cm_mean", # Bulk density of the fine earth fraction, cg/cm³ + "cec/cec_0-5cm_mean", # Cation Exchange Capacity of the soil, mmol(c)/kg + "cfvo/cfvo_0-5cm_mean", # Volumetric fraction of coarse fragments (> 2 mm) cm3/dm3 (vol‰) + "silt/silt_0-5cm_mean", # Proportion of silt particles (≥ 0.002 mm and ≤ 0.05/0.063 mm) in the fine earth fraction g/kg + "clay/clay_0-5cm_mean", # Proportion of clay particles (< 0.002 mm) in the fine earth fraction g/kg + "sand/sand_0-5cm_mean", # Proportion of sand particles (> 0.05/0.063 mm) in the fine earth fraction g/kg + "nitrogen/nitrogen_0-5cm_mean", # Total nitrogen (N) cg/kg + "phh2o/phh2o_0-5cm_mean", # Soil pH pHx10 + "ocd/ocd_0-5cm_mean", # Organic carbon density hg/m³ + "ocs/ocs_0-30cm_mean", # Organic carbon stocks t/ha + "soc/soc_0-5cm_mean") # Soil organic carbon content in the fine earth fraction dg/kg + + SoilGrids_variables <- sub(".*/", "", SoilGrids_variables_in) + + soilGrids_data <- NULL + + for (i in 1:length(SoilGrids_variables_in)) { + variable_name = SoilGrids_variables[i] + + message(SoilGrids_variables[i]) + + path_to_downloaded_file <- paste0(Dir.Data.Envir, "/", + SoilGrids_variables[i], ".tif") + + # if variable is not downloaded already, ... + ifelse( + !file.exists(path_to_downloaded_file), + # download it, ... + downloaded_variable <- gdalUtilities::gdal_translate( + src_dataset = paste0(soilGrids_url, + SoilGrids_variables_in[i], ".vrt"), + dst_dataset = path_to_downloaded_file, + tr = target_resolution # target resolution + ), + # or else load it from file + downloaded_variable <- path_to_downloaded_file + ) + + ## load variable as raster + downloaded_raster <- rast(downloaded_variable) + plot(downloaded_raster, main = SoilGrids_variables[i]) + + ### resample ---- + ## if provided, resample to match another raster object's origin and resolution + if (!missing(resample_to_match)) { + message(paste0("resampling raster to match ", names(resample_to_match))) + resample_to_match <- rast(resample_to_match) + + ## project downloaded rasters to match resample_to_match file + projection_to_match <- terra::crs(resample_to_match) + terra::crs(downloaded_raster) <- projection_to_match + + ## resample + downloaded_raster <- + terra::resample(downloaded_raster, + resample_to_match) + } + + soilGrids_data <- c(soilGrids_data, downloaded_raster) + + } + + ## HSWD downloads ---- + ## download additional rasters from HSWD + message("Downloading data from HSWD (harmonised world soil database) via fao.org") + + path_to_PH_nutrient = file.path(Dir, "HSWD_PH_nutrient.tif") + if (!file.exists(path_to_PH_nutrient)) { + download.file(url = "https://www.fao.org/fileadmin/user_upload/soils/docs/HWSD/Soil_Quality_data/sq1.asc", + destfile = path_to_PH_nutrient) + } + PH_nutrient <- rast(path_to_PH_nutrient) + + path_to_PH_toxicity = file.path(Dir, "HSWD_PH_toxicity.tif") + if (missing(path_to_PH_toxicity)) { + message("downloading HSWD PH toxicity") + download.file(url = "https://www.fao.org/fileadmin/user_upload/soils/docs/HWSD/Soil_Quality_data/sq6.asc", + destfile = path_to_PH_toxicity) + } + PH_toxicity <- rast(path_to_PH_toxicity) + + ### resample ---- + ## if provided, resample to match another raster object's origin and resolution + if (!missing(resample_to_match)) { + message(paste0("resampling raster to match ", names(resample_to_match))) + resample_to_match <- rast(resample_to_match) + + ## project downloaded rasters to match resample_to_match file + projection_to_match <- terra::crs(resample_to_match) + terra::crs(PH_nutrient) <- projection_to_match + terra::crs(PH_toxicity) <- projection_to_match + + ## resample + PH_nutrient <- + terra::resample(PH_nutrient, + resample_to_match) + + PH_toxicity <- + terra::resample(PH_toxicity, + resample_to_match) + + } + + ### combine and rename rasters ---- + EV_rasters <- rast(c(soilGrids_data, + PH_nutrient, PH_toxicity)) + + names(EV_rasters) <- c(SoilGrids_variables, + "Nutrient", "Toxicity") + + ### Saving ---- + message(paste0("saving as netCDF:", FNAME)) + terra::writeCDF(EV_rasters, + filename = FNAME, + overwrite = FALSE) + + EV_rasters + + } + } + +# GEOPHYSICAL DATA DOWNLOAD -------------------------------------------------- +#' define a function to load existing data or download them to +#' the given directory. +FUN.DownGV <- + function(Dir = getwd(),# where to store the data output on disk + Force = FALSE,# do not overwrite already present data, + resample_to_match = FALSE) { + # define a file name + FNAME <- file.path(Dir, "geophysical.nc") + + # check if file already exists and whether to overwrite + if (!Force & file.exists(FNAME)) { + GV_raster <- rast(FNAME) + names(GV_raster) <- c("elevation", + "mean_wind_speed_of_windiest_month") + message( + "Data has already been downloaded with these specifications. It has been loaded from the disk. If you wish to override the present data, please specify Force = TRUE" + ) + + return(GV_raster) + + } + + # if the file doesn't already exist: + if (Force | !file.exists(FNAME)) { + message("downloading or loading geophysical data") + + ## Download digital elevation model (DEM) ------ + ##' Fick, S.E. and R.J. Hijmans, 2017. WorldClim 2 + ##' https://doi.org/10.1002/joc.5086 + message("- digital elevation model") + if (!file.exists(paste0(Dir, "/wc2.1_2.5m_elev.tif"))) { + worldclim_dem_url = "https://geodata.ucdavis.edu/climate/worldclim/2_1/base/wc2.1_2.5m_elev.zip" + temp <- tempfile() + download.file(worldclim_dem_url, temp) + unzip(zipfile = temp, + exdir = Dir) + unlink(temp) + } + dem <- rast(paste0(Dir, "/wc2.1_2.5m_elev.tif")) + names(dem) <- "Elevacion" + + ## Download wind speed ------ + ##' WorldClim 2 + message("- mean wind speed of windiest month (annual max of monthly means)") + if (!file.exists(paste0(Dir, "/wc2.1_2.5m_wind_max.tif"))) { + worldclim_wind_url = "https://geodata.ucdavis.edu/climate/worldclim/2_1/base/wc2.1_2.5m_wind.zip" + temp <- tempfile() + download.file(worldclim_wind_url, temp) + unzip(zipfile = temp, + exdir = Dir) + unlink(temp) + + ## read monthly wind speed and find annual max of monthly means + month_numbers = c(paste0("0", 2:9), as.character(10:12)) + wind_stack <- rast(paste0(Dir, "/wc2.1_2.5m_wind_01.tif")) + for (i in month_numbers) { + raster_i = rast(paste0(Dir, "/wc2.1_2.5m_wind_", i, ".tif")) + wind_stack <- c(wind_stack, raster_i) + } + max_wind <- terra::app(wind_stack, max) + writeRaster(max_wind, + filename = paste0(Dir, "/wc2.1_2.5m_wind_max.tif")) + } + wind <- rast(paste0(Dir, "/wc2.1_2.5m_wind_max.tif")) + names(wind) <- "mean_wind_speed_of_windiest_month" + + ## Resample ------ + ## if provided, resample to match another raster object's origin and resolution + if (!missing(resample_to_match)) { + message(paste0("resampling raster to match ", names(resample_to_match))) + resample_to_match <- rast(resample_to_match) + + ## project downloaded rasters to match resample_to_match file + projection_to_match <- terra::crs(resample_to_match) + terra::crs(dem) <- projection_to_match + terra::crs(wind) <- projection_to_match + message("projected to match input") + + ## resample + dem <- terra::resample(dem, + resample_to_match) + message("dem successfully resampled") + + wind <- terra::resample(wind, + resample_to_match) + message("wind successfully resampled") + } + + ### combine rasters + geophysical_rasters <- c(dem, wind) + + ## Saving ---- + message("saving as NetCDF") + terra::writeCDF(geophysical_rasters, + filename = FNAME, + overwrite = TRUE) + + geophysical_rasters + } + } + +# WGS84 = EPSG:4326 +## Download digital elevation model (DEM) from +##' Jarvis A., H.I. Reuter, A. Nelson, E. Guevara, 2008, Hole-filled +##' seamless SRTM data V4, International Centre for Tropical Agriculture +##' (CIAT), available from http://srtm.csi.cgiar.org. +#dem <- rast("https://srtm.csi.cgiar.org/wp-content/uploads/files/250m/tiles250m.jpg") + + +# Bioclim data from ModGP, cut out from capfitogen_master.R ------------------- +### Bioclomatic data ------ +##' 19 BioClim variables +##' FUN.DownBV uses KrigR to download ERA5 data from Climate Data Store (CDS) +##' is each file of each variable >20GB? +##' Will this download Global Multi-resolution Terrain Elevation Data +##' (GMTED2010) as well? +##' Temporal coverage: January 1950 to present ? +##' https://cds.climate.copernicus.eu/datasets/derived-era5-land-daily-statistics?tab=overview +message("Downloading new or loading existing 19 BioClim bioclimatic variables") + +# Check for existing BioClim data file +existing_bioclim_file <- file.path(Dir.Data.Envir, "BV_1985-2015.nc") +if (file.exists(existing_bioclim_file)) { + message("Using existing BioClim data") + bioclim_variables <- terra::rast(existing_bioclim_file) +} else { + bioclim_variables <- FUN.DownBV( + T_Start = 1999, # what year to begin climatology calculation? + T_End = 1999, # what year to end climatology calculation? + Dir = Dir.Data.Envir, # where to store the data output on disk + Force = FALSE # do not overwrite already present data + ) +} + +# make sure the BioClim data have the correct names +BioClim_names <- c( + # see https://www.worldclim.org/data/bioclim.html + "BIO1_Annual_Mean_Temperature", + "BIO2_Mean_Diurnal_Range", + "BIO3_Isothermality", + "BIO4_Temperature_Seasonality", + "BIO5_Max_Temperature_of_Warmest_Month", + "BIO6_Min_Temperature_of_Coldest_Month", + "BIO7_Temperature_Annual_Range", + "BIO8_Mean_Temperature_of_Wettest_Quarter", + "BIO9_Mean_Temperature_of_Driest_Quarter", + "BIO10_Mean_Temperature_of_Warmest_Quarter", + "BIO11_Mean_Temperature_of_Coldest_Quarter", + "BIO12_Annual_Precipitation", + "BIO13_Precipitation_of_Wettest_Month", + "BIO14_Precipitation_of_Driest_Month", + "BIO15_Precipitation_Seasonality", + "BIO16_Precipitation_of_Wettest_Quarter", + "BIO17_Precipitation_of_Driest_Quarter", + "BIO18_Precipitation_of_Warmest_Quarter", + "BIO19_Precipitation_of_Coldest_Quarter") +names(bioclim_variables) <- BioClim_names + diff --git a/capfitogen.R b/capfitogen.R deleted file mode 100644 index 3a9a83f..0000000 --- a/capfitogen.R +++ /dev/null @@ -1,62 +0,0 @@ -# Main input file (pasaporte): -# LathyrusData-ForCapfitogen_27oct2023.txt (by Carrie) -# Filter only one species for testing: -# head -n 1 LathyrusData-ForCapfitogen_27oct2023.txt > LathyrusData-ForCapfitogen_27oct2023_niger_only.txt -# grep "Lathyrus niger" LathyrusData-ForCapfitogen_27oct2023.txt >> LathyrusData-ForCapfitogen_27oct2023_niger_only.txt - -# Global options -pasaporte_file <- "LathyrusData-ForCapfitogen_27oct2023_niger_only.txt" -country <- "World" -# resolution <- "Celdas 1x1 km aprox (30 arc-seg)" -# resolution <- "Celdas 5x5 km aprox (2.5 arc-min)" -resolution <- "celdas 20x20 km aprox (10 arc-min)" - -# Paths -results_dpath <- file.path(getwd(), "Resultados") -root_dpath <- file.path(getwd(), "CAPFITOGEN3") -param_dpath <- file.path(root_dpath, "scripts", "Parameters scripts (English)") -tools_dpath <- file.path(root_dpath, "scripts", "Tools Herramientas") - -dir.create(results_dpath) - -# We execute SelecVar and ELCMapas modules in order -# The structure of each module execution is: -# - execute the corresponding parameters file for default settings -# - override relevant settings (ruta etc.) using variables defined above -# - execute the correspoding analysis script (unless done already) -# Note! Scripts write to a common log file: CAPFITOGEN3/Error/process_info.txt - -#### SelecVar ############################# -message("SelecVar") -source(file.path(param_dpath, "Parameters_SelecVar_2021.R")) -file.copy(file.path(getwd(), pasaporte_file), file.path(root_dpath, "Pasaporte", pasaporte_file), overwrite=TRUE) -ruta <- root_dpath -pasaporte <- pasaporte_file -geoqual <- FALSE -pais <- country -resol1 <- resolution -resultados <- file.path(results_dpath, "SelecVar") -dir.create(resultados) -if (file.exists(file.path(resultados, "SelectedVariables_edaphic.xls"))) { - message("- skipping") -} else { - message("- executing") - source(file.path(tools_dpath, "SelectVar.R")) - # Prevent crosstalk with the next step - rm(geophys) -} - -#### ELCmapas ############################# -message("ELCmapas") -source(file.path(param_dpath, "Parameters_ELCmapas_2021.R")) -ruta <- root_dpath -pais <- country -resol1 <- resolution -resultados <- file.path(results_dpath, "ELCmapas") -dir.create(resultados) -if (file.exists(file.path(resultados, "Producto.RData"))) { - message("- skipping") -} else { - message("- executing") - source(file.path(tools_dpath, "ELCmapas.R")) -} diff --git a/capfitogen_master.R b/capfitogen_master.R new file mode 100644 index 0000000..d194681 --- /dev/null +++ b/capfitogen_master.R @@ -0,0 +1,448 @@ +#' ####################################################################### # +#' PROJECT: [BioDT CWR - Capfitogen] +#' CONTENTS: +#' - Loading/installing packages +#' - Execution of data pipeline +#' - Species occurrence download from GBIF +#' - Environmental data load/download +#' - Data formatting and parameter definition for CAPFITOGEN +#' - Execution of CAPFITOGEN tools +#' - 'ELC maps' +#' - 'Complementa' +#' - Visualisation of outputs +#' DEPENDENCIES: +#' - Capfitogen submodule (from https://github.com/evalieungh/Capfitogen) +#' - R_Scripts directory containing: +#' - "MoDGP-commonlines.R" +#' - "SHARED-APICredentials.R" -- NB! internal to project, ask for access +#' - "SHARED-Data.R" +#' AUTHORS: [Eva Lieungh, Erik Kusch, Heli Juottonen, Desalegn Chala] +#' Capfitogen credit: Parra-Quijano et al. 2021, +#' https://repositorio.unal.edu.co/handle/unal/85787 +#' ####################################################################### # + +# PREAMBLE ================================================================ +set.seed(42) # making things reproducibly random +rm(list=ls()) # clean environment +gc() + +# Read species from command-line argument +args = commandArgs(trailingOnly=TRUE) +if (length(args)==0) { + # Default species + SPECIES <- "Lathyrus angulatus" +} else { + SPECIES <- args[1] +} +message(sprintf("SPECIES = %s", SPECIES)) + +# Define directories in relation to project directory +Dir.Base <- getwd() +Dir.Scripts <- file.path(Dir.Base, "R_scripts") + +# source packages, directories, simple functions (...) +source(file.path(Dir.Scripts, "ModGP-commonlines.R")) + +## API Credentials -------------------------------------------------------- +# set API credentials for access to climate data store (CDS) +try(source(file.path(Dir.R_scripts, "SHARED-APICredentials.R"))) +if (as.character(options("gbif_user")) == "NULL") { + options(gbif_user = rstudioapi::askForPassword("my gbif username")) +} +if (as.character(options("gbif_email")) == "NULL") { + options(gbif_email = rstudioapi::askForPassword("my registred gbif e-mail")) +} +if (as.character(options("gbif_pwd")) == "NULL") { + options(gbif_pwd = rstudioapi::askForPassword("my gbif password")) +} + +if (!exists("API_Key") | + !exists("API_User")) { + # CS API check: if CDS API credentials have not been specified elsewhere + API_User <- + readline(prompt = "Please enter your Climate Data Store API user number and hit ENTER.") + API_Key <- + readline(prompt = "Please enter your Climate Data Store API key number and hit ENTER.") +} # end of CDS API check + +## NUMBER OF CORES +if (!exists("numberOfCores")) { + # Core check: if number of cores for parallel processing has not been set yet + numberOfCores <- + as.numeric(readline( + prompt = paste( + "How many cores do you want to allocate to these processes? Your machine has", + parallel::detectCores() + ) + )) +} # end of Core check +message(sprintf("numberOfCores = %d", numberOfCores)) + +# DATA ==================================================================== +message(paste("-----------------------------------", + " starting GBIF data download/load ", + "-----------------------------------", + sep = "\n")) + +## Run SHARED-Data script ------------------------------------------------- +## defines functions used to download data +source(file.path(Dir.R_scripts, "SHARED-Data.R")) + +## GBIF Data -------------------------------------------------------------- +message("Downloading new or loading existing GBIF data") +## species of interest +Species_ls <- FUN.DownGBIF( + species = SPECIES, # which species to pull data for + Dir = Dir.Data.GBIF, # where to store the data output on disk + Force = FALSE, # overwrite (TRUE) already present data or not (FALSE) + Mode = "Capfitogen", # query download for one species + parallel = 1 # no speed gain here for parallelising on personal machine +) + +## Environmental Data (CAPFITOGEN) -------------------------------------------- +message(paste("-------------------------------------------------------", + " starting CAPFITOGEN environmental data download/load ", + "-------------------------------------------------------", + sep = "\n")) +# make a template raster to resample to +template_raster <- rast(nrows = 1800, + ncols = 4320, + nlyr = 1) +crs(template_raster) <- "epsg:4326" # WGS84 - World Geodetic System 1984 + +# download the default data from CAPFITOGEN. +# development goal: replace with argument to give alternative download links +all_predictors <- FUN.DownCAPFITOGEN( + Dir = Dir.Data.Envir, + Force = FALSE, + resample_to_match = template_raster +) +names(all_predictors) + +## Protected areas database --------------------------------------------------- +message(paste("----------------------------------------------", + " starting protected areas data download/load ", + "----------------------------------------------", + sep = "\n")) +#' download shapefiles for protected areas to overlay with Complementa tool. +#' The FUN.DownWDPA function will save the file to a folder, but not load it +#' into RStudio as an object. +MmmYYYY <- format(Sys.Date(), "%b%Y") +FUN.DownWDPA( + # download from url: + wdpa_url = paste0( + "https://d1gam3xoknrgr2.cloudfront.net/current/WDPA_", + MmmYYYY, "_Public_shp.zip"), + # save the downloaded zipfile as: + wdpa_destination = file.path(Dir.Capfitogen.WDPA, + paste0("WDPA_", + MmmYYYY, + "_Public_shp.zip")), + # do not overwrite existing data + Force = FALSE) + +## Crop extents --------------------------------------------------------------- +# if supplied, crop all the data to a map of native species range. +# Define function +crop_to_native_range <- function( + Dir = getwd(), + Force = TRUE, + native_range_map = NULL) { + + # define file names + FNAME_env <- file.path("Data/Environment/capfitogen_cropped.nc") + FNAME_wdpa <- file.path("Capfitogen/wdpa/wdpa_cropped.gpkg") + + # check if cropped data already exists and whether to overwrite + if (!Force & file.exists(FNAME_env) & file.exists(FNAME_wdpa)) { + message(paste0(FNAME_env, " and ", FNAME_wdpa, + " exist already. They have been loaded from the disk. ", + "If you wish to override the present data, ", + "please specify Force = TRUE")) + capfitogen_cropped <- rast(FNAME_env) + wdpa_cropped <- read_sf(FNAME_wdpa) + return(list(env = capfitogen_cropped, wdpa = wdpa_cropped)) + } + + # proceed with cropping if native_range_map is valid + if (!is.null(native_range_map)) { + message(paste("----------------------------------------", + " cropping to native range ", + "----------------------------------------", + sep = "\n")) + # attempt to load native range map safely + tryCatch({ + native_range_raster <- rast(native_range_map) + native_range_extent <- ext(native_range_raster) + + if (is.null(native_range_extent) || + any(is.na(c(xmin(native_range_extent), xmax(native_range_extent), + ymin(native_range_extent), ymax(native_range_extent))))) { + stop("native_range_map does not have a valid extent.") + } + + # crop and save the environmental data + message("cropping environmental data") + all_predictors_cropped <- terra::crop(all_predictors, native_range_extent) + writeCDF(all_predictors_cropped, FNAME_env) + + # crop and save protected areas (wdpa) + message("cropping protected areas") + wdpa <- read_sf(file.path(Dir, "wdpa", "global_wdpa_polygons.gpkg")) + wdpa_cropped <- terra::crop(wdpa, native_range_extent) + st_write(wdpa_cropped, FNAME_wdpa) + + return(list(env = all_predictors_cropped, wdpa = wdpa_cropped)) + + }, error = function(e) { + stop("Error reading or processing native_range_map: ", e$message) + }) + + } else { + stop("native_range_map must be provided when Force is TRUE or data does not exist.") + } +} + +# load native species range map +range_map <- NULL + +# apply cropping function +if(is.null(range_map)) { + message("global extent") +} else { + message(paste("cropping to native range map: ", names(range_map))) + crop_to_native_range( + Dir = Dir.Base, + Force = FALSE, + native_range_map = range_map) +} + +# CAPFITOGEN pipeline ========================================================= +message(paste("------------------------------", + " starting Capfitogen pipeline ", + "------------------------------", + sep = "\n")) + +### Format GBIF data ---- +# need a data frame named 'puntos' = points with occurrence points +puntos <- data.frame(POINTID = 1:length(Species_ls[["occs"]][["DECLATITUDE"]]), + POINT_X = Species_ls[["occs"]][["DECLONGITUDE"]], + POINT_Y = Species_ls[["occs"]][["DECLATITUDE"]]) + +### create 'pasaporte' ---- +message("create Pasaporte file") +#' Capfitogen uses a file named "pasaporte" with species/taxa occurrences. +#' The file uses Darwincore names, so it should be OK to use the occurrences +#' from the GBIF download directly. +#' Place it in the right folder so Capfitogen can find it: +pasaporte_file_name = paste0(sub(pattern = " ", + replacement = "_", + SPECIES), + ".txt") + +write.table(Species_ls[["occs"]], + file.path("Capfitogen/Pasaporte", + pasaporte_file_name), + sep = "\t",) + +## Variable selection --------------------------------------------------------- +message("running variable selection") + +# make a new function to get the most extreme monthly means? + +# predefine list of variables to keep +predefined_predictors_to_keep <- NULL + +# run variable selection based on variable inflation factor usdm::vif +predictor_vifs <- + vifcor( + all_predictors, + th = 0.8, # threshold of correlation + keep = predefined_predictors_to_keep, # if wanted, vector of variables to keep no matter what + size = 5000, # subset size in case of big data (default 5000) + method = "pearson" # 'pearson','kendall','spearman' + ) + +# check which variables are kept +variables_to_keep <- + names(all_predictors)[names(all_predictors) %nin% predictor_vifs@excluded] +message("variables kept after excluding the most correlated ones:") +print(variables_to_keep) + +# subset variables to exclude highly correlated ones +predictors <- all_predictors[[(variables_to_keep)]] +predictors <- raster::stack(predictors) + +# save variables in CAPFITOGEN folder +if (!dir.exists(file.path(Dir.Capfitogen, "rdatapoints/world/9x9"))) { + dir.create(file.path(Dir.Capfitogen, "rdatapoints/world/9x9"), + recursive = TRUE) + dir.create(file.path(Dir.Capfitogen, "rdatamaps/world/9x9"), + recursive = TRUE) +} + +saveRDS(predictors, + "Capfitogen/rdatapoints/world/9x9/base9x9.RData") +# save(predictors, +# file = "Capfitogen/rdatapoints/world/9x9/base9x9.RData") + +for (i in 1:dim(predictors)[3]) { + file_name_path = file.path("Capfitogen/rdatamaps/world/9x9", + paste0(names(predictors)[i],".tif")) + writeRaster(predictors[[i]], + file_name_path, + overwrite = TRUE) +} + +## Modify Capfitogen possible values ------------------------------------------ + +# add a line with our data resolution to list of possible values +load(file.path(Dir.Capfitogen, "resol.RData")) +if (nrow(resol[resol$resol == "9x9",]) < 1) { + line = paste("\"celdas 9x9 km aprox (4.5 arc-min)\"", "\"9x9\"", 0.075, + sep = "\t") + write(line, + file = file.path(Dir.Capfitogen, "resol.txt"), + append = TRUE) + + resol <- rbind( + resol, + data.frame( + resolucion = "celdas 9x9 km aprox (4.5 arc-min)", + resol = "9x9", + resoldec = 0.075)) + + save(resol, file = file.path(Dir.Capfitogen, "resol.RData")) +} +rm(resol) + +# create template file for logging script processes +if (!file.exists(file.path(Dir.Capfitogen.Error,"process_info.txt"))) { + file.create(file.path(Dir.Capfitogen.Error,"process_info.txt")) +} + +# add new geophysical variables to list of possible variables +load(file.path(Dir.Capfitogen, "geophys.RData")) +if (nrow(geophys[geophys$VARCODE == "wind_max", ]) < 1) { + geophys <- rbind( + geophys, + data.frame( + VARID = 145, + VARCODE = "wind_max", + VARDESCR_EN = "mean wind speed of windiest month (annual max of monthly means)", + VARDESCR = "mean_wind_speed_of_windiest_month", + VARUNIDAD = "ms-1", + VARFUENTE = "Derivada de Worldclim", + VARMODULO = "Geofisico/Geophysic", + FUENTELINK = "http://worldclim.org" + ) + ) + save(geophys, file = file.path(Dir.Capfitogen, "geophys.RData")) + +# # rename geophysical variable files ---- NB! will break if edaphic vars are added... +# number_of_geophys_variables <- length(predictor_names[grep("BIO", +# predictor_names, +# invert = TRUE)]) +# for (i in 1:number_of_geophys_variables) { +# from_name = predictor_names[grep("BIO", +# predictor_names, +# invert = TRUE)][i] +# +# to_name = geophys[geophys$VARDESCR == from_name, "VARCODE"][1] +# file.rename( +# from = file.path( +# "Capfitogen/rdatamaps/world/9x9", +# paste0(from_name, ".tif")), +# to = file.path("Capfitogen/rdatamaps/world/9x9", +# paste0(to_name, ".tif")) +# ) +# } +} + +# find names of bioclimatic, edaphic, and geophysical variables +load(file.path(Dir.Capfitogen, "edaph.RData")) +load(file.path(Dir.Capfitogen, "bioclim.RData")) + +edaphic_variables <- intersect(edaph$VARCODE, names(predictors)) +bioclimatic_variables <- intersect(bioclim$VARCODE, names(predictors)) +geophysical_variables <- intersect(geophys$VARCODE, names(predictors)) + +rm(bioclim, edaph, geophys) + +## Clustering and map creation: ELCmapas --------------------------------------- +message("setting parameters and running ELC map script (ecogeographic land characterization)") +### Parameters for ELC maps ---- +ruta <- Dir.Capfitogen # path to capfitogen scripts +resultados <- Dir.Capfitogen.ELCMap # directory to place results +pasaporte <- pasaporte_file_name # species occurrence data + +pais <- "world" # global extent - big modifications will be necessary to use different resolution +geoqual <- FALSE +totalqual <- 30 # Only applies if GEOQUAL=TRUE, must be a value between 0 and 100 +duplicat <- TRUE # duplicat=TRUE indicates that records of the same GENUS/SPECIES/SUBTAXA will be deleted +distdup <- 1 # distance threshold in km to remove duplicates from same population +resol1 <- "celdas 9x9 km aprox (4.5 arc-min)" # resolution +latitud <- FALSE #Only applies if ecogeo=TRUE; whether to use latitude variable (Y) as a geophysical variable from 'pasaporte' +longitud <- FALSE + +bioclimv <- bioclimatic_variables # +edaphv <- edaphic_variables #names(edaphic_variables) # edaphic variables (defaults from SOILGRIDS) +geophysv <- geophysical_variables # geophysical variables + +maxg <- 20 # maximum number of clusters per component +metodo <- "kmeansbic" # clustering algorithm type. Options: medoides, elbow, calinski, ssi, bic +iterat <- 10 # if metodo="Calinski" or "ssi", the number of iterations to calculate the optimal number of clusters. + +# run the script +message("Clustering and creating maps") +source(file.path(Dir.Capfitogen, + "scripts/Tools Herramientas/ELCmapas.R")) +setwd(Dir.Base) + +## Overlaying conservation maps "Complementa" --------------------------------- +#' create template file for logging script processes +if (!file.exists(file.path(Dir.Results.Complementa.Error,"process_info.txt"))) { + file.create(file.path(Dir.Results.Complementa.Error,"process_info.txt")) +} + +### parameters for Complementa ---- +{ +resultados <- Dir.Results.Complementa +pasaporte <- pasaporte_file_name + +gaptype <- FALSE # Note: Representa tool a prerequisite of gaptype=TRUE +gaptresh <- 4 #Only applies if gaptype=TRUE +gapna <- "exclude" #Only applies if gaptype=TRUE + +celdas <- TRUE # Note: If celdas=TRUE, a complementarity analysis will be run by cells (grid) +resol1 <- "celdas 10x10 km aprox (5 arc-min)" #Only applies if celdas=TRUE +nceldas <- 10 #Only applies if celdas=TRUE, number of cells in a ranking (from most to least important in terms of taxa richness accumulation) + +areas <- TRUE # If areas=TRUE, a complementary analysis will be run per protected areas (polygons), which can come from a world database (WDPA) or from a shapefile provided by the user. If areas=TRUE, at least one of the following two options (or both), WDPA or propio, must be TRUE, otherwise it may cause errors. +WDPA <- FALSE #Only applies if areas=TRUE +propio <- TRUE # =own, alternative user defined file instead of WDPA +nombre <- "global_wdpa_polygons.shp" #Only applies if propio=TRUE, name of alternative shapefile +campo <- "objectid" #Only applies if propio=TRUE, in campo you must specify the column of the shapefile table that contains the identifier code (ID) of each object (polygon) in the map of protected areas that the user provides through the shapefile. The name of the column must be inserted as it appears in the shapefile table, otherwise errors are generated +nareas <- 5 # the number of protected areas where the points from the passport table coordinates fall, areas organized in a ranking (from most to least important in terms of accumulation of taxa richness) that will be analyzed in detail. It can generate a problem or error if nareas is a very large number and the passport table has few records, or few different species, or all the points are highly concentrated spatially. +coveran <- TRUE # if coveran=TRUE a coverage analysis will be generated for the network of protected areas and a folder called CoverageAnalysis should appear in the results within the resultados para areas folder + +niveltax <- "species"# At which taxonomic level the complementarity analysis is going to run (3 options: "genus", "species" or "subtaxa"). Take into account the following: If "genus" is selected, , in the GENUS column of the passport table there must be at least two different genera, or the same for "species" (SPECIES column) or "subtaxa" (SUBTAXA column)... if there are only NA values or there is only one value in the target column, it can generate errors. +datanatax <- FALSE # whether the NA values in genus, species or subtaxa will be taken into account as a different value. Any TRUE or FALSE option does not usually generate problems or errors. + +mapaelcf <- TRUE # Note: Will an ELC map from a previous execution of the ELCmapas tool be used as an additional factor for classifying the taxonomic ranks for the complementarity analysis? +mapaelc <- "mapa_elc_world.grd" #Only applies if mapaelcf=TRUE, mapaelc must contain the name of the ELC map obtained by previously using the ELCmapas tool (.grd and .gri files that must always be in the CAPFITOGEN3/ELCmapas folder) +datanaelc <- FALSE # Only applies if mapaelcf=TRUE, indicates whether (TRUE) the records that fall in NA zones on the ELC map will be taken into account or not (FALSE) +data0elc <- FALSE # Only applies if mapaelcf=TRUE, indicates whether (TRUE) the records that fall in category 0 on the ELC map will be taken into account or not (FALSE) +} + +# run the script +message("running Capfitogen Complementa tool for conservation areas") +setwd(Dir.Base) +source(file.path(Dir.Capfitogen, + "/scripts/Tools Herramientas/Complementa.R")) + +setwd(Dir.Base) + +message(" - - - end of capfitogen script - - - ") + +# end diff --git a/capfitogen_master_illustration.drawio.svg b/capfitogen_master_illustration.drawio.svg new file mode 100644 index 0000000..7f3a324 --- /dev/null +++ b/capfitogen_master_illustration.drawio.svg @@ -0,0 +1,4 @@ + + + +
GBIF
GitHub -- uc-CRW repository
/R_Scripts
/Data/Environment
/Data/GBIF
/R_Scripts

ModGP-commonlines.R


Packages,
directories,
simple functions
SHARED-Data.R
Define functions to download data:
GBIF (FUN.DownGBIF),
CAPFITOGEN's google drive (FUN.DownCAPFITOGEN)
World Database on Protected Areas (FUN.DownWDPA)
CAPFITOGEN
(WorldClim, SoilGrids, HSWD)
World Database on
Protected Areas
CAPFITOGEN submodule
Set species name
run shared scripts
FUN.DownGBIF()
FUN.DownCAPFITOGEN()
FUN.DownWDPA()
run Capfitogen tools
set parameters for capfitogen
visualising output
Variable selection with usdm::vifcor()
capfitogen_master.R
evalieungh/Capfitogen
/scripts/Tools Herramientas/
ELCmapas.R
CAPFITOGEN tool for
ecogeographic land characterization (ELC) maps
which reflect adaptive scenarios for
a specific species
/scripts/Tools Herramientas/
Complementa.R
CAPFITOGEN tool for
complementarity analysis between cells or 
protected areas to see the degree of coverage of
current protected area networks in terms of in situ conservation of crop wild relatives
\ No newline at end of file diff --git a/container/Dockerfile b/container/Dockerfile index f222998..f3e716e 100644 --- a/container/Dockerfile +++ b/container/Dockerfile @@ -56,13 +56,6 @@ RUN . /conda/etc/profile.d/conda.sh && \ conda activate /conda/env && \ conda install -c conda-forge --override-channels \ r-rcpparmadillo=14.2.2-1 \ - r-iterators=1.0.14 \ - r-sp=2.1-3 \ - r-raster=3.6-26 \ - r-rgbif=3.8.0 \ - r-rnaturalearth=1.0.1 \ - r-rnaturalearthdata=1.0.0 \ - r-ncdf4=1.22 \ r-epi=2.47.1 \ r-png=0.1-8 \ r-keyring=1.3.2 \ @@ -83,16 +76,26 @@ RUN . /conda/etc/profile.d/conda.sh && \ r-doParallel=1.0.17 \ r-foreach=1.5.2 \ r-doSNOW=1.0.20 \ - r-automap=1.1-12 \ r-fasterize=1.1.0 \ r-stars=0.6-6 \ + r-automap=1.1-12 \ + r-cowplot=1.1.3 \ r-ggplot2=3.5.1 \ + r-ggpmisc=0.6.1 \ r-ggpubr=0.6.0 \ + r-gridExtra=2.3 \ + r-ncdf4=1.22 \ + r-maps=3.4.2.1 \ + r-raster=3.6-26 \ + r-rgbif=3.8.0 \ + r-rnaturalearth=1.0.1 \ + r-rnaturalearthdata=1.0.0 \ + r-sp=2.1-3 \ r-tidyr=1.3.1 \ r-viridis=0.6.5 \ - r-cowplot=1.1.3 \ - r-ggpmisc=0.6.1 \ - r-gridExtra=2.3 \ + r-iterators=1.0.14 \ + r-rvest=1.0.4 \ + r-gdalutilities=1.2.5 \ # Packages for capfitogen r-maptools=1.1-8 \ r-dismo=1.3-16 \ @@ -140,7 +143,6 @@ RUN . /conda/etc/profile.d/conda.sh && \ "blockCV", \ "ecmwfr", \ "giscoR", \ - "tidyterra", \ "geodata", \ "mmap" \ ))' && \ diff --git a/container/Makefile b/container/Makefile index 960d0c1..b303bcd 100644 --- a/container/Makefile +++ b/container/Makefile @@ -1,13 +1,13 @@ IMAGE_ROOT?=ghcr.io/biodt IMAGE=cwr -IMAGE_VERSION=0.5.3 -R_VERSION=4.3.2 +IMAGE_VERSION=0.6.0 +R_VERSION=4.3.3 build: Dockerfile docker buildx build --platform linux/amd64 \ --label "org.opencontainers.image.source=https://github.com/BioDT/uc-CWR" \ - --label "org.opencontainers.image.description=CWR environment with R $(R_VERSION)" \ + --label "org.opencontainers.image.description=CWR environment" \ --build-arg R_VERSION=$(R_VERSION) \ -t $(IMAGE_ROOT)/$(IMAGE):$(IMAGE_VERSION) \ . diff --git a/container/README.md b/container/README.md new file mode 100644 index 0000000..7f4b6cf --- /dev/null +++ b/container/README.md @@ -0,0 +1,73 @@ +# Building container + +This directory has two main files: +- `Dockerfile`: This is the recipe for building a container image +- `Makefile`: This is a helper file for simplifying building the image (so that one can say `make build` instead of `docker buildx build --platform ... --build-arg ...`) + + +## Prerequisites + +Docker/Podman is needed for building the image on a local computer. Follow these steps on Ubuntu: + +1. Install docker or podman: + + sudo apt install podman-docker + +2. Add the following environment variable to `~/.bashrc` or redefine it always before running build commands: + + export BUILDAH_FORMAT=docker + + +## Workflow for building, publishing, and pulling images + +1. Updating and building a new image on a local computer + + 1. Update `IMAGE_VERSION` variable in `Makefile`. + If this is not done, an existing image with the same version gets overwritten, which is problematic for reproducibility. + Using [semantic versioning](https://semver.org/) is recommended. + + 2. Update `Dockerfile` and/or `Makefile` as needed. + + 3. Build a new image: + + make build + +2. Pushing image to ghcr.io + + 1. Login to GitHub container registry. + Use GitHub username and Personal Access Token with scope 'write:packages' as username and password, respectively. + See [these instructions for creating a token](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token#creating-a-personal-access-token-classic). + + docker login ghcr.io + + 2. Push the image to ghcr.io: + + make push + +3. Pulling image to LUMI and converting it to singularity + + 1. Execute the following on LUMI (if image is private, add `--docker-login` and login with a token with scope 'read:packages'): + + singularity pull --disable-cache docker://ghcr.io/biodt/cwr:IMAGE_VERSION + + This creates a file `cwr_IMAGE_VERSION.sif`. + + +## Development workflow without publishing images + +1. Updating and building a new image on a local computer + + - Follow the same steps as above + +2. Converting the image to singularity on a local computer + + 1. Convert the image to singularity (results in a .sif file): + + make singularity + +3. Transferring the image to LUMI + + 1. Use scp: + + scp file.sif user@lumi.csc.fi:/scratch/project_xxx/user/ + diff --git a/submit_capfitogen_exec_lumi.sh b/submit_capfitogen_exec_lumi.sh new file mode 100644 index 0000000..10ddad6 --- /dev/null +++ b/submit_capfitogen_exec_lumi.sh @@ -0,0 +1,15 @@ +#!/bin/bash -l +#SBATCH -J capfitogen +#SBATCH -o capfitogen-%j.out +#SBATCH --account=project_465001987 +#SBATCH --nodes=1 +#SBATCH --tasks-per-node=1 +#SBATCH --cpus-per-task=128 +#SBATCH --time=10:00:00 +#SBATCH --partition=largemem +##SBATCH --partition=lumid +##SBATCH --partition=small --mem-per-cpu=2G +##SBATCH --partition=standard --exclusive --mem=0 +##SBATCH --partition=debug --exclusive --mem=0 --time=0:30:00 + +singularity run --bind $PWD cwr_0.6.0.sif "capfitogen_master.R" \ No newline at end of file diff --git a/submit_capfitogen_prep_lumi.sh b/submit_capfitogen_prep_lumi.sh new file mode 100644 index 0000000..341b20e --- /dev/null +++ b/submit_capfitogen_prep_lumi.sh @@ -0,0 +1,25 @@ +#!/bin/bash -l +#SBATCH -J capfitogen +#SBATCH -o capfitogen-%j.out +#SBATCH --account=project_465000915 +#SBATCH --nodes=1 +#SBATCH --tasks-per-node=1 +#SBATCH --cpus-per-task=8 +#SBATCH --time=48:00:00 +#SBATCH --partition=small --mem=64G +##SBATCH --partition=standard --exclusive --mem=0 +##SBATCH --partition=debug --exclusive --mem=0 --time=0:30:00 + +SPECIES="${1:-Lathyrus}" + +# Workaround for memory issues in terra +# Max memory for terra +R_TERRA_MAX_RAM_MB=$((100 * 1024)) +# Limit max memory further if available memory is less than above +if [[ ${SLURM_MEM_PER_NODE:-0} > 0 ]]; then + R_TERRA_MAX_RAM_MB=$(( $R_TERRA_MAX_RAM_MB < $SLURM_MEM_PER_NODE ? $R_TERRA_MAX_RAM_MB : $SLURM_MEM_PER_NODE )) +fi +export R_TERRA_MAX_RAM_MB +# End of workaround + +git submodule update --init diff --git a/technical_documentation.md b/technical_documentation.md new file mode 100644 index 0000000..56ddab9 --- /dev/null +++ b/technical_documentation.md @@ -0,0 +1,47 @@ +# Technical documentation use-case Crop Wild Relatives + +> NB! This document is under construction. + +Scientific usefulness, written in R to be run on an hpc for a crop species or genus. + +## ModGP + +Main characteristics, script and workflow structure + +|File | Description | +| --- | ----------- | + +### Inputs + +### Outputs + +---------------------- + +## Capfitogen + +Main characteristics, script and workflow structure. + +> Disclaimer: CAPFITOGEN is separate software that was created by others. It is not owned by BioDT, but this prototype digital twin uses a subset of available scripts and functionality from CAPFITOGEN. See Parra-Quijano et al. 2021, and [CAPFITOGEN.net](https://www.capfitogen.net/en/). + +Some files are shared between ModGP and capfitogen scripts (R_scripts/SHARED-Data.R and ModGP-commonlines.R). These files are specific only to capfitogen code: + +|File | Description | +| --- | ----------- | +| Capfitogen | the Capfitogen code repository. A submodule (repository within repository). | +| capfitogen_master.R | Main code for setting up environment, downloading data, and executing capfitogen tools. | +| submit_capfitogen_prep_lumi.sh | bash script to initialise capfitogen submodule and add a workaround fo memory issue. | +| submit_capfitogen_exec_lumi.sh | bash script to execute (run) capfitogen_master.R on LUMI. | + +![Figure: Illustration of scripts and data for running Capfitogen tools](capfitogen_master_illustration.drawio.svg) +*Illustration of scripts and data for running the Capfitogen pipeline. The master script connects to data and other scripts through download functions and sourcing. To run on LUMI, the master script is executed through bash scripts.* + +### Inputs + +- Species occurrences, taken from download by ModGP if available +- Environmental variables downloaded from CAPFITOGEN's google drive: 177 bioclimatic, edaphic (soil), and geophysical predictors. These are narrowed down using Variable Inflation Factor. +- protected areas, vector map, by the [The World Database on Protected Areas (WDPA)](https://www.protectedplanet.net/en/thematic-areas/wdpa?tab=WDPA) + +### Outputs + +- ELC maps created with CAPFITOGEN +- Complementa analysis created with CAPFITOGEN