diff --git a/.Rbuildignore b/.Rbuildignore index c89cbfa..afad9ea 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -6,4 +6,7 @@ TODO* ^README.md ^README_files$ ^\.github$ -^vignettes/jss.bst \ No newline at end of file +^vignettes/jss.bst +^vignettes/NSGEVCompDet.Rnw +^vignettes/NSGEVCompDet.R +^vignettes/NSGEVCompDet.pdf \ No newline at end of file diff --git a/DESCRIPTION b/DESCRIPTION index e7a19ec..be4ce46 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -2,20 +2,19 @@ Package: NSGEV Type: Package Title: Non-Stationary GEV Time Series Version: 0.2.0 -Date: 2024-12-02 +Date: 2025-08-20 Authors@R: c(person(given = "Yves", family = "Deville", role = c("cre", "aut"), email = "deville.yves@alpestat.com", comment = c(ORCID = "0000-0002-1233-488X"))) -Maintainer: Yves Deville Description: Utility functions for Non-Stationary time series with GEV margins. License: GPL-3 LazyData: yes Imports: utils, methods, splines, numDeriv, nloptr, data.table, reshape2, tidyr, forecast, scales, grDevices Depends: ismev, extRemes, nieve, ggplot2 -Suggests: parallel, doParallel, foreach, testthat, rmarkdown, knitr, png, deSolve +Suggests: parallel, doParallel, foreach, testthat, rmarkdown, knitr, png, deSolve, htmltools, plotly Encoding: UTF-8 URL: https://github.com/IRSN/NSGEV/ BugReports: https://github.com/IRSN/NSGEV/issues/ diff --git a/NAMESPACE b/NAMESPACE index fefb816..abc4d7e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -13,6 +13,7 @@ S3method(autolayer,quantMax.TVGEV) S3method(autoplot,TVGEV) S3method(autoplot,bfts) S3method(autoplot,bts) +S3method(autoplot,influence.TVGEV) S3method(autoplot,predict.TVGEV) S3method(autoplot,quantMax.TVGEV) S3method(autoplot,resid.TVGEV) @@ -24,6 +25,7 @@ S3method(coef,TVGEV) S3method(confint,TVGEV) S3method(density,NSGEV) S3method(density,TVGEV) +S3method(influence,TVGEV) S3method(lines,bts) S3method(logLik,TVGEV) S3method(mean,TVGEV) @@ -34,6 +36,7 @@ S3method(plot,NSGEV) S3method(plot,TVGEV) S3method(plot,bfts) S3method(plot,bts) +S3method(plot,influence.TVGEV) S3method(plot,predict.TVGEV) S3method(plot,resid.TVGEV) S3method(predict,NSGEV) @@ -41,6 +44,7 @@ S3method(predict,TVGEV) S3method(print,NSGEV) S3method(print,TVGEV) S3method(print,bts) +S3method(print,influence.TVGEV) S3method(print,profLik.TVGEV) S3method(print,summary.NSGEV) S3method(print,summary.TVGEV) @@ -99,6 +103,7 @@ export(selectDate) export(testNumDeriv) export(transFormula) import("extRemes", except = c("qqplot", "qqnorm")) +import(data.table) import(ggplot2) import(ismev) import(nloptr) @@ -106,13 +111,18 @@ import(numDeriv) import(splines) importFrom(forecast,autolayer) importFrom(forecast,autoplot) +importFrom(grDevices,adjustcolor) importFrom(grDevices,col2rgb) +importFrom(grDevices,palette) importFrom(graphics,abline) +importFrom(graphics,grid) +importFrom(graphics,legend) importFrom(graphics,lines) importFrom(graphics,matlines) importFrom(graphics,matplot) importFrom(graphics,mtext) importFrom(graphics,par) +importFrom(graphics,rug) importFrom(graphics,text) importFrom(methods,is) importFrom(nieve,dGEV) @@ -130,6 +140,7 @@ importFrom(stats,confint) importFrom(stats,density) importFrom(stats,deriv) importFrom(stats,df) +importFrom(stats,influence) importFrom(stats,lm) importFrom(stats,logLik) importFrom(stats,model.frame) diff --git a/NEWS.md b/NEWS.md index 4875924..31bda1f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,6 +2,43 @@ **NSGEV** Package News ====================== +# New in version 0.2.0 (branch `influence`) + +## Enhancements + +- Experimental changes on `TVGEV` to allow the use of Time Series + covariates along with functions of time. CAUTION This may have + created bugs!!! + +- New vignette *Timeseries Covariates with NSGEV*. DRAFT version. + +- A warning is cast when the `design` argument of `TVGEV` is a call + using variables not in `names(data)`. These variables may or may not + be found in the parent environment. + +## Bug fixes + +- The generalised residuals computed with `"gumbel"` were actually + following the *reversed* Gumbel, not the Gumbel. New (experimental) + choice `"gev4"` for the target distribution. + +- When `TVGEV` was used with `optim = "nloptr"`, the choice `parTrack + = TRUE` did not work as expected. + +## Changes + +- The (non-exported) `MLE.TVGEV` function now has a `trace` argument + which is set to the value of `trace` argument of `TVGEV`. When + `estim = "nloptr"` and a value `> 2` is given the values of the + parameter and of the objective are traced. + +- The Hessian of the negalive log-likelihood is now exact and no longer + computed with `optimHessian`. + +- The `.Rnw` vignettes, such as the technical vignette `CompDet.Rnw` + should no longer be built. Only the pdf will be shipped with the + package tarball. + # New in version 0.2.0 ## Enhancements diff --git a/R/TVGEV.R b/R/TVGEV.R index 5b509f1..9210d6c 100644 --- a/R/TVGEV.R +++ b/R/TVGEV.R @@ -1,41 +1,46 @@ - ##***************************************************************************** -##' Compute the GEV parameters for the marginal distributions of a -##' \code{TVGEV} model. +##' @description Compute the GEV parameters for the marginal +##' distributions of a \code{TVGEV} model. ##' ##' @title Compute the Matrix of GEV Parameters from the Vector of -##' Model Parameters 'psi' and the Data +##' Model Parameters 'psi' and the Data ##' ##' @param model The \code{TVGEV} object. ##' ##' @param psi A vector of parameters for \code{model}. By default the -##' vector of estimates in \code{psi} will be used. +##' vector of estimates in \code{psi} will be used. ##' ##' @param date A vector that can be coerced to the \code{"Date"} -##' class. Each element will correspond to a row of the matrix of GEV -##' parameters. +##' class. Each element will correspond to a row of the matrix of +##' GEV parameters. If \code{model} contains TS covariables, this +##' must instead be a data frame containing the suitable +##' covariables, along with a column with name \code{model$name} +##' and class \code{Date}. Id \code{date} is \code{NULL}, the date +##' and possibly the TS covariables are extracted from +##' \code{model} and correspond to the information used in the +##' fit. ##' ##' @param deriv Logical. If \code{TRUE} the returned object has an -##' attribute names \code{"gradient"} which is the gradient -##' array. +##' attribute names \code{"gradient"} which is the gradient array. ##' ##' @param checkNames Logical. If \code{TRUE} it will be checked that -##' the names of \code{psi} match the \code{parNames} element of -##' \code{model}. +##' the names of \code{psi} match the \code{parNames} element of +##' \code{model}. ##' ##' @param \dots Not used yet. ##' -##' @return A matrix with \code{length(date)} rows and \code{3} colums. -##' The columns contain the location, the scale and the shape GEV parameters -##' in that order. +##' @return A matrix with \code{length(date)} rows and \code{3} +##' colums. The columns contain the location, the scale and the +##' shape GEV parameters in that order. ##' -##' @seealso The \code{\link{GEV}} for the GEV probability functions. +##' @seealso The \code{\link[nieve]{GEV}} for the GEV probability functions. ##' ##' @note The \code{deriv} formal argument is mainly here to maintain -##' a compatibility with the \code{NSGEV} method. However, the -##' derivatives w.r.t. the parameter vector \code{psi} are obtained -##' simply by using the \code{X} elements of the object \code{model}. +##' a compatibility with the \code{NSGEV} method. However, the +##' derivatives w.r.t. the parameter vector \code{psi} are +##' obtained simply by using the \code{X} elements of the object +##' \code{model}. ##' ##' @method psi2theta TVGEV ##' @@ -58,23 +63,33 @@ psi2theta.TVGEV <- function(model, psi = NULL, date = NULL, stop("'psi' must have named elements with ", "the names 'parNames(model)'") } - } else { - names(psi) <- model$parNames - } - - ## is date is NULL, we use the design matrices in the model + } else names(psi) <- model$parNames + + ## ======================================================================== + ## is 'date' is 'NULL', we use the design matrices in the model + ## ======================================================================== + if (is.null(date)) { X <- model$X n <- model$n ind <- model$ind fDate <- model$fDate } else { - n <- length(date) + date <- as.data.frame(date) + if (ncol(date) == 1) { + names(date) <- model$date + } else if (is.null(date[[model$date]])) { + stop("one must pass a data frame ", + "with a '", model$date, "'column to the 'modelMatrices'", + "function.") + } + n <- nrow(date) + fDate <- format(date[[model$date]]) + if (!all(model$isCst)) { L <- modelMatrices.TVGEV(model, date = date) X <- L$X } else X <- NULL - fDate <- format(date) } theta <- array(NA, dim = c(n, 3L), @@ -185,7 +200,7 @@ parIni.TVGEV <- function(object, y = NULL) { ##' psi0 = NULL, estim = c("optim", "nloptr"), ##' coefLower = if (estim != "nloptr") numeric(0) else c("xi_0" = -0.9), ##' coefUpper = if (estim != "nloptr") numeric(0) else c("xi_0" = 2.0), -##' parTrack = FALSE) +##' parTrack = FALSE, trace = 0) ##' ##' @param object A (possibly incomplete) \code{TVGEV} object. ##' @@ -196,14 +211,24 @@ parIni.TVGEV <- function(object, y = NULL) { ##' @param estim Character giving the optimisation to be used. ##' ##' @param coefLower,coefUpper Numeric vectors or \code{NULL} giving -##' bounds on the parameters. These are used overwrite the "normal" -##' values which are \code{-Inf} and \code{Inf}. These arguments are -##' only used when \code{estim} is \code{"nloptr"}. +##' bounds on the parameters. These are used overwrite the +##' "normal" values which are \code{-Inf} and \code{Inf}. These +##' arguments are only used when \code{estim} is \code{"nloptr"}. +##' These vectors must have \emph{suitably named} elements. It +##' is not necessary to give values for all the elements: the +##' "normal" infinite values will continue to hold for the +##' parameters for which the bounds are not given. When the scale +##' parameter is constant it may help to give a positive value for +##' the element \code{"sigma_0"} of \code{coefLower}. When the +##' shape is constant it may help to give bounds such as such as +##' \code{-0.5} and \code{0.5} for the element \code{"xi_0"}. ##' ##' @param parTrack \code{Logical}. If \code{TRUE}, all parameters at ##' which an evaluation of the log-Likelihood will be stored and ##' returned. ##' +##' @param trace Integer level of verbosity. +##' ##' @return A list with elements that can be copied into those ##' of a \code{TVGEV} object. ##' @@ -230,7 +255,8 @@ MLE.TVGEV <- function(object, estim = c("optim", "nloptr"), coefLower = if (estim != "nloptr") numeric(0) else c("xi_0" = -0.9), coefUpper = if (estim != "nloptr") numeric(0) else c("xi_0" = 2.0), - parTrack = FALSE) { + parTrack = FALSE, + trace = 0) { parNames.GEV <- c("loc", "scale", "shape") @@ -367,7 +393,7 @@ MLE.TVGEV <- function(object, if (parTrack) { negLogLikFun1 <- function(psi, deriv = FALSE, hessian = FALSE, object) { - trackEnv$psi <- c(trackEnv$psi, psi) + trackEnv$psi <- c(trackEnv$psi, psi) negLogLikFun(psi = psi, deriv = deriv, hessian = hessian, object = object) } } else { @@ -377,12 +403,16 @@ MLE.TVGEV <- function(object, cvg <- TRUE if (estim == "optim") { - + if (trace) { + cat("o Mininimisation of the negative log-likelihood\n", + " using `stats::optim`.\n") + } res$fit <- try(optim(par = psi0, fn = negLogLikFun1, deriv = FALSE, hessian = FALSE, - control = list(maxit = 1000), + control = list(maxit = 1000, + trace = pmax(trace - 1, 0)), ## hessian = TRUE, object = object)) @@ -394,18 +424,25 @@ MLE.TVGEV <- function(object, res$negLogLik <- res$fit$value } else { cvg <- FALSE + cat(" The optimisation failed to converge.", + " Hints: give `psi0`, try using `estim = \"nloptr\",", + " give lower and upper bounds.\n") } } } else if (estim == "nloptr") { - + if (trace) { + cat("o Minimisation of the negative log-likelihood", + " using `nloptr::nloptr`.\n") + } opts <- list("algorithm" = "NLOPT_LD_LBFGS", "xtol_rel" = 1.0e-8, "xtol_abs" = 1.0e-8, "ftol_abs" = 1e-5, - "maxeval" = 1000, "print_level" = 0, + "maxeval" = 1000, + "print_level" = pmax(trace - 1, 0), "check_derivatives" = FALSE) - + ## XXX caution! this works when the shape is constant only!!! p <- object$p @@ -440,7 +477,7 @@ MLE.TVGEV <- function(object, } res$fit <- try(nloptr(x0 = psi0, - eval_f = negLogLikFun, + eval_f = negLogLikFun1, lb = lb, ub = ub, deriv = TRUE, @@ -461,6 +498,11 @@ MLE.TVGEV <- function(object, res$negLogLik <- res$fit$objective } else { cvg <- FALSE + if (trace) { + cat(" The optimisation failed to converge.", + " Hints: try using `estim = \"optim\",", + " give `psi0`, give lower and upper bounds.\n") + } } } @@ -481,21 +523,17 @@ MLE.TVGEV <- function(object, res$theta <- psi2theta(object, psi = psiHat, checkNames = FALSE) ## compute Hessian. - ## res$hessian <- hessian(func = negLogLikFun, - ## x = res$estimate, deriv = FALSE, - ## object = object) - - res$hessian <- optimHess(par = res$estimate, - fn = negLogLikFun, - deriv = FALSE, - object = object) + res$hessian <- negLogLikFun(psi = res$estimate, + deriv = TRUE, hessian = TRUE, + object = object)$hessian vcov <- try(solve(res$hessian), silent = TRUE) - if (!inherits(vcov, "try-error")) { - rownames(vcov) <- colnames(vcov) <- object$parNames - res$vcov <- vcov - res$sd <- sqrt(diag(vcov)) + rownames(vcov) <- colnames(vcov) <- object$parNames + res$vcov <- vcov + res$sd <- sqrt(diag(vcov)) + } else { + warning("The (exact) Hessian is not numerically invertible") } } @@ -733,16 +771,27 @@ bs.TVGEV <- function(object, ##' modelMatrices.TVGEV <- function(object, date = NULL, ...) { - if (is.null(date)) { return(list(dfAll = object$dfAll, X = object$X)) } trm <- object$terms - date <- as.Date(date) - data <- data.frame(date) - colnames(data) <- object$date - parNames.GEV <- c("loc", "scale", "shape") + if (length(object$TSVars) == 0) { + ## One column data frame + data <- data.frame(date) + colnames(data) <- object$date + data[[object$date]] <- as.Date(data[[object$date]]) + } else { + data <- as.data.frame(date) + date <- data[[object$date]] + if (is.null(data[[object$date]])) { + stop("'model' uses TSVars hence one must pass a data frame ", + "with a '", object$date, "'column to the 'modelMatrices'", + "function.") + } + } + parNames.GEV <- c("loc", "scale", "shape") + ## Caution: here, 'dfAll' does not embed the response if (!is.null(object$design)) { if (is.call(object$design)) { @@ -750,10 +799,13 @@ modelMatrices.TVGEV <- function(object, date = NULL, ...) { } else { dfAll <- object$design } - } else dfAll <- object$data + dfAll[[object$date]] <- NULL + dfAll <- data.frame(data, dfAll) + } else dfAll <- data fDate <- format(date) - rownames(dfAll) <- fDate + ## XXX + ## rownames(dfAll) <- fDate X <- list() @@ -763,7 +815,10 @@ modelMatrices.TVGEV <- function(object, date = NULL, ...) { mf <- model.frame(trm[[nm]], dfAll, na.action = na.pass) X[[nm]] <- model.matrix(trm[[nm]], mf) ## X[[nm]] <- model.matrix(object$terms[[nm]], data = dfAll) - rownames(X[[nm]]) <- fDate + ## XXX + if (length(object$TSVars == 0)) { + rownames(X[[nm]]) <- fDate + } } list(dfAll = dfAll, X = X) @@ -984,20 +1039,81 @@ TVGEV <- function(data, ## ======================================================================= ## Build a data frame 'dfAll' containing ALL the required variables. + ## + ## - 'desVars' contains the "design variables" that are functions of the + ## date coming from the design e.g. 't1' + ## + ## - 'TSVars' constains the variables that are not design + ## variables, that are not the Date or the Response and appear + ## in `allVars`. + ## ## ======================================================================= tv[["design"]] <- mc[["design"]] + allVars <- union(union(all.vars(loc), all.vars(scale)), all.vars(shape)) + desVars <- setdiff(allVars, names(data)) + if (!is.null(tv$design)) { + if (is.call(tv$design)) { dfAll <- as.data.frame(eval(tv$design, envir = data)) - dfAll <- data.frame(data[ , response], dfAll) - colnames(dfAll) <- c(response, colnames(dfAll)[-1]) + ## dfAll <- data.frame(data[ , response], dfAll) + ## colnames(dfAll) <- c(response, colnames(dfAll)[-1]) + desVarsRaw <- all.vars(tv$design) + if (length(desVarsRaw)) { + ind <- is.na(match(desVarsRaw, names(data))) + if (any(ind)) { + warning("'design' uses variables not in `data`: ", + paste(sprintf("`%s`", desVarsRaw[ind]), + collapse = ", "), + ".\n This may be a source of problems in scoping, ", + "because these variables are\n at the best found ", + "in an environment differing from `data`.") + } + } } else { dfAll <- tv$design } - } else dfAll <- tv$data + notFound <- setdiff(desVars, names(dfAll)) + if (length(notFound)) { + notFound <- paste(paste0("'", notFound, "'"), collapse = ", ") + stop("Required variables not in 'data' nor in the design: ", + notFound) + } + ## avoid the duplication of the date column. + dfAll[[date]] <- NULL + dup <- intersect(names(dfAll), names(data)) + if (any(dup)) { + dup <- paste(paste0("'", dup, "'"), collapse = ", ") + stop("Variable names both in design and data: ", + dup) + } + dfAll <- data.frame(data, dfAll) + } else { + dfAll <- tv$data + notFound <- setdiff(allVars, names(dfAll)) + if (length(notFound)) { + notFound <- paste(paste0("'", notFound, "'"), collapse = ", ") + stop("Required variables not in 'data' nor in the design: ", + notFound) + } + } + tv$TSVars <- setdiff(allVars, union(union(date, response), desVars)) + if (trace > 1) { + cat("o Analysis of formulas\n") + cat(" - All Variables: ", + paste(paste0("'", allVars, "'"), collapse = ", "), + "\n") + cat(" - Design Variables: ", + paste(paste0("'", desVars, "'"), collapse = ", "), + "\n") + cat(" - TS Covariates: ", + paste(paste0("'", tv$TSVars, "'"), collapse = ", "), + "\n") + } + tv$fDate <- format(data[ , date]) rownames(dfAll) <- tv$fDate @@ -1074,7 +1190,8 @@ TVGEV <- function(data, estim = estim, coefLower = coefLower, coefUpper = coefUpper, - parTrack = parTrack) + parTrack = parTrack, + trace = trace) ## copy for (nm1 in names(res)) { diff --git a/R/influenceTVGEV.R b/R/influenceTVGEV.R new file mode 100644 index 0000000..44e527d --- /dev/null +++ b/R/influenceTVGEV.R @@ -0,0 +1,279 @@ +##' This function provides some indications on the influence of a +##' specific observation used in the fit of a \code{TVGEV} object. For +##' several candidate values of the chosen observation, a \code{TVGEV} +##' is re-fitted with the observed value replaced by the candidate +##' value, and the quantity of interest as defined by \code{what} is +##' computed. +##' +##' By suitably defining the function \code{what}, one can trace e.g., +##' the estimate of a quantile with given probability, the +##' corresponding upper confidence limit, the upper end-point of the +##' distribution, ... and more. +##' +##' @title Diagnostic of Influence for \code{TVGEV} Objects +##' +##' @param model A \code{TVGEV} object. +##' +##' @param what A function with one argument \code{object} defining or +##' extracting the (numeric) quantity for which the influence is +##' to be analysed. So, \code{object} must be a \code{TVGEV} +##' object. See \bold{Examples}. This function can return numeric +##' vector in which case several influence curves will be +##' computed. It is then wise to make the function return a vector +##' with named elements. +##' +##' @param which Identifies the observation which influence will be +##' analysed. Can be: an integer giving the observation, the +##' character \code{"min"}, the character \code{"max"}, a +##' character that can be coerced to a date, or an object with +##' class \code{"Date"} that identifies the observation. This +##' argument must define an observation existing in the data frame +##' used to create the object. So it can not be used e.g. to +##' define a future date. +##' +##' @param how Character specifying the type of (finite-sample) +##' influence curve to use. For now only the only possibility is +##' \code{"replace"}, see Section 2.1 of the book by Hampel et al +##' (1996). The chosen observation is replaced by a value \eqn{x} +##' and the quantity of interest is computed for each value of +##' \eqn{x}. +##' +##' @param ... Not used. +##' +##' @return A named list of vectors. The element \code{obsValue} +##' contains the candidate values for the chosen observation and +##' \code{statValue} contains the corresponding values of the +##' specific statistic defined with \code{what}. +##' +##' @note For \code{TVGEV} models with a positive GEV shape, the +##' smallest observation may have a strong influence on the +##' result, even if the model is stationary. +##' +##' @importFrom stats influence +##' @export +##' @method influence TVGEV +##' +##' @references +##' +##' Hampel, F.R., Ronchetti, E.M., Rousseeuw, P.J. and Stahel +##' W.A. (1996) \emph{Robust Statistics: The Approach Based on +##' Influence Functions}. Wiley. +##' +##' @examples +##' example(TVGEV) +##' influ <- influence(res2, which = "min") +##' autoplot(influ) +##' influ <- influence(res2, which = as.Date("2015-01-01")) +##' autoplot(influ) +##' RL_2050 <- function(model) { +##' c("RL_2050(100)" = quantile(model, prob = 0.99, date = "2050-01-01")[1]) +##' } +##' influence(res2, what = RL_2050) |> autoplot() +##' influence(res2, what = RL_2050, which = "2003-01-01") |> autoplot() +##' RLs_2050 <- function(model) { +##' res <- c(quantile(model, prob = 0.99, date = "2050-01-01")[1], +##' quantile(model, prob = 0.999, date = "2050-01-01")[1]) +##' names(res) <- paste0("RL", c(100, 1000)) +##' res +##' } +##' influ <- influence(res2, what = RLs_2050, which = "2003-01-01") +##' autoplot(influ) +influence.TVGEV <- function(model, + what = function(model) coef(model)["xi_0"], + which = "min", + how = "replace", + ...) { + + y <- model$data[ , model$response] + Date <- model$data[ , model$date] + n <- model$n + ind <- (1:n)[!is.na(y)] + if (is.numeric(which)) { + if (abs(which - round(which)) > 0.01 || which < 1 || which > n) { + stop("When 'which' is numeric, it must be an integer between ", + "1 and `nobs(model)`") + } + } else if (is.character(which)) { + if (which == "min") { + ii <- which.min(y[ind]) + which <- ind[ii] + } else if (which == "max") { + ii <- which.max(y[ind]) + which <- ind[ii] + } else { + which <- as.Date(which) + which <- (1:n)[Date == which] + } + } else if (inherits(which, "Date")) { + which <- (1:n)[Date == which] + } + + rr <- range(y, na.rm = TRUE) + rr <- rr + c(-0.1, 0.1) * diff(rr) + + yGrid <- seq(from = rr[1], to = rr[2], length = 30) + yGrid <- c(yGrid, y[which]) + res0 <- do.call(what, list(model = model)) + nres <- length(res0) + nms <- names(res0) + res <- matrix(NA_real_, nrow = length(yGrid), ncol = nres) + names(res) <- nms + + for (iy in seq_along(yGrid)) { + .df <- model$data + .df[which, model$response] <- yGrid[iy] + calli <- model$call + calli[["data"]] <- .df + tr <- try(fiti <- eval(calli)) + resi <- do.call(what, list(model = fiti)) + res[iy, ] <- resi + } + + if (nres == 1) res <- as.vector(res) + + res <- list(obsValue = yGrid, + statValue = res, + which = which, + what = what, + date = Date, + y = y, + names = nms) + + class(res) <- "influence.TVGEV" + res + +} + +##' @method print influence.TVGEV +##' @export +##' +print.influence.TVGEV <- function(x, ...) { + cat("Finite-sample influence function\n") + cat(sprintf("Observation number: %d (%s)\n", + x$which, x$date[x$which])) + cat(sprintf("what: %s\n", paste(x$names, collapse = ", "))) +} + +##' +##' @title Plot Method for \code{influence.TVGEV} Object +##' +##' @param x A \code{TVGEV} object. +##' +##' @param y Not used. +##' +##' @param ... Not used. +##' +##' @return Nothing. +##' +##' @note This method \code{autoplot} +##' +##' @seealso \code{\link{autoplot.TVGEV}}. +##' +##' @importFrom grDevices palette adjustcolor +##' @importFrom graphics matplot grid rug legend +##' @export +##' @method plot influence.TVGEV +##' +plot.influence.TVGEV <- function(x, y = NULL, ...) { + Pch <- c(16, 18, 21, 24, 22) + nc <- ifelse(is.matrix(sv <- x$statValue), ncol(sv), 1) + warning("Consider using the `autoplot` method instead") + grDevices::palette("Tableau") + matplot(x$obsValue, x$statValue, + pch = Pch[1:nc], + main = sprintf("Influence of obs. number %d, (%s)", + x$which, format(x$date[x$which])), + xlab = "obs. candidate value", ylab = "stat. value") + grid() + abline(v = x$y[x$which], + col = grDevices::adjustcolor( "gray", alpha.f = 0.7)) + rug(x$y) + legend("topleft", + pch = Pch[1:nc], + col = 1:nc, + legend = x$names) + grDevices::palette("default") +} + + +##' The influence curve is obtained by replacing the chosen +##' observation by a candidate value and then re-fitting the same +##' model and computing the statistic of interest: coefficient +##' estimate, quantile, ... The rug shows the \eqn{y_b} and the +##' vertical line shows the observation chosen for the analysis. +##' +##' @title Create a \code{ggplot} for a \code{influence.TVGEV} Object +##' +##' @param object An object with class \code{"influence.TVGEV"} as +##' created bye the \code{influnce} method for the \code{"TVGEV"} +##' class. +##' +##' @param ... Not used yet. +##' +##' @return An object inheriting grom \code{"ggplot"} showing the +##' (finite-sample) influence function for the observation and the +##' statistic that have been chosen. +##' +##' @export +##' @method autoplot influence.TVGEV +##' +##' @examples +##' library(ismev) +##' data(venice) +##' df <- data.frame(Date = as.Date(paste0(venice[ , "Year"], "-01-01")), +##' Sealevel = venice[ , "r1"] / 100) +##' fit0 <- TVGEV(data = df, date = "Date", response = "Sealevel") +##' autoplot(fit0) +##' RL_2050 <- function(model) { +##' c("RL_2050(100)" = quantile(model, prob = 0.99, date = "2050-01-01")[1]) +##' } +##' autoplot(fit0) +##' influence(fit0, what = RL_2050) |> autoplot() +##' ## fit with a linear time trend +##' fit1 <- TVGEV(data = df, date = "Date", response = "Sealevel", +##' design = polynomX(Date, degree = 1), loc = ~t1) +##' autoplot(fit1) +##' summary(fit1) +##' influence(fit1) |> autoplot() +##' influence(fit1, what = RL_2050) |> autoplot() +##' influence(fit1, which = "max", what = RL_2050) |> autoplot() +##' ## Influence curve for the estimated slope +##' slope <- function(model) { +##' c("slope\n(cm/100 yr)" = 100 * unname(coef(model)["mu_t1"])) +##' } +##' influence(fit1, which = "max", what = slope) |> autoplot() +##' influence(fit1, which = "min", what = slope) |> autoplot() +autoplot.influence.TVGEV <- function(object, ...) { + + obsValue <- statValue <- name <- y <- NULL + + if (is.matrix(m <- object$statValue) && ncol(m) > 1) { + df <- list() + for (j in 1:ncol(object$statValue)) { + df[[j]] <- data.frame(obsValue = object$obsValue, + statValue = object$statValue[ , j], + name = object$names[j]) + } + df <- data.table::rbindlist(df) + } else { + df <- data.frame(obsValue = object$obsValue, + statValue = object$statValue, + name = object$names) + } + g <- ggplot(data = df) + + geom_point(mapping = aes(x = obsValue, + y = statValue, + group = name, + colour = name, + shape = name), size = 1.5) + + xlab("obs. value") + ylab("stat. value") + + geom_vline(xintercept = object$y[object$which], + colour = adjustcolor( "gray", alpha.f = 0.9)) + + ggtitle(sprintf("Influence curve of obs. number %d, (%s)", + object$which, format(object$date[object$which]))) + + geom_rug(data = data.frame(y = object$y), mapping = aes(x = y)) + + labs(colour = "stat.", shape = "stat.") + + scale_colour_manual(values = c("orangered", "springgreen3", "steelblue3")) + g + +} diff --git a/R/miscUtils.R b/R/miscUtils.R index 8c7ed3b..6e60426 100644 --- a/R/miscUtils.R +++ b/R/miscUtils.R @@ -308,3 +308,86 @@ testNumDeriv <- function(der, derNum, PREC = 0.05, (max(abs(der - derNum)) < small / PREC) || (max(abs(der - derNum) / (abs(der) + 1e-9)) < small / PREC) } + +##' +##' @title Check New Data for Prediction +##' +##' @param object A \code{TSGEV} object. +##' +##' @param newdata An atomic vector or a data frame providing the data +##' for a prediction. +##' +##' @param design Not used yet. +##' +##' @return A data frame with the variables required for prediction. +##' This must always contain a column with is name given by +##' \code{object$date} and with class \code{"Date"}. If +##' \code{object} contains \code{TSVars}, the corresponding +##' variables must be provided as well, but with no check on their +##' class. +##' +##' @keywords internal +##' +## @examples +## df <- within(TXMax_Dijon, Date <- as.Date(sprintf("%4d-01-01", Year))) +## fit <- TVGEV(data = df, response = "TXMax", date = "Date", +## design = breaksX(date = Date, breaks = "1970-01-01", degree = 1), +## loc = ~ t1 + t1_1970) +## try(checkNewdata.TVGEV(fit, data.frame(date = "2026-01-20"))) +## checkNewdata.TVGEV(fit, data.frame(Date = "2026-01-20")) +## +## \dontrun{ +## data(fremantle) +## df <- within(fremantle, Date <- as.Date(paste0(Year, "-01-01"))) +## fit <- TVGEV(data = df, response = "SeaLevel", date = "Date", +## design = polynomX(date = Date, degree = 1), +## loc = ~ t1 + SOI, trace = 2) +## try(checkNewdata.TVGEV(fit, data.frame(Date = "2026-01-20", x = 2))) +## checkNewdata.TVGEV(fit, data.frame(Date = "2026-01-20", SOI = 2.1)) +## } +## +checkNewdata.TVGEV <- function(object, newdata = NULL, design = FALSE) { + + + if (length(object$TSVars)) { + if (is.null(newdata)) { + newdata <- object$data + fDate <- object$fDate + } + newdata <- as.data.frame(newdata) + requiredVars <- c(object$date, object$TSVars) + notFound <- setdiff(requiredVars, names(newdata)) + if (length(notFound)) { + stop("Required variables not found: ", + paste(notFound, collapse = ", ")) + } + } else { + if (is.null(newdata)) { + newdata <- object$data[[object$date]] + fDate <- object$fDate + } + if (is.atomic(newdata)) { + newdata <- data.frame(as.Date(newdata)) + colnames(newdata) <- object$date + fDate <- format(newdata[[object$date]]) + } else { + newdata <- as.data.frame(newdata) + if (ncol(newdata) == 1) { + if (names(newdata) != object$date) + stop("Bad column name in 'newdata'.", + " Expecting '", object$date, "'.") + newdata[[object$date]] <- as.Date(newdata[[object$date]]) + } else { + if (is.na(m <- match(object$date, names(newdata)))) + stop("Bad column names in 'newdata'.", + " Expecting to find '", object$date, "'.") + newdata[[object$date]] <- as.Date(newdata[[object$date]]) + newdata <- newdata[, m, drop = FALSE] + } + fDate <- format(newdata[[object$date]]) + } + + } + + newdata +} diff --git a/R/predictTVGEV.R b/R/predictTVGEV.R index 96afd6c..0f67405 100644 --- a/R/predictTVGEV.R +++ b/R/predictTVGEV.R @@ -47,7 +47,7 @@ ##' ##' @param trace Integer level of verbosity. ##' -##' @param ... Arguments to be passed to \code{bs}. +##' @param ... Arguments to be passed to the \code{bs} method. ##' ##' @return A data frame or an array with the RL and confidence limits ##' for each combination of \emph{date}, \emph{period} and @@ -78,6 +78,7 @@ ##' ##' @importFrom utils getS3method ##' @importFrom nieve qGEV +##' @import data.table ##' @importFrom reshape2 melt ##' @importFrom stats predict qnorm ##' @method predict TVGEV @@ -99,6 +100,8 @@ predict.TVGEV <- function(object, trace = 1L, ...) { + Date <- Period <- NULL ## Avoid 'NOTE' in `R CMD check` + dots <- match.call(expand.dots = FALSE)$... confintMethod <- match.arg(confintMethod) @@ -117,7 +120,6 @@ predict.TVGEV <- function(object, } } - if (!all(object$isCst)) { if (is.null(newdate)) { message("Since 'object' is really time-varying, the Return Levels\n", @@ -152,17 +154,20 @@ predict.TVGEV <- function(object, nLevel <- length(level) if (is.null(newdate)) { - newdate <- selectDate(object$data[ , object$date]) + if (length(object$TSVars) == 0) { + newdate <- selectDate(object$data[ , object$date]) + } else { + stop("Since `object` uses TSVars, a data frame must be given ", + "with the columns '", object$date, "' and ", + paste(paste0("'", object$TSVars, "'"), collapse = ", "), ".") + + } } - - ## OLD CODE working with else - ## X <- object$X - ## n <- object$n - ## ind <- object$ind - ## newdate <- object$data[ , object$date] - ## fDate <- object$fDate - - n <- length(newdate) + ## newdate <- as.data.frame(newdate) + ## names(newdate) <- object$date + ## newdate <- data.frame(Id = 1:nrow(newdate), newdate) + newdate <- checkNewdata.TVGEV(object, newdata = newdate) + n <- nrow(newdate) ## special case for non TV model if (!all(object$isCst)) { @@ -170,7 +175,13 @@ predict.TVGEV <- function(object, X <- L$X } else X <- NULL - fDate <- format(newdate) + ## Attempt to make rownames. Not successful and maybe not useful... + fDate <- format(newdate[[object$date]]) + for (nm in object$TSVars) { + fDate <- paste0(fDate, ", nm = ", format(newdate[[nm]])) + } + ## ... so by-pass + fDate <- 1:n theta <- psi2theta(object, date = newdate, deriv = TRUE) @@ -186,7 +197,6 @@ predict.TVGEV <- function(object, scale = theta[ , 2], shape = theta[ , 3]) } - } else if (method == "delta") { @@ -398,7 +408,7 @@ predict.TVGEV <- function(object, theta <- psi2theta(model = object, psi = psi, - date = newdate[iDate], + date = newdate[iDate, ], deriv = TRUE, checkNames = FALSE) RL <- nieve::qGEV(prob, theta[ , 1], theta[ , 2], @@ -560,7 +570,8 @@ predict.TVGEV <- function(object, if (trace && (optNum > 1)) { cat(" \n") } - + + ## XXX remove 'try' resOpt <- try(nloptr::nloptr(x0 = psi0, eval_f = f, eval_g_ineq = g, @@ -655,48 +666,94 @@ predict.TVGEV <- function(object, if (out == "data.frame") { - if (requireNamespace("reshape2", quietly = TRUE)) { + if (method == "none") { + RLDT <- RL + newdate <- data.frame(Id = 1:nrow(newdate), newdate) + newdateDT <- as.data.table(newdate) + dim(RLDT) <- c(dim(RL), 1) + dimnames(RLDT) <- c(dimnames(RL), "Drop" = 1) + RLDT <- as.data.table(RLDT) + RLDT[["Drop"]] <- NULL + RLDT[ , Date := as.integer(Date)] + RLDT[ , Period := as.numeric(Period)] + setnames(RLDT, c("Date", "Period", "value"), c("Id", "Period", "Quant")) + RLDT <- RLDT[newdateDT, on = "Id"] + RLDT[["Id"]] <- NULL + RL <- as.data.frame(RLDT) + } else { + newdate <- data.frame(Id = 1:nrow(newdate), newdate) + RLDT <- as.data.table(RL) + newdateDT <- as.data.table(newdate) + RLDT[["Id"]] <- as.integer(RLDT[["Date"]]) + RLDT[["Date"]] <- NULL - if (method == "none") { - RL <- melt(RL, value.name = "Quant", varnames = c("Date", "Period")) - RL$Date <- as.Date(RL$Date) - } else { - ## ==================================================================== - ## UGGLY CODE: there must be a simpler and more - ## efficient way of doing that. The problem is that we - ## want to drop the " Type" dimension but not the - ## "Level" dimension even when it has no extension - ## ==================================================================== - - df <- list() - for (nm in c("Quant", "L", "U")) { - RL1 <- RL[ , , nm, , drop = FALSE] - df[[nm]] <- melt(RL1, - value.name = nm, - varnames = c("Date", "Period", "Type", "Level")) - } - RL <- data.frame(df[["Quant"]][, c("Date", "Period", "Level", "Quant")], - L = df[["L"]]$L, U = df[["U"]]$U) - - RL$Date <- as.Date(RL$Date) - ind <- with(RL, order(Date, Level, Period)) - RL <- RL[ind, , drop = FALSE] - } + ## RLDT[["Id"]] <- as.integer(RLDT[[object$date]]) + ## Merge + RLDT <- RLDT[newdateDT, on = "Id"] + RLDT <- dcast(RLDT, ... ~ Type) - } else { - stop("the package 'reshape2' could not be used") + RL <- as.data.frame(RLDT) + RL$Period <- as.numeric(RL$Period) + RLDT[["Id"]] <- NULL } class(RL) <- c("predict.TVGEV", "data.frame") + } ## use fDate to display information in title? attr(RL, "title") <- "Conditional Return Levels" attr(RL, "diagno") <- diagno attr(RL, "type") <- "conditional" + attr(RL, "TSVars") <- object$TSVars return(RL) - + + ## STEP TO GET RID of the 'reshape2' package + ## ##' @importFrom reshape2 melt + + ## if (requireNamespace("reshape2", quietly = TRUE)) { + + ## if (method == "none") { + ## RL <- melt(RL, value.name = "Quant", varnames = c("Date", "Period")) + ## RL$Date <- as.Date(RL$Date) + ## } else { + ## ## ==================================================================== + ## ## UGGLY CODE: there must be a simpler and more + ## ## efficient way of doing that. The problem is that we + ## ## want to drop the " Type" dimension but not the + ## ## "Level" dimension even when it has no extension + ## ## ==================================================================== + ## + ## df <- list() + ## for (nm in c("Quant", "L", "U")) { + ## RL1 <- RL[ , , nm, , drop = FALSE] + ## df[[nm]] <- melt(RL1, + ## value.name = nm, + ## varnames = c("Date", "Period", "Type", "Level")) + ## } + ## RL <- data.frame(df[["Quant"]][, c("Date", "Period", "Level", "Quant")], + ## L = df[["L"]]$L, U = df[["U"]]$U) + ## + ## RL$Date <- as.Date(RL$Date) + ## ind <- with(RL, order(Date, Level, Period)) + ## RL <- RL[ind, , drop = FALSE] + ## } + ## + ## } else { + ## stop("the package 'reshape2' could not be used") + ## } + ## + ## class(RL) <- c("predict.TVGEV", "data.frame") + ## } + + ## ## use fDate to display information in title? + ## attr(RL, "title") <- "Conditional Return Levels" + ## attr(RL, "diagno") <- diagno + ## attr(RL, "type") <- "conditional" + + ## return(RL) + } @@ -896,6 +953,7 @@ autoplot.predict.TVGEV <- function(object, bw = TRUE, ... ) { } confLev <- attr(object, "confLevel") + TSVars <- attr(object, "TSVars") if (nd <- length(unique(object$Date)) > 6L) { stop(nd, "dates found in 'object'. The maximum allowed is 6") @@ -908,27 +966,27 @@ autoplot.predict.TVGEV <- function(object, bw = TRUE, ... ) { if (!is.null(object$L) && !is.null(object$U)) { g1 <- g1 + geom_ribbon(mapping = aes(x = Period, ymin = L, ymax = U, group = Level, - ## colour = Level, - fill = Level), + ## colour = Level, + fill = Level), ## group = type, alpha = 0.2) - if (bw) { - g1 <- g1 + - geom_line(mapping = aes(x = Period, y = L, group = Level, - linetype = Level), - colour = "gray20", - alpha = 0.8) + if (bw) { + g1 <- g1 + + geom_line(mapping = aes(x = Period, y = L, group = Level, + linetype = Level), + colour = "gray20", + alpha = 0.8) g1 <- g1 + geom_line(mapping = aes(x = Period, y = U, group = Level, linetype = Level), colour = "gray20", alpha = 0.8) - } else { - g1 <- g1 + geom_line(mapping = aes(x = Period, y = L, group = Level), - alpha = 0.2) - g1 <- g1 + geom_line(mapping = aes(x = Period, y = U, group = Level), - alpha = 0.2) - } + } else { + g1 <- g1 + geom_line(mapping = aes(x = Period, y = L, group = Level), + alpha = 0.2) + g1 <- g1 + geom_line(mapping = aes(x = Period, y = U, group = Level), + alpha = 0.2) + } } @@ -958,8 +1016,10 @@ autoplot.predict.TVGEV <- function(object, bw = TRUE, ... ) { g1 <- g1 + xlab("Period") + ylab("Quantile") - if (attr(object, "type") == "conditional") { - g1 <- g1 + facet_wrap( ~ Date) + if (attr(object, "type") == "conditional") { + fm <- as.formula(paste("~", paste(c(" Date", TSVars), collapse = "+"))) + print(fm) + g1 <- g1 + facet_wrap(fm, labeller = label_both) } g1 <- g1 + ggtitle(attr(object, "title")) diff --git a/R/predictUncond.R b/R/predictUncond.R index 73d068c..8871ffd 100644 --- a/R/predictUncond.R +++ b/R/predictUncond.R @@ -136,6 +136,10 @@ predictUncond <- function(object, trace = 0, ...) { + if (length(object$TSVars)) { + stop("'object' includes TSVars. It can not be used for unconditional", + " prediction") + } probL <- (1 - level) / 2 probU <- 1 - probL @@ -210,7 +214,7 @@ predictUncond <- function(object, } if (confintMethod == "none") { - + RL <- array(NA, dim = c(np, 1), dimnames = list("period" = period, "Quant")) diff --git a/R/residTVGEV.R b/R/residTVGEV.R index cfd24ef..00cd477 100644 --- a/R/residTVGEV.R +++ b/R/residTVGEV.R @@ -12,23 +12,25 @@ ##' @param object A \code{TVGEV} object. ##' ##' @param type The approximate distribution wanted. The choices -##' \code{c("unif", "exp", "gumbel")} correspond to the standard -##' uniform, the standard exponential and the standard Gumbel -##' distributions. Partial matching is allowed. +##' \code{c("gumbel", "exp", "unif", "gev4")} correspond to the +##' following distributions: standard Gumbel, the standard +##' exponential, the standard uniform, and the GEV with shape +##' \eqn{-0.25} distributions. Partial matching is allowed. ##' ##' @param ... Not used yet. ##' ##' @return A vector of generalised residuals which should ##' \emph{approximately} be independent and \emph{approximately} ##' follow the target distribution: standard Gumbel, standard -##' exponential or standard uniform, depending on the value of +##' exponential or standard uniform, ... depending on the value of ##' \code{type}. ##' ##' @note The upper 95\% quantile of the standard Gumbel and the ##' standard exponential is close to \eqn{3} which can be used to ##' gauge "large residuals". Using \code{type = "gumbel"} seems ##' better to diagnose abnormally small residuals as may result -##' from abnormally small block maxima. +##' from abnormally small block maxima. The generalised residuals +##' have no physical dimension. ##' ##' @references Cox, D.R. and Snell E.J. (1968) "A General Definition ##' of Residuals". \emph{JRSS Ser. B}, \bold{30}(2), pp. 248-275, @@ -63,26 +65,37 @@ ##' plot(mu, e, type = "p", pch = 16, col = "darkcyan", ##' main = "generalised residuals against 'loc'") residuals.TVGEV <- function(object, - type = c("gumbel", "exp", "unif"), + type = c("gumbel", "exp", "unif", "gev4"), ...) { type <- match.arg(type) Y <- object$data[ , object$response] theta <- psi2theta(model = object, psi = NULL) + ## e <- nieve::pGEV(Y, loc = theta[, 1L], scale = theta[, 2L], + ## shape = theta[, 3L], lower.tail = FALSE) + + ## Apply the (fitted) GEV distribution) to transform to uniform + ## in an increasing fashion then apply the target quantile function e <- nieve::pGEV(Y, loc = theta[, 1L], scale = theta[, 2L], - shape = theta[, 3L], lower.tail = FALSE) + shape = theta[, 3L], lower.tail = TRUE) lims95 <- c(0.025, 0.975) if (type == "exp") { - e <- -log(e) + ## e <- -log(e) + ## lims95 <- -log(1 - lims95) + e <- -log(1 - e) lims95 <- -log(1 - lims95) } else if (type == "gumbel") { - e <- log(-log(e)) - lims95 <- log(-log(1 - lims95)) + ## e <- log(-log(e)) + ## lims95 <- log(-log(1 - lims95)) + e <- -log(-log(e)) + lims95 <- -log(-log(lims95)) + } else if (type == "gev4") { + e <- nieve::qGEV(e, loc = 0, scale = 1, shape = -0.25) + lims95 <- nieve::qGEV(lims95, loc = 0, scale = 1, shape = -0.25) } - names(e) <- rownames(theta) attr(e, "date") <- object$data[ , object$date] attr(e, "type") <- type diff --git a/R/statsTVGEV.R b/R/statsTVGEV.R index 04c19e1..20757ac 100644 --- a/R/statsTVGEV.R +++ b/R/statsTVGEV.R @@ -35,6 +35,10 @@ quantile.TVGEV <- function(x, probs = c(0.90, 0.95, 0.99), date = NULL, psi = NULL, ...) { + + if (length(x$TSVars)) { + warning("'x' includes TSVars") + } ## control (from quantile.default) if (any(is.na(probs))) stop ("NA not allowed yet in 'probs'") @@ -42,17 +46,17 @@ quantile.TVGEV <- function(x, probs = c(0.90, 0.95, 0.99), eps <- 100 * .Machine$double.eps if (any(probs < -eps | probs > 1 + eps)) stop("'probs' outside [0,1]") - - if (is.null(date)) date <- x$fDate + if (is.null(psi)) psi <- x$estimate - - n <- length(date) - theta <- psi2theta(model = x, psi = psi, date = date) + newdate <- checkNewdata.TVGEV(object = x, newdata = date) + fDate <- format(newdate[[x$date]]) + theta <- psi2theta(model = x, psi = psi, date = newdate) + n <- nrow(theta) fProbs <- formatPerc(probs) quant <- array(NA, dim = c(n, length(probs)), - dimnames = list(date = format(date), + dimnames = list(date = fDate, paste("Q", fProbs, sep = ""))) for (i in seq_along(probs)) { @@ -61,7 +65,7 @@ quantile.TVGEV <- function(x, probs = c(0.90, 0.95, 0.99), shape = theta[ , 3L]) } - rownames(quant) <- attr(quant, "date") <- format(date) + attr(quant, "date") <- fDate attr(quant, "collabels") <- fProbs attr(quant, "label") <- "quantile" class(quant) <- c("bts", "matrix") @@ -106,7 +110,7 @@ quantile.TVGEV <- function(x, probs = c(0.90, 0.95, 0.99), ##' functions are plotted with the x-y axes flipped in order to enhance ##' the time-varying feature of the model. ##' -##' @seealso \code{\link{GEV}} for the density and cdf of the GEV +##' @seealso \code{\link[nieve]{GEV}} for the density and cdf of the GEV ##' distribution, \code{\link{plot.predict.TVGEV}} for the Return Level ##' plot. ##' @@ -125,33 +129,38 @@ density.TVGEV <- function(x, xValue = NULL, date = NULL, psi = NULL, log = FALSE, ...) { - - if (is.null(date)) { - date <- selectDate(x$fDate) + + if (length(x$TSVars)) { + warning("'x' includes TSVars") } - if (is.null(psi)) psi <- x$estimate + + ## avoid using two many dates + if (is.null(date)) date <- selectDate(x$fDate) + newdate <- checkNewdata.TVGEV(object = x, newdata = date) + fDate <- format(newdate[[x$date]]) + n <- nrow(newdate) - n <- length(date) - theta <- psi2theta(model = x, psi = psi, date = date) + if (is.null(psi)) psi <- x$estimate + theta <- psi2theta(model = x, psi = psi, date = newdate) if (is.null(xValue)) { - q <- quantile(x, probs = c(0.001, 0.999), date = date) + q <- quantile(x, probs = c(0.001, 0.999), date = newdate) rq <- range(q) xValue <- seq(from = rq[1L], to = rq[2L], length.out = 200) } fxValue <- format(xValue) dens <- array(NA, dim = c(n, length(xValue)), - dimnames = list(date = format(date), - paste("d", fxValue, sep = ""))) + dimnames = list(date = fDate, + paste("d", fxValue, sep = ""))) for (i in seq_along(xValue)) { dens[ , i] <- nieve::dGEV(xValue[i], loc = theta[ , 1L], scale = theta[, 2L], shape = theta[, 3L], ...) } - + attr(dens, "x") <- xValue - rownames(dens) <- attr(dens, "date") <- format(date) + rownames(dens) <- attr(dens, "date") <- fDate attr(dens, "collabels") <- fxValue attr(dens, "label") <- "density" class(dens) <- c("bfts", "bts", "matrix") @@ -175,17 +184,23 @@ cdf.TVGEV <- function(x, date = NULL, psi = NULL, log = FALSE, ...) { - - if (is.null(date)) { - date <- selectDate(x$fDate) + + if (length(x$TSVars)) { + warning("'object' includes TSVars") } - if (is.null(psi)) psi <- x$estimate + + ## avoid using two many dates + if (is.null(date)) date <- selectDate(x$fDate) + newdate <- checkNewdata.TVGEV(object = x, newdata = date) + fDate <- format(newdate[[x$date]]) + n <- nrow(newdate) - n <- length(date) - theta <- psi2theta(model = x, psi = psi, date = date) + if (is.null(psi)) psi <- x$estimate + + theta <- psi2theta(model = x, psi = psi, date = newdate) if (is.null(qValue)) { - q <- quantile(x, probs = c(0.005, 0.995), date = date) + q <- quantile(x, probs = c(0.005, 0.995), date = newdate) rq <- range(q) qValue <- seq(from = rq[1L], to = rq[2L], length.out = 200) } @@ -202,7 +217,7 @@ cdf.TVGEV <- function(x, } attr(cdf, "x") <- qValue - rownames(cdf) <- attr(cdf, "date") <- format(date) + rownames(cdf) <- attr(cdf, "date") <- fDate attr(cdf, "collabels") <- fqValue attr(cdf, "label") <- "cdf" class(cdf) <- c("bfts", "bts", "matrix") @@ -247,19 +262,26 @@ mean.TVGEV <- function(x, date = NULL, psi = NULL, ...) { + if (length(x$TSVars)) { + warning("'x' includes TSVars") + } + ## Euler-Mascheroni constant gam <- - digamma(1) - if (is.null(date)) date <- x$fDate if (is.null(psi)) psi <- x$estimate - - n <- length(date) - theta <- psi2theta(model = x, psi = psi, date = date) + + newdate <- checkNewdata.TVGEV(object = x, newdata = date) + fDate <- format(newdate[[x$date]]) + n <- nrow(newdate) + + theta <- psi2theta(model = x, psi = psi, date = newdate) E <- theta[ , 1, drop = FALSE] + gam * theta[ , 2, drop = FALSE] colnames(E) <- "expectation" ind <- theta[ , 3] != 0.0 E[ind, 1L] <- theta[ , 1] + theta[ind, 2] * (gamma(1 - theta[ind, 3]) - 1.0 ) / theta[ind, 3] - rownames(E) <- attr(E, "date") <- format(date) + + rownames(E) <- attr(E, "date") <- fDate attr(E, "collabels") <- "expectation" attr(E, "label") <- "expectation" class(E) <- c("bts", "matrix") @@ -281,8 +303,10 @@ moment.TVGEV <- function(x, which = "variance", date = NULL, psi = NULL, ...) { - - + if (length(x$TSVars)) { + warning("'x' includes TSVars") + } + if (which != "variance") { stop("'which' can only be \"variance\" for now.") } @@ -290,14 +314,16 @@ moment.TVGEV <- function(x, ## Euler-Mascheroni constant gam <- - digamma(1) - if (is.null(date)) date <- x$fDate if (is.null(psi)) psi <- x$estimate - n <- length(date) - theta <- psi2theta(model = x, psi = psi, date = date) + newdate <- checkNewdata.TVGEV(object = x, newdata = date) + fDate <- format(newdate[[x$date]]) + n <- nrow(newdate) + + theta <- psi2theta(model = x, psi = psi, date = newdate) M <- array(pi * pi / 6, dim = c(n, 1L), - dimnames = list(date = format(date), + dimnames = list(date = fDate, "moment" = "variance")) ## \xi >= 1/2 ind <- (theta[ , 3L] >= 0.5) @@ -309,7 +335,7 @@ moment.TVGEV <- function(x, (gamma(1 - 2 * theta[ind, 3L]) - gamma(1 - theta[ind, 3L])^2 ) / theta[ind, 3] / theta[ind, 3] - attr(M, "date") <- format(date) + attr(M, "date") <- fDate attr(M, "collabels") <- "variance" attr(M, "label") <- "variance" class(M) <- c("bts", "matrix") diff --git a/TODO.md b/TODO.md index adbf3d3..800665e 100644 --- a/TODO.md +++ b/TODO.md @@ -4,8 +4,11 @@ This is a temporary reminder list by YD. Not a planning for new development! -- [ ] Make an alias of the `anova` method under a better name making - clear that this is for a Likelihood Ratio test. +- [ ] Make the `predictUncond` function deprecated? + +- [ ] **`anova` name** Make an alias of the `anova` method under a + better name making clear that this is for a Likelihood Ratio + test. May be `LR.test`. - [ ] **Profile Likelihood**. Speed up the computation of the profile likelihood intervals in `quantMax.TVGEV`. The initial values for @@ -18,6 +21,13 @@ development! *contours* for some or all of the couples of parameters in a fitted `TVGEV` model. +- [] **Return level plots** Allow the return level plot to use an + horizontal axis with a probability of exceedance (and a Gumbel + scale) rather than a return period as done for the quantile of + the maximum. So the plot for a conditional return level would be + nearly identical to that given for the quantile of the maximum + on a one-year period. + - [ ] **Improve Numerical Treatment**. Improve by using the QR method on the design matrices. So the vector of parameters will be internally $\mathbf{R}\boldsymbol{\psi}$ instead of @@ -31,12 +41,12 @@ development! check, and an action for the build to produce a release when a tag is used on for the commit. -- [x] **Vignettes**. In the Introduction vignette enhance the part on +- [x] **Vignettes**. In the *Introduction* vignette enhance the part on the quantile of the maximum by adding profile likelihood intervals. Complete the citations and introduce the expression *Design Life Level*. -- [x] **Vignettes**. In the Introduction vignette, include the fixes +- [x] **Vignettes**. In the *Introduction* vignette, include the fixes that were by Jesper Rydén (mail 2024-03-06). - [ ] **README** file. Make the installation part shortest, and include @@ -44,3 +54,9 @@ development! Google Colab notebook. - [ ] **Hexagon sticker**? + +- [ ] Change the name of `bs` or replace this generic function by a + generic bootstapping function from another package. + +- [ ] Get rid of the dependence to the **reshape2** package by replacing + `reshape2::melt` by `data.table::melt`. diff --git a/man/MLE.TVGEV.Rd b/man/MLE.TVGEV.Rd index 62f9fa9..c7306f5 100644 --- a/man/MLE.TVGEV.Rd +++ b/man/MLE.TVGEV.Rd @@ -8,7 +8,7 @@ MLE.TVGEV(object, y = NULL, psi0 = NULL, estim = c("optim", "nloptr"), coefLower = if (estim != "nloptr") numeric(0) else c("xi_0" = -0.9), coefUpper = if (estim != "nloptr") numeric(0) else c("xi_0" = 2.0), - parTrack = FALSE) + parTrack = FALSE, trace = 0) } \arguments{ \item{object}{A (possibly incomplete) \code{TVGEV} object.} @@ -20,13 +20,23 @@ MLE.TVGEV(object, y = NULL, \item{estim}{Character giving the optimisation to be used.} \item{coefLower, coefUpper}{Numeric vectors or \code{NULL} giving -bounds on the parameters. These are used overwrite the "normal" -values which are \code{-Inf} and \code{Inf}. These arguments are -only used when \code{estim} is \code{"nloptr"}.} +bounds on the parameters. These are used overwrite the +"normal" values which are \code{-Inf} and \code{Inf}. These +arguments are only used when \code{estim} is \code{"nloptr"}. +These vectors must have \emph{suitably named} elements. It +is not necessary to give values for all the elements: the +"normal" infinite values will continue to hold for the +parameters for which the bounds are not given. When the scale +parameter is constant it may help to give a positive value for +the element \code{"sigma_0"} of \code{coefLower}. When the +shape is constant it may help to give bounds such as such as +\code{-0.5} and \code{0.5} for the element \code{"xi_0"}.} \item{parTrack}{\code{Logical}. If \code{TRUE}, all parameters at which an evaluation of the log-Likelihood will be stored and returned.} + +\item{trace}{Integer level of verbosity.} } \value{ A list with elements that can be copied into those diff --git a/man/autoplot.influence.TVGEV.Rd b/man/autoplot.influence.TVGEV.Rd new file mode 100644 index 0000000..a9b7d21 --- /dev/null +++ b/man/autoplot.influence.TVGEV.Rd @@ -0,0 +1,54 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/influenceTVGEV.R +\name{autoplot.influence.TVGEV} +\alias{autoplot.influence.TVGEV} +\title{Create a \code{ggplot} for a \code{influence.TVGEV} Object} +\usage{ +\method{autoplot}{influence.TVGEV}(object, ...) +} +\arguments{ +\item{object}{An object with class \code{"influence.TVGEV"} as +created bye the \code{influnce} method for the \code{"TVGEV"} +class.} + +\item{...}{Not used yet.} +} +\value{ +An object inheriting grom \code{"ggplot"} showing the + (finite-sample) influence function for the observation and the + statistic that have been chosen. +} +\description{ +The influence curve is obtained by replacing the chosen +observation by a candidate value and then re-fitting the same +model and computing the statistic of interest: coefficient +estimate, quantile, ... The rug shows the \eqn{y_b} and the +vertical line shows the observation chosen for the analysis. +} +\examples{ +library(ismev) +data(venice) +df <- data.frame(Date = as.Date(paste0(venice[ , "Year"], "-01-01")), + Sealevel = venice[ , "r1"] / 100) +fit0 <- TVGEV(data = df, date = "Date", response = "Sealevel") +autoplot(fit0) +RL_2050 <- function(model) { + c("RL_2050(100)" = quantile(model, prob = 0.99, date = "2050-01-01")[1]) +} +autoplot(fit0) +influence(fit0, what = RL_2050) |> autoplot() +## fit with a linear time trend +fit1 <- TVGEV(data = df, date = "Date", response = "Sealevel", + design = polynomX(Date, degree = 1), loc = ~t1) +autoplot(fit1) +summary(fit1) +influence(fit1) |> autoplot() +influence(fit1, what = RL_2050) |> autoplot() +influence(fit1, which = "max", what = RL_2050) |> autoplot() +## Influence curve for the estimated slope +slope <- function(model) { + c("slope\n(cm/100 yr)" = 100 * unname(coef(model)["mu_t1"])) +} +influence(fit1, which = "max", what = slope) |> autoplot() +influence(fit1, which = "min", what = slope) |> autoplot() +} diff --git a/man/checkNewdata.TVGEV.Rd b/man/checkNewdata.TVGEV.Rd new file mode 100644 index 0000000..cbb8ec0 --- /dev/null +++ b/man/checkNewdata.TVGEV.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/miscUtils.R +\name{checkNewdata.TVGEV} +\alias{checkNewdata.TVGEV} +\title{Check New Data for Prediction} +\usage{ +checkNewdata.TVGEV(object, newdata = NULL, design = FALSE) +} +\arguments{ +\item{object}{A \code{TSGEV} object.} + +\item{newdata}{An atomic vector or a data frame providing the data +for a prediction.} + +\item{design}{Not used yet.} +} +\value{ +A data frame with the variables required for prediction. + This must always contain a column with is name given by + \code{object$date} and with class \code{"Date"}. If + \code{object} contains \code{TSVars}, the corresponding + variables must be provided as well, but with no check on their + class. +} +\description{ +Check New Data for Prediction +} +\keyword{internal} diff --git a/man/density.TVGEV.Rd b/man/density.TVGEV.Rd index 742f80d..b3ced79 100644 --- a/man/density.TVGEV.Rd +++ b/man/density.TVGEV.Rd @@ -55,7 +55,7 @@ plot(F) } \seealso{ -\code{\link{GEV}} for the density and cdf of the GEV +\code{\link[nieve]{GEV}} for the density and cdf of the GEV distribution, \code{\link{plot.predict.TVGEV}} for the Return Level plot. } diff --git a/man/influence.TVGEV.Rd b/man/influence.TVGEV.Rd new file mode 100644 index 0000000..c4404a2 --- /dev/null +++ b/man/influence.TVGEV.Rd @@ -0,0 +1,93 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/influenceTVGEV.R +\name{influence.TVGEV} +\alias{influence.TVGEV} +\title{Diagnostic of Influence for \code{TVGEV} Objects} +\usage{ +\method{influence}{TVGEV}( + model, + what = function(model) coef(model)["xi_0"], + which = "min", + how = "replace", + ... +) +} +\arguments{ +\item{model}{A \code{TVGEV} object.} + +\item{what}{A function with one argument \code{object} defining or +extracting the (numeric) quantity for which the influence is +to be analysed. So, \code{object} must be a \code{TVGEV} +object. See \bold{Examples}. This function can return numeric +vector in which case several influence curves will be +computed. It is then wise to make the function return a vector +with named elements.} + +\item{which}{Identifies the observation which influence will be +analysed. Can be: an integer giving the observation, the +character \code{"min"}, the character \code{"max"}, a +character that can be coerced to a date, or an object with +class \code{"Date"} that identifies the observation. This +argument must define an observation existing in the data frame +used to create the object. So it can not be used e.g. to +define a future date.} + +\item{how}{Character specifying the type of (finite-sample) +influence curve to use. For now only the only possibility is +\code{"replace"}, see Section 2.1 of the book by Hampel et al +(1996). The chosen observation is replaced by a value \eqn{x} +and the quantity of interest is computed for each value of +\eqn{x}.} + +\item{...}{Not used.} +} +\value{ +A named list of vectors. The element \code{obsValue} + contains the candidate values for the chosen observation and + \code{statValue} contains the corresponding values of the + specific statistic defined with \code{what}. +} +\description{ +This function provides some indications on the influence of a +specific observation used in the fit of a \code{TVGEV} object. For +several candidate values of the chosen observation, a \code{TVGEV} +is re-fitted with the observed value replaced by the candidate +value, and the quantity of interest as defined by \code{what} is +computed. +} +\details{ +By suitably defining the function \code{what}, one can trace e.g., +the estimate of a quantile with given probability, the +corresponding upper confidence limit, the upper end-point of the +distribution, ... and more. +} +\note{ +For \code{TVGEV} models with a positive GEV shape, the + smallest observation may have a strong influence on the + result, even if the model is stationary. +} +\examples{ +example(TVGEV) +influ <- influence(res2, which = "min") +autoplot(influ) +influ <- influence(res2, which = as.Date("2015-01-01")) +autoplot(influ) +RL_2050 <- function(model) { + c("RL_2050(100)" = quantile(model, prob = 0.99, date = "2050-01-01")[1]) +} +influence(res2, what = RL_2050) |> autoplot() +influence(res2, what = RL_2050, which = "2003-01-01") |> autoplot() +RLs_2050 <- function(model) { + res <- c(quantile(model, prob = 0.99, date = "2050-01-01")[1], + quantile(model, prob = 0.999, date = "2050-01-01")[1]) + names(res) <- paste0("RL", c(100, 1000)) + res +} +influ <- influence(res2, what = RLs_2050, which = "2003-01-01") +autoplot(influ) +} +\references{ +Hampel, F.R., Ronchetti, E.M., Rousseeuw, P.J. and Stahel + W.A. (1996) \emph{Robust Statistics: The Approach Based on + Influence Functions}. Wiley. +} diff --git a/man/plot.influence.TVGEV.Rd b/man/plot.influence.TVGEV.Rd new file mode 100644 index 0000000..9fa352b --- /dev/null +++ b/man/plot.influence.TVGEV.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/influenceTVGEV.R +\name{plot.influence.TVGEV} +\alias{plot.influence.TVGEV} +\title{Plot Method for \code{influence.TVGEV} Object} +\usage{ +\method{plot}{influence.TVGEV}(x, y = NULL, ...) +} +\arguments{ +\item{x}{A \code{TVGEV} object.} + +\item{y}{Not used.} + +\item{...}{Not used.} +} +\value{ +Nothing. +} +\description{ +Plot Method for \code{influence.TVGEV} Object +} +\note{ +This method \code{autoplot} +} +\seealso{ +\code{\link{autoplot.TVGEV}}. +} diff --git a/man/predict.TVGEV.Rd b/man/predict.TVGEV.Rd index 2d9f7d4..8542d40 100644 --- a/man/predict.TVGEV.Rd +++ b/man/predict.TVGEV.Rd @@ -45,7 +45,7 @@ the RL computed with the estimated parameters in \code{object}.} \item{trace}{Integer level of verbosity.} -\item{...}{Arguments to be passed to \code{bs}.} +\item{...}{Arguments to be passed to the \code{bs} method.} } \value{ A data frame or an array with the RL and confidence limits diff --git a/man/psi2theta.TVGEV.Rd b/man/psi2theta.TVGEV.Rd index 79e6db0..ac0ae2b 100644 --- a/man/psi2theta.TVGEV.Rd +++ b/man/psi2theta.TVGEV.Rd @@ -3,7 +3,7 @@ \name{psi2theta.TVGEV} \alias{psi2theta.TVGEV} \title{Compute the Matrix of GEV Parameters from the Vector of -Model Parameters 'psi' and the Data} + Model Parameters 'psi' and the Data} \usage{ \method{psi2theta}{TVGEV}( model, @@ -21,12 +21,17 @@ Model Parameters 'psi' and the Data} vector of estimates in \code{psi} will be used.} \item{date}{A vector that can be coerced to the \code{"Date"} -class. Each element will correspond to a row of the matrix of GEV -parameters.} +class. Each element will correspond to a row of the matrix of +GEV parameters. If \code{model} contains TS covariables, this +must instead be a data frame containing the suitable +covariables, along with a column with name \code{model$name} +and class \code{Date}. Id \code{date} is \code{NULL}, the date +and possibly the TS covariables are extracted from +\code{model} and correspond to the information used in the +fit.} \item{deriv}{Logical. If \code{TRUE} the returned object has an -attribute names \code{"gradient"} which is the gradient -array.} +attribute names \code{"gradient"} which is the gradient array.} \item{checkNames}{Logical. If \code{TRUE} it will be checked that the names of \code{psi} match the \code{parNames} element of @@ -35,20 +40,21 @@ the names of \code{psi} match the \code{parNames} element of \item{\dots}{Not used yet.} } \value{ -A matrix with \code{length(date)} rows and \code{3} colums. -The columns contain the location, the scale and the shape GEV parameters -in that order. +A matrix with \code{length(date)} rows and \code{3} + colums. The columns contain the location, the scale and the + shape GEV parameters in that order. } \description{ -Compute the GEV parameters for the marginal distributions of a -\code{TVGEV} model. +Compute the GEV parameters for the marginal + distributions of a \code{TVGEV} model. } \note{ The \code{deriv} formal argument is mainly here to maintain -a compatibility with the \code{NSGEV} method. However, the -derivatives w.r.t. the parameter vector \code{psi} are obtained -simply by using the \code{X} elements of the object \code{model}. + a compatibility with the \code{NSGEV} method. However, the + derivatives w.r.t. the parameter vector \code{psi} are + obtained simply by using the \code{X} elements of the object + \code{model}. } \seealso{ -The \code{\link{GEV}} for the GEV probability functions. +The \code{\link[nieve]{GEV}} for the GEV probability functions. } diff --git a/man/residuals.TVGEV.Rd b/man/residuals.TVGEV.Rd index 5afd43e..97ce21a 100644 --- a/man/residuals.TVGEV.Rd +++ b/man/residuals.TVGEV.Rd @@ -5,15 +5,16 @@ \alias{resid.TVGEV} \title{Generalised Residuals for a \code{TVGEV} Model} \usage{ -\method{residuals}{TVGEV}(object, type = c("gumbel", "exp", "unif"), ...) +\method{residuals}{TVGEV}(object, type = c("gumbel", "exp", "unif", "gev4"), ...) } \arguments{ \item{object}{A \code{TVGEV} object.} \item{type}{The approximate distribution wanted. The choices -\code{c("unif", "exp", "gumbel")} correspond to the standard -uniform, the standard exponential and the standard Gumbel -distributions. Partial matching is allowed.} +\code{c("gumbel", "exp", "unif", "gev4")} correspond to the +following distributions: standard Gumbel, the standard +exponential, the standard uniform, and the GEV with shape +\eqn{-0.25} distributions. Partial matching is allowed.} \item{...}{Not used yet.} } @@ -21,7 +22,7 @@ distributions. Partial matching is allowed.} A vector of generalised residuals which should \emph{approximately} be independent and \emph{approximately} follow the target distribution: standard Gumbel, standard - exponential or standard uniform, depending on the value of + exponential or standard uniform, ... depending on the value of \code{type}. } \description{ @@ -37,7 +38,8 @@ The upper 95\% quantile of the standard Gumbel and the standard exponential is close to \eqn{3} which can be used to gauge "large residuals". Using \code{type = "gumbel"} seems better to diagnose abnormally small residuals as may result - from abnormally small block maxima. + from abnormally small block maxima. The generalised residuals + have no physical dimension. } \examples{ df <- within(TXMax_Dijon, Date <- as.Date(sprintf("\%4d-01-01", Year))) diff --git a/vignettes/NSGEV.bib b/vignettes/NSGEV.bib index 97d3c16..bf423fb 100644 --- a/vignettes/NSGEV.bib +++ b/vignettes/NSGEV.bib @@ -39,13 +39,13 @@ @book{BeirlantEtAl_StatOfExtremes @Article{Rpack_evd, title = {{evd: Extreme Value Distributions}}, author = {Stephenson, Alec G.}, - journal = {{The R Journal}}, + journal = {{R News}}, year = {2002}, volume = {2}, number = {2}, - pages = {0}, + pages = {31-32}, month = {June}, - url = {http://CRAN.R-project.org/doc/Rnews/} + note = {https://journal.r-project.org/articles/RN-2002-015/} } %%============================================================================ @Manual{RPack_ismev, diff --git a/vignettes/NSGEVCompDet.Rnw b/vignettes/NSGEVCompDet.Rnw index 467b25a..95baf9d 100644 --- a/vignettes/NSGEVCompDet.Rnw +++ b/vignettes/NSGEVCompDet.Rnw @@ -1,6 +1,6 @@ \documentclass[11pt,a4paper]{article} -%\VignetteIndexEntry{Mauna Loa Example from Rasmussen and Williams' Book} +%\VignetteIndexEntry{Computing Details for NSGEV} %\VignetteEngine{knitr::knitr} \usepackage{makeidx} \usepackage{amsmath,amssymb,amsthm} diff --git a/vignettes/NSGEVTS.Rmd b/vignettes/NSGEVTS.Rmd new file mode 100644 index 0000000..93a0938 --- /dev/null +++ b/vignettes/NSGEVTS.Rmd @@ -0,0 +1,429 @@ +--- + title: "Timeseries Covariables with **NSGEV**" + author: Yves Deville + date: "`r Sys.Date()`" + output: + rmarkdown::html_vignette: + toc: true + vignette: > + %\VignetteIndexEntry{Timeseries Covariates with NSGEV} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} + bibliography: NSGEV.bib + linkcolor: blue +--- + +\newcommand{\m}[1]{\mathbf{#1}} +\newcommand{\bs}[1]{\boldsymbol{#1}} + + + +```{r, echo=FALSE, eval=TRUE, message=FALSE, results="hide"} +## library(remotes) +## install_github("IRSN/NSGEV", ref = "influence") +library(knitr) +if (opts_knit$get("rmarkdown.pandoc.to") == "html") { + library(htmltools) +} +opts_knit$get("rmarkdown.pandoc.to") +``` + + + +# Using timeseries covariates in **TVGEV** models + +In Time-Varying GEV models, a series $y(t)$ of block maxima is +considered as formed by independent GEV observations with their +marginal distribution depending on the time $t$. In some cases one +may want to further use one or several *timeseries covariate* +$z(t)$. A typical example of such timeseries covariates is provided by +so-called *climate indices* such as the *Southern Oscillation Index* +(SOI) and the *North Atlantic Oscillation* (NAO) both available from +the USA [National Oceanic and Atmospheric +Administration](https://www.ncei.noaa.gov). The response $y(t)$ +usually being a timeseries of *annual* maxima, each covariate $z(t)$ +must also come with a yearly sampling, which may require some kind of +aggregation using the annual mean or maximum. Most often, timeseries +covariates should be used along with functions of time describing the +trend, in order to avoid spurious regression, see below. Note that a +covariate $z(t)$ does not need to have independent observations nor +even to be stationary. The crucial assumption in the model is that the +observations $y(t)$ are independent conditional on the series $z(t)$. + +# Example: Annual Sea Levels in Fremantle (AUS) + +The `fremantle` data of the **ismev** package contains the annual +maxima of the sea level in Fremantle, Australia, in meters. This +dataset is described in Chap. 1 of @Coles_ExtremeValues +and is used in Chap. 6 of the book to provide examples of models +for non-stationary extremes. + +```{r, label = "Fremantle", message=FALSE, warning=FALSE} +library(NSGEV); library(ismev) +data(fremantle) +head(fremantle, n = 7) +``` + +The `SOI` column contains the annual value of the Southern Oscillation +Index. In order to use the `TVGEV` function with this data it helps +to define a variable of class `"Date"` by using the +`Year` column. + +```{r, label="Fremantle2", message=FALSE, warning=FALSE} +df <- within(fremantle, Date <- as.Date(paste0(Year, "-01-01"))) +``` + +The variable `SeaLevel` contains the response timeseries $y(t)$ while +$\texttt{SOI}(t)$ will be used as covariate $z(t)$. The series are shown +below, along with a scatterplot with points $[z(t),\,y(t)]$. Note that +some years are missing such as 1902. This will not affect the use of +the `TVGEV` function. Moreover, the same results would be obtained if +the missing years were added to the table with `NA` values for the two +variables `SeaLevel` and `SOI`. This will be illustrated later + + +```{r, label="Fremantle2a", message=FALSE, warning=FALSE, echo=FALSE} +## opar <- par(mfrow = c(2, 2), cex = 0.5, mar = c(4, 4, 2, 2)) +## with(df, plot(x = Date, y = SeaLevel, type = "h", xlab = "", +## main = "Annual maxima of Sea levels in Fremantle (AUS)")) +## with(df, plot(x = Date, y = SOI, type = "h", xlab = "", +## main = "SOI", col = "seagreen")) +## with(df, plot(x = SOI, y = SeaLevel, type = "p", pch = 16, +## main = "Sea level against SOI index")) +## par(opar) +``` + +```{r, ggplotly, message=FALSE, warning=FALSE} +library(plotly); +g1 <- ggplot(data = df) + + geom_segment(aes(x = Date, xend = Date, y = min(SeaLevel) - 0.1, yend = SeaLevel), colour = "orangered") + + geom_point(aes(x = Date, y = SeaLevel), colour = "orangered") + xlab("") + ylab("SeaLevel (m)") +g2 <- ggplot(data = df) + + geom_segment(aes(x = Date, xend = Date, y = 0, yend = SOI), colour = "SeaGreen") + + geom_point(aes(x = Date, y = SOI), colour = "SeaGreen") + xlab("") + ylab("SOI") +g3 <- ggplot(data = df) + + geom_point(aes(x = SOI, y = SeaLevel, date = Date)) + xlab("SOI") + ylab("SeaLevel") +``` + +```{r, ggplotlyOut, message=FALSE, warning=FALSE, fig.show='hold', out.width="50%", echo=FALSE} +if (opts_knit$get("rmarkdown.pandoc.to") == "html") { + g <- list() + g[[1]] <- ggplotly(g1, width = 400, height = 340) + g[[2]] <- ggplotly(g2, width = 400, height = 340) + g[[3]] <- ggplotly(g3, width = 300, height = 340) + htmltools::tagList(g) +} else { + print(g1 + ggtitle("SeaLevel")) + print(g2 + ggtitle("SOI")) + print(g3 + ggtitle("SeaLevel against SOI")) +} +``` + +One may guess from the scatterplot that a larger level of the `SOI` +results on average in a (slightly) larger sea level. + + +# `TVGEV` models with timeseries covariates + +## Fitting the `TVGEV` models + +Using the `fremantle` data, we fit the following three GEV models +$y(t) \sim \texttt{GEV}\{\mu(t),\, \sigma(t),\,\xi(t)\}$ +involving a parameter vector $\bs{\psi}$ + +\begin{equation*} + (M_0) \quad + \left\{ + \begin{aligned} + \mu(t) + &= \psi^{\mu}_0 \\ + \sigma(t) + &= \psi^{\sigma}_0\\ + \xi(t)&= \psi^{\xi}_0, + \end{aligned} + \right. + \qquad + (M_1) \quad + \left\{ + \begin{aligned} + \mu(t) + &= \psi^{\mu}_0 + \psi^{\mu}_1 \, t \\ + \sigma(t) + &= \psi^{\sigma}_0\\ + \xi(t) &= \psi^{\xi}_0, + \end{aligned} + \right. + \qquad + (M_2) \quad + \left\{ + \begin{aligned} + \mu(t) + &= \psi^{\mu}_0 + \psi^{\mu}_1 \, t + \psi^\mu_2 \, \texttt{SOI}(t)\\ + \sigma(t) + &= \psi^{\sigma}_0\\ + \xi(t)&= \psi^{\xi}_0, + \end{aligned} + \right. +\end{equation*} + +So $M_0$ is a stationary model with i.i.d. GEV observations, $M_1$ +specifies a linear time trend for the location parameter $\mu$ and +$M_2$ uses both a linear time trend term and a linear effect of the +covariate $\texttt{SOI}(t)$. In all cases, the GEV scale $\sigma$ and +the shape $\xi$ are both kept constant. The three models are nested. + +The linear time trend will be built by using the `polynomX` design +function which returns a matrix of polynomials in $t$. The column `t1` +can be used to describe the trend. The (default) time origin is +located at `1970-01-01` and the time is expressed in years. + + +```{r, label="fits", results="hide",fig.width=5, fig.height=4} +fit0 <- TVGEV(data = df, response = "SeaLevel", date = "Date", + design = polynomX(date = Date, degree = 1), + loc = ~ 1) +fit1 <- TVGEV(data = df, response = "SeaLevel", date = "Date", + design = polynomX(date = Date, degree = 1), + loc = ~ t1) +fit2 <- TVGEV(data = df, response = "SeaLevel", date = "Date", + design = polynomX(date = Date, degree = 1), + loc = ~ t1 + SOI) +autoplot(fit0) +autoplot(fit1) +autoplot(fit2) +``` + +The `autoplot` method builds a `ggplot` object showing the observed +response against the date along with the fitted quantiles for three +levels of the probability of non-exceedance $p$, namely $p=0.90$, $p = +0.95$ and $p= 0.99$. These quantiles are those of the +$\texttt{GEV}\{\mu(t),\,\sigma(t),\, \xi(t)\}$ distribution. When a +timeseries covariate is used as for the object `fit2`, the quantiles +are conditional on the value of the covariate for the corresponding +year. + +The methods `coef`, `summary` can be used to extract the vector +$\widehat{\bs{\psi}}$ of estimated coefficients or get to a summary. + +```{r, label="fits_summary"} +coef(fit2) +summary(fit2) +``` + +The output of `summary` suggests that the coefficient of +$\texttt{SOI}(t)$ is significantly different from zero because in the +row with label `mu_SOI`, the absolute value of the estimate +$\widehat{\psi}^\mu_2$ is larger than twice the corresponding standard +error. The trend coefficient `mu_t1` is in meter by year so its value is +`r round(1e4 * coef(fit2)["mu_t1"], digits = 2)` when expressed +in *centimeter by century*, maybe a more suitable unit. + +Note that the estimated GEV shape parameter in `fit2` is fairly +negative implying a finite upper end-point for the marginal GEV +distributions. + + +## Generalised residuals + +The `resid` method of the `"TVGEV"` class computes the generalised +residuals: + +```{r, label="resid"} +gr <- list() +gr[["0"]] <- autoplot(resid(fit0)) + ggtitle("fit0") +gr[["1"]] <- autoplot(resid(fit1)) + ggtitle("fit1") +gr[["2"]] <- autoplot(resid(fit2)) + ggtitle("fit2") +``` + +```{r, label="residShow", echo=FALSE} +if (opts_knit$get("rmarkdown.pandoc.to") == "html") { + htmltools::tagList(lapply(gr, ggplotly, width = 300, height = 340)) +} else { + lapply(gr, identity) +} +``` + +When a generalised residual falls far away in one of the two tails of +the Gumbel distribution, it may be a good idea to check the influence +of the corresponding observation on the fit. + +When the variable `SOI` is used, the two years `1903` and `1914` have +different residuals although they correspond to the same value of the +sea level, because they correspond to different values of +`SOI`. Although they correspond to severe coastal floodings, the +largest sea levels in the dataset correspond to somewhat "normal" +values of the residuals. This may mean that coastal flood defenses +will have to be constantly improved. + +## Comparing nested models with likelihood-ratio tests + +The `anova` method can be used to compare two nested models with +Likelihood-Ratio test, for instance $M_0$ and $M_1$. + + +```{r, label="fits_anova1"} +anova(fit0, fit1) +``` + +So the likelihood-ratio test indicates that the linear trend term is strongly +significant. Up to rounding, the maximised log-likelihoods are equal +to those reported p. 113 in @Coles_ExtremeValues book. We can similarly compare the two +fitted models $M_1$ and $M_2$ + + +```{r, label="fits_anova2"} +anova(fit1, fit2) +``` + +So, as written p. 114 in @Coles_ExtremeValues book: *the effect of SOI +is influential on annual maximum sea levels at Fremantle, even after +the allowance for time variation*. Again, the maximised log-likelihoods +found here are in good agreement with that reported in the book. + +Note that it is important to assess the effect of a timeseries +covariates $z(t)$ such as $\texttt{SOI}(t)$ only when the effect of time +has been taken into account. Without this precaution any covariate +with a linear time trend would be found to be influential by a +spurious regression effect. It may indeed be the case that the series +$y(t)$ and $z(t)$ have a common trend. + +## How `TVGEV` works + +When a `TVGEV` function is created, the `design` argument can receive +a call to a function that creates covariates $x_i(t)$ which are +functions of the time i.e. of the variable specified in `date`. A +data frame is created with its columns being the timeseries covariates +$z(t)$ and the functions time $x_i(t)$. All these columns can be used +in the `lm`-style formulas for the GEV parameters. + +```{r DesignFit, echo=FALSE, fig.align="center", out.width="70%"} +library(png); library(knitr) +img1_path <- "images/Fit.png" +knitr::include_graphics(img1_path) +``` +For instance the "design" matrix corresponding to the GEV location +parameter $\mu$ can be extracted from the fitted model object. + +```{r} + head(fit2$X[["loc"]]) +``` +This object could be coerced if needed to a timeseries object such as an +object with class `"xts"` from the **xts** package. + + +# Return levels or "prediction" + +## Return levels + +Remind that the return levels are the quantiles of the GEV response +$y(t)$. The corresponding probability $p$ is often converted into a +return period $T$ in years according to the rule $p = 1 - 1/T$. For +instance the so-called *centennial* return level for $T=100$ years +corresponds to $p=0.99$ i.e., to a probability of exceedance of +$0.01$. + + +## Conditional return levels + + +As long as a `TVGEV` fitted model object involves only covariates that +are functions of the time, the `predict` method only requires an +argument `newdate` which defines the date(s) $t$ for which the +quantiles are to be computed. For an object with no timeseries +covariates, the value passed to this argument is usually a vector +defining a future period: a set of years given by their beginning +date. When timeseries covariates are used, one must give as well the +value of these covariates for each predicted year. Then `newdate` must +be a data frame with all the variables required for the prediction. + +```{r, label="pred12", fig.width=8, fig.height=4, out.width="80%"} +pred1 <- predict(fit1, newdate = c("2020-01-01", "2030-01-01")) +autoplot(pred1) + ggtitle("No TS covariate") +nd <- data.frame(Date = as.Date(c("2020-01-01", "2030-01-01")), + SOI = c(1.5, 2.1)) +pred2 <- predict(fit2, newdate = nd) +autoplot(pred2) + ggtitle("Using the TS covariate `SOI`") +``` + +By default the confidence intervals on the return levels are obtained +by using the "delta method". Yet the `confintMethod` argument of the +`predict` method can be used to get *profile likelihood* intervals +instead. Partial matching can be used both for the argument and for +its value: the choice `"proflik"` can be abbreviated as `"prof"` which +is still well understood. Remind that the confidence intervals are +approximate unless a very large sample is used. Still, profile +likelihood intervals are known to have a better coverage those +obtained by the delta method. + + +```{r, fig.width=8, fig.height=4, out.width="80%" } +predProf1 <- predict(fit1, newdate = c("2020-01-01", "2030-01-01"), + conf = "prof", trace = 0) +autoplot(predProf1) + ggtitle("No TS covariate. Conf.: Prof. Lik ") +nd <- data.frame(Date = as.Date(c("2020-01-01", "2030-01-01")), + SOI = c(1.5, 2.1)) +predProf2 <- predict(fit2, newdate = nd, conf = "prof", trace = 0) +autoplot(predProf2) + ggtitle("Covariable TS `SOI`. Conf: Prof. Lik.") +``` + +Note that the use of the `SOI` covariate leads to slightly narrower +confidence intervals on the quantiles. + +**Remark** In the previous code chunk `trace = 0` was used to avoid a +verbose output. A grid of return periods is used for the computation +and for each return period, the confidence limits are actually +obtained by using a constrained optimisation. When the `trace` +argument is not zero, convergence diagnostics are printed. It may +happen that the optimisation fails to converge. + + +## Marginal return levels + +Rather than fixing a value for the covariates, one may want to +"marginalise out" the return levels w.r.t the values of the covariate, +or to "integrate out" these values, see @EastoeTawn_NSEV2009. The marginal +return levels are not available yet but should be in a future version +of the package. The return level corresponding to a future date of +interest $t^\star$ average over a number of possible values $z^{[j]}$ +or "scenarios" for the covariate $z(t^\star)$ that would be provided. + +```{r DesignNew, echo=FALSE, fig.align="center", out.width="70%"} +img2_path <- "images/Pred.png" +knitr::include_graphics(img2_path) +``` + +# Limitations and possible extensions + +For now the distribution of the maximum on a given period can not be +computed when timeseries covariates are used. + +The **NSGEV** package does not allow for models including *latent +variables*. Such models can be used with **INLA** or **rstan**. + + +# Appendix : using `NA`s for missing observations + +Starting form the `fremantle` data frame, we can build a new data +frame containing all the years between the first and the last year of +the original data frame. The `merge` method can be used for this aim, +with the `Date` as key. All the columns except `Date` will contain `NA`s +for the missing years. + +```{r} +Date = seq(from = df$Date[1], to = df$Date[nrow(df)], by = "year") +dfNA <- merge(df, data.frame(Date), by = "Date", all.y = TRUE) +summary(dfNA) +``` +Then we can fit the same `TVGEV` model as before + +```{r, label="compareNA"} +fit2NA <- TVGEV(data = dfNA, response = "SeaLevel", date = "Date", + design = polynomX(date = Date, degree = 1), + loc = ~ t1 + SOI) +``` + +We see that the results of the estimation with `NA` observations are +identical to those obtained with the missing years. + + +# References diff --git a/vignettes/images/Fit.png b/vignettes/images/Fit.png new file mode 100644 index 0000000..fd649a2 Binary files /dev/null and b/vignettes/images/Fit.png differ diff --git a/vignettes/images/Pred.png b/vignettes/images/Pred.png new file mode 100644 index 0000000..8e0aca2 Binary files /dev/null and b/vignettes/images/Pred.png differ