[R-sig-Geo] Cross-validating across spatial interpolation packages

Tue Dec 14 10:48:45 CET 2010

Dear list,

In a project I'm currently working I compare a number of interpolation 
methods for interpolation of evaporation. In addition to the methods 
available in gstat/automap (krige, idw) I use a number of non-gstat 
interpolation methods. To perform cross-validation I wanted to be able 
to use all methods (gstat and non-gstat) and end with the 
cross-validation result in the same format. This is particularly useful 
in combination with the compare.cv() function in automap, which provides 
summary cross-validation statistics and a spatial plot of the 
cross-validation residuals. The following example illustrates how I 
solved this for the Tps function from the fields pacakge and Nearest 
neighbor interpolation (idw with nmax == 1). First a few function 
definitions (plus Roxygen documentation) and then an example with the 
meuse dataset:

#' Provides a krige (gstat) like interface to Tps (fields)
#'
#' @param formula A formula to define the dependent and the independent 
variables, see the documentation of krige.
#' @param data The input data, should be a spatial object which supports 
coordinates extracting through coordinates()
#' @param newdata A spatial object with the prediction locations.
#' @param ... parameters that are passed on to Tps
#' @return The function returns a spatial object with the same class as 
\code{newdata} with the prediction and the variance (NA in this case). 
The names of the columns match the outcome of the krige function.
#' @author Paul Hiemstra, \email{p.h.hiemstra at gmail.com}
#' @export
#' @examples
#' data(meuse)
#' coordinates(meuse) = ~x+y
#' data(meuse.grid)
#' gridded(meuse.grid) = ~x+y
#'
#' meuse_tps = doTps(zinc~dist, meuse, meuse.grid)
#' meuse_tps = doTps(zinc~1, meuse, meuse.grid)
#'
doTps = function(formula, data, newdata, ..., debug.level = 1) {
   require(fields, quietly = TRUE)
   if(debug.level > 0) cat("[using thin plate splines (from fields)]\n")
   f = as.character(formula)
   dependent = f[2]
   independent = f[3] ## Meerdere covariates zou moeten kunnen
   if(independent == "1") {
     fit = Tps(x = coordinates(data), Y = data[[dependent]], ...)
     newdata$var1.pred = as.numeric(predict(fit, coordinates(newdata)))
     newdata$var1.var = NA
   } else {
     fit = Tps(x = coordinates(data), Y = data[[dependent]], Z = 
data[[independent]], ...)
     newdata$var1.pred = as.numeric(predict(fit, coordinates(newdata), Z 
= newdata[[independent]]))
     newdata$var1.var = NA
   }
   newdata$var1.stdev = sqrt(newdata$var1.var)
   return(newdata[c("var1.pred", "var1.var")])
}

#' Performs nearest neighbor interpolation
#'
#' @param formula A formula to define the dependent and the independent 
variables, see the documentation of krige.
#'                Note that nearest neighbor does not support 
independent variables. Therefor, the formula should always
#'                have the form dependent~1.
#' @param data The input data, should be a spatial object which supports 
coordinates extracting through coordinates()
#' @param newdata A spatial object with the prediction locations.
#' @param ... parameters that are passed on to Tps
#' @note This functions uses idw with 'nmax' set to 1 to perform nearest 
neighbor interpolation.
#' @return The function returns a spatial object with the same class as 
\code{newdata} with the prediction and the
#'          variance (NA in this case). The names of the columns match 
the outcome of the krige function.
#' @author Paul Hiemstra, \email{p.h.hiemstra at gmail.com}
#' @export
#' @examples
#' data(meuse)
#' coordinates(meuse) = ~x+y
#' data(meuse.grid)
#' gridded(meuse.grid) = ~x+y
#'
#' meuse_nn = doNearestNeighbor(zinc~1, meuse, meuse.grid)
#'
doNearestNeighbor = function(formula, data, newdata, debug.level = 1) {
   require(gstat, quietly = TRUE)
   if(debug.level > 0) cat("[using nearest neighbor]\n")
   f = as.character(formula)
   dependent = f[2]
   independent = f[3] ## Meerdere covariates zou moeten kunnen
   if(independent != "1") stop("Nearest neighbor does not support 
independent variables, please formula to 'dependent~1'.")
   newdata = idw(formula, data, newdata, nmax = 1, debug.level = 0)
   newdata$var1.stdev = sqrt(newdata$var1.var)
   return(newdata)
}

# Creates an empty crossvalidation result object from 'obj'
createCvObjFromSpatialObj = function(obj) {
   cv_obj = obj
   cv_obj$observed = NA
   cv_obj$var1.pred = NA
   cv_obj$var1.var = NA
   cv_obj$residual = NA
   cv_obj$zscore = NA
   return(cv_obj[c("var1.pred","var1.var","observed", "residual", 
"zscore")])
}

#' This function performs crossvalidation in the style of krige.cv
#'
#' In addition to supporting kriging and idw, it also supports
#' thin plate splines (through Tps (fields)).
#'
#' @param formula A formula to define the dependent and the independent 
variables, see the documentation of krige.
#' @param data The input data, should be a spatial object which supports 
coordinates extracting through coordinates()
#' @param func The function which should be used for cross-validation, 
possibilities are "krige", "idw", "doNearestNeighbor"
#'              and "doTps".
#' @param ... parameters that are passed on to 'func'
#' @return The function returns a spatial object with the class of 
'data'. The format
#'         in which the cross-validation results are presented is equal 
to that of krige.cv.
#' @note At this stage only leave-one-out cross-validation is supported. 
In addition, the formula supports only a limit
#'       syntax, i.e. log(zinc) or sqrt(dist) are not possible in the 
formula. Instead, create a new column in the object,
#'       e.g. meuse$log_zinc = log(meuse$zinc).
#' @author Paul Hiemstra, \email{p.h.hiemstra at gmail.com}
#' @export
#' @examples
#' data(meuse)
#' coordinates(meuse) = ~x+y
#' data(meuse.grid)
#' gridded(meuse.grid) = ~x+y
#'
#' regres_cv = crossvalidate(zinc~dist, meuse, func = "krige", 
debug.level = 0)
#' nearestneighbor_cv = crossvalidate(zinc~1, meuse, func = 
"doNearestNeighbor")
#' idw2_cv = crossvalidate(zinc~1, meuse, func = "idw", debug.level = 0)
#' idw4_cv = crossvalidate(zinc~1, meuse, func = "idw", debug.level = 0, 
idp = 4)
#' idw05_cv = crossvalidate(zinc~1, meuse, func = "idw", debug.level = 
0, idp = 0.5)
#' ked_cv = crossvalidate(zinc~dist, meuse, func = "krige",
#'         model = autofitVariogram(zinc~dist, meuse, model = 
"Ste")$var_model, debug.level = 0)
#' tps_cv = crossvalidate(zinc~1, meuse, func = "doTps", debug.level = 0)
#' tpsdist_cv = crossvalidate(zinc~dist, meuse, func = "doTps", 
debug.level = 0)
#'
#' compare.cv(regres_cv, idw2_cv, idw4_cv, idw05_cv, tps_cv, tpsdist_cv, 
ked_cv)
#'
#' # Use log(zinc) as dependent variable
#'
#' meuse$log_zinc = log(meuse$zinc)
#' kedlog_cv = crossvalidate(log_zinc~dist, meuse, func = "krige",
#'         model = autofitVariogram(zinc~dist, meuse, model = 
"Ste")$var_model, debug.level = 0)
#'
#'# Test if crossvalidate matches krige.cv
#' dum = krige.cv(zinc~dist, meuse, model = m)
#' all.equal(bla$residual, ked_cv$residual)
#'
crossvalidate = function(formula, data, func = "doTps", ...) {
   pb = txtProgressBar(1, nrow(data), 1, style = 3)
   dependent = as.character(formula)[2]
   cv_obj = createCvObjFromSpatialObj(data)
   for(i in 1:nrow(data)) {
     setTxtProgressBar(pb, i)
     newdata = data[i,]
     cv_data = data[-i,]
     cv_pred = do.call(func, list(formula, cv_data, newdata, ...))
     cv_obj at data[i,"var1.pred"] <- cv_pred$var1.pred
     cv_obj at data[i,"var1.var"] <- cv_pred$var1.var
     cv_obj at data[i,"observed"] <- data[i,][[dependent]]
   }
   cv_obj$residual = cv_obj$observed - cv_obj$var1.pred
   cat("\n")
   return(cv_obj)
}

# Example code
library(automap)
library(fields)
data(meuse)
coordinates(meuse) = ~x+y
data(meuse.grid)
gridded(meuse.grid) = ~x+y

regres_cv = crossvalidate(zinc~dist, meuse, func = "krige", debug.level = 0)
nearestneighbor_cv = crossvalidate(zinc~1, meuse, func = 
"doNearestNeighbor", debug.level = 0)
idw2_cv = crossvalidate(zinc~1, meuse, func = "idw", debug.level = 0)
idw4_cv = crossvalidate(zinc~1, meuse, func = "idw", debug.level = 0, 
idp = 4)
idw05_cv = crossvalidate(zinc~1, meuse, func = "idw", debug.level = 0, 
idp = 0.5)
ked_cv = crossvalidate(zinc~dist, meuse, func = "krige",
         model = autofitVariogram(zinc~dist, meuse, model = 
"Ste")$var_model, debug.level = 0)
tps_cv = crossvalidate(zinc~1, meuse, func = "doTps", debug.level = 0)

compare.cv(regres_cv, idw2_cv, idw4_cv, idw05_cv, tps_cv, ked_cv)

Would it be worthwile to the R-sig-geo community to expand this code 
into a crossvalidate package? This package could provide an easy means 
of crossvalidating across the different interpolation packages. People 
could write interface functions such as I did for the Tps function.

So, what do you guys think?

cheers,
Paul

-- 
Paul Hiemstra, MSc
Department of Physical Geography
Faculty of Geosciences
University of Utrecht
Heidelberglaan 2
P.O. Box 80.115
3508 TC Utrecht
Phone:  +3130 253 5773
http://intamap.geo.uu.nl/~paul
http://nl.linkedin.com/pub/paul-hiemstra/20/30b/770

currently @ KNMI
paul.hiemstra_AT_knmi.nl