[R-sig-Geo] Cross-validating across spatial interpolation packages

Mark Connolly wmconnol at ncsu.edu
Tue Dec 14 11:32:44 CET 2010


I would find it useful.

On 12/14/2010 04:48 AM, Paul Hiemstra wrote:
> Dear list,
>
> In a project I'm currently working I compare a number of interpolation 
> methods for interpolation of evaporation. In addition to the methods 
> available in gstat/automap (krige, idw) I use a number of non-gstat 
> interpolation methods. To perform cross-validation I wanted to be able 
> to use all methods (gstat and non-gstat) and end with the 
> cross-validation result in the same format. This is particularly 
> useful in combination with the compare.cv() function in automap, which 
> provides summary cross-validation statistics and a spatial plot of the 
> cross-validation residuals. The following example illustrates how I 
> solved this for the Tps function from the fields pacakge and Nearest 
> neighbor interpolation (idw with nmax == 1). First a few function 
> definitions (plus Roxygen documentation) and then an example with the 
> meuse dataset:
>
> #' Provides a krige (gstat) like interface to Tps (fields)
> #'
> #' @param formula A formula to define the dependent and the 
> independent variables, see the documentation of krige.
> #' @param data The input data, should be a spatial object which 
> supports coordinates extracting through coordinates()
> #' @param newdata A spatial object with the prediction locations.
> #' @param ... parameters that are passed on to Tps
> #' @return The function returns a spatial object with the same class 
> as \code{newdata} with the prediction and the variance (NA in this 
> case). The names of the columns match the outcome of the krige function.
> #' @author Paul Hiemstra, \email{p.h.hiemstra at gmail.com}
> #' @export
> #' @examples
> #' data(meuse)
> #' coordinates(meuse) = ~x+y
> #' data(meuse.grid)
> #' gridded(meuse.grid) = ~x+y
> #'
> #' meuse_tps = doTps(zinc~dist, meuse, meuse.grid)
> #' meuse_tps = doTps(zinc~1, meuse, meuse.grid)
> #'
> doTps = function(formula, data, newdata, ..., debug.level = 1) {
>   require(fields, quietly = TRUE)
>   if(debug.level > 0) cat("[using thin plate splines (from fields)]\n")
>   f = as.character(formula)
>   dependent = f[2]
>   independent = f[3] ## Meerdere covariates zou moeten kunnen
>   if(independent == "1") {
>     fit = Tps(x = coordinates(data), Y = data[[dependent]], ...)
>     newdata$var1.pred = as.numeric(predict(fit, coordinates(newdata)))
>     newdata$var1.var = NA
>   } else {
>     fit = Tps(x = coordinates(data), Y = data[[dependent]], Z = 
> data[[independent]], ...)
>     newdata$var1.pred = as.numeric(predict(fit, coordinates(newdata), 
> Z = newdata[[independent]]))
>     newdata$var1.var = NA
>   }
>   newdata$var1.stdev = sqrt(newdata$var1.var)
>   return(newdata[c("var1.pred", "var1.var")])
> }
>
> #' Performs nearest neighbor interpolation
> #'
> #' @param formula A formula to define the dependent and the 
> independent variables, see the documentation of krige.
> #'                Note that nearest neighbor does not support 
> independent variables. Therefor, the formula should always
> #'                have the form dependent~1.
> #' @param data The input data, should be a spatial object which 
> supports coordinates extracting through coordinates()
> #' @param newdata A spatial object with the prediction locations.
> #' @param ... parameters that are passed on to Tps
> #' @note This functions uses idw with 'nmax' set to 1 to perform 
> nearest neighbor interpolation.
> #' @return The function returns a spatial object with the same class 
> as \code{newdata} with the prediction and the
> #'          variance (NA in this case). The names of the columns match 
> the outcome of the krige function.
> #' @author Paul Hiemstra, \email{p.h.hiemstra at gmail.com}
> #' @export
> #' @examples
> #' data(meuse)
> #' coordinates(meuse) = ~x+y
> #' data(meuse.grid)
> #' gridded(meuse.grid) = ~x+y
> #'
> #' meuse_nn = doNearestNeighbor(zinc~1, meuse, meuse.grid)
> #'
> doNearestNeighbor = function(formula, data, newdata, debug.level = 1) {
>   require(gstat, quietly = TRUE)
>   if(debug.level > 0) cat("[using nearest neighbor]\n")
>   f = as.character(formula)
>   dependent = f[2]
>   independent = f[3] ## Meerdere covariates zou moeten kunnen
>   if(independent != "1") stop("Nearest neighbor does not support 
> independent variables, please formula to 'dependent~1'.")
>   newdata = idw(formula, data, newdata, nmax = 1, debug.level = 0)
>   newdata$var1.stdev = sqrt(newdata$var1.var)
>   return(newdata)
> }
>
> # Creates an empty crossvalidation result object from 'obj'
> createCvObjFromSpatialObj = function(obj) {
>   cv_obj = obj
>   cv_obj$observed = NA
>   cv_obj$var1.pred = NA
>   cv_obj$var1.var = NA
>   cv_obj$residual = NA
>   cv_obj$zscore = NA
>   return(cv_obj[c("var1.pred","var1.var","observed", "residual", 
> "zscore")])
> }
>
> #' This function performs crossvalidation in the style of krige.cv
> #'
> #' In addition to supporting kriging and idw, it also supports
> #' thin plate splines (through Tps (fields)).
> #'
> #' @param formula A formula to define the dependent and the 
> independent variables, see the documentation of krige.
> #' @param data The input data, should be a spatial object which 
> supports coordinates extracting through coordinates()
> #' @param func The function which should be used for cross-validation, 
> possibilities are "krige", "idw", "doNearestNeighbor"
> #'              and "doTps".
> #' @param ... parameters that are passed on to 'func'
> #' @return The function returns a spatial object with the class of 
> 'data'. The format
> #'         in which the cross-validation results are presented is 
> equal to that of krige.cv.
> #' @note At this stage only leave-one-out cross-validation is 
> supported. In addition, the formula supports only a limit
> #'       syntax, i.e. log(zinc) or sqrt(dist) are not possible in the 
> formula. Instead, create a new column in the object,
> #'       e.g. meuse$log_zinc = log(meuse$zinc).
> #' @author Paul Hiemstra, \email{p.h.hiemstra at gmail.com}
> #' @export
> #' @examples
> #' data(meuse)
> #' coordinates(meuse) = ~x+y
> #' data(meuse.grid)
> #' gridded(meuse.grid) = ~x+y
> #'
> #' regres_cv = crossvalidate(zinc~dist, meuse, func = "krige", 
> debug.level = 0)
> #' nearestneighbor_cv = crossvalidate(zinc~1, meuse, func = 
> "doNearestNeighbor")
> #' idw2_cv = crossvalidate(zinc~1, meuse, func = "idw", debug.level = 0)
> #' idw4_cv = crossvalidate(zinc~1, meuse, func = "idw", debug.level = 
> 0, idp = 4)
> #' idw05_cv = crossvalidate(zinc~1, meuse, func = "idw", debug.level = 
> 0, idp = 0.5)
> #' ked_cv = crossvalidate(zinc~dist, meuse, func = "krige",
> #'         model = autofitVariogram(zinc~dist, meuse, model = 
> "Ste")$var_model, debug.level = 0)
> #' tps_cv = crossvalidate(zinc~1, meuse, func = "doTps", debug.level = 0)
> #' tpsdist_cv = crossvalidate(zinc~dist, meuse, func = "doTps", 
> debug.level = 0)
> #'
> #' compare.cv(regres_cv, idw2_cv, idw4_cv, idw05_cv, tps_cv, 
> tpsdist_cv, ked_cv)
> #'
> #' # Use log(zinc) as dependent variable
> #'
> #' meuse$log_zinc = log(meuse$zinc)
> #' kedlog_cv = crossvalidate(log_zinc~dist, meuse, func = "krige",
> #'         model = autofitVariogram(zinc~dist, meuse, model = 
> "Ste")$var_model, debug.level = 0)
> #'
> #'# Test if crossvalidate matches krige.cv
> #' dum = krige.cv(zinc~dist, meuse, model = m)
> #' all.equal(bla$residual, ked_cv$residual)
> #'
> crossvalidate = function(formula, data, func = "doTps", ...) {
>   pb = txtProgressBar(1, nrow(data), 1, style = 3)
>   dependent = as.character(formula)[2]
>   cv_obj = createCvObjFromSpatialObj(data)
>   for(i in 1:nrow(data)) {
>     setTxtProgressBar(pb, i)
>     newdata = data[i,]
>     cv_data = data[-i,]
>     cv_pred = do.call(func, list(formula, cv_data, newdata, ...))
>     cv_obj at data[i,"var1.pred"] <- cv_pred$var1.pred
>     cv_obj at data[i,"var1.var"] <- cv_pred$var1.var
>     cv_obj at data[i,"observed"] <- data[i,][[dependent]]
>   }
>   cv_obj$residual = cv_obj$observed - cv_obj$var1.pred
>   cat("\n")
>   return(cv_obj)
> }
>
> # Example code
> library(automap)
> library(fields)
> data(meuse)
> coordinates(meuse) = ~x+y
> data(meuse.grid)
> gridded(meuse.grid) = ~x+y
>
> regres_cv = crossvalidate(zinc~dist, meuse, func = "krige", 
> debug.level = 0)
> nearestneighbor_cv = crossvalidate(zinc~1, meuse, func = 
> "doNearestNeighbor", debug.level = 0)
> idw2_cv = crossvalidate(zinc~1, meuse, func = "idw", debug.level = 0)
> idw4_cv = crossvalidate(zinc~1, meuse, func = "idw", debug.level = 0, 
> idp = 4)
> idw05_cv = crossvalidate(zinc~1, meuse, func = "idw", debug.level = 0, 
> idp = 0.5)
> ked_cv = crossvalidate(zinc~dist, meuse, func = "krige",
>         model = autofitVariogram(zinc~dist, meuse, model = 
> "Ste")$var_model, debug.level = 0)
> tps_cv = crossvalidate(zinc~1, meuse, func = "doTps", debug.level = 0)
>
> compare.cv(regres_cv, idw2_cv, idw4_cv, idw05_cv, tps_cv, ked_cv)
>
> Would it be worthwile to the R-sig-geo community to expand this code 
> into a crossvalidate package? This package could provide an easy means 
> of crossvalidating across the different interpolation packages. People 
> could write interface functions such as I did for the Tps function.
>
> So, what do you guys think?
>
> cheers,
> Paul
>



More information about the R-sig-Geo mailing list