####--------------- Ozone - gam Example -------------------------- makeOzone <- function() { ## Purpose: Construct the version of 'Ozone' data we use for the lecture ## ---------------------------------------------------------------------- ## Author: Martin Maechler, Date: 1 June 2006 data(ozone, package = "gss") ## ============== if(exists("d.ozone")) stopifnot(as.matrix(d.ozone) == as.matrix(ozone)) ## the Variable names differ though; want to replace *some* : oldN <- c("upo3", "wdsp", "hmdt", "sbtp") newN <- c("O3", "wind", "humidity", "temp") colnames(ozone)[colnames(ozone) %in% oldN] <- newN ozone } d.ozone <- makeOzone() if(FALSE) # 2012: no longer needed: cairo with dev.hold() is *fast*! X11.options(type="Xlib")# fast (but unaliased, less "nice") pairs(d.ozone, cex = 0.4, gap = 0) ## after example(pairs, ask = FALSE, echo=FALSE)#-> get panel.hist() utility pairs(d.ozone, cex = 0.4, gap = 0, diag.panel = panel.hist) ## --> O3 should be log-transformed; also "wind" outlier (obs. 92) require(mgcv) # -- for function gam() ##=========== ## fit <- gam(upo3 ~ s(.), data = d.ozone) ## ---- does NOT work yet fit <- gam(O3 ~ s(vdht)+s(wind)+s(humidity)+s(temp)+s(ibht)+s(dgpg)+s(ibtp)+s(vsty)+s(day), data = d.ozone) summary(fit) plot(fit, pages = 1) ## Tukey-Anscombe plot(fitted(fit), residuals(fit), main = "Tukey-Anscombe plot", xlab = "fitted", ylab = "residuals") ## nicer, using 'sfsmisc' TA.plot() : if(require("sfsmisc")) { TA.plot(fit) ## even nicer : TA.plot(fit, labels = "o", show.2sigma= FALSE, cex.main = .8) } ## .. N O W W H A T ? ## ~~~~~~~~~~~~~~~~~ ## ---> use better model : Log-transform the Y - var : ## update() the last model : summary(fit1 <- update(fit, log(O3) ~ . ) ) ## ... ## Parametric coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 2.21297 0.01717 128.9 <2e-16 *** ## --- ## Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 ## ## Approximate significance of smooth terms: ## edf Est.rank F p-value ## s(vdht) 1.000 1.000 11.765 0.000688 *** ## s(wind) 2.526 6.000 2.450 0.025035 * ## s(humidity) 2.353 5.000 2.887 0.014574 * ## s(temp) 3.772 8.000 3.453 0.000798 *** ## s(ibht) 2.797 6.000 3.965 0.000780 *** ## s(dgpg) 3.248 7.000 10.600 6.28e-12 *** ## s(ibtp) 1.000 1.000 0.466 0.495368 ## s(vsty) 5.728 9.000 5.459 5.93e-07 *** ## s(day) 4.609 9.000 17.120 < 2e-16 *** ## --- ## Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 ## ## R-sq.(adj) = 0.826 Deviance explained = 84.1% ## GCV score = 0.10629 Scale est. = 0.097256 n = 330 ## MM: much more significant ones! plot(fit1, pages = 1) ## or with extras: ## Improve the 'bad' defaults in plot.gam(): op <- par(mfrow = c(3,3), mgp = c(1.5, 0.6, 0), mar = 0.1 + c(3, 3, 1, 1), oma = c(0,0, 2.5, 0)) plot(fit1, pages = 1) cl <- fit1$call; names(cl)[2] <- "" mtext(deparse(cl, width.cutoff=200), outer = TRUE, cex = 0.8, font = 2) ## Residual Plot: plot(fitted(fit1),resid(fit1), main = "Tukey-Anscombe plot",xlab = "fitted",ylab = "residuals") ## better : use TA.plot() utility function from 'sfsmisc' package: TA.plot(fit1, labels="o", cex.main = 1) ## And Normal Plot: qqnorm(resid(fit1)) # ~ ok. ###----------- But *REALLY* --- the ## leverage outlier of "wind" should be removed. with(d.ozone, which(wind > 15)) ## 92 summary(fit1.o <- update(fit1, subset = -92)) ## Family: gaussian ## Link function: identity ## Formula: ## log(O3) ~ s(vdht) + s(wind) + s(humidity) + s(temp) + s(ibht) + ## s(dgpg) + s(ibtp) + s(vsty) + s(day) ## Parametric coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) 2.2148 0.0172 128.8 <2e-16 *** ## --- ## Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 ## Approximate significance of smooth terms: ## edf Est.rank F p-value ## s(vdht) 1.000 1 11.650 0.000730 *** ## s(wind) 1.046 3 3.427 0.017517 * ## s(humidity) 2.372 5 2.810 0.016922 * ## s(temp) 3.855 8 3.520 0.000655 *** ## s(ibht) 2.785 6 4.068 0.000611 *** ## s(dgpg) 3.267 7 10.761 4.09e-12 *** ## s(ibtp) 1.000 1 0.458 0.499088 ## s(vsty) 5.513 9 5.436 6.37e-07 *** ## s(day) 4.616 9 17.537 < 2e-16 *** ## --- ## Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 ## R-sq.(adj) = 0.826 Deviance explained = 84% ## GCV score = 0.10579 Scale est. = 0.097287 n = 329 plot(fit1.o, pages = 1) ## with extras: ## Improve the 'bad' defaults in plot.gam(): op <- par(mfrow = c(3,3), mgp = c(1.5, 0.6, 0), mar = 0.1 + c(3, 3, 1, 1), oma = c(0,0, 2.5, 0)) plot(fit1.o, pages = 1) cl <- fit1.o$call; names(cl)[2] <- "" mtext(deparse(cl, width.cutoff=200), outer = TRUE, cex = 0.8, font = 2) plot(fitted(fit1.o),resid(fit1.o), main = "Tukey-Anscombe plot", xlab = "fitted",ylab = "residuals") ## better : use TA.plot() utility function from 'sfsmisc' package: TA.plot(fit1.o, labels="o", cex.main = 1) ## And Normal Plot: qqnorm(resid(fit1.o)) # "no" difference to previous