## Illustration of a data analysis, by Werner Stahel ## =============================== ## April 2015 ## need package require(regr0) ## may need ## install.packages("regr0", repos="http://r-forge.r-project.org") ## --------------------------------- ## read data dd <- read.table("~/R/datanal/ef1.txt",sep="\t",na.strings="n.a.") ##- dd <- read.table("http://stat.ethz.ch/Teaching/Datasets/WBL/ef1.dat", ##- sep="\t") showd(dd) ## variable names c.vardescr <- scan("~/R/datanal/ef-var.txt",what="",sep="\t") t.v <- c("vehicle", "cycle", "test", "km", "weight", "pressure", "temperature", "rhum", "distance", "duration", "CO", "CO2", "FC", "HC", "NOx") names(dd) <- t.v names(c.vardescr) <- t.v cbind(c.vardescr) summary(dd) ## check first and last rows and columns ## check if the right variables are factors, and check plausibility length(levels(dd$test)) ## -- all different --> useless levels(dd$cycle) t.lv <- levels(dd$cycle) t.lv <- substring(t.lv,4,20) t.lv levels(dd$cycle) <- t.lv sumna(dd) ## number of missing values --> none showd(dd) ## annotate dd (when using regr0) tit(dd) <- "Emission factors" userOptions(project="Data analysis demo") d.ef <- dd ## store under a 'd.' name, continue working with dd ## ------------------------- ## nature of variables c.vinputf <- t.v[1:2] ## factors c.vinputq <- t.v[4:10] ## quantitative c.vinput <- c(c.vinputf, c.vinputq) c.vresponse <- t.v[11:15] c.id <- "test" ## get acquainted, screen pairs(dd) ## not useful ## --- start by response variables only pairs(dd[,c.vresponse]) ## --> fuel consumption (FC) almost equal to CO2 ## use formula paste(c.vresponse,collapse=" + ") ## --> "CO + CO2 + FC + HC + NOx" pairs( ~ CO + CO2 + FC + HC + NOx, data=dd) ## transform paste(paste("log10(",c.vresponse,")"),collapse=" + ") ## ... and drop FC pairs( ~ log10( CO ) + log10( CO2 ) + log10( HC ) + log10( NOx ), data=dd) ## alternative function from regr plmatrix( ~ log10( CO ) + log10( CO2 ) + log10( HC ) + log10( NOx ), data=dd) ## --- now, input variables ## quantitative variables plmatrix(dd[,c.vinputq]) ## --> transform km ## distance and duration have only 3 really distinct values hist(dd$km) plot(distance~cycle, data=dd) ## produces boxplots (!) plot(duration~cycle, data=dd) ## drop duration and distance from input variables c.vinputq <- setdiff(c.vinputq, c("distance","duration")) c.vinput <- c(c.vinputf, c.vinputq) ## factors table(dd$vehicle, dd$cycle) sum(c( table(dd$vehicle, dd$cycle) )!=1) ## --> each combination occurs once, balanced paste(c.vinput,collapse=" + ") ## --> "vehicle + cycle + ... + rhum" plmatrix(~ vehicle + cycle + log10(km) + weight + pressure + temperature + rhum, data=dd) ## km and weight are properties of vehicle pairs(~ vehicle + cycle + log10(km) + weight + pressure + temperature + rhum, data=dd, cex=0.2) ## ------- ## try plmatrix(dd) ! ## advantages of plmatrix: handle many variables, save space, add stamp ## ========================================================================= ## regression modelling ## ========================================================================= ( t.form <- paste("log10(HC)~",paste(c.vinput, collapse="+")) ) mframe(3,3) ## like par(mfrow=c(3,3)) , with some adjustments plot(as.formula(t.form), data=dd) ## --- fit model r.lmfull <- lm(as.formula(t.form), data=dd) summary(r.lmfull) ## weight and almost km are redundant when vehicle is in the model. formula(r.lmfull) r.lmf <- lm(log10(HC) ~ vehicle + cycle + pressure + temperature + rhum, data=dd) summary(r.lmf) ## factors should not get a pvalue for each level ## --> use function regr froma package regr0 (r.rf <- regr(log10(HC) ~ vehicle + cycle + pressure + temperature + rhum, data=dd) ) ## --- diagnostic plotting ## lm mframe(2,2) plot(r.lmf) termplot(r.lmf,partial.resid=T) ## regr plot(r.rf, ask=T) ## --- model development ## drop temperature (r.r1 <- regr(log10(HC) ~ vehicle + cycle + pressure + rhum, data=dd) ) print(r.r1, dummycoef=F) userOptions(show.dummy.coef=F) ## check adding square terms and interactions add1(r.r1) ( r.r2 <- update(r.r1, formula=.~.+vehicle:pressure) ) print(r.r2, dummycoef=F) add1(r.r2) r.r3 <- update(r.r2, formula=.~.+vehicle:rhum) print(r.r3, dummycoef=F) add1(r.r3) r.r3 ## analyze coefficients names(r.r3$allcoef) ( t.t <- table(dd$vehicle,dd$weight) ) dim(t.t) t.i <- match(1:25,as.numeric(dd$vehicle)) d.veh <- data.frame(weight=dd[t.i,"weight"],km=dd[t.i,"km"], coefpressure=r.r3$allcoef$pressure+r.r3$allcoef$"vehicle:pressure", coefrhum=r.r3$allcoef$rhum+r.r3$allcoef$"vehicle:rhum", row.names=levels(dd$vehicle)) showd(d.veh) plmatrix(d.veh[,1:2],d.veh[3:4]) ## multivariate regression ( r.rcf <- regr(cbind(coefpressure,coefrhum) ~ weight+km, data=d.veh) ) r.rcf ## --- other regression models ## ordered factor hist(log10(dd$HC)) dd$HCcat <- cut(log10(dd$HC),c(-4,-2.5,-2,-1.5,-1,0)) str(dd$HCcat) is.ordered(dd$HCcat) dd$HCcat <- ordered(dd$HCcat) ## regression for ordered factor response ( r.ro1 <- regr(HCcat ~ vehicle + cycle + pressure + rhum, data=dd, family="cumlogit") ) print(r.ro1, dummycoef=F) print(r.r1, dummycoef=F) plot(r.ro1, ask=T) ## the end