############################################################ # # # R code Exam Problem Series 2 # # # # "Using R for statistical data analysis and graphics" # # # # # ############################################################ # 2009-11-02 A. Papritz, C. Schwierz # read the data d.dornach <- read.table("http://stat.ethz.ch/~stahel/courses/R/dornach.txt",header=TRUE) # d.dornach <- read.table( "dornach.txt", header=TRUE ) summary( d.dornach ) ## # Problem 1 on graphics [10 points] ## palette( rainbow( 10 ) ) oldpar <- par() par( mfrow = c( 2, 2 ) ) # a) plot( cu~ x, d.dornach, col = as.numeric( cut( d.dornach$cu, 10 ) ), main = "cu vs. x" ) # b) boxplot( d.dornach$cu, log = "y", main = "cu content", ylab = "cu [mg/kg]" ) abline( h = c( 40, 150, 1000 ), col = c( "green", "orange", "red" ) ) # c) plot( y ~ cu, d.dornach, main = "cu vs. y" ) abline( v = c( 40, 150, 1000 ), col = c( "green", "orange", "red" ) ) legend( "topright", lty = "solid", col = c( "green", "orange", "red" ), legend = c( "guide threshold", "trigger threshold", "intervention threshold" ) ) ## Largest Cu concentrations are found at about (x,y)~(150,0) # d) par( mfrow = c( 2, 2 ), bg = "black", fg = "white", col.main = "yellow", cex.main = 2, col.axis = "cyan", col.lab = "cyan" ) # And redo a) - c) from above par(oldpar) ## # Problem 2 - Distributions [5 points] ## # a) Cumulative distribution shows the cumulative probability of a # distribution up to a certain X-value. It corresponds to the area # under the density curve up to the value X. The Y-value of the # cumulative distribution of a certain X-value corresponds to the # quantile of the distribution. The maximum Y-value of the cumulative # function is thus 1. # b) pnorm(c(1,2.5),mean=0,sd=1) # [1] 0.8413447 0.9937903 # x=1 corresponds to the 84.13 percentile; x=2.5 corresponds to the # 99.37 percentile. I.e 84.13 percent of all values of the # distribution lie to the left of x=1. # c) t.x <- c(0:20) t.y <- pnorm(t.x,mean=5,sd=3) plot(t.x,t.y,type="l", main="Cumulative Distribution mean=5, sd=3", xlab="x",ylab="pnorm") ## # Problem 3 - Functions [8 points] ## # a) ?runif # Provides information about the uniform distribution on the interval # from ‘min’ to ‘max’. ‘runif’ generates random deviates. # runif(n, min=0, max=1). # n: number of observations. If ‘length(n) > 1’, the length is taken # to be the number required. # min,max: lower and upper limits of the distribution. Must be # finite. # b) t.runif <- runif(10,min=0,max=1) ## [1] 0.3165579 0.8130658 0.3573399 0.1081055 0.4401509 0.4979574 0.3062651 ## [8] 0.2414085 0.5361646 0.4052669 # c) mean(t.runif) median(t.runif) sd(t.runif) ## > [1] 0.586666 ## > [1] 0.6269727 ## > [1] 0.2972619 # d) f.runif <- function(ni,mini,maxi){ options(digits=3) t.runif <- runif(n=ni,min=mini,max=maxi) d.result <- c(mean=mean(t.runif),median=median(t.runif),sd=sd(t.runif)) return(d.result) } # e) f.runif(ni=10000,mini=-10,maxi=10) ## # Problem 4 - Missing values [5 points] ## # a) d.dornach2 <- read.table("http://stat.ethz.ch/~stahel/courses/R/dornach2.txt", header=TRUE,na.strings=c(-999)) # If information on NA=-999 is not given, summary takes -999 as # minimum value for the concentrations. If na.strings is provided, # summary counts the number of NA's in each variable. # b) summary(d.dornach2) ## cu: 3 NAs ## zn: 2 NAs ## cd: 3 NAs v.na <- which(is.na(d.dornach2)) # Elements in the data frame that are NAs v.dornach2 <- unlist(d.dornach2) # unlist to turn d.dornach2 into a # vector - lines are added behind variable # names; v.dornach2 v.dornach2[v.na] # then use logical vector "which(...)" to get # NA-elements of new vector unlist(d.dornach2)[which(is.na(d.dornach2))] # the same can be done in # one line ## cu5 cu21 cu87 zn16 zn74 cd32 cd96 cd114 ## NA NA NA NA NA NA NA NA # c) mean(d.dornach$cu) mean(d.dornach2$cu,na.rm=T) ## [1] 438 ## [1] 439 mean(d.dornach$zn) mean(d.dornach2$zn,na.rm=T) ## [1] 559 ## [1] 548 mean(d.dornach$cd) mean(d.dornach2$cd,na.rm=T) ## [1] 1.84 ## [1] 1.86 # d) # The groups "gs" and "wirz" are from two different surveys and have # different length, so we cannot use a paired Wilcoxon Test # Select groups d.gs <- d.dornach2[d.dornach2[,"survey"]=="gs",] d.wirz <- d.dornach2[d.dornach2[,"survey"]=="wirz",] wilcox.test(d.gs$cu,d.wirz$cu,na.action=na.omit,paired=FALSE) ## > wilcox.test(d.gs$cu,d.wirz$cu,na.action=na.omit,paired=FALSE) ## Wilcoxon rank sum test with continuity correction ## data: d.gs$cu and d.wirz$cu ## W = 2134, p-value = 0.0004429 ## alternative hypothesis: true location shift is not equal to 0 # The p-value is very small, therefore we can reject the hypothesis, # that the means of the cu-contents in the two groups differ. # Note: Same result for the t-test; ## > t.test(d.gs$cu,d.wirz$cu,na.action=na.omit) ## ## Welch Two Sample t-test ## data: d.gs$cu and d.wirz$cu ## t = 2.3519, df = 109.036, p-value = 0.02047 ## alternative hypothesis: true difference in means is not equal to 0 ## 95 percent confidence interval: ## 37.59154 440.42096 ## sample estimates: ## mean of x mean of y ## 541.1875 302.1812