###################################### ## TEST 1 MODEL SOLUTION ###################################### ###################################### ## QUESTION 1 Vectors and Matrices [5 points] ## ## Note that many other ways exist ## to create the desired outcome ###################################### ## 1a) Generate the following vectors [1 point each] c(10:0) ## [1] 10 9 8 7 6 5 4 3 2 1 0 sort(rep((1:10)^2,3)) rep((1:10)^2,each=3)) ## [1] 1 1 1 4 4 4 9 9 9 16 16 16 25 25 25 36 36 36 49 ## [20] 49 49 64 64 64 81 81 81 100 100 100 paste(letters[1:16],(1:16),sep="") ## [1] "a1" "b2" "c3" "d4" "e5" "f6" "g7" "h8" "i9" "j10" "k11" "l12" ## [13] "m13" "n14" "o15" "p16" ## 1b) Generate the matrix... [2 points each] t.matrix <- matrix(seq(5.5,15,by=0.5),nrow=2,byrow=TRUE) ## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] ## [1,] 5.5 6 6.5 7 7.5 8 8.5 9 9.5 10 ## [2,] 10.5 11 11.5 12 12.5 13 13.5 14 14.5 15 t.matrix%*%t(t.matrix) ## [,1] [,2] ## [1,] 621.25 1008.75 ## [2,] 1008.75 1646.25 ###################################### ## QUESTION 2 Selecting Elements [10 points] ###################################### ## 2a) [2 points] d.pcb <- read.table("http://stat.ethz.ch/~stahel/courses/R/pcb.txt", header=TRUE) summary(d.pcb) ## There are 6 variables (year, x, y, coast, depth and pcb). ## Together there are 122 observations/lines, 42 for year 1991, 49 for ## 1996 and 31 for 2000. ## 2b) [2 points] d.pcb[8:20,2:3] d.pcb[c(8:20),c(2:3)] d.pcb[c(8:20),c("x","y")] ## x y ## 8 595950.4 5791157 ## 9 613405.5 5858019 ## 10 612495.8 5858035 ## 11 610341.4 5858969 ## 12 580113.0 5766380 ## 13 584584.0 5779119 ## 14 574357.9 5773522 ## 15 549571.8 5756380 ## 16 534577.6 5769657 ## 17 525862.4 5693230 ## 18 523440.5 5695313 ## 19 673131.4 5946486 ## 20 674718.0 5931342 ## 2c) [2 points] table(d.pcb[,"pcb"]>5) sum(d.pcb[,"pcb"]>5) ## 9 observations have pcb>5 t.i <- d.pcb[,"pcb"]>5 ## create logical vector with TRUE/FALSE for ## each observation/line d.pcb[t.i,c("x","y","pcb")] ## select only those lines where t.i = ## TRUE, line numbers are returned ## automatically ## x y pcb ## 3 507679.8 5729817 7.6 ## 7 588834.8 5795178 5.8 ## 10 612495.8 5858035 5.2 ## 13 584584.0 5779119 11.6 ## 27 604087.1 5814244 13.1 ## 28 607645.6 5831124 7.2 ## 35 569497.6 5745289 7.5 ## 36 570532.5 5747621 6.3 ## 39 674879.5 5928471 7.0 ## 2d) [2 points] t.pcb <- d.pcb[,"pcb"] ## extract pcb values median(t.pcb) ## calculate median median(d.pcb[,6]) ## same result in one line median(d.pcb[,"pcb"]) ## same result in one line ## [1] 1.6 ## 2e) [2 points] ## solution remembering result of 2a t.y1991 <- d.pcb[1:42,"pcb"] ## create vector with pcb values for each ## year t.y1996 <- d.pcb[43:91,"pcb"] t.y2000 <- d.pcb[92:122,"pcb"] median(t.y1991) ## [1] 3.05 median(t.y1996) ## [1] 1.4 median(t.y2000) ## [1] 0.9 ## same using logical vectors median(d.pcb[d.pcb[,"year"]=="y1991","pcb"]) median(d.pcb[d.pcb[,"year"]=="y1996","pcb"]) median(d.pcb[d.pcb[,"year"]=="y2000","pcb"]) ###################################### ## QUESTION 3 Boxplots and Statistical Tests [9 points] ###################################### ## 3a) [3 points] boxplot( d.pcb[, "pcb"], ylab = "normalised PCB content", main = "PCB in sediments of North Sea" ) ## 3b) [3 points] boxplot( pcb~year, d.pcb, log = "y", notch = TRUE, ylab = "normalised PCB content", main = "PCB in sediments of North Sea in 3 years" ) ## Notches indicate 95% confidence interval for median. If they do not ## overlap, medians are significantly different. This seems to be the ## case between y1991 and y1996, and between y1991 and y2000. ## 3c) [3 points] t.1996 <- d.pcb[d.pcb[,"year"]=="y1996","pcb"] t.2000 <- d.pcb[d.pcb[,"year"]=="y2000","pcb"] ## (*) The number of observations in year 1996 and 2000 differs, so ## the observatinos are not paired. wilcox.test(t.1996,t.2000) ## Wilcoxon rank sum test with continuity correction ## data: t.1996 and t.2000 ## W = 872, p-value = 0.2680 ## alternative hypothesis: true location shift is not equal to 0 ## Warning message: ## In wilcox.test.default(t.1996, t.2000) : ## cannot compute exact p-value with ties ## Warning only tells you that the rank sum was equal for several ## observations. t.test(t.1996,t.2000) ## Welch Two Sample t-test ## data: t.1996 and t.2000 ## t = 1.2686, df = 72.74, p-value = 0.2086 ## alternative hypothesis: true difference in means is not equal to 0 ## 95 percent confidence interval: ## -0.1792691 0.8070505 ## sample estimates: ## mean of x mean of y ## 1.581633 1.267742 ## Interpretation: Both the Wilcoxon test (p-value = 0.268) and the ## t-test (p-value = 0.209) do not reject the hypothesis that the ## means of the PCB observations in year 1996 and 2000 are the ## same. We cannot rule out that their true mean is the same (although ## the PCB means from the observations do slightly differ (1.58 versus ## 1.27) - but this coul be caused by chance! ###################################### ## QUESTION 4 Scatterplots [9 points] ###################################### ## 4a) [3 points] plot(y~x,data=d.pcb,asp=1,pch=as.numeric(d.pcb$year)) ## different symbols plot( y ~ x, data = d.pcb, asp = 1, col = as.numeric( d.pcb[,"year"] ), xlab = "easting [UTM]", ylab = "northing [UTM]", main = "Positions of sampling locations" ) legend( "topright", pch = 1, col = 1:nlevels( d.pcb[,"year"] ), legend = levels( d.pcb[,"year"] ), bty = "n" ) ## 4b) [3 points] plot( d.pcb[,c("x","y")], asp=1, col=as.numeric( d.pcb[,"year"] ), cex=sqrt( d.pcb[,"pcb"] )) legend("topleft",legend=as.character(levels(d.pcb$year)), col=c(1:nlevels(d.pcb$year)),pch=1,bty="n") t.legend <- c( 0.2,0.5,1,2,5,10 ) legend( "bottomright", x.intersp=1.5, y.intersp=1.7, legend=c( "pcb [mg/kg]", as.character(t.legend) ), pch=1, pt.cex=c( NA, sqrt(t.legend)), bty="n") ## simpler, with just one colour plot( d.pcb[, c("x", "y")], asp = 1, cex = sqrt( d.pcb[, "pcb"] ), col = "red" ) t.pcb <- c( 0.2, 0.5, 1, 2, 5, 10 ) legend( "topleft", x.intersp = 1.5, y.intersp = 1.7, col = "red", legend = c( "pcb content", as.character( t.pcb ) ), pch = 1, pt.cex = c( NA, sqrt( t.pcb ) ), bty = "n" ) ## Interpretation: The highest concentrations are found close to the ## mainland. Also, the concentrations are highest for the year 1991, ## in line with the boxplots and tests from Question 3. ## 4c) [3 points] plot( pcb ~ coast, data = d.pcb[ d.pcb[, "year"] == "y1991",], ylim = c( 0.1, 12 ), log = "xy" ) points( d.pcb[ d.pcb[, "year"] == "y2000", "coast"], d.pcb[ d.pcb[, "year"] == "y2000", "pcb"], col = "blue", pch = 3 ) abline( h = c( 2, 5 ), lty = "dotted", col = "red" ) table( d.pcb[, "year"], cut( d.pcb[, "pcb"], c( 0, 2, 1000 ) ) ) table( cut( d.pcb[d.pcb[,"year"] == "y1991", "pcb"], c( 0, 2, 1000 ) ) ) table( cut( d.pcb[d.pcb[,"year"] == "y2000", "pcb"], c( 0, 2, 1000 ) ) ) ############ ## Alternative ways to get to the same results: ## make a subselection of the data t.selection <- d.pcb[d.pcb[,"year"]!="y1996",] t.selection$year <- factor(t.selection[,"year"], exclude="y1996") ## remove level for year ## 1996 plot(pcb~coast,data=t.selection,col=as.numeric(t.selection$year), log="xy", xlab="Distance from the coast [m], logarithmic scale", ylab="PCB [mg/kg], logarithmic scale", main="PCB concentrations") legend("topright",legend=as.character(levels(t.selection$year)), col=c(1:as.numeric(nlevels(t.selection$year))),pch=1,bty="n") abline(h=c(2,5),lty=2) ## A "stepwise" solution to create the plot: t.pcb1991 <- d.pcb[d.pcb[,"year"]=="y1991","pcb"] t.pcb2000 <- d.pcb[d.pcb[,"year"]=="y2000","pcb"] t.dist1991 <- d.pcb[d.pcb[,"year"]=="y1991","coast"] t.dist2000 <- d.pcb[d.pcb[,"year"]=="y2000","coast"] plot(t.pcb1991~t.dist1991,col=1,log="xy", xlab="Distance from the coast [m], logarithmic scale", ylab="PCB [mg/kg], logarithmic scale", main="PCB concentrations") points(t.pcb2000~t.dist2000,col=3) legend("topright",legend=c("1991","2000"), col=c(1,3),pch=1,bty="n") abline(h=c(2,5),lty=2) # max(c(t.pcb1991,t.pcb2000)) ## to find uppermost cut-point table(cut(c(t.pcb1991,t.pcb2000),c(0,2,15))) table(cut(c(t.pcb1991,t.pcb2000),c(0,2,5,15))) ## 34 measurements exceed a concentration of 2 g/kg, out of which 9 ## exceed 5 g/kg. ######################################