##------------------------------------------------------------------------------# ## Statistical Analysis of the PHALANX Project. # ## # ## MAS in Medizinphysik # ## Blockkurs "Computer in der Medizin" # ##------------------------------------------------------------------------------# ##------------------------------------------------------------------------------# ## Directories and filenames # ##------------------------------------------------------------------------------# dir.base <- "D:/Martin/Documents/NDS/Computerkurs_2007/Statistik/R" ## MM: dir.base <- "/u/maechler/R/Meetings-Kurse-etc/2007-Biomechanik/R" dir.csv <- file.path(dir.base, "csv") ## Students: dir.base <- "http://stat.ethz.ch/Teaching/maechler/Phalanx" dir.csv <- dir.base ## Final Place: ## dir.base <- "http://www.biomechanics.ethz.ch/phalanx/download" if(FALSE) { ## Students can ``bootstrap'' themselves {get the data} via FF <- tempfile() download.file("http://stat.ethz.ch/Teaching/maechler/Phalanx/read-etc.R", FF) source(FF, echo = TRUE) } ##------------------------------------------------------------------------------# ## Read and merge data. # ##------------------------------------------------------------------------------# ## Simple utility function: readTabPhalanx <- function(filename) { head1 <- read.table(filename, nrows = 1, sep = ";", as.is = TRUE) dat <- read.table(filename, skip = 2, sep = ";", col.names = head1) stopifnot(names(dat)[1] == "SPECIMEN") ## use "SPECIMEN" as row names and drop as variable : rownames(dat) <- as.character(dat[,"SPECIMEN"]) # and not factor ## return dat[, -1] # '-1' : leave away first variable i.e. 'SPECIMEN' } ### -------------- Now read the 4 files ---------------------------------- ##----------------------------------# ## Anthropometric Bone Measurements # ## using digital calipers # ##----------------------------------# d.anthro <- readTabPhalanx(file.path(dir.csv, "PHALANX_Anthropometric.csv")) ##--------------------------------------# ## MicroCT Distal 25% Bone Measurements # ## 25% of total length analysed. # ##--------------------------------------# d.dist <- readTabPhalanx(file.path(dir.csv, "PHALANX_Distal_25percent.csv")) ##----------------------------------------------------# ## MicroCT Trabecular Bone Spherical VOI Measurements # ## Sphere totally located within the trabecular # ## bone in the distal part of the phalanges. # ##----------------------------------------------------# d.mech <- readTabPhalanx(file.path(dir.csv, "PHALANX_Mechanical_Testing.csv")) ##----------------------------------------# ## Mechanical Testing of Intact Phalanges # ## in three-point bending # ##----------------------------------------# d.spher <- readTabPhalanx(file.path(dir.csv, "PHALANX_Spherical_VOI.csv")) ## Now have the "base" data --- in 4 data sets : str(d.anthro) # 9 var ##- 'data.frame': 26 obs. of 9 variables: ##- $ SEX : Factor w/ 2 levels "f","m": 1 2 2 2 1 1 1 1 1 2 ... ##- $ AGE : int 56 82 69 73 88 65 87 88 82 77 ... ##- $ ML.Base.Diameter: num 17.1 19.4 19.4 18.2 17.5 ... ##- $ ML.Body.Diameter: num 9.36 11.04 10.80 10.68 9.97 ... ##- $ ML.Head.Diameter: num 11.1 13.3 13.5 13.3 12.0 ... ##- $ AP.Base.Diameter: num 12.5 13.6 13.3 13.3 12.3 ... ##- $ Ap.Body.Diameter: num 6.06 7.2 7.69 7.97 7.55 6.86 6.88 6 7.13 6.44 ... ##- $ AP.Head.Diameter: num 7.94 9.9 9.39 9.43 8.59 7.49 7.74 7.51 8.43 8.66 ... ##- $ Max..Length : num 41.6 47.5 47.5 45.3 42.4 ... summary(d.anthro) summary(d.mech) # 3 var ## Fu Fy S ## Min. :170.4 Min. : 34.10 Min. : 21.73 ## 1st Qu.:389.8 1st Qu.: 88.55 1st Qu.: 58.20 ## Median :511.5 Median :106.85 Median : 69.44 ## Mean :504.6 Mean :107.49 Mean : 69.43 ## 3rd Qu.:586.9 3rd Qu.:122.08 3rd Qu.: 81.22 ## Max. :822.2 Max. :184.90 Max. :108.45 apply(d.dist, 2, mean) str(d.dist) # 12 var ##- 'data.frame': 26 obs. of 12 variables: ##- $ X25..Length : num 10.4 11.9 11.9 11.3 10.6 ... ##- $ Nr.of.Slices.: int 306 349 349 333 311 280 283 284 301 333 ... ##- $ TV : num 1100 1627 1684 1458 1202 ... ##- $ BV : num 245 428 443 346 307 ... ##- $ BMC : num 0.439 0.766 0.793 0.619 0.55 0.327 0.461 0.379 0.372 0.582 ... ##- $ BS : num 1897 2730 2905 3026 2120 ... ##- $ BV.TV. : num 22.3 26.3 26.3 23.7 25.6 ... ##- $ BS.TV : num 1.73 1.68 1.73 2.08 1.76 ... ##- $ BS.BV : num 7.74 6.38 6.55 8.75 6.90 ... ##- $ dS.dr : num -981 -3065 -3589 -2211 -1905 ... ##- $ TBPf : num -0.517 -1.123 -1.236 -0.731 -0.898 ... ##- $ SMI : num -0.401 -1.056 -1.132 -0.501 -0.782 ... str(d.spher) # 13 var ##- 'data.frame': 26 obs. of 13 variables: ##- $ TV : num 103 192 169 151 110 ... ##- $ BV : num 28.7 55.0 52.3 38.3 33.4 ... ##- $ BS : num 397 667 718 668 455 ... ##- $ BV.TV.: num 27.9 28.6 31.0 25.3 30.4 ... ##- $ BS.TV : num 3.86 3.46 4.25 4.41 4.15 ... ##- $ BS.BV : num 13.8 12.1 13.7 17.4 13.6 ... ##- $ Tb.N : num 1.93 1.73 2.13 2.21 2.08 ... ##- $ Tb.Th : num 0.145 0.165 0.146 0.115 0.147 0.122 0.157 0.151 0.123 0.186 ... ##- $ Tb.Sp : num 0.374 0.412 0.325 0.338 0.335 0.347 0.307 0.369 0.535 0.406 ... ##- $ DA : num 1.23 1.25 1.30 1.31 1.20 ... ##- $ X.H1. : num 0.471 0.528 0.424 0.39 0.445 0.425 0.429 0.461 0.57 0.541 ... ##- $ X.H2. : num 0.579 0.658 0.551 0.512 0.532 0.53 0.513 0.577 0.763 0.674 ... ##- $ X.H3. : num 0.508 0.552 0.447 0.468 0.469 0.456 0.448 0.523 0.66 0.563 ... ## merge ---- useful in general. ## ----- Here almost not needed, since have always the same 26 obs. ## in the *identical* order ---> hence could just use cbind(.) identical(row.names(d.spher), row.names(d.dist)) identical(row.names(d.anthro), row.names(d.dist)) identical(row.names(d.mech), row.names(d.dist)) rnMerge <- function(d1,d2, ...) { ## Purpose: merge by row.names -- and make the "row.names" variable to row.names ## ---------------------------------------------------------------------- ## Arguments: d1, d2: two data frames with common row names ## ---------------------------------------------------------------------- ## Author: Martin Maechler, Date: 13 Mar 2007 d <- merge(d1, d2, by = "row.names", ...) ## set row names and omit them as variable: rownames(d) <- d[, 1] d[, -1] } str(d.A.M <- rnMerge(d.anthro, d.mech)) ## Martin Stauber's : d.distal <- rnMerge(d.A.M, d.dist) d.spherical <- rnMerge(d.A.M, d.spher) str(d.distal) # 24 variables str(d.spherical)# 25 variables ## All of them: d.All <- rnMerge(d.distal, d.spher, suffixes= c("_d", "_s")) str(d.All) # 37 variables ## ## Variabel-Namen vereinfachen: names(d.All) <- gsub("\\.Diameter$", ".dm", # shorten.. gsub("\\.$", "", # omit trailing "." gsub("\\.([._])", "\\1", # substitute ".." by "." and "._" by "_" names(d.All)))) names(d.All) <- sub("Nr.of.", "Nr_", names(d.All)) # another shortening names(d.All) <- sub("^Ap", "AP", names(d.All)) # <<-- Fix typo in file ## Ok, make the above into a simple function : mySub <- function(nms) { sub("Nr.of.", "Nr_", # shorten sub("^Ap", "AP", # <<-- Fix typo in file gsub("\\.Diameter$", ".dm", # shorten.. gsub("\\.$", "", # omit trailing "." gsub("\\.([._])", "\\1", # substitute ".." by "." and "._" by "_" nms))))) } ## All variables but the "mechanical": d.X <- rnMerge(d.anthro, rnMerge(d.dist, d.spher, suffixes= c("_d", "_s"))) names(d.X) <- mySub(names(d.X)) str(d.X) ###------------- end of the read-etc.R part ------------------------------ ##------------------------------------------------------------------------------# ## Descriptive statistics. # ##------------------------------------------------------------------------------# ## Nummerische Summaries ... sapply(d.All, mean) # warning: cannot take mean of 'SEX' sapply(d.All[, -1], mean) # ok, but this is better t(sapply(d.All[, -1], quantile)) ## or even better {does "table" 'SEX'}: summary(d.All) ## This gives a bit table (maybe nicer) sapply(d.All[, -1], summary) dim(d.All[ d.All[,"SEX"] == "f" , ]) ## slightly more readable : dim(subset(d.All, SEX == "f")) ## women <-> men summary(subset(d.All, SEX == "f")) summary(subset(d.All, SEX == "m")) ## or "nicer" by(d.All[, -1], d.All[,"SEX"], mean) by(d.All[, -1], d.All[,"SEX"], sd) ## S[tandard] D[eviation] if(FALSE) ## This fails, since median() cannot work : by(d.All[, -1], d.All[,"SEX"], median) by(d.All[, -1], d.All[,"SEX"], mean, trim = 0.10) # 10%-trimmed mean by(d.All[, -1], d.All[,"SEX"], summary) cor(d.All[,-1]) ## correlation matrix {dropping "SEX"} ## or Cm <- cor(data.matrix(d.All)) Cm ## und noch viel mehr; ist aber nicht spannend; ### ========== ### Stattdessen Grafiken ### ========== ## Visualize matrix of pairwise correlations: image(Cm) ## or even nicer (!) heatmap(Cm, symm = TRUE)##-- after variable re-ordering via clustering title("heatmap(cor( data.matrix( d.All ) ), symm = TRUE)") ## dev.print(command="prt.Cfolie") ## do it again, storing the result (including the dendrogram): hmCor <- heatmap(Cm, symm = TRUE, keep.dendro = TRUE) ## what have we got: use str(): str(hmCor, max=2) ## The variable ordering as chosen by the hierarchical clustering algorithm: names(d.All)[ hmCor$rowInd ] ## The clustering dendrogram (tree): hmCor$Rowv plot(hmCor$Rowv) plot(hmCor$Rowv, horiz=TRUE) plot(Fu ~ SEX, data = d.All, main = "Boxplots", sub = "dies ist ein Untertitel") plot(Fu ~ SEX, data = d.All, main = "Boxplots", notch= TRUE) ### More Plots: ## Alle paarweisen Scatter plots: hier wirklich zu viel pairs(d.All, gap = 0, cex = 0.5) ## Plotten ausgewählter Variabler: d.sub1 <- d.All[, c("BV.TV_d", "BS.BV_d", "TBPf", "SMI", ## "Mechanical" ones: "Fu", "Fy", "S") ] ### Von help(pairs) übernommen : ## put (absolute) correlations on the upper panels, ## with size proportional to the correlations. panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor) { usr <- par("usr"); on.exit(par(usr)) par(usr = c(0, 1, 0, 1)) r <- abs(cor(x, y)) txt <- format(c(r, 0.123456789), digits = digits)[1] txt <- paste(prefix, txt, sep = "") if(missing(cex.cor)) cex <- 0.8/strwidth(txt) text(0.5, 0.5, txt, cex = cex * r) } pairs(d.sub1, lower.panel = panel.smooth, upper.panel = panel.cor) ## ^^^^^^^^^ ## Now use sex for color coding : pairs(d.sub1, panel=panel.smooth, col = as.numeric(d.All[,"SEX"])) ## Vergleich zwischen Distal_25% und VOI : (i.ds <- grep("_[ds]$", names(d.All))) # Spalten-Nummern names(d.All)[i.ds] ## sort such that 'pairs' are contiguous: matrix(names(d.All)[i.ds], ncol = 2) # Transpose and use vector: (n.ds <- as.vector(t(matrix(names(d.All)[i.ds], ncol = 2)))) d.vs.s <- d.All[ , n.ds] pairs(d.vs.s, panel=panel.smooth, gap = .01, col = as.numeric(d.All[,"SEX"])) hm.ds <- heatmap(cor(d.vs.s, method = "spearman"), symm = TRUE, keep.dendro = TRUE) str(hm.ds, max=2) plot(hm.ds $ Rowv, main = "hierarchische Clusterung der VOI vs Distal25%") library(sfsmisc)# for mult.fig() which is slightly simpler/nicer than par(mfrow = .) rr <- mult.fig(ncol(d.anthro), main = "d.anthro data") for(i in seq(along = d.anthro)) { v <- d.anthro[, i] nn <- names(d.anthro)[i] if(is.numeric(v)) ## boxplot(v, main = nn) boxplot(split(v, d.anthro[,"SEX"]), main = nn) else plot(table(v), main = nn, xlab = '', col = 2) } ## Make the above into a simple function plotUni <- function(dat, numericplot = boxplot(v, main = nn), namdat = deparse(substitute(dat))) { ## Purpose: Univariate plots for all variables in ## ---------------------------------------------------------------------- ## Arguments: dataframe 'dat' which must have "SEX" variable ## ---------------------------------------------------------------------- ## Author: Martin Maechler, Date: Mar 2007 rr <- sfsmisc::mult.fig(ncol(dat), main = paste(namdat, " data")) ## reset graphic window setup on exiting this function: on.exit(par(rr$old.par)) for(i in seq(along = dat)) { v <- dat[, i] nn <- names(dat)[i] if(is.numeric(v)) eval(substitute(numericplot)) else plot(table(v), main = nn, xlab = '', col = 2) } invisible() } plotUni(d.anthro) # default: simple boxplot plotUni(d.anthro, boxplot(split(v, dat[,"SEX"]), main = nn)) ## noch mit "Signficance - Notches" plotUni(d.anthro, boxplot(split(v, dat[,"SEX"]), main = nn, notch = TRUE)) ## Fuer Uebungs - Lösung setwd("~/R/Meetings-Kurse-etc/2009-Biomechanik") pdf.latex("boxplot_anthro.pdf", width=8, height=7) plotUni(d.anthro, boxplot(split(v, dat[,"SEX"]), main = nn)) pdf.end() ## Kernel Density plotUni(d.anthro, plot(density(v), main = nn)) plotUni(d.mech) plotUni(d.mech, boxplot(split(v, d.anthro[,"SEX"]), main = nn)) plotUni(d.mech, plot(density(v), main = nn)) plotUni(d.anthro, hist(v, main = nn)) gender <- as.numeric(d.All[,"SEX"]) (gCol <- c("blue", "tomato")[gender]) plotUni(d.anthro, scatter.smooth(v, d.All[,"Fu"], ylab = "Fu", col = gCol, pch = 1 + gender, main = paste("Fu vs.", nn))) ## only the women : plotUni(subset(d.mech, subset = d.anthro[,"SEX"] == "f"), plot(density(v), main = nn)) ## both: be sophisticated: plotUni(d.mech, { L <- split(v, d.anthro[,"SEX"]) plot(density(v), main = nn, lty = 2, col = "gray") for(n in 1:2) lines(density(L[[n]]), col = n+1) } ) ## y-Skala : noch etwas raffinierter plotUni(d.mech, { L <- split(v, d.anthro[,"SEX"]) dL <- lapply(L, density) dv <- density(v) plot(dv, main = nn, lty = 2, col = "gray", ylim = c(0, max(dv$y, dL[[1]]$y, dL[[2]]$y))) for(n in 1:2) lines(density(L[[n]]), col = n+1) legend("topright", c("total", "f emale", "m ale"), col = c("gray", palette()[2:3]), lty = c(2,1,1), inset = 0.02) } ) p.uniDens <- function(dat) { ## Purpose: plot univariate (and male / female) densities: ## ---------------------------------------------------------------------- ## Arguments: data frame ## ---------------------------------------------------------------------- ## Author: Martin Maechler plotUni(dat, { L <- split(v, d.anthro[,"SEX"]) dL <- lapply(L, density) dv <- density(v) plot(dv, main = nn, lty = 2, col = "gray", ylim = c(0, max(dv$y, dL[[1]]$y, dL[[2]]$y))) for(n in 1:2) lines(density(L[[n]]), col = n+1) legend("topright", c("total", "f emale", "m ale"), col = c("gray", palette()[2:3]), lty = c(2,1,1), inset = 0.02) }, namdat = deparse(substitute(dat)) ) # trick to get correct title } p.uniDens(d.mech) p.uniDens(d.anthro) p.uniDens(d.spher) p.uniDens(d.dist) ###== Now go for regression: =================================================== ## use all variables for prediction of Fu apart from Fy and S : str(dat <- subset(d.All, select = c( -Fy, -S))) ## also omit the diameters and "lengths": dim(dat2 <- dat [, -grep("\\.(dm|Length)$", names(dat))]) # still 27 variables m1 <- lm(Fu ~ . , data = dat2) # perfect fit: have too many variables! m1 ## Now try to add them stepwisely m0 <- lm(Fu ~ 1, data = dat) # fit mean only summary(m1 <- lm(Fu ~ SEX, data = dat)) # fit "SEX + intercept" confint(m1) ## the cumbersome way: (ch.terms <- paste("Fu ~", paste(names(dat)[names(dat) != "Fu"], collapse="+"))) step(m0, scope = as.formula(ch.terms)) ## ore more elegantly (mst <- step(m0, scope = terms(Fu ~ ., data = dat))) ##------------------------------------------------------ library(MASS)# a "recommended" (and hence 'standard') package stepAIC(m0, scope = terms(Fu ~ ., data = dat))# gives the same here summary(mst) # only one strongly significant term ## Diagnostic plots plot(mst) plot(residuals(mst) ~ dat[,"dS.dr"], col = as.numeric(dat[,"SEX"])) abline(h = 0, lty = 3, col = "gray") # 0-line ## Alternatively, 'leaps' extension package that try "all possible subsets": library(leaps) ma1 <- regsubsets(Fu ~ ., data = dat, nvmax = 20) ## takes a little while (< 1 min) (sm1 <- summary(ma1)) myTit <- "regsubsets(Fu ~ ., data = dat)" plot(ma1, main = myTit) plot(ma1, main = myTit, scale = "adjr2") ## Plot "criterion" against number of variables (nvar <- as.numeric(row.names(sm1 $ which))) plot(nvar, sm1 $ adjr2, xlab = "#{var} ( = p - 1 )", main = myTit) plot(nvar, sm1 $ bic, xlab = "#{var} ( = p - 1 )", main = myTit) ## --> both these plots suggests that all variables are useful ... hmm funny colSums(sm1$which) ## Now record the 3 best variable subsets for each size (up to size 12): ma3 <- regsubsets(Fu ~ ., data = dat, nvmax = 12, nbest = 3) summary(ma3) plot(ma3, main = myTit) plot(ma3, scale = "adjr2", main = myTit) ## this is an improved for plot.regsubsets() s{12}{nvmax} s{3}{nbest}: abline(h = .5 + 3*(1:12), col = "gray", lty = 3) axis (4, at = .5+ 3*(1:12 - 1/2), lab = 1:12, las=2) sm3 <- summary(ma3, matrix.logical = TRUE) (nvar <- as.numeric(row.names(sm3 $ which))) plot(nvar, sm3 $ adjr2, xlab = "#{var} ( = p - 1 )", main = myTit) cbind(sort( colSums(sm3$which), decreasing=TRUE )) ## [,1] ## (Intercept) 36 ## Tb.Th 21 ## TV_s 19 ## BV_s 19 ## BS.BV_s 18 ## X25.Length 14 ## BS.TV_d 13 ## dS.dr 12 ## Tb.Sp 11 ## X.H1 11 ## Max.Length 10 ## BS_d 10 ## Nr_Slices 9 ## SEXm 8 ## TV_d 7 ## ML.Base.dm 6 ## X.H2 6 ## AP.Body.dm 5 ## X.H3 5 ## ML.Body.dm 4 ## TBPf 4 ## DA 4 ## AGE 3 ## AP.Base.dm 3 ## SMI 3 ## BV_d 2 ## BMC 2 ## BV.TV_s 2 ## BS.BV_d 1 ## BS_s 1 ## Tb.N 1 ## ML.Head.dm 0 ## AP.Head.dm 0 ## BV.TV_d 0 ## BS.TV_s 0 ## Ohne unsinnige Nr_slices ma <- regsubsets(Fu ~ ., data = dat, force.out = c("Nr_Slices"), nvmax = 12, nbest = 3) summary(ma) plot(ma, main = myTit) plot(ma, scale = "adjr2", main = myTit) ## this is an improved for plot.regsubsets() s{12}{nvmax} s{3}{nbest}: abline(h = .5 + 3*(1:12), col = "gray", lty = 3) axis (4, at = .5+ 3*(1:12 - 1/2), lab = 1:12, las=2) ## Ohne unsinnige Nr_slices -- und ohne dS.dr -- was wird dann sonst wichtig? ma. <- regsubsets(Fu ~ ., data = dat, force.out = c("dS.dr", "Nr_Slices"), nvmax = 12, nbest = 3) summary(ma.) plot(ma., main = myTit) plot(ma., scale = "adjr2", main = myTit) ## this is an improved for plot.regsubsets() s{12}{nvma.x} s{3}{nbest}: abline(h = .5 + 3*(1:12), col = "gray", lty = 3) axis (4, at = .5+ 3*(1:12 - 1/2), lab = 1:12, las=2) ###------------ Weitere Modellierungen statt lineare Regression ------------ library(rpart) mrp <- rpart(Fu ~ . , data = subset(d.All, select = c( -Fy, -S))) mrp ## --> again uses dS.dr exclusively ##--- Another alternative: ## Use --- lasso --- to find relevant variables; ## even for all 2-way interactions: library(lars) ## lars() needs a traditional X matrix : dat <- subset(d.All, select = c( -Fy, -S)) ff <- Fu ~ . dX <- model.matrix(ff, model.frame(ff, dat)) str(dX) dX[, 1:6] # die ersten 6 Variabeln ## Alle möglichen 2-fach Interaktionen: dX2 <- model.matrix(Fu ~ .^2, model.frame(Fu ~ .^2, dat)) str(dX2) ## 26 * 596 (!!) l1 <- lars(dX2, y = dat[,"Fu"], use.Gram=FALSE) plot(l1) ## oops -- use Martin's improved plot --> ./lars-improved-MM.R if(FALSE) ## Or Nicolai's library(relaxo) ## but that's definitely *NOT* yet on students' machines