##------------------------------------------------------------------------------#
##   Statistical Analysis of the PHALANX Project.                               #
##                                                                              #
##   MAS in Medizinphysik                                                       #
##   Blockkurs "Computer in der Medizin"                                        #
##------------------------------------------------------------------------------#

##------------------------------------------------------------------------------#
## Directories and filenames                                                    #
##------------------------------------------------------------------------------#

dir.base        <- "D:/Martin/Documents/NDS/Computerkurs_2007/Statistik/R"

## MM:
dir.base  <- "/u/maechler/R/Meetings-Kurse-etc/2007-Biomechanik/R"
dir.csv <- file.path(dir.base, "csv")

## Students:
dir.base  <- "http://stat.ethz.ch/Teaching/maechler/Phalanx"
dir.csv   <- dir.base

## Final Place:
## dir.base  <- "http://www.biomechanics.ethz.ch/phalanx/download"

if(FALSE) {
    ## Students can  ``bootstrap'' themselves {get the data} via
    FF <- tempfile()
    download.file("http://stat.ethz.ch/Teaching/maechler/Phalanx/read-etc.R", FF)
    source(FF, echo = TRUE)
}

##------------------------------------------------------------------------------#
## Read and merge data.                                                         #
##------------------------------------------------------------------------------#

## Simple utility function:
readTabPhalanx <- function(filename) {
    head1 <- read.table(filename, nrows = 1, sep = ";", as.is = TRUE)
    dat <- read.table(filename, skip = 2,  sep = ";", col.names = head1)
    stopifnot(names(dat)[1] == "SPECIMEN")
    ## use "SPECIMEN" as row names and drop as variable :
    rownames(dat) <- as.character(dat[,"SPECIMEN"]) # and not factor
    ## return
    dat[, -1] # '-1' : leave away first variable  i.e. 'SPECIMEN'
}

### -------------- Now read the 4 files ----------------------------------

##----------------------------------#
## Anthropometric Bone Measurements #
## using digital calipers           #
##----------------------------------#
d.anthro <- readTabPhalanx(file.path(dir.csv, "PHALANX_Anthropometric.csv"))

##--------------------------------------#
## MicroCT Distal 25% Bone Measurements #
## 25% of total length analysed.        #
##--------------------------------------#
d.dist   <- readTabPhalanx(file.path(dir.csv, "PHALANX_Distal_25percent.csv"))

##----------------------------------------------------#
## MicroCT Trabecular Bone Spherical VOI Measurements #
## Sphere totally located within the trabecular       #
## bone in the distal part of the phalanges.          #
##----------------------------------------------------#
d.mech   <- readTabPhalanx(file.path(dir.csv, "PHALANX_Mechanical_Testing.csv"))

##----------------------------------------#
## Mechanical Testing of Intact Phalanges #
## in three-point bending                 #
##----------------------------------------#
d.spher  <- readTabPhalanx(file.path(dir.csv, "PHALANX_Spherical_VOI.csv"))

## Now have the "base" data --- in 4 data sets :
str(d.anthro) # 9 var
##- 'data.frame':	26 obs. of  9 variables:
##-  $ SEX             : Factor w/ 2 levels "f","m": 1 2 2 2 1 1 1 1 1 2 ...
##-  $ AGE             : int  56 82 69 73 88 65 87 88 82 77 ...
##-  $ ML.Base.Diameter: num  17.1 19.4 19.4 18.2 17.5 ...
##-  $ ML.Body.Diameter: num   9.36 11.04 10.80 10.68  9.97 ...
##-  $ ML.Head.Diameter: num  11.1 13.3 13.5 13.3 12.0 ...
##-  $ AP.Base.Diameter: num  12.5 13.6 13.3 13.3 12.3 ...
##-  $ Ap.Body.Diameter: num  6.06 7.2 7.69 7.97 7.55 6.86 6.88 6 7.13 6.44 ...
##-  $ AP.Head.Diameter: num  7.94 9.9 9.39 9.43 8.59 7.49 7.74 7.51 8.43 8.66 ...
##-  $ Max..Length     : num  41.6 47.5 47.5 45.3 42.4 ...
summary(d.anthro)

summary(d.mech)   #  3 var
##        Fu              Fy               S
##  Min.   :170.4   Min.   : 34.10   Min.   : 21.73
##  1st Qu.:389.8   1st Qu.: 88.55   1st Qu.: 58.20
##  Median :511.5   Median :106.85   Median : 69.44
##  Mean   :504.6   Mean   :107.49   Mean   : 69.43
##  3rd Qu.:586.9   3rd Qu.:122.08   3rd Qu.: 81.22
##  Max.   :822.2   Max.   :184.90   Max.   :108.45

apply(d.dist, 2, mean)
str(d.dist)   # 12 var
##- 'data.frame':	26 obs. of  12 variables:
##-  $ X25..Length  : num  10.4 11.9 11.9 11.3 10.6 ...
##-  $ Nr.of.Slices.: int  306 349 349 333 311 280 283 284 301 333 ...
##-  $ TV           : num  1100 1627 1684 1458 1202 ...
##-  $ BV           : num  245 428 443 346 307 ...
##-  $ BMC          : num  0.439 0.766 0.793 0.619 0.55 0.327 0.461 0.379 0.372 0.582 ...
##-  $ BS           : num  1897 2730 2905 3026 2120 ...
##-  $ BV.TV.       : num  22.3 26.3 26.3 23.7 25.6 ...
##-  $ BS.TV        : num  1.73 1.68 1.73 2.08 1.76 ...
##-  $ BS.BV        : num  7.74 6.38 6.55 8.75 6.90 ...
##-  $ dS.dr        : num   -981 -3065 -3589 -2211 -1905 ...
##-  $ TBPf         : num  -0.517 -1.123 -1.236 -0.731 -0.898 ...
##-  $ SMI          : num  -0.401 -1.056 -1.132 -0.501 -0.782 ...

str(d.spher)  # 13 var
##- 'data.frame':	26 obs. of  13 variables:
##-  $ TV    : num  103 192 169 151 110 ...
##-  $ BV    : num  28.7 55.0 52.3 38.3 33.4 ...
##-  $ BS    : num  397 667 718 668 455 ...
##-  $ BV.TV.: num  27.9 28.6 31.0 25.3 30.4 ...
##-  $ BS.TV : num  3.86 3.46 4.25 4.41 4.15 ...
##-  $ BS.BV : num  13.8 12.1 13.7 17.4 13.6 ...
##-  $ Tb.N  : num  1.93 1.73 2.13 2.21 2.08 ...
##-  $ Tb.Th : num  0.145 0.165 0.146 0.115 0.147 0.122 0.157 0.151 0.123 0.186 ...
##-  $ Tb.Sp : num  0.374 0.412 0.325 0.338 0.335 0.347 0.307 0.369 0.535 0.406 ...
##-  $ DA    : num  1.23 1.25 1.30 1.31 1.20 ...
##-  $ X.H1. : num  0.471 0.528 0.424 0.39 0.445 0.425 0.429 0.461 0.57 0.541 ...
##-  $ X.H2. : num  0.579 0.658 0.551 0.512 0.532 0.53 0.513 0.577 0.763 0.674 ...
##-  $ X.H3. : num  0.508 0.552 0.447 0.468 0.469 0.456 0.448 0.523 0.66 0.563 ...


## merge ---- useful in general.
## -----  Here almost not needed, since have always the same 26  obs.
##        in the *identical* order ---> hence could just use cbind(.)
identical(row.names(d.spher),  row.names(d.dist))
identical(row.names(d.anthro), row.names(d.dist))
identical(row.names(d.mech),   row.names(d.dist))

rnMerge <- function(d1,d2, ...) {
    ## Purpose: merge by row.names -- and make the "row.names" variable to row.names
    ## ----------------------------------------------------------------------
    ## Arguments: d1, d2: two data frames with common row names
    ## ----------------------------------------------------------------------
    ## Author: Martin Maechler, Date: 13 Mar 2007

    d <- merge(d1, d2, by = "row.names", ...)
    ## set row names and omit them as variable:
    rownames(d) <- d[, 1]
    d[, -1]
}

str(d.A.M <- rnMerge(d.anthro, d.mech))

## Martin Stauber's :
d.distal    <- rnMerge(d.A.M, d.dist)
d.spherical <- rnMerge(d.A.M, d.spher)

str(d.distal)   # 24 variables
str(d.spherical)# 25 variables

## All of them:
d.All <- rnMerge(d.distal, d.spher, suffixes= c("_d", "_s"))
str(d.All) # 37 variables
##
## Variabel-Namen vereinfachen:
names(d.All) <-
    gsub("\\.Diameter$", ".dm", # shorten..
         gsub("\\.$", "",         # omit trailing "."
              gsub("\\.([._])", "\\1", # substitute ".." by "." and "._" by "_"
                   names(d.All))))
names(d.All) <- sub("Nr.of.", "Nr_", names(d.All)) # another shortening
names(d.All) <- sub("^Ap", "AP",  names(d.All))    # <<-- Fix typo in file

## Ok, make the above into a simple function :
mySub <- function(nms) {
    sub("Nr.of.", "Nr_", # shorten
        sub("^Ap", "AP", # <<-- Fix typo in file
            gsub("\\.Diameter$", ".dm", # shorten..
                 gsub("\\.$", "",         # omit trailing "."
                      gsub("\\.([._])", "\\1", # substitute ".." by "." and "._" by "_"
                           nms)))))
}


## All variables but the "mechanical":
d.X <- rnMerge(d.anthro, rnMerge(d.dist, d.spher, suffixes= c("_d", "_s")))
names(d.X) <- mySub(names(d.X))
str(d.X)

###------------- end of the  read-etc.R  part ------------------------------


##------------------------------------------------------------------------------#
## Descriptive statistics.                                                      #
##------------------------------------------------------------------------------#

## Nummerische Summaries ...

sapply(d.All, mean) # warning: cannot take mean of 'SEX'
sapply(d.All[, -1], mean) # ok, but this is better
t(sapply(d.All[, -1], quantile))
## or even better {does "table" 'SEX'}:
summary(d.All)
## This gives a bit table (maybe nicer)
sapply(d.All[, -1], summary)

dim(d.All[ d.All[,"SEX"] == "f" , ])
## slightly more readable :
dim(subset(d.All, SEX == "f"))
## women <-> men
summary(subset(d.All, SEX == "f"))
summary(subset(d.All, SEX == "m"))

## or "nicer"
by(d.All[, -1],  d.All[,"SEX"], mean)

by(d.All[, -1],  d.All[,"SEX"], sd) ## S[tandard] D[eviation]

if(FALSE) ## This fails, since  median(<data.frame>) cannot work :
by(d.All[, -1],  d.All[,"SEX"], median)

by(d.All[, -1],  d.All[,"SEX"], mean, trim = 0.10) # 10%-trimmed mean

by(d.All[, -1],  d.All[,"SEX"], summary)

cor(d.All[,-1]) ## correlation matrix {dropping "SEX"}
## or
Cm <- cor(data.matrix(d.All))
Cm
## und noch viel mehr; ist aber nicht spannend;

###            ==========
### Stattdessen Grafiken
###            ==========

## Visualize matrix of pairwise correlations:
image(Cm)
## or even nicer (!)
heatmap(Cm, symm = TRUE)##-- after variable re-ordering via clustering
title("heatmap(cor( data.matrix( d.All ) ), symm = TRUE)")
## dev.print(command="prt.Cfolie")

## do it again, storing the result (including the dendrogram):
hmCor <- heatmap(Cm, symm = TRUE, keep.dendro = TRUE)

## what have we got: use str():
str(hmCor, max=2)
## The variable ordering as chosen by the hierarchical clustering algorithm:
names(d.All)[ hmCor$rowInd ]

## The clustering dendrogram (tree):
hmCor$Rowv
plot(hmCor$Rowv)
plot(hmCor$Rowv, horiz=TRUE)


plot(Fu ~ SEX, data = d.All, main = "Boxplots", sub = "dies ist ein Untertitel")
plot(Fu ~ SEX, data = d.All, main = "Boxplots", notch= TRUE)
### More Plots:

## Alle paarweisen Scatter plots:  hier wirklich zu viel
pairs(d.All, gap = 0, cex = 0.5)

## Plotten ausgewählter Variabler:
d.sub1 <- d.All[, c("BV.TV_d", "BS.BV_d", "TBPf", "SMI",
                    ## "Mechanical" ones:
                    "Fu", "Fy", "S") ]

### Von  help(pairs)  übernommen :
     ## put (absolute) correlations on the upper panels,
     ## with size proportional to the correlations.
     panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor)
     {
         usr <- par("usr"); on.exit(par(usr))
         par(usr = c(0, 1, 0, 1))
         r <- abs(cor(x, y))
         txt <- format(c(r, 0.123456789), digits = digits)[1]
         txt <- paste(prefix, txt, sep = "")
         if(missing(cex.cor)) cex <- 0.8/strwidth(txt)
         text(0.5, 0.5, txt, cex = cex * r)
     }

pairs(d.sub1, lower.panel = panel.smooth, upper.panel = panel.cor)
##                                                      ^^^^^^^^^
## Now use sex  for color coding :
pairs(d.sub1, panel=panel.smooth, col = as.numeric(d.All[,"SEX"]))

## Vergleich zwischen  Distal_25%  und  VOI :
(i.ds <- grep("_[ds]$", names(d.All))) # Spalten-Nummern
names(d.All)[i.ds]
## sort such that 'pairs' are contiguous:
matrix(names(d.All)[i.ds], ncol = 2)
# Transpose and use vector:
(n.ds <- as.vector(t(matrix(names(d.All)[i.ds], ncol = 2))))

d.vs.s <- d.All[ , n.ds]
pairs(d.vs.s, panel=panel.smooth, gap = .01, col = as.numeric(d.All[,"SEX"]))

hm.ds <- heatmap(cor(d.vs.s, method = "spearman"),
                 symm = TRUE, keep.dendro = TRUE)
str(hm.ds, max=2)
plot(hm.ds $ Rowv,
     main = "hierarchische Clusterung der VOI vs Distal25%")

library(sfsmisc)# for  mult.fig() which is slightly simpler/nicer than par(mfrow = .)
rr <- mult.fig(ncol(d.anthro), main = "d.anthro  data")
for(i in seq(along = d.anthro)) {
    v <- d.anthro[, i]
    nn <- names(d.anthro)[i]
    if(is.numeric(v))
        ## boxplot(v, main = nn)
        boxplot(split(v, d.anthro[,"SEX"]), main = nn)
    else
        plot(table(v), main = nn, xlab = '', col = 2)
}

## Make the above into a simple function
plotUni <- function(dat, numericplot = boxplot(v, main = nn),
                    namdat = deparse(substitute(dat)))
{
    ## Purpose: Univariate plots for all variables in
    ## ----------------------------------------------------------------------
    ## Arguments: dataframe 'dat'  which must have "SEX" variable
    ## ----------------------------------------------------------------------
    ## Author: Martin Maechler, Date: Mar 2007

    rr <- sfsmisc::mult.fig(ncol(dat), main = paste(namdat, " data"))

    ## reset graphic window setup on exiting this function:
    on.exit(par(rr$old.par))

    for(i in seq(along = dat)) {
        v <- dat[, i]
        nn <- names(dat)[i]
        if(is.numeric(v))
            eval(substitute(numericplot))
        else
            plot(table(v), main = nn, xlab = '', col = 2)
    }
    invisible()
}


plotUni(d.anthro) # default: simple boxplot

plotUni(d.anthro, boxplot(split(v, dat[,"SEX"]), main = nn))
## noch mit "Signficance - Notches"
plotUni(d.anthro, boxplot(split(v, dat[,"SEX"]), main = nn, notch = TRUE))

## Fuer Uebungs - Lösung
setwd("~/R/Meetings-Kurse-etc/2009-Biomechanik")
pdf.latex("boxplot_anthro.pdf", width=8, height=7)
  plotUni(d.anthro, boxplot(split(v, dat[,"SEX"]), main = nn))
pdf.end()




## Kernel Density
plotUni(d.anthro, plot(density(v), main = nn))


plotUni(d.mech)
plotUni(d.mech, boxplot(split(v, d.anthro[,"SEX"]), main = nn))

plotUni(d.mech, plot(density(v), main = nn))

plotUni(d.anthro, hist(v, main = nn))

gender <- as.numeric(d.All[,"SEX"])
(gCol <- c("blue", "tomato")[gender])
plotUni(d.anthro,
        scatter.smooth(v, d.All[,"Fu"], ylab = "Fu",
                       col = gCol, pch = 1 + gender,
                       main = paste("Fu vs.", nn)))

## only the women :
plotUni(subset(d.mech, subset = d.anthro[,"SEX"] == "f"),
        plot(density(v), main = nn))

## both: be sophisticated:

plotUni(d.mech, {
    L <- split(v, d.anthro[,"SEX"])
    plot(density(v), main = nn, lty = 2, col = "gray")
    for(n in 1:2) lines(density(L[[n]]), col = n+1) } )

## y-Skala : noch etwas raffinierter
plotUni(d.mech, {
    L <- split(v, d.anthro[,"SEX"])
    dL <- lapply(L, density)
    dv <- density(v)
    plot(dv, main = nn, lty = 2, col = "gray",
         ylim = c(0, max(dv$y, dL[[1]]$y, dL[[2]]$y)))
    for(n in 1:2) lines(density(L[[n]]), col = n+1)
    legend("topright", c("total", "f emale", "m ale"),
           col = c("gray", palette()[2:3]), lty = c(2,1,1),
           inset = 0.02)
} )

p.uniDens <- function(dat)
{
    ## Purpose: plot univariate (and male / female) densities:
    ## ----------------------------------------------------------------------
    ## Arguments: data frame
    ## ----------------------------------------------------------------------
    ## Author: Martin Maechler
    plotUni(dat, {
        L <- split(v, d.anthro[,"SEX"])
        dL <- lapply(L, density)
        dv <- density(v)
        plot(dv, main = nn, lty = 2, col = "gray",
             ylim = c(0, max(dv$y, dL[[1]]$y, dL[[2]]$y)))
        for(n in 1:2) lines(density(L[[n]]), col = n+1)
        legend("topright", c("total", "f emale", "m ale"),
               col = c("gray", palette()[2:3]), lty = c(2,1,1),
               inset = 0.02)
    }, namdat = deparse(substitute(dat)) ) # trick to get correct title
}


p.uniDens(d.mech)
p.uniDens(d.anthro)
p.uniDens(d.spher)
p.uniDens(d.dist)



###==   Now go for regression: ===================================================

## use all variables for prediction of  Fu  apart from Fy and S :
str(dat <- subset(d.All, select = c( -Fy, -S)))
## also omit the diameters and "lengths":
dim(dat2 <- dat [, -grep("\\.(dm|Length)$", names(dat))]) # still 27 variables
m1 <- lm(Fu ~ . , data = dat2) # perfect fit: have too many variables!
m1

## Now try to add them stepwisely
m0 <- lm(Fu ~ 1, data = dat) # fit mean only

summary(m1 <- lm(Fu ~ SEX, data = dat)) # fit "SEX + intercept"
confint(m1)


## the cumbersome way:
(ch.terms <- paste("Fu ~", paste(names(dat)[names(dat) != "Fu"], collapse="+")))
step(m0, scope = as.formula(ch.terms))

## ore more elegantly
(mst <- step(m0, scope = terms(Fu ~ ., data = dat)))
##------------------------------------------------------

library(MASS)# a "recommended" (and hence 'standard') package
stepAIC(m0, scope = terms(Fu ~ ., data = dat))# gives the same here

summary(mst) # only one strongly significant term

## Diagnostic plots
plot(mst)
plot(residuals(mst) ~ dat[,"dS.dr"], col = as.numeric(dat[,"SEX"]))
abline(h = 0, lty = 3, col = "gray") # 0-line


## Alternatively, 'leaps' extension package that try "all possible subsets":
library(leaps)
ma1 <- regsubsets(Fu ~ ., data = dat, nvmax = 20) ## takes a little while (< 1 min)
(sm1 <- summary(ma1))
myTit <- "regsubsets(Fu ~ .,  data = dat)"
plot(ma1, main = myTit)
plot(ma1, main = myTit, scale = "adjr2")

## Plot "criterion" against number of variables
(nvar <- as.numeric(row.names(sm1 $ which)))
plot(nvar, sm1 $ adjr2, xlab = "#{var} ( = p - 1 )", main = myTit)
plot(nvar, sm1 $ bic,   xlab = "#{var} ( = p - 1 )", main = myTit)
## --> both these plots suggests that all variables are useful ... hmm funny

colSums(sm1$which)

## Now record the 3 best variable subsets for each size (up to size 12):
ma3 <- regsubsets(Fu ~ ., data = dat, nvmax = 12, nbest = 3)
summary(ma3)
plot(ma3,                  main = myTit)
plot(ma3, scale = "adjr2", main = myTit)
## this is an improved for  plot.regsubsets()  s{12}{nvmax}  s{3}{nbest}:
abline(h = .5 + 3*(1:12), col = "gray", lty = 3)
axis  (4, at = .5+ 3*(1:12 - 1/2), lab = 1:12, las=2)


sm3 <- summary(ma3, matrix.logical = TRUE)
(nvar <- as.numeric(row.names(sm3 $ which)))
plot(nvar, sm3 $ adjr2, xlab = "#{var} ( = p - 1 )", main = myTit)


cbind(sort( colSums(sm3$which), decreasing=TRUE ))
##             [,1]
## (Intercept)   36
## Tb.Th         21
## TV_s          19
## BV_s          19
## BS.BV_s       18
## X25.Length    14
## BS.TV_d       13
## dS.dr         12
## Tb.Sp         11
## X.H1          11
## Max.Length    10
## BS_d          10
## Nr_Slices      9
## SEXm           8
## TV_d           7
## ML.Base.dm     6
## X.H2           6
## AP.Body.dm     5
## X.H3           5
## ML.Body.dm     4
## TBPf           4
## DA             4
## AGE            3
## AP.Base.dm     3
## SMI            3
## BV_d           2
## BMC            2
## BV.TV_s        2
## BS.BV_d        1
## BS_s           1
## Tb.N           1
## ML.Head.dm     0
## AP.Head.dm     0
## BV.TV_d        0
## BS.TV_s        0

## Ohne unsinnige Nr_slices
ma <- regsubsets(Fu ~ ., data = dat, force.out = c("Nr_Slices"),
                 nvmax = 12, nbest = 3)
summary(ma)
plot(ma,                  main = myTit)
plot(ma, scale = "adjr2", main = myTit)
## this is an improved for  plot.regsubsets()  s{12}{nvmax}  s{3}{nbest}:
abline(h = .5 + 3*(1:12), col = "gray", lty = 3)
axis  (4, at = .5+ 3*(1:12 - 1/2), lab = 1:12, las=2)

## Ohne unsinnige Nr_slices -- und ohne  dS.dr -- was wird dann sonst wichtig?
ma. <- regsubsets(Fu ~ ., data = dat, force.out = c("dS.dr", "Nr_Slices"),
                  nvmax = 12, nbest = 3)
summary(ma.)
plot(ma.,                  main = myTit)
plot(ma., scale = "adjr2", main = myTit)
## this is an improved for  plot.regsubsets()  s{12}{nvma.x}  s{3}{nbest}:
abline(h = .5 + 3*(1:12), col = "gray", lty = 3)
axis  (4, at = .5+ 3*(1:12 - 1/2), lab = 1:12, las=2)

###------------ Weitere Modellierungen statt lineare Regression ------------

library(rpart)
mrp <- rpart(Fu ~ . , data = subset(d.All, select = c( -Fy, -S)))
mrp
## --> again uses  dS.dr  exclusively

##--- Another alternative:
## Use   --- lasso --- to find relevant variables;
## even for all 2-way interactions:
library(lars)

## lars() needs a traditional X matrix :

dat <- subset(d.All, select = c( -Fy, -S))
ff <- Fu ~ .
dX <- model.matrix(ff, model.frame(ff, dat))
str(dX)
dX[, 1:6] # die ersten 6 Variabeln

## Alle möglichen 2-fach Interaktionen:
dX2 <- model.matrix(Fu ~ .^2, model.frame(Fu ~ .^2, dat))
str(dX2) ## 26 * 596 (!!)

l1 <- lars(dX2, y = dat[,"Fu"], use.Gram=FALSE)
plot(l1) ## oops -- use Martin's improved plot --> ./lars-improved-MM.R

if(FALSE) ## Or Nicolai's
library(relaxo)
## but that's definitely *NOT* yet on students' machines