###################################################################################
##                                                  Fit  Regression Model To Signal                                                            ##
###################################################################################         
## function:  -
###################################################################################
## input:  phase-unfolded text file signal                                                                                                   ##
##                                                                                                                                                              ##
## output:  regression coefficients                                                                                                                           ##
##                                                                                                                                                                                ##
################################################################################### 

##....................  function FitRegModel  ........................##

FitRegModel <- function (filname,InDir,OutDir,signum) {

##set returned flag
 if(!file.exists(filname)) {
    cat("\n\n ***  File: ",flname," not found! ! Exit current Script ! *** \n\n")
    cat ("\n FitRegModel: success = ",success,"\n")
    return(NULL)
 }

##read pure file 
 xx <- read.table(filname,skip = 1, sep = " ") 
 names(xx) <- c("samptime","sampamp","samphase","cycle","patient")

##predicted variable
 y <- xx[,"sampamp"]

##generate regressors
 t1 <-    xx[,"samptime"]
 t2 <-    I(t1^2)
 t3 <-    I(t1^3)
 t4 <-    I(t1^4)
 t5 <-    I(t1^5) 
 t6 <-    I(t1^6)
 t7 <-    I(t1^7)
 t8 <-    I(t1^8)
 t9 <-    I(t1^9)
 t10 <-  I(t1^10)
 p1 <-   xx[,"samphase"]
 p2 <-   I(p1^2)
 p3 <-   I(p1^3)
 p4 <-   I(p1^4)
 p5 <-   I(p1^5) 
 p6 <-   I(p1^6)
 p7 <-   I(p1^7)
 p8 <-   I(p1^8)
 p9 <-   I(p1^9)
 p10 <- I(p1^10)

##place the data into a data frame
 my.df <- data.frame(y,t1,t2,t3,t4,t5,t6,t7,t8,t9,t10, p1,p2,p3,p4,p5,p6,p7,p8,p9,p10 )

##fit the full model
 big.lm <- lm(y ~ ., data = my.df)
 summary(big.lm)

##step wise procedure
 step.lm <- step(big.lm)  
 step.lm.sum <- summary(step.lm)
 
##get  number of terms still not significant excluding intercept 
 step.lm.sum$coefficients <- step.lm.sum$coefficients[-1,]        #drop the intercept    
 nleft <- which(step.lm.sum$coefficients[,"Pr(>|t|)"] >= 0.05)

##if no unsignificant terms present  return step-output coefficients
 if(length(nleft) == 0)  {
    cat("\n\n All Step Output Coefficients Significant for Signal: ",signum,"\n\n") 
    return (step.lm.sum$coefficients)
 }

##if not significant terms still present create a new data frame with only the variables selected by step
 selected.variables <- names(step.lm$coefficients)[-1]
 selected.variables
 my.df.smaller <- my.df[,c("y",selected.variables)]

##  leaps
## Based on Faraway, Linear Models with R, page 127 et seq.
 library(leaps) 
 b <- regsubsets(y ~ . ,nvmax =  length(selected.variables), data = my.df.smaller)
 summary(b)
 rs <- summary(b) 
 no.parameters <-  seq_along(rs$cp) + 1

##set plot window to accommodate 3 graphs (1 row, 3 columns)
# x11(width = 14, height = 10)
# par(cex.axis=2, cex.lab=2,mfrow = c(1,3))

##plot adjusted R^2
 #plot(no.parameters, rs$adjr2, xlab = "Number of parameters", ylab = "Adjusted R-square",font.lab=2,font.axis=2)

##work with Cp (unlike Faraway's example)
##we desire models with Cp around or less than the number of parameters
# plot(no.parameters  , rs$cp, xlab = "Number of parameters", ylab = "Cp Statistics",font.lab=2,font.axis=2)
# abline(0, 1)
# plot(no.parameters  , rs$cp - no.parameters, xlab = "Number of parameters p", ylab = "Cp Statistics - p",font.lab=2,font.axis=2) 
# abline(h = 0)

##wait for mouse input (consent) to remove drawing
# locator()
# graphics.off()

##find the smallest model with Cp <= number-parameters
 chosen.model <- min(which(rs$cp <= no.parameters))
 chosen.model

##see which variables are included
 reduced.variables <- rs$which[chosen.model, ]
 which.variables <- which(reduced.variables)

##remove intercept
 which.variables <- which.variables[which.variables > 1]
 which.variables

##now reduce the data frame keeping the y variable (which is in column 1)
 reduced.df <- my.df.smaller[,c(1,which.variables)]

## check-point
 names(my.df.smaller)      #previous data frame
 names(reduced.df)           #new reduced data frame

##refit using the reduced data frame
 reduced.lm <- lm(y ~ ., data = reduced.df)
 reduced.lm.sum <- summary(reduced.lm)

##get  number of terms still not significant excluding intercept
 reduced.lm.sum$coefficients <- reduced.lm.sum$coefficients[-1,]        #drop the intercept   
 nleft <- which(reduced.lm.sum$coefficients[,"Pr(>|t|)"] >= 0.05)

##if no unsignificant terms present  return step-output coefficients
 if(length(nleft) == 0)  {
    cat("\n\n All Regsubsets Output Coefficients Significant for Signal: ",signum,"\n\n") 
    return (reduced.lm.sum$coefficients)
 }

##remove unsignificant coefficients 
 while (length(nleft) > 0) { 
     names(reduced.df)  
     nn <- nleft[[1]]  +1       # account for the "y" in position 1
     reduced.df <- reduced.df[,-nn]
     names(reduced.df)  
     reduced.lm <- lm(y ~ ., data = reduced.df)
     reduced.lm.sum <- summary(reduced.lm)
     reduced.lm.sum$coefficients <- reduced.lm.sum$coefficients[-1,]        #drop the intercept
     nleft <- which(reduced.lm.sum$coefficients[,"Pr(>|t|)"] >= 0.05)
 }
 
##return coefficients
 return (reduced.lm.sum$coefficients) 
}


##.............................  Main  .................................##

 OutDir <- "/Users/mauede/Breathing-Curves-Dir/Modeling-Dir/Regression-Models-Dir"
 InDir <- "/Users/mauede/Breathing-Curves-Dir/Modeling-Dir/Pure-Signals-Dir/Mixed"  
 setwd(InDir) 
 cat("\n")
 print(list.files(pattern = "[0-1,a-z,A-Z, ,_]*.txt"))
 cat("\n\n")
 nms <- list.files(pattern="[0-1,a-z,A-Z, ,_]*.txt") 
 if(is.null(nms)) {
    stop("\n\n *** Pure SIgnalss list empty. Exit current process. *** \n\n") 
 }

##set up matrix to save regression model coefficients: 
##columns(1:10) contain the coefficients of the time powers
##columns(11:20) contain the coefficients of the phase powers  
##column(21) contain the patient' s ID
 rg <- matrix(nrow=length(nms),ncol=21)
 colnames(rg) = c("t1","t2","t3","t4","t5","t6","t7","t8","t9","t10","p1","p2","p3","p4","p5","p6","p7","p8","p9","p10","Id")

##initialize regression matrix to zero
 for (m in 1:ncol(rg)) {
     rg[,m] <- rep(0,nrow(rg))
 }

 ns <- 0       #initialize index

 for(i in 1:length(nms)) {
    cat("\n i:",i, " nms[i]:",nms[i], "\n")
    insig <- NULL
    insig = strsplit(nms[i]," ")[[1]][1]
    result <- FitRegModel(nms[i],InDir,OutDir,insig)
    cat("\n result:  ",result,"\n\n") 
    if(length(result) == 0) {
        cat("\n\n Signal: ", insig, " not processed ! \n\n")
       next
    } 
     cat("\n\n Signal: ", insig, " SUCCESS ! \n\n")
     ns <- ns +1
     for (k in 1: nrow(result)) {
         for (j in 1:ncol(rg)) {
             if (rownames(result)[k] == colnames(rg)[j]) {
                  rg[ns,j] <- result[k,"Estimate"]
             }
          }
     }     
     rg [ns,"Id"] <- insig 
 }

##save regression matrix 
 setwd(OutDir)
 write.table(rg,file="RegressionCoeffMatrix",quote=FALSE,sep = " ",row.names=FALSE,col.names=colnames(rg))

 ##create distance matrix between regression coefficients (ignore patient's ID)
 rgdist <- dist(rg[,1:20])
 
##generate clusters from distance matrix
 h <- hclust(rgdist)
 plot(h)