[R] Interperting results of glmnet and coxph plot, Brier score and Harrel's C-Index - am I doing something wrong ???

E Joffe ejoffe at hotmail.com
Sat Sep 28 11:39:44 CEST 2013


Hi all,

I am using COX LASSO (glmnet / coxnet) regression to analyze a dataset of
394 obs. / 268 vars.
I use the following procedure:
1.	Construct a coxnet on the entire dataset (by cv.glmnet) 
2.	Pick the significant features by selecting the non-zero coefficient
under the best lambda selected by the model 
3.	Build a coxph model with bi-directional stepwise feature selection
limited to the coxnet selected features.
	
To validate the model I use both Brier score (library=peperr) and Harrel's
C-Index (library=Hmisc) with a bootstrap of 50 iterations.


MY QUESTION :  I am getting fairly good C-Index (0.76) and Brier (0.07)
values for the models however per the coxnet the %Dev explained by the model
is at best 0.27 and when I plot the survfit of the coxph the plotted
confidence interval is very large. 
What am I missing here ?

%DEV=27%
 


Brier score - 0.07  ($ipec.coxglmnet -> [1] 7.24)
C-Index - 0.76 ($cIndex -> [1] 0.763)



DATA: [Private Health Information - can't publish] 394 obs./268 vars.

CODE (need to define a dataset with 'time' and 'status' variables):

library("survival")
library ("glmnet")
library ("c060")
library ("peperr")
library ("Hmisc")

    #creat Y (survival matrix) for glmnet
    surv_obj <- Surv(dataset$time,dataset$status) 
    
      
    ## tranform categorical variables into binary variables with dummy for
dataset
    predict_matrix <- model.matrix(~ ., data=dataset, 
                                   contrasts.arg = lapply
(dataset[,sapply(dataset, is.factor)], contrasts))
    
    ## remove the statu/time variables from the predictor matrix (x) for
glmnet
    predict_matrix <- subset (predict_matrix, select=c(-time,-status))
    
    ## create a glmnet cox object using lasso regularization and cross
validation
    glmnet.cv <- cv.glmnet (predict_matrix, surv_obj, family="cox")
    
    ## get the glmnet model on the full dataset
    glmnet.obj <- glmnet.cv$glmnet.fit
    
    # find lambda index for the models with least partial likelihood
deviance (by cv.glmnet) 
    optimal.lambda <- glmnet.cv$lambda.min    # For a more parsimoneous
model use lambda.1se     
    lambda.index <- which(glmnet.obj$lambda==optimal.lambda) 
    
    
    # take beta for optimal lambda 
    optimal.beta  <- glmnet.obj$beta[,lambda.index] 
    
    # find non zero beta coef 
    nonzero.coef <- abs(optimal.beta)>0 
    selectedBeta <- optimal.beta[nonzero.coef] 
    
    # take only covariates for which beta is not zero 
    selectedVar   <- predict_matrix[,nonzero.coef] 
    
    # create a dataframe for trainSet with time, status and selected
variables in binary representation for evaluation in pec
    reformat_dataSet <- as.data.frame(cbind(surv_obj,selectedVar))
    
    # glmnet.cox only with meaningful features selected by stepwise
bidirectional AIC feature selection 
    glmnet.cox.meaningful <- step(coxph(Surv(time,status) ~
.,data=reformat_dataSet),direction="both")
    
         
  
 
##--------------------------------------------------------------------------
-----------------------------
    ##                                    MODEL PERFORMANCE
 
##--------------------------------------------------------------------------
-----------------------------
    ## 
    
  
    ## Calculate the Brier score - pec does its own bootstrap so this
function runs on i=51 (i.e., whole trainset)                         

        ## Brier score calculation to cox-glmnet
        peperr.coxglmnet <- peperr(response=surv_obj, x=selectedVarCox,
                                fit.fun=fit.coxph, load.all=TRUE,
                                indices=resample.indices(n=nrow(surv_obj),
method="boot", sample.n=50))
        
        # Get error predictions from peperr
        prederr.coxglmnet <- perr(peperr.coxglmnet)
        
        # Integrated prediction error Brier score calculation
        ipec.coxglmnet<-ipec(prederr.coxglmnet,
eval.times=peperr.coxglmnet$attribute, response=surv_obj)

    
  ## C-Index calculation 50 iter bootstrapping
  
  for (i in 1:50){
        print (paste("Iteration:",i))
        train <- sample(1:nrow(dataset), nrow(dataset), replace = TRUE) ##
random sampling with replacement
        # create a dataframe for trainSet with time, status and selected
variables in binary representation for evaluation in pec
        reformat_trainSet <- reformat_dataSet [train,]
        
        
        # glmnet.cox only with meaningful features selected by stepwise
bidirectional AIC feature selection 
        glmnet.cox.meaningful.test <- step(coxph(Surv(time,status) ~
.,data=reformat_dataSet),direction="both")
        
        selectedVarCox   <-
predict_matrix[,attr(glmnet.cox.meaningful.test$terms,"term.labels")] 
        reformat_testSet <- as.data.frame(cbind(surv_obj,selectedVarCox))
        reformat_testSet <- reformat_dataSet [-train,]
      
        
#     compute c-index (Harrell's) for cox-glmnet models
        if (is.null(glmnet.cox.meaningful)){
          cIndexCoxglmnet <- c(cIndexCoxglmnet,NULL)
        }else{
          cIndexCoxglmnet <- c(cIndexCoxglmnet,
1-rcorr.cens(predict(glmnet.cox.meaningful,
reformat_testSet),Surv(reformat_testSet$time,reformat_testSet$status))[1])
        }
  }
  
  #Get average C-Index
  cIndex<- mean (unlist(cIndexCoxglmnet),rm.na=TRUE)
  
  #create a list of all the objects generated
 
assign(name,c(eval(parse(text=name)),glmnet.cv=list(glmnet.cv),glmnet.obj=li
st(glmnet.obj),
 
selectedVar=list(colnames(selectedVar)),glmnet.cox=list(glmnet.cox),
 
glmnet.cox.meaningful=list(glmnet.cox.meaningful),ipec.coxglmnet=list(ipec.c
oxglmnet),
                cIndex=cIndex))
    
  # save image of the workspace after each iteration
    save.image("final_subgroup_analysissubgroup_analysis.RData")
    



More information about the R-help mailing list