[R] survSplit: further exploration and related topics

Thu Nov 11 12:04:29 CET 2004

While waiting for R.2.0.1 or 2.1, for you  who need function for this 
survival-splitting business, as I do,   this 'survcut' function below 
might be helpful.
It is not an elegant nor efficient function but it works, at least for 
some examples below.

data(aml)
m1<-coxph(Surv(time,status)~x,data=aml)

#unfortunately, start time must be created first
#but not a big deal I believe
aml$t0<-0

#then try :

d1<-survcut(cut=c(5,10,50),c("t0","time","status"),data=aml)
coxph(Surv(t0,time,status)~x,data=d1)
d2<-survcut(cut=c(9,12,40),c("t0","time","status"),data=d1)
coxph(Surv(t0,time,status)~x,data=d2)
d3<-survcut(cut=c(9,12,40),c("t0","time","status"),data=d2)
coxph(Surv(t0,time,status)~x,data=d3)

# splitting at the risk times
# useful for coxph with time-dependent covariate
d4<-survcut(cut=unique(aml$time[aml$status==1]),c("t0","time","status"),data=d2)
coxph(Surv(t0,time,status)~x,data=d4)

# "random" splitting
dr<-survcut(cut=runif(rpois(1,4),0,100),c("t0","time","status"),data=d1)
dim(dr)
coxph(Surv(t0,time,status)~x,data=dr)

# "per unit time" splitting
d5<-survcut(cut=0:161,c("t0","time","status"),data=d4)
coxph(Surv(t0,time,status)~x,data=d5)

#### the code --------------------------------------------------
survcut<-function (cuts, surv = c("t0", "t1", "event"), data,
   id = NULL, addv = TRUE, sq = FALSE)
# cuts: vector of timepoints to cut at
# surv: a Surv like input
# id :  (optional) variable name, if the data has an id variable
#advv :  (optional) include other variables in the output?
#sq: (optional) sequence of splitting , may be similar with episode in 
survSplit
{
   #cutting one counting process survival line
    cutting <- function(x, a, s) {
        x <- as.numeric(x)
        tmp <- sort(c(x[2:3], a[(x[2] < a) & (x[3] > a)]))
        n <- length(tmp)
        idx <- rep(x[1], n - 1)
        t0 <- tmp[1:(n - 1)]
        t1 <- tmp[2:n]
        event <- rep(0, n - 1)
        event[n - 1] <- x[4]
        if (s)
            data.frame(idx, t0, t1, event, s = 1:(n - 1))
        else data.frame(idx, t0, t1, event)
    }

  #Note that this database-joint-like function is similar to 'merge'
  #I made this before I knew 'merge' is available in R
  #merge can substitute this function, I believe
  #x: the main data (all rows will be selected)
  #variable: variable name (character vector) from refdat
  #key: key variable has to be exist in x and refdat
  #refdat: reference data, must have unique 'key' id
  addvars<-function (x, variable, key = "id", refdat)
  {
    nama <- names(refdat)
    if (is.numeric(variable)) {
        if (!prod(variable %in% 1:length(nama)))
            stop("variable does not exist")
        newname <- names(refdat)[variable]
    }
    else {
        if (!prod(variable %in% nama))
            stop("variable does not exist")
        newname <- variable
    }
    if (!(length(unique(refdat[, key])) == length(refdat[, key])))
        stop("key must be unique")
    newvar <- refdat[, variable, drop = FALSE]
    newvar <- newvar[match(x[, key], refdat[, key]), , drop = FALSE]
    x <- cbind(x, newvar)
    return(x)
}

    vars <- names(data)
    if (sum(surv %in% vars) != 3)
        stop("one or more surv variables do not exist")
    idx <- 1:NROW(data)
    data<- cbind(idx, data)
    t0 <- data[, surv[1]]
    t1 <- data[, surv[2]]
    event <- data[, surv[3]]
    x <- data.frame(idx, t0, t1, event)
    out <- by(x, list(1:NROW(x)), cutting, a = cuts, s = sq)
    out <- do.call("rbind", out)
    rownames(out) <- 1:NROW(out)
    names(out)[2:4] <- surv
    if (addv)
        out <- addvars(out, setdiff(names(data), c(surv, "idx")),
            key = "idx", refdat = data)
    out <- out[, -1]
    if (!is.null(id)) {
        slct <- c(id, setdiff(names(out), id))
        out <- subset(out, select = slct)
    }
    out
}

-----------------------------
/Danar