[R] merge by time, certain value if 5 min before and after an "event"

Fri Oct 3 15:05:39 CEST 2014

I've attached two functions used locally.  (The attachments will be stripped off of the 
r-help response, but the questioner should get them).  The functions "neardate" and 
"tmerge" were written to deal with a query that comes up very often in our medical 
statistics work, some variety of "get the closest creatinine value to the subject's date 
of rehospitalization, at least one week before but no more than 1 year prior", or tasks 
that merge two data sets to create a single (start, stop] style one.

The neardate function is a variant on match().  Given two (id, date) pairs it will find 
the first pair in list 2 that has date2 <= date1 (or >=) and the same id.  The second 
variable can be any orderable class, but dates are the most common use and hence the name.

These are being added to the survival package release that should be out real-soon-now, 
once I add some extended examples of their use to the time dependent covariates vignette.

Terry Therneau

On 10/03/2014 05:00 AM, r-help-request at r-project.org wrote:
> Hello! I hope someone can help me. It would save me days of work. Thanks in
> advance!
> I have two dataframes which look like these:
>
>
> myframe <- data.frame (Timestamp=c("24.09.2012 09:00:00", "24.09.2012
> 10:00:00",
> "24.09.2012 11:00:00"), Event=c("low","high","low") )
> myframe
>
>
> mydata <- data.frame ( Timestamp=c("24.09.2012 09:05:01", "24.09.2012
> 09:49:50", "24.09.2012 09:51:01", "24.09.2012 10:04:50", "24.09.2012
> 10:05:10")
> , location=c("1","2","3","1","5") )
> mydata
>
>
> # I want to merge them by time so I have a dataframe which looks like this
> in the end (i.e. "Low"  during 5 min before and after "high" )
>
> result <- data.frame ( Timestamp=c("24.09.2012 09:05:01", "24.09.2012
> 09:49:50", "24.09.2012 09:51:01", "24.09.2012 10:04:50", "24.09.2012
> 10:05:10")
> , location=c("1","2","3","1","5") ,
> Event=c("low", "low","high","high","low"))
> result
>
> Anyone knows how do merge them?
> Best regards,
> Dagmar
>
-------------- next part --------------
#
# Create a "nearest date" index
#  date1: the trial date
#  date2: target to match to
#
# result: an index vector for data set 1, which shows the row in data set
#  2 that has the same id, and the best date.
#
# best = "after"  The closest date in #2 that is on or after the date in #1
#        "prior"  The closest date in #2 that is on or before the date in #1
# 
neardate <- function(date1, date2, id1, id2, best=c("after", "prior")) {
    if (!missing(id1)) {
        if (length(id1) != length(date1))
            stop("id1 and date1 have different lengths")
        if (missing(id2))
            stop("either both or neither of id1 and id2 must be supplied")
        if (class(id1) != class(id2) && !(is.numeric(id1) && is.numeric(id2)))
            stop("id1 and id2 are different data types")
        if (length(id2) != length(date2))
            stop("id2 and date2 have different lengths")
    }
    else if (!missing(id2))
        stop("either both or neither of id1 and id2 must be supplied")

    if (class(date1) != class(date2) & !(is.numeric(date1) & is.numeric(date2)))
        stop("date1 and date2 are diffrent data types")

    if (is.factor(date1) & !is.ordered(date1)) 
        stop("date1 cannot be a factor")

    date1 <- as.numeric(date1)  # this will fail for silly inputs like a list
    date2 <- as.numeric(date2)  
    best <- match.arg(best)

    # Throw out missing dates in the second arg, but remember which ones
    rowid <- 1:length(date2)
    if (any(is.na(date2))) {
        toss <- is.na(date2)
        date2 <- date2[!toss]
        if (!missing(id2)) id2 <- id2[!toss]
        rowid <- rowid[!toss]
    }
    n2 <- length(date2)
    if (n2 ==0) stop("No valid entries in data set 2")

    # Simpler case: no id variables
    rowid <- 1:length(date2)
    if (missing(id1)) {
        if (best=="prior")
            indx2 <- approx(date2, 1:n2, date1, method="constant", yleft=NA,
                            yright=n2, rule=2, f=0)$y
        else 
            indx2 <- approx(date2, 1:n2, date1, method="constant", yleft=1,
                            yright=NA, rule=2, f=1)$y
        return(rowid[indx2])
    }

    # match id values as well
    #   First toss out any rows in id2 that are not possible targets for id1
    #   (id2 is usually the larger data set, thinning speeds it up)
    indx1 <- match(id2, id1)
    toss <- is.na(indx1) 
    if (any(toss)) {
        id2 <- id2[!toss]
        date2 <- date2[!toss]
        indx1 <- indx1[!toss]
        rowid <- rowid[!toss]
    }

    n2 <- length(date2)
    if (n2 ==0) stop("No valid entries in data set 2")

    # We need to create a merging id.  A minimal amount of
    #  spread for the dates keeps numeric overflow at bay
    alldate <- sort(unique(c(date1, date2)))
    date1 <- match(date1, alldate)
    date2 <- match(date2, alldate)
    delta <- 1.0 + length(alldate)  #numeric, not integer, on purpose
    hash1 <- match(id1, id1)*delta + date1
    hash2 <- indx1*delta + date2 

    if (best=="prior")
        indx2 <- approx(hash2, 1:n2, hash1, method="constant", yleft=NA,
                        yright=n2, rule=2, f=0)$y
    else 
        indx2 <- approx(hash2, 1:n2, hash1, method="constant", yleft=1,
                        yright=NA, rule=2, f=1)$y
    rowid[ifelse(id1== id2[indx2], indx2, NA)]
}
-------------- next part --------------
tmerge <- function(data, id, start="start", end, status,  ...) {
    Call <- match.call()
    # The function wants to recognize some special keywords in the
    #  arguments, so define a set of functions which will be used to
    #  mark objects
    new <- new.env(parent=parent.frame())
    assign("count", function(x) {class(x) <- c("count", class(x)); x}, 
           envir=new)
    assign("cumcount", function(x) {class(x) <- c("cumcount", class(x)); x}
           , envir=new)
    assign("event",  function(x) {class(x) <- c("event", class(x)); x}, 
           envir=new)
    assign("cumevent", function(x) {class(x) <- c("cumevent", class(x)); x}
           , envir=new)
    assign("variable", 
           function(...) {x <- list(...); class(x) <- "variable"; x},
           envir= new)

    if (missing(data)) stop("a data argument is required")
    dname <- names(data)
    if (missing(id)) stop("the id argument is required")

    if (!inherits(data, "tmerge")) {
        # The first call has to set up the identities of the 
        #  variables
        # Id variable first
        tname <- rep("", 4)
        names(tname) <- c("id", "start", "end", "status")

        if (length(id)==1 && is.character(id)) {
            j <- match(id, names(data))
            if (missing(j)) stop("id variable name not found in data")
            tname[1] <- id
            id <- data[[id]]
        }
        else stop("on the initial call 'id' should be a variable name")

        if (missing(end)) stop("end argument not supplied")
        if (length(end)==1 && is.character(end)) {
            j <- match(end, names(data))
            if (missing(j)) stop("end variable name not found in data")
            tname[3] <- end
        }
        else stop("on the initial call 'end' should be a variable name")

        if (missing(status)) stop("status argument not supplied")
        if (length(status)==1 && is.character(status)) {
            j <- match(status, names(data))
            if (missing(j)) stop("status variable name not found in data")
            tname[4] <- status
        }
        else stop("on the initial call 'status' should be a variable name")
        temp <- data[[status]]
        if (is.logical(temp)) {
            temp <- ifelse(temp, 0,1)
            data[[status]] <- temp
        }
        if (!is.numeric(temp)) 
            stop("status variable must be numeric or logical")
        if (any(temp!=0 & temp!=1)) 
            stop("numeric status values must be 0 or 1")

        # The start variable is allowed to be missing, in which case we 
        #   add one
        if (length(start)==1 && is.character(start)) {
            j <- match(start, names(data))
            if (is.na(j)) data <- cbind(data, start=0)
            tname[2] <- start
        }
        else stop("on the initial call 'start' should be a variable name")

        unused <- Call[is.na(match(names(Call), c("data", "id", "start",
                                                  "end", "status")))]
    }
    else {
        tname <- attr(data, 'tname') # data is a prior tmerge object
        unused <- Call[is.na(match(names(Call), c("data", "id")))]
    }

    if (length(unused) <=1) {
        # An initial call, usually, with nothing to add.  Rather than indent
        #  the entire remainder of the code put a return here.
        attr(data, "tname") <- tname
        attr(data, "tcount") <- NULL  #remove tcount if it exists
        class(data) <- c("tmerge", class(tdata))
        return(data)
    }

    # Now for the actual work, adding a variable into the mix
    # Each of the newvars should be a time variable which fits into the
    # time scale of the starter data set
    unused[[1]] <- as.name("list")  # The as-yet unused arguments
    args <- eval(unused, envir=new)
    argclass <- sapply(args, function(x) (class(x))[1])
    argname <- names(args)
    if (any(argname== "")) stop("all argments must have a name")

    check <- match(argclass, c("count", "cumcount", "event", 
                               "cumevent", "variable"))
    if (any(is.na(check)))
        stop(paste("argument(s)", argname[is.na(check)], 
                   "not a recognized type"))

    dname <- match(tname, names(data))
    names(dname) <- names(tname)
    if (any(is.na(dname))) 
        stop("data set does not match its own tname attribute")

    indx <- match(id, data[[dname["id"]]])
    if (any(is.na(indx))) stop("new data has subjects not in the base data set")

    # The tcount matrix is useful for debugging
    tcount <- matrix(0L, length(argname), 7)
    dimnames(tcount) <- list(argname, c("early","late", "gap", "within", 
                                        "tied edge", "front edge", "back edge"))

    newdata <- data
    row.names(newdata) <- NULL

    for (i in 1:length(args)) {
        baseid <- newdata[[dname["id"]]]
        dstart <- newdata[[dname["start"]]]
        dstop  <- newdata[[dname["end"]]]

        # if an event time is missing then skip that obs
        if (argclass[i] == "variable")  etime <- args[[i]][[1]]
        else etime <- args[[i]]
        keep <- !is.na(etime)
        etime <- etime[keep]
        class(etime) <- class(etime)[-1] #throw away my fake class
        id <- id[keep]

        indx1 <- neardate(etime, dstart, id, baseid, best="prior")
        indx2 <- neardate(etime, dstop,  id, baseid, best="after")
        # The event times fall into one of 5 categories
        #   1. Before the first interval
        #   2. After the last interval
        #   3. Outside any interval but with time span, i.e, it falls into
        #       a gap in follow-up
        #   4. Strictly inside an interval (don't touch either end)
        #   5. Inside an interval, but touching.
        itype <- ifelse(is.na(indx1), 1,
                        ifelse(is.na(indx2), 2, 
                               ifelse(indx2 > indx1, 3,
                                      ifelse(etime== dstart[indx1] | 
                                             etime== dstop[indx2], 5, 4))))

        # Subdivide the events that touch on a boundary
        #   Common: e.g. the subject has time intervals of
        #      (a,b] and (b,c] with a new count at b.
        #  Start: an interval (a,b], new count at a, subject not at risk at a-0
        #  End: similar to start
        #  
        subtype <- ifelse(itype!=5, 0, 
                          ifelse(indx1 == indx2+1, 1,
                                 ifelse(etime==dstart[indx1], 2, 3)))
        tcount[i,] <- table(factor(itype+subtype, levels=c(1:4, 6:8)))

        if (argclass[i] == "variable") {
            stop("'variable' code not yet finished")
        }
        else {
            increment <- rep(0, nrow(newdata))
            eflag <- (argclass[i] %in% c("event", "cumevent")) # 'event' type
            if (eflag) {
                if (any(subtype==1)) { #subtype 1 events to the earlier interval
                    count1 <- table(indx2[subtype==1])
                    itemp <- as.numeric(names(count1))
                    increment[itemp] <- increment[itemp] + c(count1)
                }
                if (any(subtype==3)) { # go to the matched interval
                    count3 <- table(indx2[subtype==3])
                    itemp <- as.numeric(names(count3))
                    increment[itemp] <- increment[itemp] + c(count3)
                }
                #subtype 2 and type 3 events are ignored
            }
            else {
                if (any(itype==1)) {
                    count <- table(id[itype==1])  #there might be multiples
                    itemp <- match(names(count), baseid)
                    increment[itemp] <- increment[itemp] + c(count)
                }

                if (any(subtype==1)) { #subtype 1 events to the later interval
                    count1 <- table(indx1[subtype==1])
                    itemp <- as.numeric(names(count1))
                    increment[itemp] <- increment[itemp] + c(count1)
                }

                if (any(subtype==2)) { # subtype 2 go to the matched interval
                    count2 <- table(indx1[subtype==2])
                    itemp <- as.numeric(names(count2))
                    increment[itemp] <- increment[itemp] + c(count2)
                }
                # subtype 3 are ignored (as are type 3 events)
            } 

            # Type 4 forces us to split rows of the data set
            #  A single subject might have muliple jumps on a single day,
            #   or a single day a jump for multiple subjects, so we need
            #   to count up by day and subject
            if (any(itype==4)) {
                indx4 <- which(itype==4)
                n4 <- length(indx4)
                # first will be true for first of each unique (id, etime) set
                #  etime is the set of new times, indx1=the obs it went into
                #  so each of these represents a new value to insert
                firstid <- c(T, diff(indx1[indx4]) !=0)
                first <- (firstid | c(TRUE, diff(etime[indx4])!=0))
                irows<- (indx1[indx4])[first] #which rows in base to expand
                newrows <- c(table(irows))  #single interval may have >1 insert
                irows <- unique(irows)
                newcount <- diff(c(which(first), 1+n4)) #multiple events, 1 day
                etemp <- rep(1L, nrow(newdata))
                etemp[irows] <- 1+ newrows
                newindx <- rep(1:nrow(newdata), etemp)  
                newdata <- newdata[newindx,]
                increment <- increment[newindx]  #expand increment

                #
                # Now fix up the new data set
                #  For each subject the new set of start times = c(old,newtimes)
                #                                stop = c(newtimes, old)
                #                                status=c(rep(0,newtimes), old)
                #                                increment=c(old, count)
                #  rindx is the index of rows for each changed interval
                rindx <- which(diff(newindx)==0)  #the added rows
                newtimes <- (etime[indx4])[first]
                newdata[rindx,   dname["end"]] <- newtimes
                newdata[rindx+1, dname["start"]] <- newtimes
                newdata[rindx,   dname["status"]] <- 0
                if (eflag) increment[rindx] <- newcount
                else       increment[rindx+1] <- newcount
            }

            # Now update the count variable
            if (argclass[i] %in% c("cumcount", "cumevent")) {
                # Cumulative within person
                temp <- cumsum(increment)
                idname <- dname["id"]
                indx <- match(newdata[[idname]], newdata[[idname]])
                newdata[[argname[i]]] <- temp + increment[indx] - temp[indx]
            }
            else newdata[[argname[i]]] <- increment
        }
    }
    attr(newdata, "tcount") <- tcount
    row.names(newdata) <- NULL
    newdata
}