[R] stop on rows where !is.na(mydata$ti_all)

Mon Sep 24 12:38:35 CEST 2012

Dear R experts,

I got help to build a loop but there is a bug inside it that causes
one part of the mechanism to fail.

It should grow once, but if keep growing on rows where $ti_all is not NA.

Here is a wall of code that very crudely demonstrates the problem,
there is a couple of dim() outputs at the end where you can see how it
the second time around keeps adds (2) rows, but this does not happen
to row 2, I'm aware this is part of another function, but I can't
figure it out for that. The first thing that happen is correct, that 7
rows are added.

Any help or guidance would be appreciated.

Thanks,
Eric

lookup <- structure(list(c_name = c(1L, 2L, 4L, 5L, 6L, 7L), t_name =
structure(1:6, .Label = c("Bob", "Julian", "Mitt", "Ricky", "Tom",
"Victor"), class = "factor")), .Names = c("c_name", "t_name"), class =
"data.frame", row.names = c("1", "2", "3", "4", "5", "6"))

mydata <- structure(list(id = c(1L, 1L, 2L, 3L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 4L, 5L, 5L, 6L, 7L, 7L, 7L, 7L, 8L, 9L), time =
c("intake_arm_1", "v_001_arm_1", "intake_arm_1", "intake_arm_1",
"intake_arm_1", "v_001_arm_1", "v_002_arm_1", "v_003_arm_1",
"v_004_arm_1", "v_005_arm_1", "v_006_arm_1", "v_007_arm_1",
"intake_arm_1", "v_001_arm_1", "intake_arm_1", "intake_arm_1",
"v_011_arm_1", "v_012_arm_1", "v_013_arm_1", "intake_arm_1",
"intake_arm_1"), dat_all = c(NA, NA, NA, NA, NA, NA, NA, "2012-09-23",
"2012-09-23", "2012-09-02", "2012-09-10", "2012-09-23", NA, NA, NA,
NA, "2012-09-23", "2012-09-23", "2012-09-23", NA, NA), ti_all = c(NA,
NA, NA, NA, NA, NA, NA, 6L, 44L, 33L, NA, 22L, NA, NA, NA, NA, 65L,
NA, 10L, NA, NA),     ty_all = c(NA, NA, NA, NA, NA, NA, NA, "out_",
"out_", "cma_",     NA, "cma_", NA, NA, NA, NA, "out_", "out_",
"out_", NA, NA), out_c = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
    NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), cma_c = c(NA, NA,     NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,     NA, NA,
NA, NA), c_n = c(NA, 1L, NA, NA, NA, NA, NA, 7L,     4L, 7L, NA, 1L,
NA, 2L, NA, NA, 7L, 7L, 7L, NA, NA), t_name = c("Tom",     NA,
"Ricky", "Ricky", "Victor", NA, NA, NA, NA, NA, NA, NA,     "Julian",
NA, "Julian", "Bob", NA, NA, NA, "Mitt", "Mitt")), .Names = c("id",
"time", "dat_all", "ti_all", "ty_all", "out_c", "cma_c", "c_n",
"t_name"), class = "data.frame", row.names = c("1", "2", "3", "4",
"5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16",
"17", "18", "19", "20", "21"))

 if(require(plyr)){
  print("plyr is loaded correctly")
 } else {
  print("trying to install plyr")
  install.packages('plyr')
  if(require(plyr)){
    print("plyr installed and loaded")
  } else {
   stop("could not install plyr")
  }
}

newrows <- ddply(mydata, .(id), function(subdata) {
    subdata_ty = subdata[!is.na(subdata$ty_all), ]
    if (NROW(subdata) == 1) {
        r = subdata[1, ]
        c("v_001_arm_1", NA, NA, NA, NA, NA,
                    lookup$c_name[lookup$t_name == r$t_name], NA)
    }
    else if (NROW(subdata_ty) > 0) {
        numbers = sapply(strsplit(subdata$time, "_"), function(l)
            ifelse(l[1] != "intake", as.numeric(l[2]), 0))
        newname = paste(c("v", sprintf("%03d", max(numbers) + 1), "arm", "1"),
                            collapse="_")
        r1 = subdata[1, ]
        new_c_n = lookup$c_name[lookup$t_name == r1$t_name]

        new_out_c = sum(subdata$ty_all == "out_" & !is.na(subdata$ti_all))
        new_cma_c = sum(subdata$ty_all == "cma_" & !is.na(subdata$ti_all))

        new_out_c = ifelse(new_out_c == 0, NA, new_out_c)
        new_cma_c = ifelse(new_cma_c == 0, NA, new_cma_c)

        return(c(newname, NA, NA, NA, new_out_c, new_cma_c, new_c_n, NA))
    }
})

# recombine and sort
colnames(newrows) = colnames(mydata)
newdata = rbind(mydata, newrows)
newdata = newdata[order(newdata$id), ]

mydata2 <- newdata

newrows2 <- ddply(mydata2, .(id), function(subdata) {
    subdata_ty = subdata[!is.na(subdata$ty_all), ]
    if (NROW(subdata) == 1) {
        r = subdata[1, ]
        c("v_001_arm_1", NA, NA, NA, NA, NA,
                    lookup$c_name[lookup$t_name == r$t_name], NA)
    }
    else if (NROW(subdata_ty) > 0) {
        numbers = sapply(strsplit(subdata$time, "_"), function(l)
            ifelse(l[1] != "intake", as.numeric(l[2]), 0))
        newname = paste(c("v", sprintf("%03d", max(numbers) + 1), "arm", "1"),
                            collapse="_")
        r1 = subdata[1, ]
        new_c_n = lookup$c_name[lookup$t_name == r1$t_name]

        new_out_c = sum(subdata$ty_all == "out_" & !is.na(subdata$ti_all))
        new_cma_c = sum(subdata$ty_all == "cma_" & !is.na(subdata$ti_all))

        new_out_c = ifelse(new_out_c == 0, NA, new_out_c)
        new_cma_c = ifelse(new_cma_c == 0, NA, new_cma_c)

        return(c(newname, NA, NA, NA, new_out_c, new_cma_c, new_c_n, NA))
    }
})

# recombine and sort
colnames(newrows2) = colnames(mydata2)
newdata2 = rbind(mydata2, newrows2)
newdata2 = newdata2[order(newdata2$id), ]

mydata3 <- newdata2

newrows2 <- ddply(mydata3, .(id), function(subdata) {
    subdata_ty = subdata[!is.na(subdata$ty_all), ]
    if (NROW(subdata) == 1) {
        r = subdata[1, ]
        c("v_001_arm_1", NA, NA, NA, NA, NA,
                    lookup$c_name[lookup$t_name == r$t_name], NA)
    }
    else if (NROW(subdata_ty) > 0) {
        numbers = sapply(strsplit(subdata$time, "_"), function(l)
            ifelse(l[1] != "intake", as.numeric(l[2]), 0))
        newname = paste(c("v", sprintf("%03d", max(numbers) + 1), "arm", "1"),
                            collapse="_")
        r1 = subdata[1, ]
        new_c_n = lookup$c_name[lookup$t_name == r1$t_name]

        new_out_c = sum(subdata$ty_all == "out_" & !is.na(subdata$ti_all))
        new_cma_c = sum(subdata$ty_all == "cma_" & !is.na(subdata$ti_all))

        new_out_c = ifelse(new_out_c == 0, NA, new_out_c)
        new_cma_c = ifelse(new_cma_c == 0, NA, new_cma_c)

        return(c(newname, NA, NA, NA, new_out_c, new_cma_c, new_c_n, NA))
    }
})

# recombine and sort
colnames(newrows2) = colnames(mydata3)
newdata3 = rbind(mydata3, newrows2)
newdata3 = newdata3[order(newdata3$id), ]

identical(newdata3, newdata2)
identical(newdata2, newdata)
dim(mydata)
dim(newdata)
dim(newdata2)
dim(newdata3)