[R] Data frame sampling

Thu Feb 17 21:57:28 CET 2011

R users,

I have been trying to write a program in R that will extract rows from a data frame and combine the rows 
into a new smaller data frame while meeting several criteria. I would greatly appreciate any advice 
that could help me get started down the right path. What I want to do is to extract two rows WEEK 
(26 weeks total) from the data frame by sampling without replacement from the variable STRATA_NUM (1:40) 
and using this output to index the data frame and extract rows, moving from one week to the next. I 
created STRATA_NUM to use as an index to identify unique sampling frame strata (e.g. unique DOW, SITE, 
and TOD permutations---40 total). This would result in the entire STRATA_NUM vector being sampled by 
the end of WEEK 20, at which point I would need the code to resample again without replacement from 
the full vector for the remaining 6 weeks (i.e. 12 more rows extracted). One final condition is that 
DOW's must not be duplicated for any given WEEK (e.g. I cannot sample two Tuesdays in any one week). 
The reasoning for my approach is that I want even coverage of all 40 unique spatiotemporal strata in 
my final data frame. 

Thank you,

Mike

### The following code extracts two rows per week randomly without replacement and ensures that no two 
### DOW are duplicated per week, but fails to incorporate even coverage of strata. 

set.seed(3000)
WEEKDAY_STRATA1 <- do.call("rbind", lapply(split(WEEKDAY_STRATA, WEEKDAY_STRATA$WEEK), function(x) x[sample(nrow(x), 10, replace=FALSE),])) 
l <- do.call("rbind", lapply(split(WEEKDAY_STRATA1,WEEKDAY_STRATA1$WEEK),function(y)!duplicated(y[,1])))
M <- as.matrix(l,byrow = FALSE)
WEEKDAY_STRATA2 <- WEEKDAY_STRATA1[as.vector(t(M)),]
WEEKDAY_STRATA3 <- do.call("rbind", lapply(split(WEEKDAY_STRATA2, WEEKDAY_STRATA2$WEEK), function(x) x[sample(nrow(x), 2, replace=FALSE),])) 
WEEKDAY_STRATA3

### I only included the first three weeks for obvious reasons

WEEKDAY_STRATA <-
structure(list(DOW = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
5L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
5L), .Label = c("Mon", "Tue", "Wed", "Thu", "Fri"), class = "factor"), 
    SITE = c(101, 101, 102, 102, 103, 103, 104, 104, 101, 101, 
    102, 102, 103, 103, 104, 104, 101, 101, 102, 102, 103, 103, 
    104, 104, 101, 101, 102, 102, 103, 103, 104, 104, 101, 101, 
    102, 102, 103, 103, 104, 104, 101, 101, 102, 102, 103, 103, 
    104, 104, 101, 101, 102, 102, 103, 103, 104, 104, 101, 101, 
    102, 102, 103, 103, 104, 104, 101, 101, 102, 102, 103, 103, 
    104, 104, 101, 101, 102, 102, 103, 103, 104, 104, 101, 101, 
    102, 102, 103, 103, 104, 104, 101, 101, 102, 102, 103, 103, 
    104, 104, 101, 101, 102, 102, 103, 103, 104, 104, 101, 101, 
    102, 102, 103, 103, 104, 104, 101, 101, 102, 102, 103, 103, 
    104, 104), TOD = structure(c(2L, 1L, 2L, 1L, 2L, 1L, 2L, 
    1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
    2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 
    1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
    2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 
    1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
    2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 
    1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
    2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L), .Label = c("Morn", "Aftn"
    ), class = "factor"), STRATA_NUM = c(2L, 1L, 12L, 11L, 22L, 
    21L, 32L, 31L, 4L, 3L, 14L, 13L, 24L, 23L, 34L, 33L, 6L, 
    5L, 16L, 15L, 26L, 25L, 36L, 35L, 8L, 7L, 18L, 17L, 28L, 
    27L, 38L, 37L, 10L, 9L, 20L, 19L, 30L, 29L, 40L, 39L, 2L, 
    1L, 12L, 11L, 22L, 21L, 32L, 31L, 4L, 3L, 14L, 13L, 24L, 
    23L, 34L, 33L, 6L, 5L, 16L, 15L, 26L, 25L, 36L, 35L, 8L, 
    7L, 18L, 17L, 28L, 27L, 38L, 37L, 10L, 9L, 20L, 19L, 30L, 
    29L, 40L, 39L, 2L, 1L, 12L, 11L, 22L, 21L, 32L, 31L, 4L, 
    3L, 14L, 13L, 24L, 23L, 34L, 33L, 6L, 5L, 16L, 15L, 26L, 
    25L, 36L, 35L, 8L, 7L, 18L, 17L, 28L, 27L, 38L, 37L, 10L, 
    9L, 20L, 19L, 30L, 29L, 40L, 39L), DATE = structure(c(14732, 
    14732, 14732, 14732, 14732, 14732, 14732, 14732, 14733, 14733, 
    14733, 14733, 14733, 14733, 14733, 14733, 14734, 14734, 14734, 
    14734, 14734, 14734, 14734, 14734, 14735, 14735, 14735, 14735, 
    14735, 14735, 14735, 14735, 14736, 14736, 14736, 14736, 14736, 
    14736, 14736, 14736, 14739, 14739, 14739, 14739, 14739, 14739, 
    14739, 14739, 14740, 14740, 14740, 14740, 14740, 14740, 14740, 
    14740, 14741, 14741, 14741, 14741, 14741, 14741, 14741, 14741, 
    14742, 14742, 14742, 14742, 14742, 14742, 14742, 14742, 14743, 
    14743, 14743, 14743, 14743, 14743, 14743, 14743, 14746, 14746, 
    14746, 14746, 14746, 14746, 14746, 14746, 14747, 14747, 14747, 
    14747, 14747, 14747, 14747, 14747, 14748, 14748, 14748, 14748, 
    14748, 14748, 14748, 14748, 14749, 14749, 14749, 14749, 14749, 
    14749, 14749, 14749, 14750, 14750, 14750, 14750, 14750, 14750, 
    14750, 14750), class = "Date"), DOW_NUM = c(3, 3, 3, 3, 3, 
    3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 
    6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 3, 3, 3, 
    3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 
    5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 3, 
    3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 
    5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 
    7), WEEK = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
    1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 
    2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 
    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 
    3, 3, 3, 3, 3, 3, 3, 3, 3, 3)), .Names = c("DOW", "SITE", 
"TOD", "STRATA_NUM", "DATE", "DOW_NUM", "WEEK"), row.names = c(223L, 
235L, 272L, 287L, 330L, 346L, 379L, 395L, 634L, 655L, 683L, 707L, 
750L, 761L, 790L, 808L, 837L, 865L, 887L, 923L, 948L, 968L, 1001L, 
1017L, 418L, 448L, 476L, 502L, 527L, 554L, 580L, 600L, 16L, 37L, 
74L, 95L, 123L, 135L, 176L, 192L, 215L, 236L, 277L, 288L, 331L, 
339L, 384L, 391L, 625L, 653L, 684L, 705L, 737L, 755L, 783L, 807L, 
834L, 859L, 885L, 915L, 944L, 967L, 993L, 1020L, 417L, 443L, 
469L, 501L, 532L, 550L, 577L, 603L, 17L, 36L, 57L, 79L, 107L, 
132L, 158L, 183L, 210L, 237L, 269L, 299L, 315L, 348L, 372L, 397L, 
626L, 657L, 689L, 714L, 731L, 758L, 787L, 811L, 833L, 860L, 888L, 
911L, 940L, 964L, 991L, 1016L, 421L, 451L, 480L, 495L, 529L, 
553L, 582L, 602L, 18L, 31L, 58L, 80L, 120L, 141L, 169L, 189L), class = "data.frame")