[R] Loop for taking sum of rows based on proximity to other non-NA rows

Mon Oct 21 01:49:32 CEST 2013

Looks like a right parenthesis was dropped. Corrected:

removeNNAs <- function( dat, N, lessOrEqual=FALSE ) {
   N1 <- N+1
   rx <- rle( !is.na( dat$Count ) )
   # indexes of the ends of each run of NAs or non-NAs
   cs <- cumsum( rx$lengths )
   # indexes of the ends of runs of NAs or non-NAs
   cs2 <- cs[ !rx$values ]
   # If the first Count is NA, then drop first run of NAs
   if ( !rx$values[1] ) {
     cs2 <- cs2[ -1 ]
   }
   # If the last Count is NA, then drop last run of NAs
   if ( !rx$values[ length( rx$values ) ] ) {
     cs2 <- cs2[ -length( cs2 ) ]
   }
   # cs2 is indexes of rows to potentially receive deleted Counts
   # after collapse
   cs2 <- cs2 + 1
   # cs1 is indexes of non-NA Counts to be deleted
   cs1 <- cs[ rx$values ][ seq.int( length( cs2 ) ) ]
   # identify the indexes of the Count values before the strings
   # of NAs that meet the criteria
   if ( lessOrEqual ) {
     idx0 <- N1 >= ( cs2 - cs1 )
   } else {
     idx0 <- N1 == ( cs2 - cs1 )
   }
   idx1 <- cs1[ idx0 ]
   # identify the indexes of the Count values after the strings of
   # NAs that meet the criteria
   idx2 <- cs2[ idx0 ]
   # Identify which indexes are both sources and destinations
   idx1c <-c( idx2[ -length( idx2 ) ] == idx1[ -1 ], FALSE )
   # identify groups of indexes that need to be merged
   idx1g <- rev( cumsum( rev( !idx1c ) ) )
   # find which elements of idx1 represent the beginning of a
   # sequence of indexes to be replaced (meta-indexes)
   srcmidxs <- which( -1 == diff( c( idx1g[ 1 ] + 1, idx1g ) ) )
   # find which elements of idx2 represent the end of a sequence
   # to be  replaced (meta-indexes)
   destmidxs <- which( 1 == rev( diff( rev( c( idx1g, 0 ) ) ) ) )
   # add counts from before NAs to destination rows
   result <- dat
   srcidxList <- vector( mode="list", length=length( destmidxs ) )
   for ( i in seq.int( length( destmidxs ) ) ) {
     # row to which data will be copied
     destidx <- idx2[ destmidxs[ i ] ]
     # sequence of indexes of source rows
     srcidxss <- seq.int( from=idx1[ srcmidxs[ i ] ], to=destidx - 1 )
     result[ destidx, "Count" ] <- ( dat[ destidx, "Count" ]
                     + sum( dat[ srcidxss, "Count" ], na.rm=TRUE ) )
     # keep a list of indexes to be removed
     srcidxList[ i ] <- list( srcidxss )
   }
   # remove source rows
   result <- result[ -unlist( srcidxList ), ]
   result
}

On Sun, 20 Oct 2013, Jeff Newmiller wrote:

> I thought this question looked interesting enough to make my own stab at it, 
> but in hindsight I think this business of combining the counts seems quite 
> unlikely to be necessary... it would be simpler and less damaging to the 
> original data pattern to just remove groups of rows having fewer than "N" 
> NAs.
>
> removeNNAs <- function( dat, N, lessOrEqual=FALSE ) {
>  N1 <- N+1
>  rx <- rle( !is.na( dat$Count ) )
>  # indexes of the ends of each run of NAs or non-NAs
>  cs <- cumsum( rx$lengths )
>  # indexes of the ends of runs of NAs or non-NAs
>  cs2 <- cs[ !rx$values ]
>  # If the first Count is NA, then drop first run of NAs
>  if ( !rx$values[1] ) {
>    cs2 <- cs2[ -1 ]
>  }
>  # If the last Count is NA, then drop last run of NAs
>  if ( !rx$values[ length( rx$values ) ] ) {
>    cs2 <- cs2[ -length( cs2 ) ]
>  }
>  # cs2 is indexes of rows to potentially receive deleted Counts
>  # after collapse
>  cs2 <- cs2 + 1
>  # cs1 is indexes of non-NA Counts to be deleted
>  cs1 <- cs[ rx$values ][ seq.int( length( cs2 ) ) ]
>  # identify the indexes of the Count values before the strings
>  # of NAs that meet the criteria
>  if ( lessOrEqual ) {
>    idx0 <- N1 >= ( cs2 - cs1 )
>  } else {
>    idx0 <- N1 == ( cs2 - cs1 )
>  }
>  idx1 <- cs1[ idx0 ]
>  # identify the indexes of the Count values after the strings of
>  # NAs that meet the criteria
>  idx2 <- cs2[ idx0 ]
>  # Identify which indexes are both sources and destinations
>  idx1c <-c( idx2[ -length( idx2 ) ] == idx1[ -1 ], FALSE )
>  # identify groups of indexes that need to be merged
>  idx1g <- rev( cumsum( rev( !idx1c ) ) )
>  # find which elements of idx1 represent the beginning of a
>  # sequence of indexes to be replaced (meta-indexes)
>  srcmidxs <- which( -1 == diff( c( idx1g[ 1 ] + 1, idx1g ) ) )
>  # find which elements of idx2 represent the end of a sequence
>  # to be  replaced (meta-indexes)
>  destmidxs <- which( 1 == rev( diff( rev( c( idx1g, 0 ) ) ) ) )
>  # add counts from before NAs to destination rows
>  result <- dat
>  srcidxList <- vector( mode="list", length=length( destmidxs ) )
>  for ( i in seq.int( length( destmidxs ) ) ) {
>    # row to which data will be copied
>    destidx <- idx2[ destmidxs[ i ] ]
>    # sequence of indexes of source rows
>    srcidxss <- seq.int( from=idx1[ srcmidxs[ i ] ], to=destidx - 1 )
>    result[ destidx, "Count" ] <- ( dat[ destidx, "Count" ]
>                       + sum( dat[ srcidxss, "Count" ], na.rm=TRUE )
>    # keep a list of indexes to be removed
>    srcidxList[ i ] <- list( srcidxss )
>  }
>  # remove source rows
>  result <- result[ -unlist( srcidxList ), ]
>  result
> }
>
>
> On Fri, 18 Oct 2013, arun wrote:
>
>> 
>> 
>> Hi,
>> 
>> Found a bug in the function when tested.  So, try this (added one more 
>> line):
>> 
>> #Modified function
>> fun1 <- function(dat,n) {
>>  rl <- rle(is.na(dat[,"Count"]))
>> indx <- 
>> which(is.na(dat[,"Count"]))[rep(rl$lengths[rl$values],rl$lengths[rl$values])==n]
>>  lst1 <- lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) {
>>                          x1 <- dat[c(min(x)-1L,x,max(x)+1L),]
>>                      x2 <- x1[!is.na(x1$Count),]
>>                      datN <- 
>> data.frame(Position=max(x2$Position),Count=sum(x2$Count))
>>                      rowN <- row.names(x2)[x2$Position %in% 
>> max(x2$Position)]  
>>                      row.names(datN) <- if(length(rowN)>1) rowN[1] else 
>> rowN
>>                      datN
>>                     })
>> names(lst1) <- NULL
>> lst1 <- lst1[!duplicated(sapply(lst1,row.names))] ######added
>> dat2 <- do.call(rbind,lst1)
>> indx2 <-  
>> sort(unlist(lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) 
>> c(min(x)-1L,x,c(max(x)+1L))),use.names=FALSE))
>> 
>> dat1New <- dat[-indx2[!indx2 %in% row.names(dat2)],]
>> dat1New[match(row.names(dat2),row.names(dat1New)),] <- dat2
>> row.names(dat1New) <- 1:nrow(dat1New)
>> dat1New
>> }
>> 
>> 
>> 
>> #Another function
>> fun2 <- function(dat,n){
>>  indx <- cumsum(c(1,abs(diff(is.na(dat[,"Count"])))))
>>  indx1 <- indx[is.na(dat[,"Count"])]
>>  names(indx1) <- which(is.na(dat[,"Count"]))
>> indx2 <- indx1[indx1 %in% names(table(indx1))[table(indx1)==n]]
>> lst1 <- tapply(seq_along(indx2),list(indx2),FUN=function(i) {
>>                             x1 <- indx2[i]
>>                              x2 <- as.numeric(names(x1))
>>                              x3 <- dat[c(min(x2)-1L,x2,max(x2)+1L),]
>>                              x4 <- subset(x3, !is.na(Count))
>>                              x5 <- 
>> data.frame(Position=max(x4$Position),Count=sum(x4$Count))
>>                             ind <- x4$Position %in% max(x4$Position)
>>                              row.names(x5) <- if(sum(ind)>1) 
>> row.names(x4)[ind][1] else row.names(x4)[ind]
>>                             x5
>>                         })
>> attr(lst1,"dimnames") <- NULL
>>  dat2 <- do.call(rbind,lst1)
>> indx3 <- sort(unlist(tapply(seq_along(indx2),list(indx2),FUN=function(i) 
>> {x1 <- indx2[i]
>>                                      x2 <- as.numeric(names(x1))
>>                                      c(min(x2)-1L, x2, 
>> max(x2)+1L)}),use.names=FALSE))
>> 
>> dat$id <- 1:nrow(dat)
>> dat2$id <- as.numeric(row.names(dat2))
>> library(plyr)
>> res <- join(dat,dat2[,-1],by="id",type="left")
>> res1 <- res[!((row.names(res) %in% indx3) & is.na(res[,4])),]
>> res1[,2][!is.na(res1[,4])] <- res1[,4][!is.na(res1[,4])]
>> res2 <- res1[,1:2]
>> row.names(res2) <- 1:nrow(res2)
>> res2
>> }
>> 
>> 
>> identical(fun1(dat1,1),fun2(dat1,1))
>> #[1] TRUE
>> identical(fun1(fun1(dat1,1),2),fun2(fun2(dat1,1),2))
>> #[1] TRUE
>> 
>> identical(fun1(fun1(fun1(dat1,1),2),3),fun2(fun2(fun2(dat1,1),2),3))
>> #[1] TRUE
>> 
>> 
>> #Speed
>> set.seed(185)
>> datT <- data.frame(Position = sample(10:80,1e5,replace=TRUE),Count= 
>> sample(c(NA, 10:100),1e5, replace=TRUE))
>>  system.time(res <- fun1(datT,1))
>>  #  user  system elapsed
>>  # 0.676   0.000   0.676
>>  system.time(res2 <- fun2(datT,1))
>> #   user  system elapsed
>> #  1.240   0.000   1.237
>>  identical(res,res2)
>> #[1] TRUE
>> 
>> A.K.
>> 
>> 
>> 
>> 
>> 
>> 
>> 
>> 
>> 
>> On Friday, October 18, 2013 4:19 PM, arun <smartpink111 at yahoo.com> wrote:
>> Hi,
>> 
>> May be this helps:
>> 
>> dat1 <- structure(list(Position = c(15L, 22L, 38L, 49L, 55L, 61L, 62L,
>> 14L, 29L, 63L, 46L, 22L, 18L, 24L, 22L, 49L, 42L, 38L, 29L, 22L,
>> 29L, 23L, 42L), Count = c(15L, NA, NA, 5L, NA, 17L, 18L, NA,
>> NA, NA, 8L, NA, 20L, NA, NA, 16L, 19L, NA, NA, NA, 13L, NA, 33L
>> )), .Names = c("Position", "Count"), class = "data.frame", row.names = 
>> c(NA,
>> -23L))
>> 
>> 
>> #There might be simple solutions.
>> 
>> fun1 <- function(dat,n) {
>>  rl <- rle(is.na(dat[,"Count"]))
>> indx <- 
>> which(is.na(dat[,"Count"]))[rep(rl$lengths[rl$values],rl$lengths[rl$values])==n]
>>  lst1 <- lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) {
>>                          x1 <- dat[c(min(x)-1L,x,max(x)+1L),]
>>                      x2 <- x1[!is.na(x1$Count),]
>>                      datN <- 
>> data.frame(Position=max(x2$Position),Count=sum(x2$Count))
>>                      rowN <- row.names(x2)[x2$Position %in% 
>> max(x2$Position)]   
>>                      row.names(datN) <- if(length(rowN)>1) rowN[1] else 
>> rowN
>>                      datN
>>                     })
>> names(lst1) <- NULL
>> dat2 <- do.call(rbind,lst1)
>> indx2 <-  
>> sort(unlist(lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) 
>> c(min(x)-1L,x,c(max(x)+1L))),use.names=FALSE))
>> 
>> dat1New <- dat[-indx2[!indx2 %in% row.names(dat2)],]
>> dat1New[match(row.names(dat2),row.names(dat1New)),] <- dat2
>> row.names(dat1New) <- 1:nrow(dat1New)
>> dat1New
>> }
>> 
>> dat1N <- fun1(dat1,1)
>> dat1N
>>    Position Count
>> 1        15    15
>> 2        22    NA
>> 3        38    NA
>> 4        61    22
>> 5        62    18
>> 6        14    NA
>> 7        29    NA
>> 8        63    NA
>> 9        46    28
>> 10       24    NA
>> 11       22    NA
>> 12       49    16
>> 13       42    19
>> 14       38    NA
>> 15       29    NA
>> 16       22    NA
>> 17       42    46
>> 
>> dat2N <- fun1(dat1N,2)
>> dat2N
>>    Position Count
>> 1        61    37
>> 2        62    18
>> 3        14    NA
>> 4        29    NA
>> 5        63    NA
>> 6        49    44
>> 7        42    19
>> 8        38    NA
>> 9        29    NA
>> 10       22    NA
>> 11       42    46
>> dat3N <- fun1(dat2N,3)
>> dat3N
>>   Position Count
>> 1       61    37
>> 2       62    62
>> 3       42    65
>> 
>> A.K.
>> 
>> 
>> 
>> 
>> 
>> 
>> 
>> 
>> 
>> Hi all, I have a dataset with 2 important columns, "Position" and
>> "Count". There are a total of 34,532 rows, but only 457 non-NA values in 
>> the "Count" column (every cell in "Position" column has a value). I
>> need to write a loop to march down the rows, and if there are 2 rows in
>> "Count" where there is only 1 NA row between them, sum the two values up 
>> and print only one row with the summed Count value and the Position
>> value that corresponds to the larger Count value, thus making the three
>> rows into one. For example:
>> 
>> Position Count
>> 15 15
>> 22 NA
>> 38 NA
>> 49 5
>> 55 NA
>> 61 17
>> 
>> would become
>> 
>> Position Count
>> 15 15
>> 22 NA
>> 38 NA
>> 61 22
>> 
>> After this step, I also need to write another script to march
>> down the rows and look for rows with only two NA's between non-NA rows
>> in Count. This would make the previous data become
>> 
>> Position Count
>> 61 37
>> 
>> Ideally I would like a loop that can be flexibly adjusted to the
>> number of NA's in between adjacent non-NA values that can be freely
>> changed. I would greatly appreciate any insight for this.
>> 
>> ______________________________________________
>> R-help at r-project.org mailing list
>> https://stat.ethz.ch/mailman/listinfo/r-help
>> PLEASE do read the posting guide 
>> http://www.R-project.org/posting-guide.html
>> and provide commented, minimal, self-contained, reproducible code.
>> 
>
> ---------------------------------------------------------------------------
> Jeff Newmiller                        The     .....       .....  Go Live...
> DCN:<jdnewmil at dcn.davis.ca.us>        Basics: ##.#.       ##.#.  Live Go...
>                                      Live:   OO#.. Dead: OO#..  Playing
> Research Engineer (Solar/Batteries            O.O#.       #.O#.  with
> /Software/Embedded Controllers)               .OO#.       .OO#.  rocks...1k
> ---------------------------------------------------------------------------

---------------------------------------------------------------------------
Jeff Newmiller                        The     .....       .....  Go Live...
DCN:<jdnewmil at dcn.davis.ca.us>        Basics: ##.#.       ##.#.  Live Go...
                                       Live:   OO#.. Dead: OO#..  Playing
Research Engineer (Solar/Batteries            O.O#.       #.O#.  with
/Software/Embedded Controllers)               .OO#.       .OO#.  rocks...1k
---------------------------------------------------------------------------