[R] Loop for taking sum of rows based on proximity to other non-NA rows

Sun Oct 20 23:11:25 CEST 2013

I thought this question looked interesting enough to make my own stab at 
it, but in hindsight I think this business of combining the counts seems 
quite unlikely to be necessary... it would be simpler and less damaging to 
the original data pattern to just remove groups of rows having fewer than 
"N" NAs.

removeNNAs <- function( dat, N, lessOrEqual=FALSE ) {
   N1 <- N+1
   rx <- rle( !is.na( dat$Count ) )
   # indexes of the ends of each run of NAs or non-NAs
   cs <- cumsum( rx$lengths )
   # indexes of the ends of runs of NAs or non-NAs
   cs2 <- cs[ !rx$values ]
   # If the first Count is NA, then drop first run of NAs
   if ( !rx$values[1] ) {
     cs2 <- cs2[ -1 ]
   }
   # If the last Count is NA, then drop last run of NAs
   if ( !rx$values[ length( rx$values ) ] ) {
     cs2 <- cs2[ -length( cs2 ) ]
   }
   # cs2 is indexes of rows to potentially receive deleted Counts
   # after collapse
   cs2 <- cs2 + 1
   # cs1 is indexes of non-NA Counts to be deleted
   cs1 <- cs[ rx$values ][ seq.int( length( cs2 ) ) ]
   # identify the indexes of the Count values before the strings
   # of NAs that meet the criteria
   if ( lessOrEqual ) {
     idx0 <- N1 >= ( cs2 - cs1 )
   } else {
     idx0 <- N1 == ( cs2 - cs1 )
   }
   idx1 <- cs1[ idx0 ]
   # identify the indexes of the Count values after the strings of
   # NAs that meet the criteria
   idx2 <- cs2[ idx0 ]
   # Identify which indexes are both sources and destinations
   idx1c <-c( idx2[ -length( idx2 ) ] == idx1[ -1 ], FALSE )
   # identify groups of indexes that need to be merged
   idx1g <- rev( cumsum( rev( !idx1c ) ) )
   # find which elements of idx1 represent the beginning of a
   # sequence of indexes to be replaced (meta-indexes)
   srcmidxs <- which( -1 == diff( c( idx1g[ 1 ] + 1, idx1g ) ) )
   # find which elements of idx2 represent the end of a sequence
   # to be  replaced (meta-indexes)
   destmidxs <- which( 1 == rev( diff( rev( c( idx1g, 0 ) ) ) ) )
   # add counts from before NAs to destination rows
   result <- dat
   srcidxList <- vector( mode="list", length=length( destmidxs ) )
   for ( i in seq.int( length( destmidxs ) ) ) {
     # row to which data will be copied
     destidx <- idx2[ destmidxs[ i ] ]
     # sequence of indexes of source rows
     srcidxss <- seq.int( from=idx1[ srcmidxs[ i ] ], to=destidx - 1 )
     result[ destidx, "Count" ] <- ( dat[ destidx, "Count" ]
                        + sum( dat[ srcidxss, "Count" ], na.rm=TRUE )
     # keep a list of indexes to be removed
     srcidxList[ i ] <- list( srcidxss )
   }
   # remove source rows
   result <- result[ -unlist( srcidxList ), ]
   result
}

On Fri, 18 Oct 2013, arun wrote:

>
>
> Hi,
>
> Found a bug in the function when tested.  So, try this (added one more line):
>
> #Modified function
> fun1 <- function(dat,n) {
>  rl <- rle(is.na(dat[,"Count"]))
> indx <- which(is.na(dat[,"Count"]))[rep(rl$lengths[rl$values],rl$lengths[rl$values])==n]
>  lst1 <- lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) {
>                          x1 <- dat[c(min(x)-1L,x,max(x)+1L),]
>                      x2 <- x1[!is.na(x1$Count),]
>                      datN <- data.frame(Position=max(x2$Position),Count=sum(x2$Count))
>                      rowN <- row.names(x2)[x2$Position %in% max(x2$Position)]  
>                      row.names(datN) <- if(length(rowN)>1) rowN[1] else rowN
>                      datN
>                     })
> names(lst1) <- NULL
> lst1 <- lst1[!duplicated(sapply(lst1,row.names))] ######added
> dat2 <- do.call(rbind,lst1)
> indx2 <-  sort(unlist(lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) c(min(x)-1L,x,c(max(x)+1L))),use.names=FALSE))
>
> dat1New <- dat[-indx2[!indx2 %in% row.names(dat2)],]
> dat1New[match(row.names(dat2),row.names(dat1New)),] <- dat2
> row.names(dat1New) <- 1:nrow(dat1New)
> dat1New
> }
>
>
>
> #Another function
> fun2 <- function(dat,n){
>  indx <- cumsum(c(1,abs(diff(is.na(dat[,"Count"])))))
>  indx1 <- indx[is.na(dat[,"Count"])]
>  names(indx1) <- which(is.na(dat[,"Count"]))
> indx2 <- indx1[indx1 %in% names(table(indx1))[table(indx1)==n]]
> lst1 <- tapply(seq_along(indx2),list(indx2),FUN=function(i) {
>                             x1 <- indx2[i]
>                              x2 <- as.numeric(names(x1))
>                              x3 <- dat[c(min(x2)-1L,x2,max(x2)+1L),]
>                              x4 <- subset(x3, !is.na(Count))
>                              x5 <- data.frame(Position=max(x4$Position),Count=sum(x4$Count))
>                             ind <- x4$Position %in% max(x4$Position)
>                              row.names(x5) <- if(sum(ind)>1) row.names(x4)[ind][1] else row.names(x4)[ind]
>                             x5
>                         })
> attr(lst1,"dimnames") <- NULL
>  dat2 <- do.call(rbind,lst1)
> indx3 <- sort(unlist(tapply(seq_along(indx2),list(indx2),FUN=function(i) {x1 <- indx2[i]
>                                      x2 <- as.numeric(names(x1))
>                                      c(min(x2)-1L, x2, max(x2)+1L)}),use.names=FALSE))
>
> dat$id <- 1:nrow(dat)
> dat2$id <- as.numeric(row.names(dat2))
> library(plyr)
> res <- join(dat,dat2[,-1],by="id",type="left")
> res1 <- res[!((row.names(res) %in% indx3) & is.na(res[,4])),]
> res1[,2][!is.na(res1[,4])] <- res1[,4][!is.na(res1[,4])]
> res2 <- res1[,1:2]
> row.names(res2) <- 1:nrow(res2)
> res2
> }
>
>
> identical(fun1(dat1,1),fun2(dat1,1))
> #[1] TRUE
> identical(fun1(fun1(dat1,1),2),fun2(fun2(dat1,1),2))
> #[1] TRUE
>
> identical(fun1(fun1(fun1(dat1,1),2),3),fun2(fun2(fun2(dat1,1),2),3))
> #[1] TRUE
>
>
> #Speed
> set.seed(185)
> datT <- data.frame(Position = sample(10:80,1e5,replace=TRUE),Count= sample(c(NA, 10:100),1e5, replace=TRUE))
>  system.time(res <- fun1(datT,1))
>  #  user  system elapsed
>  # 0.676   0.000   0.676
>  system.time(res2 <- fun2(datT,1))
> #   user  system elapsed
> #  1.240   0.000   1.237
>  identical(res,res2)
> #[1] TRUE
>
> A.K.
>
>
>
>
>
>
>
>
>
> On Friday, October 18, 2013 4:19 PM, arun <smartpink111 at yahoo.com> wrote:
> Hi,
>
> May be this helps:
>
> dat1 <- structure(list(Position = c(15L, 22L, 38L, 49L, 55L, 61L, 62L,
> 14L, 29L, 63L, 46L, 22L, 18L, 24L, 22L, 49L, 42L, 38L, 29L, 22L,
> 29L, 23L, 42L), Count = c(15L, NA, NA, 5L, NA, 17L, 18L, NA,
> NA, NA, 8L, NA, 20L, NA, NA, 16L, 19L, NA, NA, NA, 13L, NA, 33L
> )), .Names = c("Position", "Count"), class = "data.frame", row.names = c(NA,
> -23L))
>
>
> #There might be simple solutions.
>
> fun1 <- function(dat,n) {
>  rl <- rle(is.na(dat[,"Count"]))
> indx <- which(is.na(dat[,"Count"]))[rep(rl$lengths[rl$values],rl$lengths[rl$values])==n]
>  lst1 <- lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) {
>                          x1 <- dat[c(min(x)-1L,x,max(x)+1L),]
>                      x2 <- x1[!is.na(x1$Count),]
>                      datN <- data.frame(Position=max(x2$Position),Count=sum(x2$Count))
>                      rowN <- row.names(x2)[x2$Position %in% max(x2$Position)]   
>                      row.names(datN) <- if(length(rowN)>1) rowN[1] else rowN
>                      datN
>                     })
> names(lst1) <- NULL
> dat2 <- do.call(rbind,lst1)
> indx2 <-  sort(unlist(lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) c(min(x)-1L,x,c(max(x)+1L))),use.names=FALSE))
>
> dat1New <- dat[-indx2[!indx2 %in% row.names(dat2)],]
> dat1New[match(row.names(dat2),row.names(dat1New)),] <- dat2
> row.names(dat1New) <- 1:nrow(dat1New)
> dat1New
> }
>
> dat1N <- fun1(dat1,1)
> dat1N
>    Position Count
> 1        15    15
> 2        22    NA
> 3        38    NA
> 4        61    22
> 5        62    18
> 6        14    NA
> 7        29    NA
> 8        63    NA
> 9        46    28
> 10       24    NA
> 11       22    NA
> 12       49    16
> 13       42    19
> 14       38    NA
> 15       29    NA
> 16       22    NA
> 17       42    46
>
> dat2N <- fun1(dat1N,2)
> dat2N
>    Position Count
> 1        61    37
> 2        62    18
> 3        14    NA
> 4        29    NA
> 5        63    NA
> 6        49    44
> 7        42    19
> 8        38    NA
> 9        29    NA
> 10       22    NA
> 11       42    46
> dat3N <- fun1(dat2N,3)
> dat3N
>   Position Count
> 1       61    37
> 2       62    62
> 3       42    65
>
> A.K.
>
>
>
>
>
>
>
>
>
> Hi all, I have a dataset with 2 important columns, "Position" and
> "Count". There are a total of 34,532 rows, but only 457 non-NA values in the "Count" column (every cell in "Position" column has a value). I
> need to write a loop to march down the rows, and if there are 2 rows in
> "Count" where there is only 1 NA row between them, sum the two values up and print only one row with the summed Count value and the Position
> value that corresponds to the larger Count value, thus making the three
> rows into one. For example:
>
> Position Count
> 15 15
> 22 NA
> 38 NA
> 49 5
> 55 NA
> 61 17
>
> would become
>
> Position Count
> 15 15
> 22 NA
> 38 NA
> 61 22
>
> After this step, I also need to write another script to march
> down the rows and look for rows with only two NA's between non-NA rows
> in Count. This would make the previous data become
>
> Position Count
> 61 37
>
> Ideally I would like a loop that can be flexibly adjusted to the
> number of NA's in between adjacent non-NA values that can be freely
> changed. I would greatly appreciate any insight for this.
>
> ______________________________________________
> R-help at r-project.org mailing list
> https://stat.ethz.ch/mailman/listinfo/r-help
> PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
> and provide commented, minimal, self-contained, reproducible code.
>

---------------------------------------------------------------------------
Jeff Newmiller                        The     .....       .....  Go Live...
DCN:<jdnewmil at dcn.davis.ca.us>        Basics: ##.#.       ##.#.  Live Go...
                                       Live:   OO#.. Dead: OO#..  Playing
Research Engineer (Solar/Batteries            O.O#.       #.O#.  with
/Software/Embedded Controllers)               .OO#.       .OO#.  rocks...1k
---------------------------------------------------------------------------