[R] Loop for taking sum of rows based on proximity to other non-NA rows

Mon Oct 21 05:21:45 CEST 2013

Sorry, I noticed that when two "Count" values are the same and NA in between, my function fails.

#Modified
fun1 <- function(dat,n) {
 rl <- rle(is.na(dat[,"Count"]))
indx <- which(is.na(dat[,"Count"]))[rep(rl$lengths[rl$values],rl$lengths[rl$values])==n]
 lst1 <- lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) {
                         x1 <- dat[c(min(x)-1L,x,max(x)+1L),]
                     x2 <- x1[!is.na(x1$Count),]
                     datN <- data.frame(Position=x2$Position[x2$Count %in% max(x2$Count)],Count=sum(x2$Count))
                     rowN <- row.names(x2)[x2$Count %in% max(x2$Count)]   
                     datN<- if(length(rowN)>1) datN[1,] else datN
             row.names(datN) <- if(length(rowN) >1) rowN[1] else rowN    
                     datN
                    })
names(lst1) <- NULL
lst1 <- lst1[!duplicated(sapply(lst1,row.names))] 
dat2 <- do.call(rbind,lst1)
indx2 <-  sort(unlist(lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) c(min(x)-1L,x,c(max(x)+1L))),use.names=FALSE))

dat1New <- dat[-indx2[!indx2 %in% row.names(dat2)],]
dat1New[match(row.names(dat2),row.names(dat1New)),] <- dat2
row.names(dat1New) <- 1:nrow(dat1New)
dat1New
}

#########################
fun2 <- function(dat,n){
 indx <- cumsum(c(1,abs(diff(is.na(dat[,"Count"])))))
 indx1 <- indx[is.na(dat[,"Count"])]
 names(indx1) <- which(is.na(dat[,"Count"]))
indx2 <- indx1[indx1 %in% names(table(indx1))[table(indx1)==n]]
lst1 <- tapply(seq_along(indx2),list(indx2),FUN=function(i) {
                            x1 <- indx2[i]
                             x2 <- as.numeric(names(x1))
                             x3 <- dat[c(min(x2)-1L,x2,max(x2)+1L),]
                             x4 <- subset(x3, !is.na(Count))
                             x5 <- data.frame(Position=x4$Position[x4$Count %in% max(x4$Count)],Count=sum(x4$Count))
                            ind <- x4$Count %in% max(x4$Count)
                             row.names(x5) <- row.names(x4)[ind] 
                            x5 <- if(sum(ind)>1) x5[1,] else x5
                            x5
                        })
attr(lst1,"dimnames") <- NULL
 dat2 <- do.call(rbind,lst1)
indx3 <- sort(unlist(tapply(seq_along(indx2),list(indx2),FUN=function(i) {x1 <- indx2[i]
                                     x2 <- as.numeric(names(x1))
                                     c(min(x2)-1L, x2, max(x2)+1L)}),use.names=FALSE))

dat$id <- 1:nrow(dat)
dat2$id <- as.numeric(row.names(dat2))
library(plyr)
res <- join(dat,dat2[,-1],by="id",type="left")
res1 <- res[!((row.names(res) %in% indx3) & is.na(res[,4])),]
res1[,2][!is.na(res1[,4])] <- res1[,4][!is.na(res1[,4])]
res2 <- res1[,1:2]
row.names(res2) <- 1:nrow(res2)
res2
}

identical(fun1(dat1,1),fun2(dat1,1))
#[1] TRUE
identical(fun1(fun1(dat1,1),2),fun2(fun2(dat1,1),2))
#[1] TRUE

identical(fun1(fun1(fun1(dat1,1),2),3),fun2(fun2(fun2(dat1,1),2),3))
#[1] TRUE
fun1(fun1(fun1(dat1,1),2),3)
 # Position Count
#1       61    37
#2       18    62
#3       42    65

##When I tried the function on a bigger dataset:
set.seed(185)
datT <- data.frame(Position = sample(10:80,1e5,replace=TRUE),Count= sample(c(NA, 10:100),1e5, replace=TRUE))
 dim(datT)
#[1] 100000      2

 system.time(res <- fun1(datT,1))
#   user  system elapsed 
 # 0.708   0.000   0.709 
system.time(res2 <- fun2(datT,1))
#   user  system elapsed 
 # 1.400   0.016   1.421 

system.time(res3 <- removeNNAs(datT,1))
#   user  system elapsed 
 # 1.068   0.000   1.071 

all.equal(res,res2)
#[1] TRUE
 all.equal(res,res3)
#[1] "Attributes: < Component 2: Numeric: lengths (97786, 97778) differ >"
#[2] "Component 1: Numeric: lengths (97786, 97778) differ"                
#[3] "Component 2: Numeric: lengths (97786, 97778) differ"  
dim(res)
#[1] 97786     2
dim(res3)
#[1] 97778     2

##Here your function seems to give the correct number of rows as:
rl <- rle(is.na(datT[,"Count"]))
indx <- which(is.na(datT[,"Count"]))[rep(rl$lengths[rl$values],rl$lengths[rl$values])==1]
 dim(datT)[1]- 2*length(indx)
#[1] 97778

#Here is where I think the difference occur (in addition to the one with the values)
datS <- datT[16000:20000,]
row.names(datS) <- 1:nrow(datS)

resT <- fun1(datS,1)
 resT3 <- removeNNAs(datS,1)

 datS[3402:3408,]
     Position Count
3402       72    70
3403       38    51
3404       80    NA
3405       26    44
3406       42    NA
3407       78    77
3408       70    89

resT3[3311:3318,]
     Position Count
3401       54    65
3402       72    70
3407       78   172######
3408       70    89
3409       27    40
3410       44    44
3411       73    75
3412       73    76

 resT[3311:3318,]
     Position Count
3311       29    98
3312       54    65
3313       72    70
3314       38    95####
3315       78   121 ###
3316       70    89
3317       27    40
3318       44    44

In these conditions, the post is not very clear about dealing it.

A.K.

On Sunday, October 20, 2013 9:36 PM, arun <smartpink111 at yahoo.com> wrote:
Hi Jeff,

I found some difference in results between your function and mine.  It also point out a mistake in my code. In the original post, it says:
""""""""""" 

I need to write a loop to march down the rows, and if there are 2 rows in 
"Count"
where there is only 1 NA row between them, sum the two values up and 
print only one row with the summed Count value and the Position value that corresponds to the larger Count value, thus making the three 
rows into one.
"""""""""

Sorry, I read it incorrectly the last time and selected the maximum  "Position" value instead of that corresponds to the larger Count value.  

After correcting the function, there is still some difference between the results. 

##fun1() and fun2() corrected
fun1 <- function(dat,n) {
 rl <- rle(is.na(dat[,"Count"]))
indx <- which(is.na(dat[,"Count"]))[rep(rl$lengths[rl$values],rl$lengths[rl$values])==n]
 lst1 <- lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) {
                         x1 <- dat[c(min(x)-1L,x,max(x)+1L),]
                     x2 <- x1[!is.na(x1$Count),]
                     datN <- data.frame(Position=x2$Position[x2$Count %in% max(x2$Count)],Count=sum(x2$Count))
                     rowN <- row.names(x2)[x2$Count %in% max(x2$Count)]   
                     row.names(datN) <- if(length(rowN)>1) rowN[1] else rowN
                     datN
                    })
names(lst1) <- NULL
lst1 <- lst1[!duplicated(sapply(lst1,row.names))] ######added
dat2 <- do.call(rbind,lst1)
indx2 <-  sort(unlist(lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) c(min(x)-1L,x,c(max(x)+1L))),use.names=FALSE))

dat1New <- dat[-indx2[!indx2 %in% row.names(dat2)],]
dat1New[match(row.names(dat2),row.names(dat1New)),] <- dat2
row.names(dat1New) <- 1:nrow(dat1New)
dat1New
}

##################################

fun2 <- function(dat,n){
 indx <- cumsum(c(1,abs(diff(is.na(dat[,"Count"])))))
 indx1 <- indx[is.na(dat[,"Count"])]
 names(indx1) <- which(is.na(dat[,"Count"]))
indx2 <- indx1[indx1 %in% names(table(indx1))[table(indx1)==n]]
lst1 <- tapply(seq_along(indx2),list(indx2),FUN=function(i) {
                            x1 <- indx2[i]
                             x2 <- as.numeric(names(x1))
                             x3 <- dat[c(min(x2)-1L,x2,max(x2)+1L),]
                             x4 <- subset(x3, !is.na(Count))
                             x5 <- data.frame(Position=x4$Position[x4$Count %in% max(x4$Count)],Count=sum(x4$Count))
                            ind <- x4$Count %in% max(x4$Count)
                             row.names(x5) <- if(sum(ind)>1) row.names(x4)[ind][1] else row.names(x4)[ind]
                            x5
                        })
attr(lst1,"dimnames") <- NULL
 dat2 <- do.call(rbind,lst1)
indx3 <- sort(unlist(tapply(seq_along(indx2),list(indx2),FUN=function(i) {x1 <- indx2[i]
                                     x2 <- as.numeric(names(x1))
                                     c(min(x2)-1L, x2, max(x2)+1L)}),use.names=FALSE))

dat$id <- 1:nrow(dat)
dat2$id <- as.numeric(row.names(dat2))
library(plyr)
res <- join(dat,dat2[,-1],by="id",type="left")
res1 <- res[!((row.names(res) %in% indx3) & is.na(res[,4])),]
res1[,2][!is.na(res1[,4])] <- res1[,4][!is.na(res1[,4])]
res2 <- res1[,1:2]
row.names(res2) <- 1:nrow(res2)
res2
}

dat1 <- structure(list(Position = c(15L, 22L, 38L, 49L, 55L, 61L, 62L,
14L, 29L, 63L, 46L, 22L, 18L, 24L, 22L, 49L, 42L, 38L, 29L, 22L,
29L, 23L, 42L), Count = c(15L, NA, NA, 5L, NA, 17L, 18L, NA,
NA, NA, 8L, NA, 20L, NA, NA, 16L, 19L, NA, NA, NA, 13L, NA, 33L
)), .Names = c("Position", "Count"), class = "data.frame", row.names = c(NA,
-23L))

fun1(dat1,1)
   Position Count
1        15    15
2        22    NA
3        38    NA
4        61    22
5        62    18
6        14    NA
7        29    NA
8        63    NA
9        18    28  ###
10       24    NA
11       22    NA
12       49    16 ####
13       42    19
14       38    NA
15       29    NA
16       22    NA
17       42    46
removeNNAs(dat1,1) #gets similar results

#but,

 fun1(fun1(dat1,1),2)
   Position Count
1        61    37
2        62    18
3        14    NA
4        29    NA
5        63    NA
6        18    44 #######different
7        42    19
8        38    NA
9        29    NA
10       22    NA
11       42    46

 removeNNAs(dat1,2,lessOrEqual=TRUE)
   Position Count
6        61    37
7        62    18
8        14    NA
9        29    NA
10       63    NA
16       49    44 ###### different
17       42    19
18       38    NA
19       29    NA
20       22    NA
23       42    46
> 

removeNNAs(dat1,3,lessOrEqual=TRUE)
   Position Count
6        61    37
16       49    62
23       42    65
 fun1(fun1(fun1(dat1,1),2),3)
  Position Count
1       61    37
2       18    62
3       42    65

A.K.

On Sunday, October 20, 2013 7:49 PM, Jeff Newmiller <jdnewmil at dcn.davis.ca.us> wrote:
Looks like a right parenthesis was dropped. Corrected:

removeNNAs <- function( dat, N, lessOrEqual=FALSE ) {
   N1 <- N+1
   rx <- rle( !is.na( dat$Count ) )
   # indexes of the ends of each run of NAs or non-NAs
   cs <- cumsum( rx$lengths )
   # indexes of the ends of runs of NAs or non-NAs
   cs2 <- cs[ !rx$values ]
   # If the first Count is NA, then drop first run of NAs
   if ( !rx$values[1] ) {
     cs2 <- cs2[ -1 ]
   }
   # If the last Count is NA, then drop last run of NAs
   if ( !rx$values[ length( rx$values ) ] ) {
     cs2 <- cs2[ -length( cs2 ) ]
   }
   # cs2 is indexes of rows to potentially receive deleted Counts
   # after collapse
   cs2 <- cs2 + 1
   # cs1 is indexes of non-NA Counts to be deleted
   cs1 <- cs[ rx$values ][ seq.int( length( cs2 ) ) ]
   # identify the indexes of the Count values before the strings
   # of NAs that meet the criteria
   if ( lessOrEqual ) {
     idx0 <- N1 >= ( cs2 - cs1 )
   } else {
     idx0 <- N1 == ( cs2 - cs1 )
   }
   idx1 <- cs1[ idx0 ]
   # identify the indexes of the Count values after the strings of
   # NAs that meet the criteria
   idx2 <- cs2[ idx0 ]
   # Identify which indexes are both sources and destinations
   idx1c <-c( idx2[ -length( idx2 ) ] == idx1[ -1 ], FALSE )
   # identify groups of indexes that need to be merged
   idx1g <- rev( cumsum( rev( !idx1c ) ) )
   # find which elements of idx1 represent the beginning of a
   # sequence of indexes to be replaced (meta-indexes)
   srcmidxs <- which( -1 == diff( c( idx1g[ 1 ] + 1, idx1g ) ) )
   # find which elements of idx2 represent the end of a sequence
   # to be  replaced (meta-indexes)
   destmidxs <- which( 1 == rev( diff( rev( c( idx1g, 0 ) ) ) ) )
   # add counts from before NAs to destination rows
   result <- dat
   srcidxList <- vector( mode="list", length=length( destmidxs ) )
   for ( i in seq.int( length( destmidxs ) ) ) {
     # row to which data will be copied
     destidx <- idx2[ destmidxs[ i ] ]
     # sequence of indexes of source rows
     srcidxss <- seq.int( from=idx1[ srcmidxs[ i ] ], to=destidx - 1 )
     result[ destidx, "Count" ] <- ( dat[ destidx, "Count" ]
                     + sum( dat[ srcidxss, "Count" ], na.rm=TRUE ) )
     # keep a list of indexes to be removed
     srcidxList[ i ] <- list( srcidxss )
   }
   # remove source rows
   result <- result[ -unlist( srcidxList ), ]
   result
}