[R] Loop for taking sum of rows based on proximity to other non-NA rows
Jeff Newmiller
jdnewmil at dcn.davis.ca.us
Mon Oct 21 01:49:32 CEST 2013
Looks like a right parenthesis was dropped. Corrected:
removeNNAs <- function( dat, N, lessOrEqual=FALSE ) {
N1 <- N+1
rx <- rle( !is.na( dat$Count ) )
# indexes of the ends of each run of NAs or non-NAs
cs <- cumsum( rx$lengths )
# indexes of the ends of runs of NAs or non-NAs
cs2 <- cs[ !rx$values ]
# If the first Count is NA, then drop first run of NAs
if ( !rx$values[1] ) {
cs2 <- cs2[ -1 ]
}
# If the last Count is NA, then drop last run of NAs
if ( !rx$values[ length( rx$values ) ] ) {
cs2 <- cs2[ -length( cs2 ) ]
}
# cs2 is indexes of rows to potentially receive deleted Counts
# after collapse
cs2 <- cs2 + 1
# cs1 is indexes of non-NA Counts to be deleted
cs1 <- cs[ rx$values ][ seq.int( length( cs2 ) ) ]
# identify the indexes of the Count values before the strings
# of NAs that meet the criteria
if ( lessOrEqual ) {
idx0 <- N1 >= ( cs2 - cs1 )
} else {
idx0 <- N1 == ( cs2 - cs1 )
}
idx1 <- cs1[ idx0 ]
# identify the indexes of the Count values after the strings of
# NAs that meet the criteria
idx2 <- cs2[ idx0 ]
# Identify which indexes are both sources and destinations
idx1c <-c( idx2[ -length( idx2 ) ] == idx1[ -1 ], FALSE )
# identify groups of indexes that need to be merged
idx1g <- rev( cumsum( rev( !idx1c ) ) )
# find which elements of idx1 represent the beginning of a
# sequence of indexes to be replaced (meta-indexes)
srcmidxs <- which( -1 == diff( c( idx1g[ 1 ] + 1, idx1g ) ) )
# find which elements of idx2 represent the end of a sequence
# to be replaced (meta-indexes)
destmidxs <- which( 1 == rev( diff( rev( c( idx1g, 0 ) ) ) ) )
# add counts from before NAs to destination rows
result <- dat
srcidxList <- vector( mode="list", length=length( destmidxs ) )
for ( i in seq.int( length( destmidxs ) ) ) {
# row to which data will be copied
destidx <- idx2[ destmidxs[ i ] ]
# sequence of indexes of source rows
srcidxss <- seq.int( from=idx1[ srcmidxs[ i ] ], to=destidx - 1 )
result[ destidx, "Count" ] <- ( dat[ destidx, "Count" ]
+ sum( dat[ srcidxss, "Count" ], na.rm=TRUE ) )
# keep a list of indexes to be removed
srcidxList[ i ] <- list( srcidxss )
}
# remove source rows
result <- result[ -unlist( srcidxList ), ]
result
}
On Sun, 20 Oct 2013, Jeff Newmiller wrote:
> I thought this question looked interesting enough to make my own stab at it,
> but in hindsight I think this business of combining the counts seems quite
> unlikely to be necessary... it would be simpler and less damaging to the
> original data pattern to just remove groups of rows having fewer than "N"
> NAs.
>
> removeNNAs <- function( dat, N, lessOrEqual=FALSE ) {
> N1 <- N+1
> rx <- rle( !is.na( dat$Count ) )
> # indexes of the ends of each run of NAs or non-NAs
> cs <- cumsum( rx$lengths )
> # indexes of the ends of runs of NAs or non-NAs
> cs2 <- cs[ !rx$values ]
> # If the first Count is NA, then drop first run of NAs
> if ( !rx$values[1] ) {
> cs2 <- cs2[ -1 ]
> }
> # If the last Count is NA, then drop last run of NAs
> if ( !rx$values[ length( rx$values ) ] ) {
> cs2 <- cs2[ -length( cs2 ) ]
> }
> # cs2 is indexes of rows to potentially receive deleted Counts
> # after collapse
> cs2 <- cs2 + 1
> # cs1 is indexes of non-NA Counts to be deleted
> cs1 <- cs[ rx$values ][ seq.int( length( cs2 ) ) ]
> # identify the indexes of the Count values before the strings
> # of NAs that meet the criteria
> if ( lessOrEqual ) {
> idx0 <- N1 >= ( cs2 - cs1 )
> } else {
> idx0 <- N1 == ( cs2 - cs1 )
> }
> idx1 <- cs1[ idx0 ]
> # identify the indexes of the Count values after the strings of
> # NAs that meet the criteria
> idx2 <- cs2[ idx0 ]
> # Identify which indexes are both sources and destinations
> idx1c <-c( idx2[ -length( idx2 ) ] == idx1[ -1 ], FALSE )
> # identify groups of indexes that need to be merged
> idx1g <- rev( cumsum( rev( !idx1c ) ) )
> # find which elements of idx1 represent the beginning of a
> # sequence of indexes to be replaced (meta-indexes)
> srcmidxs <- which( -1 == diff( c( idx1g[ 1 ] + 1, idx1g ) ) )
> # find which elements of idx2 represent the end of a sequence
> # to be replaced (meta-indexes)
> destmidxs <- which( 1 == rev( diff( rev( c( idx1g, 0 ) ) ) ) )
> # add counts from before NAs to destination rows
> result <- dat
> srcidxList <- vector( mode="list", length=length( destmidxs ) )
> for ( i in seq.int( length( destmidxs ) ) ) {
> # row to which data will be copied
> destidx <- idx2[ destmidxs[ i ] ]
> # sequence of indexes of source rows
> srcidxss <- seq.int( from=idx1[ srcmidxs[ i ] ], to=destidx - 1 )
> result[ destidx, "Count" ] <- ( dat[ destidx, "Count" ]
> + sum( dat[ srcidxss, "Count" ], na.rm=TRUE )
> # keep a list of indexes to be removed
> srcidxList[ i ] <- list( srcidxss )
> }
> # remove source rows
> result <- result[ -unlist( srcidxList ), ]
> result
> }
>
>
> On Fri, 18 Oct 2013, arun wrote:
>
>>
>>
>> Hi,
>>
>> Found a bug in the function when tested. So, try this (added one more
>> line):
>>
>> #Modified function
>> fun1 <- function(dat,n) {
>> rl <- rle(is.na(dat[,"Count"]))
>> indx <-
>> which(is.na(dat[,"Count"]))[rep(rl$lengths[rl$values],rl$lengths[rl$values])==n]
>> lst1 <- lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) {
>> x1 <- dat[c(min(x)-1L,x,max(x)+1L),]
>> x2 <- x1[!is.na(x1$Count),]
>> datN <-
>> data.frame(Position=max(x2$Position),Count=sum(x2$Count))
>> rowN <- row.names(x2)[x2$Position %in%
>> max(x2$Position)]
>> row.names(datN) <- if(length(rowN)>1) rowN[1] else
>> rowN
>> datN
>> })
>> names(lst1) <- NULL
>> lst1 <- lst1[!duplicated(sapply(lst1,row.names))] ######added
>> dat2 <- do.call(rbind,lst1)
>> indx2 <-
>> sort(unlist(lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x)
>> c(min(x)-1L,x,c(max(x)+1L))),use.names=FALSE))
>>
>> dat1New <- dat[-indx2[!indx2 %in% row.names(dat2)],]
>> dat1New[match(row.names(dat2),row.names(dat1New)),] <- dat2
>> row.names(dat1New) <- 1:nrow(dat1New)
>> dat1New
>> }
>>
>>
>>
>> #Another function
>> fun2 <- function(dat,n){
>> indx <- cumsum(c(1,abs(diff(is.na(dat[,"Count"])))))
>> indx1 <- indx[is.na(dat[,"Count"])]
>> names(indx1) <- which(is.na(dat[,"Count"]))
>> indx2 <- indx1[indx1 %in% names(table(indx1))[table(indx1)==n]]
>> lst1 <- tapply(seq_along(indx2),list(indx2),FUN=function(i) {
>> x1 <- indx2[i]
>> x2 <- as.numeric(names(x1))
>> x3 <- dat[c(min(x2)-1L,x2,max(x2)+1L),]
>> x4 <- subset(x3, !is.na(Count))
>> x5 <-
>> data.frame(Position=max(x4$Position),Count=sum(x4$Count))
>> ind <- x4$Position %in% max(x4$Position)
>> row.names(x5) <- if(sum(ind)>1)
>> row.names(x4)[ind][1] else row.names(x4)[ind]
>> x5
>> })
>> attr(lst1,"dimnames") <- NULL
>> dat2 <- do.call(rbind,lst1)
>> indx3 <- sort(unlist(tapply(seq_along(indx2),list(indx2),FUN=function(i)
>> {x1 <- indx2[i]
>> x2 <- as.numeric(names(x1))
>> c(min(x2)-1L, x2,
>> max(x2)+1L)}),use.names=FALSE))
>>
>> dat$id <- 1:nrow(dat)
>> dat2$id <- as.numeric(row.names(dat2))
>> library(plyr)
>> res <- join(dat,dat2[,-1],by="id",type="left")
>> res1 <- res[!((row.names(res) %in% indx3) & is.na(res[,4])),]
>> res1[,2][!is.na(res1[,4])] <- res1[,4][!is.na(res1[,4])]
>> res2 <- res1[,1:2]
>> row.names(res2) <- 1:nrow(res2)
>> res2
>> }
>>
>>
>> identical(fun1(dat1,1),fun2(dat1,1))
>> #[1] TRUE
>> identical(fun1(fun1(dat1,1),2),fun2(fun2(dat1,1),2))
>> #[1] TRUE
>>
>> identical(fun1(fun1(fun1(dat1,1),2),3),fun2(fun2(fun2(dat1,1),2),3))
>> #[1] TRUE
>>
>>
>> #Speed
>> set.seed(185)
>> datT <- data.frame(Position = sample(10:80,1e5,replace=TRUE),Count=
>> sample(c(NA, 10:100),1e5, replace=TRUE))
>> system.time(res <- fun1(datT,1))
>> # user system elapsed
>> # 0.676 0.000 0.676
>> system.time(res2 <- fun2(datT,1))
>> # user system elapsed
>> # 1.240 0.000 1.237
>> identical(res,res2)
>> #[1] TRUE
>>
>> A.K.
>>
>>
>>
>>
>>
>>
>>
>>
>>
>> On Friday, October 18, 2013 4:19 PM, arun <smartpink111 at yahoo.com> wrote:
>> Hi,
>>
>> May be this helps:
>>
>> dat1 <- structure(list(Position = c(15L, 22L, 38L, 49L, 55L, 61L, 62L,
>> 14L, 29L, 63L, 46L, 22L, 18L, 24L, 22L, 49L, 42L, 38L, 29L, 22L,
>> 29L, 23L, 42L), Count = c(15L, NA, NA, 5L, NA, 17L, 18L, NA,
>> NA, NA, 8L, NA, 20L, NA, NA, 16L, 19L, NA, NA, NA, 13L, NA, 33L
>> )), .Names = c("Position", "Count"), class = "data.frame", row.names =
>> c(NA,
>> -23L))
>>
>>
>> #There might be simple solutions.
>>
>> fun1 <- function(dat,n) {
>> rl <- rle(is.na(dat[,"Count"]))
>> indx <-
>> which(is.na(dat[,"Count"]))[rep(rl$lengths[rl$values],rl$lengths[rl$values])==n]
>> lst1 <- lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) {
>> x1 <- dat[c(min(x)-1L,x,max(x)+1L),]
>> x2 <- x1[!is.na(x1$Count),]
>> datN <-
>> data.frame(Position=max(x2$Position),Count=sum(x2$Count))
>> rowN <- row.names(x2)[x2$Position %in%
>> max(x2$Position)]
>> row.names(datN) <- if(length(rowN)>1) rowN[1] else
>> rowN
>> datN
>> })
>> names(lst1) <- NULL
>> dat2 <- do.call(rbind,lst1)
>> indx2 <-
>> sort(unlist(lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x)
>> c(min(x)-1L,x,c(max(x)+1L))),use.names=FALSE))
>>
>> dat1New <- dat[-indx2[!indx2 %in% row.names(dat2)],]
>> dat1New[match(row.names(dat2),row.names(dat1New)),] <- dat2
>> row.names(dat1New) <- 1:nrow(dat1New)
>> dat1New
>> }
>>
>> dat1N <- fun1(dat1,1)
>> dat1N
>> Position Count
>> 1 15 15
>> 2 22 NA
>> 3 38 NA
>> 4 61 22
>> 5 62 18
>> 6 14 NA
>> 7 29 NA
>> 8 63 NA
>> 9 46 28
>> 10 24 NA
>> 11 22 NA
>> 12 49 16
>> 13 42 19
>> 14 38 NA
>> 15 29 NA
>> 16 22 NA
>> 17 42 46
>>
>> dat2N <- fun1(dat1N,2)
>> dat2N
>> Position Count
>> 1 61 37
>> 2 62 18
>> 3 14 NA
>> 4 29 NA
>> 5 63 NA
>> 6 49 44
>> 7 42 19
>> 8 38 NA
>> 9 29 NA
>> 10 22 NA
>> 11 42 46
>> dat3N <- fun1(dat2N,3)
>> dat3N
>> Position Count
>> 1 61 37
>> 2 62 62
>> 3 42 65
>>
>> A.K.
>>
>>
>>
>>
>>
>>
>>
>>
>>
>> Hi all, I have a dataset with 2 important columns, "Position" and
>> "Count". There are a total of 34,532 rows, but only 457 non-NA values in
>> the "Count" column (every cell in "Position" column has a value). I
>> need to write a loop to march down the rows, and if there are 2 rows in
>> "Count" where there is only 1 NA row between them, sum the two values up
>> and print only one row with the summed Count value and the Position
>> value that corresponds to the larger Count value, thus making the three
>> rows into one. For example:
>>
>> Position Count
>> 15 15
>> 22 NA
>> 38 NA
>> 49 5
>> 55 NA
>> 61 17
>>
>> would become
>>
>> Position Count
>> 15 15
>> 22 NA
>> 38 NA
>> 61 22
>>
>> After this step, I also need to write another script to march
>> down the rows and look for rows with only two NA's between non-NA rows
>> in Count. This would make the previous data become
>>
>> Position Count
>> 61 37
>>
>> Ideally I would like a loop that can be flexibly adjusted to the
>> number of NA's in between adjacent non-NA values that can be freely
>> changed. I would greatly appreciate any insight for this.
>>
>> ______________________________________________
>> R-help at r-project.org mailing list
>> https://stat.ethz.ch/mailman/listinfo/r-help
>> PLEASE do read the posting guide
>> http://www.R-project.org/posting-guide.html
>> and provide commented, minimal, self-contained, reproducible code.
>>
>
> ---------------------------------------------------------------------------
> Jeff Newmiller The ..... ..... Go Live...
> DCN:<jdnewmil at dcn.davis.ca.us> Basics: ##.#. ##.#. Live Go...
> Live: OO#.. Dead: OO#.. Playing
> Research Engineer (Solar/Batteries O.O#. #.O#. with
> /Software/Embedded Controllers) .OO#. .OO#. rocks...1k
> ---------------------------------------------------------------------------
---------------------------------------------------------------------------
Jeff Newmiller The ..... ..... Go Live...
DCN:<jdnewmil at dcn.davis.ca.us> Basics: ##.#. ##.#. Live Go...
Live: OO#.. Dead: OO#.. Playing
Research Engineer (Solar/Batteries O.O#. #.O#. with
/Software/Embedded Controllers) .OO#. .OO#. rocks...1k
---------------------------------------------------------------------------
More information about the R-help
mailing list