[R] Loop for taking sum of rows based on proximity to other non-NA rows
arun
smartpink111 at yahoo.com
Mon Oct 21 05:21:45 CEST 2013
Sorry, I noticed that when two "Count" values are the same and NA in between, my function fails.
#Modified
fun1 <- function(dat,n) {
rl <- rle(is.na(dat[,"Count"]))
indx <- which(is.na(dat[,"Count"]))[rep(rl$lengths[rl$values],rl$lengths[rl$values])==n]
lst1 <- lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) {
x1 <- dat[c(min(x)-1L,x,max(x)+1L),]
x2 <- x1[!is.na(x1$Count),]
datN <- data.frame(Position=x2$Position[x2$Count %in% max(x2$Count)],Count=sum(x2$Count))
rowN <- row.names(x2)[x2$Count %in% max(x2$Count)]
datN<- if(length(rowN)>1) datN[1,] else datN
row.names(datN) <- if(length(rowN) >1) rowN[1] else rowN
datN
})
names(lst1) <- NULL
lst1 <- lst1[!duplicated(sapply(lst1,row.names))]
dat2 <- do.call(rbind,lst1)
indx2 <- sort(unlist(lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) c(min(x)-1L,x,c(max(x)+1L))),use.names=FALSE))
dat1New <- dat[-indx2[!indx2 %in% row.names(dat2)],]
dat1New[match(row.names(dat2),row.names(dat1New)),] <- dat2
row.names(dat1New) <- 1:nrow(dat1New)
dat1New
}
#########################
fun2 <- function(dat,n){
indx <- cumsum(c(1,abs(diff(is.na(dat[,"Count"])))))
indx1 <- indx[is.na(dat[,"Count"])]
names(indx1) <- which(is.na(dat[,"Count"]))
indx2 <- indx1[indx1 %in% names(table(indx1))[table(indx1)==n]]
lst1 <- tapply(seq_along(indx2),list(indx2),FUN=function(i) {
x1 <- indx2[i]
x2 <- as.numeric(names(x1))
x3 <- dat[c(min(x2)-1L,x2,max(x2)+1L),]
x4 <- subset(x3, !is.na(Count))
x5 <- data.frame(Position=x4$Position[x4$Count %in% max(x4$Count)],Count=sum(x4$Count))
ind <- x4$Count %in% max(x4$Count)
row.names(x5) <- row.names(x4)[ind]
x5 <- if(sum(ind)>1) x5[1,] else x5
x5
})
attr(lst1,"dimnames") <- NULL
dat2 <- do.call(rbind,lst1)
indx3 <- sort(unlist(tapply(seq_along(indx2),list(indx2),FUN=function(i) {x1 <- indx2[i]
x2 <- as.numeric(names(x1))
c(min(x2)-1L, x2, max(x2)+1L)}),use.names=FALSE))
dat$id <- 1:nrow(dat)
dat2$id <- as.numeric(row.names(dat2))
library(plyr)
res <- join(dat,dat2[,-1],by="id",type="left")
res1 <- res[!((row.names(res) %in% indx3) & is.na(res[,4])),]
res1[,2][!is.na(res1[,4])] <- res1[,4][!is.na(res1[,4])]
res2 <- res1[,1:2]
row.names(res2) <- 1:nrow(res2)
res2
}
identical(fun1(dat1,1),fun2(dat1,1))
#[1] TRUE
identical(fun1(fun1(dat1,1),2),fun2(fun2(dat1,1),2))
#[1] TRUE
identical(fun1(fun1(fun1(dat1,1),2),3),fun2(fun2(fun2(dat1,1),2),3))
#[1] TRUE
fun1(fun1(fun1(dat1,1),2),3)
# Position Count
#1 61 37
#2 18 62
#3 42 65
##When I tried the function on a bigger dataset:
set.seed(185)
datT <- data.frame(Position = sample(10:80,1e5,replace=TRUE),Count= sample(c(NA, 10:100),1e5, replace=TRUE))
dim(datT)
#[1] 100000 2
system.time(res <- fun1(datT,1))
# user system elapsed
# 0.708 0.000 0.709
system.time(res2 <- fun2(datT,1))
# user system elapsed
# 1.400 0.016 1.421
system.time(res3 <- removeNNAs(datT,1))
# user system elapsed
# 1.068 0.000 1.071
all.equal(res,res2)
#[1] TRUE
all.equal(res,res3)
#[1] "Attributes: < Component 2: Numeric: lengths (97786, 97778) differ >"
#[2] "Component 1: Numeric: lengths (97786, 97778) differ"
#[3] "Component 2: Numeric: lengths (97786, 97778) differ"
dim(res)
#[1] 97786 2
dim(res3)
#[1] 97778 2
##Here your function seems to give the correct number of rows as:
rl <- rle(is.na(datT[,"Count"]))
indx <- which(is.na(datT[,"Count"]))[rep(rl$lengths[rl$values],rl$lengths[rl$values])==1]
dim(datT)[1]- 2*length(indx)
#[1] 97778
#Here is where I think the difference occur (in addition to the one with the values)
datS <- datT[16000:20000,]
row.names(datS) <- 1:nrow(datS)
resT <- fun1(datS,1)
resT3 <- removeNNAs(datS,1)
datS[3402:3408,]
Position Count
3402 72 70
3403 38 51
3404 80 NA
3405 26 44
3406 42 NA
3407 78 77
3408 70 89
resT3[3311:3318,]
Position Count
3401 54 65
3402 72 70
3407 78 172######
3408 70 89
3409 27 40
3410 44 44
3411 73 75
3412 73 76
resT[3311:3318,]
Position Count
3311 29 98
3312 54 65
3313 72 70
3314 38 95####
3315 78 121 ###
3316 70 89
3317 27 40
3318 44 44
In these conditions, the post is not very clear about dealing it.
A.K.
On Sunday, October 20, 2013 9:36 PM, arun <smartpink111 at yahoo.com> wrote:
Hi Jeff,
I found some difference in results between your function and mine. It also point out a mistake in my code. In the original post, it says:
"""""""""""
I need to write a loop to march down the rows, and if there are 2 rows in
"Count"
where there is only 1 NA row between them, sum the two values up and
print only one row with the summed Count value and the Position value that corresponds to the larger Count value, thus making the three
rows into one.
"""""""""
Sorry, I read it incorrectly the last time and selected the maximum "Position" value instead of that corresponds to the larger Count value.
After correcting the function, there is still some difference between the results.
##fun1() and fun2() corrected
fun1 <- function(dat,n) {
rl <- rle(is.na(dat[,"Count"]))
indx <- which(is.na(dat[,"Count"]))[rep(rl$lengths[rl$values],rl$lengths[rl$values])==n]
lst1 <- lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) {
x1 <- dat[c(min(x)-1L,x,max(x)+1L),]
x2 <- x1[!is.na(x1$Count),]
datN <- data.frame(Position=x2$Position[x2$Count %in% max(x2$Count)],Count=sum(x2$Count))
rowN <- row.names(x2)[x2$Count %in% max(x2$Count)]
row.names(datN) <- if(length(rowN)>1) rowN[1] else rowN
datN
})
names(lst1) <- NULL
lst1 <- lst1[!duplicated(sapply(lst1,row.names))] ######added
dat2 <- do.call(rbind,lst1)
indx2 <- sort(unlist(lapply(split(indx,((seq_along(indx)-1)%/%n)+1),function(x) c(min(x)-1L,x,c(max(x)+1L))),use.names=FALSE))
dat1New <- dat[-indx2[!indx2 %in% row.names(dat2)],]
dat1New[match(row.names(dat2),row.names(dat1New)),] <- dat2
row.names(dat1New) <- 1:nrow(dat1New)
dat1New
}
##################################
fun2 <- function(dat,n){
indx <- cumsum(c(1,abs(diff(is.na(dat[,"Count"])))))
indx1 <- indx[is.na(dat[,"Count"])]
names(indx1) <- which(is.na(dat[,"Count"]))
indx2 <- indx1[indx1 %in% names(table(indx1))[table(indx1)==n]]
lst1 <- tapply(seq_along(indx2),list(indx2),FUN=function(i) {
x1 <- indx2[i]
x2 <- as.numeric(names(x1))
x3 <- dat[c(min(x2)-1L,x2,max(x2)+1L),]
x4 <- subset(x3, !is.na(Count))
x5 <- data.frame(Position=x4$Position[x4$Count %in% max(x4$Count)],Count=sum(x4$Count))
ind <- x4$Count %in% max(x4$Count)
row.names(x5) <- if(sum(ind)>1) row.names(x4)[ind][1] else row.names(x4)[ind]
x5
})
attr(lst1,"dimnames") <- NULL
dat2 <- do.call(rbind,lst1)
indx3 <- sort(unlist(tapply(seq_along(indx2),list(indx2),FUN=function(i) {x1 <- indx2[i]
x2 <- as.numeric(names(x1))
c(min(x2)-1L, x2, max(x2)+1L)}),use.names=FALSE))
dat$id <- 1:nrow(dat)
dat2$id <- as.numeric(row.names(dat2))
library(plyr)
res <- join(dat,dat2[,-1],by="id",type="left")
res1 <- res[!((row.names(res) %in% indx3) & is.na(res[,4])),]
res1[,2][!is.na(res1[,4])] <- res1[,4][!is.na(res1[,4])]
res2 <- res1[,1:2]
row.names(res2) <- 1:nrow(res2)
res2
}
dat1 <- structure(list(Position = c(15L, 22L, 38L, 49L, 55L, 61L, 62L,
14L, 29L, 63L, 46L, 22L, 18L, 24L, 22L, 49L, 42L, 38L, 29L, 22L,
29L, 23L, 42L), Count = c(15L, NA, NA, 5L, NA, 17L, 18L, NA,
NA, NA, 8L, NA, 20L, NA, NA, 16L, 19L, NA, NA, NA, 13L, NA, 33L
)), .Names = c("Position", "Count"), class = "data.frame", row.names = c(NA,
-23L))
fun1(dat1,1)
Position Count
1 15 15
2 22 NA
3 38 NA
4 61 22
5 62 18
6 14 NA
7 29 NA
8 63 NA
9 18 28 ###
10 24 NA
11 22 NA
12 49 16 ####
13 42 19
14 38 NA
15 29 NA
16 22 NA
17 42 46
removeNNAs(dat1,1) #gets similar results
#but,
fun1(fun1(dat1,1),2)
Position Count
1 61 37
2 62 18
3 14 NA
4 29 NA
5 63 NA
6 18 44 #######different
7 42 19
8 38 NA
9 29 NA
10 22 NA
11 42 46
removeNNAs(dat1,2,lessOrEqual=TRUE)
Position Count
6 61 37
7 62 18
8 14 NA
9 29 NA
10 63 NA
16 49 44 ###### different
17 42 19
18 38 NA
19 29 NA
20 22 NA
23 42 46
>
removeNNAs(dat1,3,lessOrEqual=TRUE)
Position Count
6 61 37
16 49 62
23 42 65
fun1(fun1(fun1(dat1,1),2),3)
Position Count
1 61 37
2 18 62
3 42 65
A.K.
On Sunday, October 20, 2013 7:49 PM, Jeff Newmiller <jdnewmil at dcn.davis.ca.us> wrote:
Looks like a right parenthesis was dropped. Corrected:
removeNNAs <- function( dat, N, lessOrEqual=FALSE ) {
N1 <- N+1
rx <- rle( !is.na( dat$Count ) )
# indexes of the ends of each run of NAs or non-NAs
cs <- cumsum( rx$lengths )
# indexes of the ends of runs of NAs or non-NAs
cs2 <- cs[ !rx$values ]
# If the first Count is NA, then drop first run of NAs
if ( !rx$values[1] ) {
cs2 <- cs2[ -1 ]
}
# If the last Count is NA, then drop last run of NAs
if ( !rx$values[ length( rx$values ) ] ) {
cs2 <- cs2[ -length( cs2 ) ]
}
# cs2 is indexes of rows to potentially receive deleted Counts
# after collapse
cs2 <- cs2 + 1
# cs1 is indexes of non-NA Counts to be deleted
cs1 <- cs[ rx$values ][ seq.int( length( cs2 ) ) ]
# identify the indexes of the Count values before the strings
# of NAs that meet the criteria
if ( lessOrEqual ) {
idx0 <- N1 >= ( cs2 - cs1 )
} else {
idx0 <- N1 == ( cs2 - cs1 )
}
idx1 <- cs1[ idx0 ]
# identify the indexes of the Count values after the strings of
# NAs that meet the criteria
idx2 <- cs2[ idx0 ]
# Identify which indexes are both sources and destinations
idx1c <-c( idx2[ -length( idx2 ) ] == idx1[ -1 ], FALSE )
# identify groups of indexes that need to be merged
idx1g <- rev( cumsum( rev( !idx1c ) ) )
# find which elements of idx1 represent the beginning of a
# sequence of indexes to be replaced (meta-indexes)
srcmidxs <- which( -1 == diff( c( idx1g[ 1 ] + 1, idx1g ) ) )
# find which elements of idx2 represent the end of a sequence
# to be replaced (meta-indexes)
destmidxs <- which( 1 == rev( diff( rev( c( idx1g, 0 ) ) ) ) )
# add counts from before NAs to destination rows
result <- dat
srcidxList <- vector( mode="list", length=length( destmidxs ) )
for ( i in seq.int( length( destmidxs ) ) ) {
# row to which data will be copied
destidx <- idx2[ destmidxs[ i ] ]
# sequence of indexes of source rows
srcidxss <- seq.int( from=idx1[ srcmidxs[ i ] ], to=destidx - 1 )
result[ destidx, "Count" ] <- ( dat[ destidx, "Count" ]
+ sum( dat[ srcidxss, "Count" ], na.rm=TRUE ) )
# keep a list of indexes to be removed
srcidxList[ i ] <- list( srcidxss )
}
# remove source rows
result <- result[ -unlist( srcidxList ), ]
result
}
More information about the R-help
mailing list