[R] How to extract last value in each group
arun
smartpink111 at yahoo.com
Thu Aug 15 21:40:30 CEST 2013
Speed comparison:
dat1<-structure(list(Date = c("06/01/2010", "06/01/2010", "06/01/2010",
"06/01/2010", "06/02/2010", "06/02/2010", "06/02/2010", "06/02/2010",
"06/02/2010", "06/02/2010", "06/02/2010"), Time = c(1358L, 1359L,
1400L, 1700L, 331L, 332L, 334L, 335L, 336L, 337L, 338L), O = c(136.4,
136.4, 136.45, 136.55, 136.55, 136.7, 136.75, 136.8, 136.8, 136.75,
136.8), H = c(136.4, 136.5, 136.55, 136.55, 136.7, 136.7, 136.75,
136.8, 136.8, 136.8, 136.8), L = c(136.35, 136.35, 136.35, 136.55,
136.5, 136.65, 136.75, 136.8, 136.8, 136.75, 136.8), C = c(136.35,
136.5, 136.4, 136.55, 136.7, 136.65, 136.75, 136.8, 136.8, 136.8,
136.8), U = c(2L, 9L, 8L, 1L, 36L, 3L, 1L, 4L, 8L, 1L, 3L), D = c(12L,
6L, 7L, 0L, 6L, 1L, 0L, 0L, 0L, 2L, 0L)), .Names = c("Date",
"Time", "O", "H", "L", "C", "U", "D"), class = "data.frame", row.names = c(NA,
-11L))
indx<- rep(1:nrow(dat1),1e5)
dat2<- dat1[indx,]
dat2[-c(1:11),1]<-format(rep(seq(as.Date("1080-01-01"),by=1,length.out=99999),each=11),"%m/%d/%Y")
dat2<- dat2[order(dat2[,1],dat2[,2]),]
row.names(dat2)<-1:nrow(dat2)
library(data.table)
library(plyr)
## Functions
isLastInRun <- function(x) c(x[-1] != x[-length(x)], TRUE)
f3 <- function(dataFrame) {
dataFrame[ isLastInRun(dataFrame$Date), ]
}
f1 <- function (dataFrame) {
dataFrame[unlist(with(dataFrame, tapply(Time, list(Date), FUN = function(x) x == max(x)))), ]
}
f2 <- function (dataFrame) {
dataFrame[cumsum(with(dataFrame, tapply(Time, list(Date), FUN = which.max))), ]
}
f4<- function(dataFrame){
dataFrame[as.logical(with(dataFrame,ave(Time,Date,FUN=function(x) x==max(x)))),]
}
#Comparison
system.time(res1<-dat2[c(diff(as.numeric(as.factor(dat2$Date))),1)>0,])
# user system elapsed
# 0.500 0.000 0.501
system.time(res2<-f3(dat2))
# user system elapsed
# 0.316 0.000 0.318
identical(res1,res2)
#[1] TRUE
system.time(res3<-f1(dat2))
#user system elapsed
# 2.272 0.000 2.278
system.time(res4<-f2(dat2))
# user system elapsed
# 0.932 0.000 0.935
identical(res1,res3)
#[1] TRUE
identical(res1,res4)
#[1] TRUE
system.time(res5<-aggregate(dat2[-1], dat2[1], tail, 1))
# user system elapsed
# 26.784 0.008 26.840
row.names(res5)<- row.names(res1)
attr(res5,"row.names")<- attr(res1,"row.names")
identical(res5,res1)
#[1] TRUE
system.time(res6<- dat2[ tapply(rownames(dat2), dat2$Date, tail, 1) , ] )
# user system elapsed
#392.124 0.008 392.880
identical(res1,res6)
#[1] TRUE
system.time(res7<- dat2[cumsum(rle(dat2[,1])$lengths),]) #shortest time
# user system elapsed
# 0.152 0.000 0.153
identical(res1,res7)
#[1] TRUE
system.time(res8<-ddply(dat2, .(Date), function(df) df[which.max(df$Time),]))
# user system elapsed
#195.580 1.988 197.995
row.names(res8)<- row.names(res1)
attr(res8,"row.names")<- attr(res1,"row.names")
identical(res1,res8)
#[1] TRUE
system.time(res9<- f4(dat2))
# user system elapsed
# 0.764 0.000 0.767
identical(res1,res9)
#[1] TRUE
system.time({
dt1 <- data.table(dat2, key=c('Date', 'Time'))
ans <- dt1[, .SD[.N], by='Date']})
# user system elapsed
# 37.384 0.000 37.454
#separate the data.table creation step:
dt1 <- data.table(dat2, key=c('Date', 'Time'))
system.time(ans <- dt1[, .SD[.N], by='Date'])
# user system elapsed
# 38.500 0.000 38.566
ans1<- as.data.frame(ans)
row.names(ans1)<- row.names(res1)
attr(ans1,"row.names")<- attr(res1,"row.names")
identical(ans1,res1)
#[1] TRUE
A.K.
----- Original Message -----
From: Steve Lianoglou <lianoglou.steve at gene.com>
To: William Dunlap <wdunlap at tibco.com>
Cc: arun <smartpink111 at yahoo.com>; Noah Silverman <noahsilverman at ucla.edu>; R help <r-help at r-project.org>
Sent: Wednesday, August 14, 2013 5:22 PM
Subject: Re: [R] How to extract last value in each group
Or with plyr:
R> library(plyr)
R> ans <- ddply(x, .(Date), function(df) df[which.max(df$Time),])
-steve
On Wed, Aug 14, 2013 at 2:18 PM, Steve Lianoglou
<lianoglou.steve at gene.com> wrote:
> While we're playing code golf, likely faster still could be to use
> data.table. Assume your data is in a data.frame named "x":
>
> R> library(data.table)
> R> x <- data.table(x, key=c('Date', 'Time'))
> R> ans <- x[, .SD[.N], by='Date']
>
> -steve
>
> On Wed, Aug 14, 2013 at 2:01 PM, William Dunlap <wdunlap at tibco.com> wrote:
>> A somewhat faster version (for datasets with lots of dates, assuming it is sorted by date and time) is
>> isLastInRun <- function(x) c(x[-1] != x[-length(x)], TRUE)
>> f3 <- function(dataFrame) {
>> dataFrame[ isLastInRun(dataFrame$Date), ]
>> }
>> where your two suggestions, as functions, are
>> f1 <- function (dataFrame) {
>> dataFrame[unlist(with(dataFrame, tapply(Time, list(Date), FUN = function(x) x == max(x)))), ]
>> }
>> f2 <- function (dataFrame) {
>> dataFrame[cumsum(with(dataFrame, tapply(Time, list(Date), FUN = which.max))), ]
>> }
>>
>> Bill Dunlap
>> Spotfire, TIBCO Software
>> wdunlap tibco.com
>>
>>
>>> -----Original Message-----
>>> From: r-help-bounces at r-project.org [mailto:r-help-bounces at r-project.org] On Behalf
>>> Of arun
>>> Sent: Wednesday, August 14, 2013 1:08 PM
>>> To: Noah Silverman
>>> Cc: R help
>>> Subject: Re: [R] How to extract last value in each group
>>>
>>> Hi,
>>> Try:
>>> dat1<- read.table(text="
>>> Date Time O H L C U D
>>> 06/01/2010 1358 136.40 136.40 136.35 136.35 2 12
>>> 06/01/2010 1359 136.40 136.50 136.35 136.50 9 6
>>> 06/01/2010 1400 136.45 136.55 136.35 136.40 8 7
>>> 06/01/2010 1700 136.55 136.55 136.55 136.55 1 0
>>> 06/02/2010 331 136.55 136.70 136.50 136.70 36 6
>>> 06/02/2010 332 136.70 136.70 136.65 136.65 3 1
>>> 06/02/2010 334 136.75 136.75 136.75 136.75 1 0
>>> 06/02/2010 335 136.80 136.80 136.80 136.80 4 0
>>> 06/02/2010 336 136.80 136.80 136.80 136.80 8 0
>>> 06/02/2010 337 136.75 136.80 136.75 136.80 1 2
>>> 06/02/2010 338 136.80 136.80 136.80 136.80 3 0
>>> ",sep="",header=TRUE,stringsAsFactors=FALSE)
>>>
>>> dat1[unlist(with(dat1,tapply(Time,list(Date),FUN=function(x) x==max(x)))),]
>>> # Date Time O H L C U D
>>> #4 06/01/2010 1700 136.55 136.55 136.55 136.55 1 0
>>> #11 06/02/2010 338 136.80 136.80 136.80 136.80 3 0
>>> #or
>>> dat1[cumsum(with(dat1,tapply(Time,list(Date),FUN=which.max))),]
>>> Date Time O H L C U D
>>> 4 06/01/2010 1700 136.55 136.55 136.55 136.55 1 0
>>> 11 06/02/2010 338 136.80 136.80 136.80 136.80 3 0
>>>
>>> #or
>>> dat1[as.logical(with(dat1,ave(Time,Date,FUN=function(x) x==max(x)))),]
>>> # Date Time O H L C U D
>>> #4 06/01/2010 1700 136.55 136.55 136.55 136.55 1 0
>>> #11 06/02/2010 338 136.80 136.80 136.80 136.80 3 0
>>> A.K.
>>>
>>>
>>>
>>>
>>> ----- Original Message -----
>>> From: Noah Silverman <noahsilverman at ucla.edu>
>>> To: "R-help at r-project.org" <r-help at r-project.org>
>>> Cc:
>>> Sent: Wednesday, August 14, 2013 3:56 PM
>>> Subject: [R] How to extract last value in each group
>>>
>>> Hello,
>>>
>>> I have some stock pricing data for one minute intervals.
>>>
>>> The delivery format is a bit odd. The date column is easily parsed and used as an index
>>> for an its object. However, the time column is just an integer (1:1807)
>>>
>>> I just need to extract the *last* entry for each day. Don't actually care what time it was,
>>> as long as it was the last one.
>>>
>>> Sure, writing a big nasty loop would work, but I was hoping that someone would be able
>>> to suggest a faster way.
>>>
>>> Small snippet of data below my sig.
>>>
>>> Thanks!
>>>
>>>
>>> --
>>> Noah Silverman, M.S., C.Phil
>>> UCLA Department of Statistics
>>> 8117 Math Sciences Building
>>> Los Angeles, CA 90095
>>>
>>> --------------------------------------------------------------------------
>>>
>>> Date Time O H L C U D
>>> 06/01/2010 1358 136.40 136.40 136.35 136.35 2 12
>>> 06/01/2010 1359 136.40 136.50 136.35 136.50 9 6
>>> 06/01/2010 1400 136.45 136.55 136.35 136.40 8 7
>>> 06/01/2010 1700 136.55 136.55 136.55 136.55 1 0
>>> 06/02/2010 331 136.55 136.70 136.50 136.70 36 6
>>> 06/02/2010 332 136.70 136.70 136.65 136.65 3 1
>>> 06/02/2010 334 136.75 136.75 136.75 136.75 1 0
>>> 06/02/2010 335 136.80 136.80 136.80 136.80 4 0
>>> 06/02/2010 336 136.80 136.80 136.80 136.80 8 0
>>> 06/02/2010 337 136.75 136.80 136.75 136.80 1 2
>>> 06/02/2010 338 136.80 136.80 136.80 136.80 3 0
>>> ______________________________________________
>>> R-help at r-project.org mailing list
>>> https://stat.ethz.ch/mailman/listinfo/r-help
>>> PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
>>> and provide commented, minimal, self-contained, reproducible code.
>>>
>>>
>>> ______________________________________________
>>> R-help at r-project.org mailing list
>>> https://stat.ethz.ch/mailman/listinfo/r-help
>>> PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
>>> and provide commented, minimal, self-contained, reproducible code.
>>
>> ______________________________________________
>> R-help at r-project.org mailing list
>> https://stat.ethz.ch/mailman/listinfo/r-help
>> PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
>> and provide commented, minimal, self-contained, reproducible code.
>
>
>
> --
> Steve Lianoglou
> Computational Biologist
> Bioinformatics and Computational Biology
> Genentech
--
Steve Lianoglou
Computational Biologist
Bioinformatics and Computational Biology
Genentech
More information about the R-help
mailing list