[R-SIG-Finance] efficient extraction of local extrema and zero-crossings in large multivariate zoo?

Gabor Grothendieck ggrothendieck at gmail.com
Fri Jun 26 14:46:32 CEST 2009


Try this. It runs about 10x faster.  (Note that the na.rm = FALSE
argument has been added to the na.approx line in find.extrema
to correspond to your description.)


find.extrema = function(myzoo) {
 days = as.POSIXct(levels(as.factor(as.Date(index(myzoo)))))
 ret = myzoo
 ret[] <- NA
 for (tcol in 1:dim(myzoo)[2]) {
   for (day in days) {
     this <- window(myzoo[,tcol], start=day, end=day+24*60*60-1)
     thismax <- this[which.max(this)]
     this[this<0] <- 0 ## remove negative values first
     thiszero <- this[which.min(this)]
     ret[index(ret) == index(thismax), tcol] <- coredata(thismax)
     ret[index(ret) == index(thiszero), tcol] <- coredata(thiszero)
     ### ret[index(thismax), tcol] <- coredata(thismax)  ### gives an error
   } # end days
 } # end tcol
 ret = na.approx(ret, na.rm = FALSE) #### added na.rm=
 return(ret)
}

hours=1e4 # about 2 year's worth of hours
tmp=(as.POSIXct('2001-01-01')+1:hours*60*60)
tmpz=zoo(cbind(a=sin(as.integer(tmp)/1e4), b=sin(as.integer(tmp)/1.1e4)), tmp)
system.time(tmpextrema <- find.extrema(tmpz))
#   25.06    0.05   25.37

f <- function(x) {
	ret <- NA * x
	ix <- ave(x, as.Date(time(tmpz)), FUN = function(x) {
		logic <- rep(FALSE, length(x))
		logic[which.max(x)] <- TRUE
		logic
	})
	ix <- as.logical(ix)
	ret[ix] <- x[ix]
	a <- x
	a[a < 0] <- 0
	ix <- ave(a, as.Date(time(tmpz)), FUN = function(x) {
		logic <- rep(FALSE, length(x))
		logic[which.min(x)] <- TRUE
		logic
	})
	ix <- as.logical(ix)
	ret[ix] <- a[ix]
	na.approx(ret, na.rm = FALSE)
}
system.time(out <- do.call(cbind, lapply(tmpz, f)))
#  2.39    0.02    2.48

all.equal(tmpextrema, out, check.attributes = FALSE)
# TRUE


> find.extrema = function(myzoo) {
+  days = as.POSIXct(levels(as.factor(as.Date(index(myzoo)))))
+  ret = myzoo
+  ret[] <- NA
+  for (tcol in 1:dim(myzoo)[2]) {
+    for (day in days) {
+      this <- window(myzoo[,tcol], start=day, end=day+24*60*60-1)
+      thismax <- this[which.max(this)]
+      this[this<0] <- 0 ## remove negative values first
+      thiszero <- this[which.min(this)]
+      ret[index(ret) == index(thismax), tcol] <- coredata(thismax)
+      ret[index(ret) == index(thiszero), tcol] <- coredata(thiszero)
+      ### ret[index(thismax), tcol] <- coredata(thismax)  ### gives an error
+    } # end days
+  } # end tcol
+  ret = na.approx(ret, na.rm = FALSE)
+  return(ret)
+ }
>
> hours=1e4 # about 2 year's worth of hours
> # hours = 27
> tmp=(as.POSIXct('2001-01-01')+1:hours*60*60)
> tmpz=zoo(cbind(a=sin(as.integer(tmp)/1e4), b=sin(as.integer(tmp)/1.1e4)), tmp)
> system.time(tmpextrema <- find.extrema(tmpz))
   user  system elapsed
  25.06    0.05   25.37
>
> f <- function(x) {
+ ret <- NA * x
+ ix <- ave(x, as.Date(time(tmpz)), FUN = function(x) {
+ logic <- rep(FALSE, length(x))
+ logic[which.max(x)] <- TRUE
+ logic
+ })
+ ix <- as.logical(ix)
+ ret[ix] <- x[ix]
+ a <- x
+ a[a < 0] <- 0
+ ix <- ave(a, as.Date(time(tmpz)), FUN = function(x) {
+ logic <- rep(FALSE, length(x))
+ logic[which.min(x)] <- TRUE
+ logic
+ })
+ ix <- as.logical(ix)
+ ret[ix] <- a[ix]
+ na.approx(ret, na.rm = FALSE)
+ }
> system.time(out <- do.call(cbind, lapply(tmpz, f)))
   user  system elapsed
   2.39    0.02    2.48
>
> all.equal(tmpextrema, out, check.attributes = FALSE)
[1] TRUE


On Thu, Jun 25, 2009 at 6:37 AM, Christian Gunning<icos.atropa at gmail.com> wrote:
> I have an multi-year hour-sampled multivariate zoo from which i'm
> trying to extract index() and coredata() of daily max and
> zero-crossing for each variable and use na.approx() to fill in
> intervening values, so that the resulting zoo has the same dimensions
> of the input zoo. Currently i'm looping over each column and day,
> extracting a day's worth of data with window(), and using which.max()
> to extract the "record".  For short zoos it works fine.  It doesn't
> scale well, though - for example, 2e4 points takes ~4-6 times longer
> than 1e4 points with 2 variables. Am i missing major bottlenecks or
> vectorization potentials here?
>
> find.extrema = function(myzoo) {
>  days = as.POSIXct(levels(as.factor(as.Date(index(myzoo)))))
>  ret = myzoo
>  ret[] <- NA
>  for (tcol in 1:dim(myzoo)[2]) {
>    for (day in days) {
>      this <- window(myzoo[,tcol], start=day, end=day+24*60*60-1)
>      thismax <- this[which.max(this)]
>      this[this<0] <- 0 ## remove negative values first
>      thiszero <- this[which.min(this)]
>      ret[index(ret) == index(thismax), tcol] <- coredata(thismax)
>      ret[index(ret) == index(thiszero), tcol] <- coredata(thiszero)
>      ### ret[index(thismax), tcol] <- coredata(thismax)  ### gives an error
>    } # end days
>  } # end tcol
>  ret = na.approx(ret)
>  return(ret)
> }
>
> hours=1e4 # about 2 year's worth of hours
> tmp=(as.POSIXct('2001-01-01')+1:hours*60*60)
> tmpz=zoo(cbind(a=sin(as.integer(tmp)/1e4), b=sin(as.integer(tmp)/1.1e4)), tmp)
> system.time(tmpextrema <- find.extrema(tmpz))
>
> thanks,
> christian gunning
> university of new mexico
>
> _______________________________________________
> R-SIG-Finance at stat.math.ethz.ch mailing list
> https://stat.ethz.ch/mailman/listinfo/r-sig-finance
> -- Subscriber-posting only.
> -- If you want to post, subscribe first.
>



More information about the R-SIG-Finance mailing list