[R] dataframe to a timeseries object

Mon Mar 14 12:48:01 CET 2011

Well, I'd start by removing all explicit use of environments, which
makes you code very hard to follow.

Hadley

On Monday, March 14, 2011, Daniele Amberti <daniele.amberti at ors.it> wrote:
> I found that plyr:::daply is more efficient than base:::by (am I doing something wrong?), below updated code for comparison (I also fixed a couple things).
> Function daply from plyr package has also a .parallel argument and I wonder if creating timeseries objects in parallel and then combining them would be faster (Windows XP platform); does someone has experience with this topic? I found only very simple examples about plyr and parallel computations and I do not have a working example for such kind of implementation (daply that return a list of timeseries objects).
>
> Thanks in advance,
> Daniele Amberti
>
>
> set.seed(123)
>
> N <- 10000
> X <- data.frame(
>   ID = c(rep(1,N), rep(2,N,), rep(3,N), rep(4,N)),
>   DATE = as.character(rep(as.POSIXct("2000-01-01", tz = "GMT")+ 0:(N-1), 4)),
>   VALUE = runif(N*4), stringsAsFactors = FALSE)
> X <- X[sample(1:(N*4), N*4),]
> str(X)
>
> library(timeSeries)
> buildTimeSeriesFromDataFrame <- function(x, env)
> {
>   {
>     if(exists("xx", envir = env))
>       assign("xx",
>         cbind(get("xx", env), timeSeries(x$VALUE, x$DATE,
>           format = '%Y-%m-%d %H:%M:%S',
>           zone = 'GMT', units = as.character(x$ID[1]))),
>         envir = env)
>     else
>       assign("xx",
>         timeSeries(x$VALUE, x$DATE, format = '%Y-%m-%d %H:%M:%S',
>           zone = 'GMT', units = as.character(x$ID[1])),
>         envir = env)
>
>     return(TRUE)
>   }
> }
>
> tsBy <- function(...)
> {
>   e1 <- new.env(parent = baseenv())
>   res <- by(X, X$ID, buildTimeSeriesFromDataFrame,
>       env = e1, simplify = TRUE)
>   return(get("xx", e1))
> }
>
> Time01 <- replicate(100,
>   system.time(tsBy(X, X$ID, simplify = TRUE))[[1]])
> median(Time01)
> hist(Time01)
> ATS <- tsBy(X, X$ID, simplify = TRUE)
>
>
> library(xts)
> buildXtsFromDataFrame <- function(x, env)
> {
>   {
>     if(exists("xx", envir = env))
>       assign("xx",
>         cbind(get("xx", env), xts(x$VALUE,
>           as.POSIXct(x$DATE, tz = "GMT",
>             format = '%Y-%m-%d %H:%M:%S'),
>           tzone = 'GMT')),
>         envir = env)
>     else
>       assign("xx",
>         xts(x$VALUE, as.POSIXct(x$DATE, tz = "GMT",
>             format = '%Y-%m-%d %H:%M:%S'),
>           tzone = 'GMT'),
>         envir = env)
>
>     return(TRUE)
>   }
> }
>
> xtsBy <- function(...)
> {
>   e1 <- new.env(parent = baseenv())
>   res <- by(X, X$ID, buildXtsFromDataFrame,
>       env = e1, simplify = TRUE)
>   return(get("xx", e1))
> }
>
> Time02 <- replicate(100,
>   system.time(xtsBy(X, X$ID,simplify = TRUE))[[1]])
> median(Time02)
> hist(Time02)
> AXTS <- xtsBy(X, X$ID, simplify = TRUE)
>
> plot(density(Time02), col = "red",
>   xlim = c(min(c(Time02, Time01)), max(c(Time02, Time01))))
> lines(density(Time01), col = "blue")
> #check equal, a still a problem with names
> AXTS2 <- as.timeSeries(AXTS)
> names(AXTS2) <- names(ATS)
> identical(getDataPart(ATS), getDataPart(AXTS2))
> identical(time(ATS), time(AXTS2))
>
> # with plyr library and daply instead of by:
> library(plyr)
>
> tsDaply <- function(...)
> {
>   e1 <- new.env(parent = baseenv())
>   res <- daply(X, "ID", buildTimeSeriesFromDataFrame,
>       env = e1)
>   return(get("xx", e1))
> }
>
> Time03 <- replicate(100,
>   system.time(tsDaply(X, X$ID))[[1]])
> median(Time03)
> hist(Time03)
>
> xtsDaply <- function(...)
> {
>   e1 <- new.env(parent = baseenv())
>   res <- daply(X, "ID", buildXtsFromDataFrame,
>       env = e1)
>   return(get("xx", e1))
> }
>
> Time04 <- replicate(100,
>   system.time(xtsDaply(X, X$ID))[[1]])
>
> median(Time04)
> hist(Time04)
>
> plot(density(Time04), col = "red",
>   xlim = c(
>     min(c(Time02, Time01, Time03, Time04)),
>     max(c(Time02, Time01, Time03, Time04))),
>   ylim = c(0,100))
> lines(density(Time03), col = "blue")
> lines(density(Time02))
> lines(density(Time01))
>
>
>
>
>
> -----Original Message-----
> From: Daniele Amberti
> Sent: 11 March 2011 14:44
> To: r-help at r-project.org
> Subject: dataframe to a timeseries object
>
> I’m wondering which is the most efficient (time, than memory usage) way to obtain a multivariate time series object from a data frame (the easiest data structure to get data from a database trough RODBC).
> I have a starting point using timeSeries or xts library (these libraries can handle time zones), below you can find code to test.
> Merging parallelization (cbind) is something I’m thinking at (suggestions from users with experience on this topic is highly appreciated), any suggestion is welcome.
> My platform is Windows XP, R 2.12.1, latest available packages on CRAN for timeSeries and xts.
>
>
> set.seed(123)
>
> N <- 9000
> X <- data.frame(
>   ID = c(rep(1,N), rep(2,N,), rep(3,N), rep(4,N)),
>   DATE = rep(as.POSIXct("2000-01-01", tz = "GMT")+ 0:(N-1), 4),
>   VALUE = runif(N*4))
>
> library(timeSeries)
> buildTimeSeriesFromDataFrame <- function(x, env)
> {
>   {
>     if(exists("xx", envir = env))
>       assign("xx",
>         cbind(get("xx", env), timeSeries(x$VALUE, x$DATE, format = '%Y-%m-%d %H:%M:%S',
>           zone = 'GMT', units = as.character(x$ID[1]))),
>         envir = env)
>     else
>       assign("xx",
>         timeSeries(x$VALUE, x$DATE, format = '%Y-%m-%d %H:%M:%S',
>           zone = 'GMT', units = as.character(x$ID[1])),
>         envir = env)
>
>     return(TRUE)
>   }
> }
>
>
> fooBy <- function(...)
> {
>   e1 <- new.env(parent = baseenv())
>   res <- by(X, X$ID, buildTimeSeriesFromDataFrame,
>       env = e1, simplify = TRUE)
>   return(get("xx", e1))
> }
>
> Time01 <- replicate(100,
>   system.time(fooBy(X,
>     X$ID, buildTimeSeriesFromDataFrame,
>     simplify = TRUE))[[1]])
>
> median(Time01)
> hist(Time01)
>
> library(xts)
>
> buildXtsFromDataFrame <- function(x, env)
> {
>   {
>     if(exists("xx", envir = env))
>       assign("xx",
>         cbind(get("xx", env), xts(x$VALUE,
>           as.POSIXct(x$DATE, format = '%Y-%m-%d %H:%M:%S'),
>           tzone = 'GMT')),
>         envir = env)
>     else
>       assign("xx",
>         xts(x$VALUE, as.POSIXct(x$DATE, format = '%Y-%m-%d %H:%M:%S'),
>           tzone = 'GMT'),
>         envir = env)
>
>     return(TRUE)
>   }
> }
>
> fooBy <- function(...)
> {
>   e1 <- new.env(parent = baseenv())
>   res <- by(X, X$ID, buildXtsFromDataFrame,
>       env = e1, simplify = TRUE)
>   return(get("xx", e1))
> }
>
> Time02 <- replicate(100,
>   system.time(fooBy(X,
>     X$ID, buildTimeSeriesFromDataFrame,
>     simplify = TRUE))[[1]])
>
> median(Time02)
> hist(Time02)
>
> plot(density(Time02), xlim = c(min(c(Time02, Time01)), max(c(Time02, Time01))))
> lines(density(Time01))
>
>
> Best regards,
> Daniele Amberti
>
> ORS Srl
>
> Via Agostino Morando 1/3 12060 Roddi (Cn) - Italy
> Tel. +39 0173 620211
> Fax. +39 0173 620299 / +39 0173 433111
> Web Site www.ors.it
>
> ------------------------------------------------------------------------------------------------------------------------
> Qualsiasi utilizzo non autorizzato del presente messaggio e dei suoi allegati è vietato e potrebbe costituire reato.
> Se lei avesse ricevuto erroneamente questo messaggio, Le saremmo grati se provvedesse alla distruzione dello stesso
> e degli eventuali allegati.
> Opinioni, conclusioni o altre informazioni riportate nella e-mail, che non siano relative alle attività e/o
> alla missione aziendale di O.R.S. Srl si intendono non  attribuibili alla società stessa, né la impegnano in alcun modo.
> ______________________________________________
> R-help at r-project.org mailing list
> https://stat.ethz.ch/mailman/listinfo/r-help
> PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
> and provide commented, minimal, self-contained, reproducible code.
>

-- 
Assistant Professor / Dobelman Family Junior Chair
Department of Statistics / Rice University
http://had.co.nz/