[R] dplyr - add/expand rows

Tóth Dénes toth.denes at kogentum.hu
Wed Nov 29 22:15:58 CET 2017


Hi,

A benchmarking study with an additional (data.table-based) solution. 
Enjoy! ;)

Cheers,
Denes


--------------------------


## packages ##########################

library(dplyr)
library(data.table)
library(IRanges)
library(microbenchmark)

## prepare example dataset ###########

## use Bert's example, with 2000 stations instead of 2
d_df <- data.frame( station = rep(rep(c("one","two"),c(5,4)), 1000L),
                     from = as.integer(c(60,61,71,72,76,60,65,82,83)),
                     to = as.integer(c(60,70,71,76,83,64, 81, 82,83)),
                     record = c("A","B","C","B","D","B","B","D","E"),
                     stringsAsFactors = FALSE)
stations <- rle(d_df$station)
stations$value <- gsub(
   " ", "0",
   paste0("station", format(1:length(stations$value), width = 6)))
d_df$station <- rep(stations$value, stations$lengths)

## prepare tibble and data.table versions
d_tbl <- as_tibble(d_df)
d_dt <- as.data.table(d_df)

## solutions ##########################

## Bert - by
fun_bert <- function(d) {
   out <- by(
     d, d$station, function(x) with(x, {
       i <- to - from +1
       data.frame(record =rep(record,i),
                  year =sequence(i) -1 + rep(from,i),
                  stringsAsFactors = FALSE)
     }))
   data.frame(station = rep(names(out), sapply(out,nrow)),
              do.call(rbind,out),
              row.names = NULL,
              stringsAsFactors = FALSE)
}

## Bill - transform
fun_bill <- function(d) {
   i <- rep(seq_len(nrow(d)), d$to-d$from+1)
   j <- sequence(d$to-d$from+1)
   transform(d[i,], year=from+j-1, from=NULL, to=NULL)
}

## Michael - IRanges
fun_michael <- function(d) {
   df <- with(d, DataFrame(station, record, year=IRanges(from, to)))
   expand(df, "year")
}

## Jim - dplyr
fun_jim <- function(d) {
   d %>%
     rowwise() %>%
     do(tibble(station = .$station,
               record = .$record,
               year = seq(.$from, .$to))
     )
}

## Martin - Map
fun_martin <- function(d) {
   d$year <- with(d, Map(seq, from, to))
   res0 <- with(d, Map(data.frame,
                       station=station,
                       record=record,
                       year=year,
                       MoreArgs = list(stringsAsFactors = FALSE)))
   do.call(rbind, unname(res0))
}

## Denes - simple data.table
fun_denes <- function(d) {
   out <- d[, .(year = from:to), by = .(station, from, record)]
   out[, from := NULL]
}

## Check equality ################################
all.equal(fun_bill(d_df), fun_bert(d_df),
           check.attributes = FALSE)
all.equal(fun_bill(d_df), fun_martin(d_df),
           check.attributes = FALSE)
all.equal(fun_bill(d_df), as.data.frame(fun_michael(d_df)),
           check.attributes = FALSE)
all.equal(fun_bill(d_df), as.data.frame(fun_denes(d_dt)),
           check.attributes = FALSE)
# Be prepared: this solution is super slow
all.equal(fun_bill(d_df), as.data.frame(fun_jim(d_tbl)),
           check.attributes = FALSE)

## Benchmark #####################################

## Martin
print(system.time(fun_martin(d_df)))

## Bert
print(system.time(fun_bert(d_df)))

## Top 3
print(
   microbenchmark(
     fun_bill(d_df),
     fun_michael(d_df),
     fun_denes(d_dt),
     times = 100L
   )
)


-------------------------

On 11/28/2017 06:49 PM, Michael Lawrence wrote:
> Or with the Bioconductor IRanges package:
> 
> df <- with(input, DataFrame(station, year=IRanges(from, to), record))
> expand(df, "year")
> 
> DataFrame with 24 rows and 3 columns
>          station     year      record
>      <character> <integer> <character>
> 1       07EA001      1960         QMS
> 2       07EA001      1961         QMC
> 3       07EA001      1962         QMC
> 4       07EA001      1963         QMC
> 5       07EA001      1964         QMC
> ...         ...       ...         ...
> 20      07EA001      1979         QRC
> 21      07EA001      1980         QRC
> 22      07EA001      1981         QRC
> 23      07EA001      1982         QRC
> 24      07EA001      1983         QRC
> 
> If you tell the computer more about your data, it can do more things for
> you.
> 
> Michael
> 
> On Tue, Nov 28, 2017 at 7:34 AM, Martin Morgan <
> martin.morgan at roswellpark.org> wrote:
> 
>> On 11/26/2017 08:42 PM, jim holtman wrote:
>>
>>> try this:
>>>
>>> ##########################################
>>>
>>> library(dplyr)
>>>
>>> input <- tribble(
>>>     ~station, ~from, ~to, ~record,
>>>    "07EA001" ,    1960  ,  1960  , "QMS",
>>>    "07EA001"  ,   1961 ,   1970  , "QMC",
>>>    "07EA001" ,    1971  ,  1971  , "QMM",
>>>    "07EA001" ,    1972  ,  1976  , "QMC",
>>>    "07EA001" ,    1977  ,  1983  , "QRC"
>>> )
>>>
>>> result <- input %>%
>>>     rowwise() %>%
>>>     do(tibble(station = .$station,
>>>               year = seq(.$from, .$to),
>>>               record = .$record)
>>>     )
>>>
>>> ###########################
>>>
>>
>> In a bit more 'base R' mode I did
>>
>>    input$year <- with(input, Map(seq, from, to))
>>    res0 <- with(input, Map(data.frame, station=station, year=year,
>>        record=record))
>>     as_tibble(do.call(rbind, unname(res0)))# A tibble: 24 x 3
>>
>> resulting in
>>
>>> as_tibble(do.call(rbind, unname(res0)))# A tibble: 24 x 3
>>     station  year record
>>      <fctr> <int> <fctr>
>>   1 07EA001  1960    QMS
>>   2 07EA001  1961    QMC
>>   3 07EA001  1962    QMC
>>   4 07EA001  1963    QMC
>>   5 07EA001  1964    QMC
>>   6 07EA001  1965    QMC
>>   7 07EA001  1966    QMC
>>   8 07EA001  1967    QMC
>>   9 07EA001  1968    QMC
>> 10 07EA001  1969    QMC
>> # ... with 14 more rows
>>
>> I though I should have been able to use `tibble` in the second step, but
>> that leads to a (cryptic) error
>>
>>> res0 <- with(input, Map(tibble, station=station, year=year,
>> record=record))Error in captureDots(strict = `__quosured`) :
>>    the argument has already been evaluated
>>
>> The 'station' and 'record' columns are factors, so different from the
>> original input, but this seems the appropriate data type for theses columns.
>>
>> It's interesting to compare the 'specialized' knowledge needed for each
>> approach -- rowwise(), do(), .$ for tidyverse, with(), do.call(), maybe
>> rbind() and Map() for base R.
>>
>> Martin
>>
>>
>>
>>>
>>>
>>> Jim Holtman
>>> Data Munger Guru
>>>
>>> What is the problem that you are trying to solve?
>>> Tell me what you want to do, not how you want to do it.
>>>
>>> On Sun, Nov 26, 2017 at 2:10 PM, Bert Gunter <bgunter.4567 at gmail.com>
>>> wrote:
>>>
>>> To David W.'s point about lack of a suitable reprex ("reproducible
>>>> example"), Bill's solution seems to be for only one station.
>>>>
>>>> Here is a reprex and modification that I think does what was requested
>>>> for
>>>> multiple stations, again using base R and data frames, not dplyr and
>>>> tibbles.
>>>>
>>>> First the reprex with **two** stations:
>>>>
>>>> d <- data.frame( station = rep(c("one","two"),c(5,4)),
>>>>>
>>>>                  from = c(60,61,71,72,76,60,65,82,83),
>>>>                   to = c(60,70,71,76,83,64, 81, 82,83),
>>>>                   record = c("A","B","C","B","D","B","B","D","E"))
>>>>
>>>> d
>>>>>
>>>>     station from to record
>>>> 1     one   60 60      A
>>>> 2     one   61 70      B
>>>> 3     one   71 71      C
>>>> 4     one   72 76      B
>>>> 5     one   76 83      D
>>>> 6     two   60 64      B
>>>> 7     two   65 81      B
>>>> 8     two   82 82      D
>>>> 9     two   83 83      E
>>>>
>>>> ## Now the conversion code using base R, especially by():
>>>>
>>>> out <- by(d, d$station, function(x) with(x, {
>>>>>
>>>> +    i <- to - from +1
>>>> +    data.frame(YEAR =sequence(i) -1 +rep(from,i), RECORD =rep(record,i))
>>>> + }))
>>>>
>>>>
>>>> out <- data.frame(station =
>>>>>
>>>> rep(names(out),sapply(out,nrow)),do.call(rbind,out), row.names = NULL)
>>>>
>>>>
>>>> out
>>>>>
>>>>      station YEAR RECORD
>>>> 1      one   60      A
>>>> 2      one   61      B
>>>> 3      one   62      B
>>>> 4      one   63      B
>>>> 5      one   64      B
>>>> 6      one   65      B
>>>> 7      one   66      B
>>>> 8      one   67      B
>>>> 9      one   68      B
>>>> 10     one   69      B
>>>> 11     one   70      B
>>>> 12     one   71      C
>>>> 13     one   72      B
>>>> 14     one   73      B
>>>> 15     one   74      B
>>>> 16     one   75      B
>>>> 17     one   76      B
>>>> 18     one   76      D
>>>> 19     one   77      D
>>>> 20     one   78      D
>>>> 21     one   79      D
>>>> 22     one   80      D
>>>> 23     one   81      D
>>>> 24     one   82      D
>>>> 25     one   83      D
>>>> 26     two   60      B
>>>> 27     two   61      B
>>>> 28     two   62      B
>>>> 29     two   63      B
>>>> 30     two   64      B
>>>> 31     two   65      B
>>>> 32     two   66      B
>>>> 33     two   67      B
>>>> 34     two   68      B
>>>> 35     two   69      B
>>>> 36     two   70      B
>>>> 37     two   71      B
>>>> 38     two   72      B
>>>> 39     two   73      B
>>>> 40     two   74      B
>>>> 41     two   75      B
>>>> 42     two   76      B
>>>> 43     two   77      B
>>>> 44     two   78      B
>>>> 45     two   79      B
>>>> 46     two   80      B
>>>> 47     two   81      B
>>>> 48     two   82      D
>>>> 49     two   83      E
>>>>
>>>> Cheers,
>>>> Bert
>>>>
>>>>
>>>>
>>>>
>>>> Bert Gunter
>>>>
>>>> "The trouble with having an open mind is that people keep coming along
>>>> and
>>>> sticking things into it."
>>>> -- Opus (aka Berkeley Breathed in his "Bloom County" comic strip )
>>>>
>>>> On Sat, Nov 25, 2017 at 4:49 PM, William Dunlap via R-help <
>>>> r-help at r-project.org> wrote:
>>>>
>>>> dplyr may have something for this, but in base R I think the following
>>>>>
>>>> does
>>>>
>>>>> what you want.  I've shortened the name of your data set to 'd'.
>>>>>
>>>>> i <- rep(seq_len(nrow(d)), d$YEAR_TO-d$YEAR_FROM+1)
>>>>> j <- sequence(d$YEAR_TO-d$YEAR_FROM+1)
>>>>> transform(d[i,], YEAR=YEAR_FROM+j-1, YEAR_FROM=NULL, YEAR_TO=NULL)
>>>>>
>>>>>
>>>>> Bill Dunlap
>>>>> TIBCO Software
>>>>> wdunlap tibco.com
>>>>>
>>>>> On Sat, Nov 25, 2017 at 11:18 AM, Hutchinson, David (EC) <
>>>>> david.hutchinson at canada.ca> wrote:
>>>>>
>>>>> I have a returned tibble of station operational record similar to the
>>>>>> following:
>>>>>>
>>>>>> data.collection
>>>>>>>
>>>>>> # A tibble: 5 x 4
>>>>>>     STATION_NUMBER YEAR_FROM YEAR_TO RECORD
>>>>>>              <chr>     <int>   <int>  <chr>
>>>>>> 1        07EA001      1960    1960    QMS
>>>>>> 2        07EA001      1961    1970    QMC
>>>>>> 3        07EA001      1971    1971    QMM
>>>>>> 4        07EA001      1972    1976    QMC
>>>>>> 5        07EA001      1977    1983    QRC
>>>>>>
>>>>>> I would like to reshape this to one operational record (row) per year
>>>>>>
>>>>> per
>>>>
>>>>> station. Something like:
>>>>>>
>>>>>> 07EA001              1960      QMS
>>>>>> 07EA001              1961      QMC
>>>>>> 07EA001              1962      QMC
>>>>>> 07EA001              1963      QMC
>>>>>> ...
>>>>>> 07EA001              1971      QMM
>>>>>>
>>>>>> Can this be done in dplyr easily?
>>>>>>
>>>>>> Thanks in advance,
>>>>>>
>>>>>> David
>>>>>>
>>>>>>           [[alternative HTML version deleted]]
>>>>>>
>>>>>> ______________________________________________
>>>>>> R-help at r-project.org mailing list -- To UNSUBSCRIBE and more, see
>>>>>> https://stat.ethz.ch/mailman/listinfo/r-help
>>>>>> PLEASE do read the posting guide http://www.R-project.org/
>>>>>> posting-guide.html
>>>>>> and provide commented, minimal, self-contained, reproducible code.
>>>>>>
>>>>>>
>>>>>           [[alternative HTML version deleted]]
>>>>>
>>>>> ______________________________________________
>>>>> R-help at r-project.org mailing list -- To UNSUBSCRIBE and more, see
>>>>> https://stat.ethz.ch/mailman/listinfo/r-help
>>>>> PLEASE do read the posting guide http://www.R-project.org/
>>>>> posting-guide.html
>>>>> and provide commented, minimal, self-contained, reproducible code.
>>>>>
>>>>>
>>>>           [[alternative HTML version deleted]]
>>>>
>>>> ______________________________________________
>>>> R-help at r-project.org mailing list -- To UNSUBSCRIBE and more, see
>>>> https://stat.ethz.ch/mailman/listinfo/r-help
>>>> PLEASE do read the posting guide http://www.R-project.org/
>>>> posting-guide.html
>>>> and provide commented, minimal, self-contained, reproducible code.
>>>>
>>>>
>>>          [[alternative HTML version deleted]]
>>>
>>> ______________________________________________
>>> R-help at r-project.org mailing list -- To UNSUBSCRIBE and more, see
>>> https://stat.ethz.ch/mailman/listinfo/r-help
>>> PLEASE do read the posting guide http://www.R-project.org/posti
>>> ng-guide.html
>>> and provide commented, minimal, self-contained, reproducible code.
>>>
>>>
>>
>> This email message may contain legally privileged and/or...{{dropped:2}}
>>
>>
>> ______________________________________________
>> R-help at r-project.org mailing list -- To UNSUBSCRIBE and more, see
>> https://stat.ethz.ch/mailman/listinfo/r-help
>> PLEASE do read the posting guide http://www.R-project.org/posti
>> ng-guide.html
>> and provide commented, minimal, self-contained, reproducible code.
>>
> 
> 	[[alternative HTML version deleted]]
> 
> ______________________________________________
> R-help at r-project.org mailing list -- To UNSUBSCRIBE and more, see
> https://stat.ethz.ch/mailman/listinfo/r-help
> PLEASE do read the posting guide http://www.R-project.org/posting-guide.html
> and provide commented, minimal, self-contained, reproducible code.
> 

-- 
Dr. Tóth Dénes ügyvezető
Kogentum Kft.
Tel.: 06-30-2583723
Web: www.kogentum.hu



More information about the R-help mailing list