[R] matching a sequence in a vector?

Thu Feb 16 10:51:16 CET 2012

On 16-02-2012, at 09:01, Petr Savicky wrote:

> 
> Hi.
> 
> There were several solutions in this thread. Their speed differs
> quite significantly. Here is a comparison.
> 
>  patrn <- 1:4
>  exmpl <- sample(1:4, 10000, replace=TRUE)
> 
>  occur1 <- function(patrn, exmpl)
>  {
>    m <- length(patrn)
>    n <- length(exmpl)
>    candidate <- seq.int(length=n-m+1)
>    for (i in seq.int(length=m)) {
>        candidate <- candidate[patrn[i] == exmpl[candidate + i - 1]]
>    }
>    candidate
>  }
> 
>  occur2 <- function(patrn, exmpl)
>  {
>    patrn.rev <- rev(patrn)
>    w <- embed(exmpl,length(patrn))
>    which(apply(w,1,function(r) all(r == patrn.rev)))
>  }
> 
>  occur3 <- function(patrn, exmpl)
>  {
>    patrn.rev <- rev(patrn)
>    w <- embed(exmpl,length(patrn))
>    which(rowSums(w == rep(rev(patrn), each=nrow(w))) == ncol(w))
>  }
> 
>  occur4 <- function(patrn, exmpl)
>  {
>    # requires patrn without duplicates
>    n = length(patrn)
>    r = rle(diff(match(exmpl, patrn)) == 1L)
>    cumsum(r$length)[r$values & r$length == (n - 1L)] - (n - 2L)
>  }
> 
>  occur5 <- function(patrn, exmpl)
>  {
>    which( sapply( 1:(length(exmpl)-length(patrn)+1), function(i) isTRUE( all.equal( patrn, exmpl[i + 0:(length(patrn)-1) ] ) ) ) )
>  }
> 
>  occur6 <- function(patrn, exmpl)
>  {
>    indx <- embed(rev(seq_along(exmpl)), length(patrn))
>    matches <- apply(indx, 1, function(.indx){
>        all(exmpl[.indx] == patrn)
>    })
>    rev(indx[matches, 1L])
>  }
> 
>  occur7 <- function(patrn, exmpl)
>  {
>    which(rollapply(exmpl, length(patrn), identical, patrn, fill = FALSE, align = "left"))
>  }
> 
>  library(zoo)
> 
>  t1 <- system.time( out1 <- occur1(patrn, exmpl) )
>  t2 <- system.time( out2 <- occur2(patrn, exmpl) )
>  t3 <- system.time( out3 <- occur3(patrn, exmpl) )
>  t4 <- system.time( out4 <- occur4(patrn, exmpl) )
>  t5 <- system.time( out5 <- occur5(patrn, exmpl) )
>  t6 <- system.time( out6 <- occur6(patrn, exmpl) )
>  t7 <- system.time( out7 <- occur7(patrn, exmpl) )
> 
>  print(identical(out1, out2))
>  print(identical(out1, out3))
>  print(identical(out1, out4))
>  print(identical(out1, out5))
>  print(identical(out1, out6))
>  print(identical(out1, out7))
>  print(rbind(t1, t2, t3, t4, t5, t6, t7))
> 
> The output was
> 
>  [1] TRUE
>  [1] TRUE
>  [1] TRUE
>  [1] TRUE
>  [1] TRUE
>  [1] TRUE
>     user.self sys.self elapsed user.child sys.child
>  t1     0.001        0   0.001          0         0
>  t2     0.062        0   0.061          0         0
>  t3     0.002        0   0.002          0         0
>  t4     0.001        0   0.001          0         0
>  t5     1.749        0   1.749          0         0
>  t6     0.068        0   0.068          0         0
>  t7     0.172        0   0.172          0         0

And by modifying occur5 to this

 occur5 <- function(patrn, exmpl)
 {
   which( sapply( 1:(length(exmpl)-length(patrn)+1), 
          function(i) identical( patrn, exmpl[i + 0:(length(patrn)-1) ] ) ) )
 }

occur5 can be made a lot faster.
On my computer instead of

   user.self sys.self elapsed user.child sys.child
t1     0.001    0.000   0.001          0         0
t2     0.061    0.007   0.068          0         0
t3     0.002    0.001   0.002          0         0
t4     0.001    0.000   0.002          0         0
t5     1.640    0.037   1.677          0         0
t6     0.079    0.004   0.084          0         0
t7     0.256    0.004   0.260          0         0

I got

   user.self sys.self elapsed user.child sys.child
t1     0.000    0.000   0.001          0         0
t2     0.060    0.004   0.065          0         0
t3     0.002    0.001   0.003          0         0
t4     0.001    0.000   0.002          0         0
t5     0.070    0.002   0.071          0         0
t6     0.076    0.000   0.077          0         0
t7     0.246    0.006   0.252          0         0

Berend