[Rd] indexing and regression testing

Thu Aug 23 12:30:30 CEST 2007

Dear all,

It was a pleasure to meet you at Iowa State University. Two days ago I submitted two experimental packages to CRAN (hope it  will be there soon):

rindex: quick indexing of large objects (currently only character, see ?index)
regtest: some first support for automated regression testing (heavily used in \dontshow{} section of ?index)

With rindex you can for example

i <- index(rownames(someDataFrame))
# then this 
someDataFrame[match(fewRowNames, i),]
# is much faster than that
someDataFrame[match(fewRowNames, rownames(someDataFrame)),]
# which still is much faster than
someDataFrame[fewRowNames, ]
# the latter being surprisingly slow as it can be worked around by match (even without indexing)

I'd appreciate any comments, especially on the "Open Questions" secion in ?index, in order to
1) extend to all atomic types and be UTF8-safe
2) extend to packages R.huge and/or ff
3) optimally integrate with sort(OK), order(no generic yet) and match(no generic and dispatch rather needed on second argument)

Both packages were developed under 2.5.1 and I plan to publish them under a GPL2-QA, a GPL2 clone that obliges users to participate in some kind of regression testing (to be defined).

Best regards

Jens Oehlschlägel

P.S. 

> # some timings using timefactor(regtest)
> library(rindex)
> 
> Vec1 <- c(NA, paste('a', 1:1000000, sep=""))
> Vec2 <- c(paste('a', 1:1000000, sep=""), NA)
> iVec1 <- index(Vec1, verbose=TRUE)
     user.self sys.self elapsed
sort      7.85     0.01    7.86
tree      0.00     0.00    0.00
> iVec2 <- index(Vec2, verbose=TRUE)
finalized tree
     user.self sys.self elapsed
sort      7.78        0    7.79
tree      0.00        0    0.00
> 
> timefactor(any(Vec1=="a999"), any(iVec1=="a999"), 10, 100)
finalized tree
            nom  denom   factor
user.self 0.041 0.0036 11.38889
sys.self  0.000 0.0000      NaN
elapsed   0.041 0.0036 11.38889
> timefactor(any(Vec2=="a999"), any(iVec2=="a999"), 10, 100)
            nom  denom   factor
user.self 0.040 0.0036 11.11111
sys.self  0.000 0.0000      NaN
elapsed   0.041 0.0036 11.38889
> 
> timefactor(length(match("a999", Vec1)), length(match("a999", iVec1)), 10, 1000)
            nom   denom   factor
user.self 0.125 0.00017 735.2941
sys.self  0.000 0.00000      NaN
elapsed   0.125 0.00018 694.4444
> timefactor(length(match("a999", Vec2)), length(match("a999", iVec2)), 10, 1000)
            nom   denom   factor
user.self 0.123 0.00015 820.0000
sys.self  0.003 0.00000      Inf
elapsed   0.127 0.00015 846.6667
> 
> timefactor(length(match(c("a9","a99","a999","a9999"), Vec1)), length(match(c("a9","a99","a999","a9999"), iVec1)), 10, 1000)
            nom   denom   factor
user.self 0.126 0.00022 572.7273
sys.self  0.000 0.00000      NaN
elapsed   0.127 0.00022 577.2727
> timefactor(length(match(c("a9","a99","a999","a9999"), Vec2)), length(match(c("a9","a99","a999","a9999"), iVec2)), 10, 1000)
            nom   denom   factor
user.self 0.125 0.00022 568.1818
sys.self  0.002 0.00000      Inf
elapsed   0.126 0.00022 572.7273
> 
> # and not too bad with respect to the recent hasNA() / anyNA() thread (once the index has been built)
> timefactor(any(is.na(Vec1)), any(is.na(iVec1)), 50, 50)
             nom denom   factor
user.self 0.0040 0.003 1.333333
sys.self  0.0004 0.000      Inf
elapsed   0.0044 0.003 1.466667
> timefactor(any(is.na(Vec2)), any(is.na(iVec2)), 50, 50)
             nom  denom   factor
user.self 0.0040 0.0036 1.111111
sys.self  0.0008 0.0000      Inf
elapsed   0.0046 0.0034 1.352941
> 
> # especially if using a better function for this
> timefactor(any(is.na(Vec1)), length(match(NA, iVec1)), 50, 1000)
             nom   denom   factor
user.self 0.0044 0.00013 33.84615
sys.self  0.0002 0.00000      Inf
elapsed   0.0046 0.00013 35.38462
> timefactor(any(is.na(Vec2)), length(match(NA, iVec2)), 50, 1000)
             nom   denom   factor
user.self 0.0044 0.00014 31.42857
sys.self  0.0000 0.00000      NaN
elapsed   0.0042 0.00014 30.00000
> 
> # or ask directly :-)
> timefactor(any(is.na(Vec1)), iVec1$nNA, 50, 10000)
             nom denom   factor
user.self 0.0044 9e-06 488.8889
sys.self  0.0000 0e+00      NaN
elapsed   0.0044 9e-06 488.8889
> timefactor(any(is.na(Vec2)), iVec2$nNA, 50, 10000)
             nom denom   factor
user.self 0.0046 1e-05 460.0000
sys.self  0.0000 0e+00      NaN
elapsed   0.0046 9e-06 511.1111
> 
> version
               _                           
platform       i386-pc-mingw32             
arch           i386                        
os             mingw32                     
system         i386, mingw32               
status                                     
major          2                           
minor          5.1                         
year           2007                        
month          06                          
day            27                          
svn rev        42083                       
language       R                           
version.string R version 2.5.1 (2007-06-27)

# some timings using timefactor(regtest)
library(rindex)

Vec1 <- c(NA, paste('a', 1:1000000, sep=""))
Vec2 <- c(paste('a', 1:1000000, sep=""), NA)
iVec1 <- index(Vec1, verbose=TRUE)
iVec2 <- index(Vec2, verbose=TRUE)

timefactor(any(Vec1=="a999"), any(iVec1=="a999"), 10, 100)
timefactor(any(Vec2=="a999"), any(iVec2=="a999"), 10, 100)

timefactor(length(match("a999", Vec1)), length(match("a999", iVec1)), 10, 1000)
timefactor(length(match("a999", Vec2)), length(match("a999", iVec2)), 10, 1000)

timefactor(length(match(c("a9","a99","a999","a9999"), Vec1)), length(match(c("a9","a99","a999","a9999"), iVec1)), 10, 1000)
timefactor(length(match(c("a9","a99","a999","a9999"), Vec2)), length(match(c("a9","a99","a999","a9999"), iVec2)), 10, 1000)

# and not too bad with respect to the recent hasNA() / anyNA() thread (once the index has been built)
timefactor(any(is.na(Vec1)), any(is.na(iVec1)), 50, 50)
timefactor(any(is.na(Vec2)), any(is.na(iVec2)), 50, 50)

# especially if using a better function for this
timefactor(any(is.na(Vec1)), length(match(NA, iVec1)), 50, 1000)
timefactor(any(is.na(Vec2)), length(match(NA, iVec2)), 50, 1000)

# or ask directly :-)
timefactor(any(is.na(Vec1)), iVec1$nNA, 50, 10000)
timefactor(any(is.na(Vec2)), iVec2$nNA, 50, 10000)

version

--