[R] segfault when using data.table package in conjunction with foreach

Uwe Ligges ligges at statistik.tu-dortmund.de
Fri Feb 24 16:29:47 CET 2012


0. Read the posting guide! It tells you to

1. Do not cross post!

2. Do try a recent version of R and all packages you have in use.

3. If it still fails, send a reproducible example.


Uwe Ligges





On 23.02.2012 18:04, Matthew Keller wrote:
> Hi all,
>
> I'm trying to use the package read.table within a foreach loop. I'm
> grabbing 500M rows of data at a time from two different files and then
> doing an aggregate/tapply like function in read.table after that. I
> had planned on doing a foreach loop 39 times at once for the 39 files
> I have, but obviously that won't work until I figure out why the
> segfault is occurring. The sessionInfo, code, and error are pasted
> below. If you have any ideas, would love to hear them. (I have no
> control over the version of R - 2.13.0 - being used). Best
>
> Matt
>
>
> SESSION INFO:
>
>> sessionInfo()
> R version 2.13.0 (2011-04-13)
> Platform: x86_64-unknown-linux-gnu (64-bit)
>
> locale:
>   [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C
> LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8     LC_MONETARY=C
>   [6] LC_MESSAGES=en_US.UTF-8    LC_PAPER=en_US.UTF-8       LC_NAME=C
>                 LC_ADDRESS=C               LC_TELEPHONE=C
> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
>
> attached base packages:
> [1] stats     graphics  grDevices utils     datasets  methods   base
>
> other attached packages:
> [1] data.table_1.7.10 doMC_1.2.2        multicore_0.1-5
> foreach_1.3.2     codetools_0.2-8   iterators_1.0.3
>
>
>
> MY CODE:
>
> computeAllPairSums<- function(filename, nbindiv,nrows.to.read)
> {
>     con<- file(filename, open="r")
>     on.exit(close(con))
>     ans<- matrix(numeric(nbindiv * nbindiv), nrow=nbindiv)
>     chunk<- 0L
>     while (TRUE) {
>         #read.table faster than scan
>         df0<- read.table(con,col.names=c("ID1", "ID2", "ignored", "sharing"),
>                  colClasses=c("integer", "integer", "NULL",
> "numeric"),nrows=nrows.to.read,comment.char="")
>
>         DT<- data.table(df0)
>         setkey(DT,ID1,ID2)
>         ss<- DT[,sum(sharing),by="ID1,ID2"]
>
>         if (nrow(df0) == 0L)
>             break
>
>         chunk<- chunk + 1L
>         cat("Processing chunk", chunk, "... ")
>
>        idd<- as.matrix(subset(ss,select=1:2))
>        newvec<- as.vector(as.matrix(subset(ss,select=3)))
>        ans[idd]<- ans[idd] + newvec
>
>           cat("OK\n")
>       }
>     ans
>   }
>
>
>
> require(foreach)
> require(doMC)
> registerDoMC(cores=2)
>
>
> num<- 8891
> nr<-  500000000L   #500 million rows at a time
>
>
> MMM<-  foreach(IT = 1:2) %dopar% {
>    require(data.table)
>    if (IT==1){ x<- system.time({computeAllPairSums(
> paste(GERMLINE,"bc.chr22.q.20.file",sep=''),num,nr)}) } #Run it on
> regular file PID 6489, 24 gb
>    if (IT==2){ z<- system.time({computeAllPairSums.gz(
> paste(GERMLINE,"bc.chr22.q.20.gz",sep=''),num,nr)}) } #Run it on gz
> file PID 6490, 24 gb
> }
>
>
> MY R OUTPUT/ERROR:
>
> MMM<-  foreach(IT = 1:2) %dopar% {
> +   require(data.table)
> +   if (IT==1){ x<- system.time({computeAllPairSums(
> paste(GERMLINE,"bc.chr22.q.20.file",sep=''),num,nr)}) } #Run it on
> regular file PID 6053, 5.9 gb
> +   if (IT==2){ z<- system.time({computeAllPairSums.gz(
> paste(GERMLINE,"bc.chr22.q.20.gz",sep=''),num,nr)}) } #Run it on gz
> file PID 6054, 4 gb
> + }
>
> Loading required package: data.table
> Loading required package: data.table
> data.table 1.7.10  For help type: help("data.table")
> data.table 1.7.10  For help type: help("data.table")
>
>   *** caught segfault ***
> address 0x2ae93df90000, cause 'memory not mapped'
>
> Traceback:
>   1: .Call("dogroups", x, xcols, o__, f__, len__, jsub, SDenv, testj,
>    byretn, byval, i, as.integer(icols), i[1, ivars, with = FALSE],
> if (length(ivars)) paste("i.", ivars, sep = ""), is.na(nomatch),
> verbose, PACKAGE = "data.table")
>   2: `[.data.table`(DT, , sum(sharing), by = "ID1,ID2")
>   3: DT[, sum(sharing), by = "ID1,ID2"]
>   4: computeAllPairSums(paste(GERMLINE, "bc.chr22.q.20.file", sep =
> ""),     num, nr)
>   5: system.time({    computeAllPairSums(paste(GERMLINE,
> "bc.chr22.q.20.file",         sep = ""), num, nr)})
>   6: eval(expr, envir, enclos)
>   7: eval(c.expr, envir = args, enclos = envir)
>   8: doTryCatch(return(expr), name, parentenv, handler)
>   9: tryCatchOne(expr, names, parentenv, handlers[[1L]])
> 10: tryCatchList(expr, classes, parentenv, handlers)
> 11: tryCatch(eval(c.expr, envir = args, enclos = envir), error = function(e) e)
> 12: FUN(X[[1L]], ...)
> 13: lapply(S, FUN, ...)
> 14: doTryCatch(return(expr), name, parentenv, handler)
> 15: tryCatchOne(expr, names, parentenv, handlers[[1L]])
> 16: tryCatchList(expr, classes, parentenv, handlers)
> 17: tryCatch(expr, error = function(e) {    call<- conditionCall(e)
>   if (!is.null(call)) {        if (identical(call[[1L]],
> quote(doTryCatch)))             call<- sys.call(-4L)        dcall<-
> deparse(call)[1L]        prefix<- paste("Error in", dcall, ": ")
>    LONG<- 75L        msg<- conditionMessage(e)        sm<-
> strsplit(msg, "\n")[[1L]]        w<- 14L + nchar(dcall, type = "w") +
> nchar(sm[1L], type = "w")        if (is.na(w))             w<- 14L +
> nchar(dcall, type = "b") + nchar(sm[1L],                 type = "b")
>       if (w>  LONG)             prefix<- paste(prefix, "\n  ", sep =
> "")    }    else prefix<- "Error : "    msg<- paste(prefix,
> conditionMessage(e), "\n", sep = "")
> .Internal(seterrmessage(msg[1L]))    if (!silent&&
> identical(getOption("show.error.messages"),         TRUE)) {
> cat(msg, file = stderr())        .Internal(printDeferredWarnings())
> }    invisible(structure(msg, class = "try-error"))})
> 18: try(lapply(S, FUN, ...), silent = TRUE)
> 19: sendMaster(try(lapply(S, FUN, ...), silent = TRUE))
> 20: FUN(1:2[[1L]], ...)
> 21: lapply(1:cores, inner.do)
> 22: mclapply(argsList, FUN, mc.preschedule = preschedule, mc.set.seed
> = set.seed,     mc.silent = silent, mc.cores = cores)
> 23: e$fun(obj, substitute(ex), parent.frame(), e$data)
> 24: foreach(IT = 1:2) %dopar% {    require(data.table)    if (IT == 1)
> {        x<- system.time({
> computeAllPairSums(paste(GERMLINE, "bc.chr22.q.20.file",
>    sep = ""), num, nr)        })    }    if (IT == 2) {        z<-
> system.time({            computeAllPairSums.gz(paste(GERMLINE,
> "bc.chr22.q.20.gz",                 sep = ""), num, nr)        })
> }}
>
> Possible actions:
> 1: abort (with core dump, if enabled)
> 2: normal R exit
> 3: exit R without saving workspace
> 4: exit R saving workspace
>
>
>



More information about the R-help mailing list