[BioC] segfault when using data.table package in conjunction with foreach
Matthew Keller
mckellercran at gmail.com
Thu Feb 23 18:04:59 CET 2012
Hi all,
I'm trying to use the package read.table within a foreach loop. I'm
grabbing 500M rows of data at a time from two different files and then
doing an aggregate/tapply like function in read.table after that. I
had planned on doing a foreach loop 39 times at once for the 39 files
I have, but obviously that won't work until I figure out why the
segfault is occurring. The sessionInfo, code, and error are pasted
below. If you have any ideas, would love to hear them. (I have no
control over the version of R - 2.13.0 - being used). Best
Matt
SESSION INFO:
> sessionInfo()
R version 2.13.0 (2011-04-13)
Platform: x86_64-unknown-linux-gnu (64-bit)
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=C
[6] LC_MESSAGES=en_US.UTF-8 LC_PAPER=en_US.UTF-8 LC_NAME=C
LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
attached base packages:
[1] stats graphics grDevices utils datasets methods base
other attached packages:
[1] data.table_1.7.10 doMC_1.2.2 multicore_0.1-5
foreach_1.3.2 codetools_0.2-8 iterators_1.0.3
MY CODE:
computeAllPairSums <- function(filename, nbindiv,nrows.to.read)
{
con <- file(filename, open="r")
on.exit(close(con))
ans <- matrix(numeric(nbindiv * nbindiv), nrow=nbindiv)
chunk <- 0L
while (TRUE) {
#read.table faster than scan
df0 <- read.table(con,col.names=c("ID1", "ID2", "ignored", "sharing"),
colClasses=c("integer", "integer", "NULL",
"numeric"),nrows=nrows.to.read,comment.char="")
DT <- data.table(df0)
setkey(DT,ID1,ID2)
ss <- DT[,sum(sharing),by="ID1,ID2"]
if (nrow(df0) == 0L)
break
chunk <- chunk + 1L
cat("Processing chunk", chunk, "... ")
idd <- as.matrix(subset(ss,select=1:2))
newvec <- as.vector(as.matrix(subset(ss,select=3)))
ans[idd] <- ans[idd] + newvec
cat("OK\n")
}
ans
}
require(foreach)
require(doMC)
registerDoMC(cores=2)
num <- 8891
nr <- 500000000L #500 million rows at a time
MMM <- foreach(IT = 1:2) %dopar% {
require(data.table)
if (IT==1){ x <- system.time({computeAllPairSums(
paste(GERMLINE,"bc.chr22.q.20.file",sep=''),num,nr)}) } #Run it on
regular file PID 6489, 24 gb
if (IT==2){ z <- system.time({computeAllPairSums.gz(
paste(GERMLINE,"bc.chr22.q.20.gz",sep=''),num,nr)}) } #Run it on gz
file PID 6490, 24 gb
}
MY R OUTPUT/ERROR:
MMM <- foreach(IT = 1:2) %dopar% {
+ require(data.table)
+ if (IT==1){ x <- system.time({computeAllPairSums(
paste(GERMLINE,"bc.chr22.q.20.file",sep=''),num,nr)}) } #Run it on
regular file PID 6053, 5.9 gb
+ if (IT==2){ z <- system.time({computeAllPairSums.gz(
paste(GERMLINE,"bc.chr22.q.20.gz",sep=''),num,nr)}) } #Run it on gz
file PID 6054, 4 gb
+ }
Loading required package: data.table
Loading required package: data.table
data.table 1.7.10 For help type: help("data.table")
data.table 1.7.10 For help type: help("data.table")
*** caught segfault ***
address 0x2ae93df90000, cause 'memory not mapped'
Traceback:
1: .Call("dogroups", x, xcols, o__, f__, len__, jsub, SDenv, testj,
byretn, byval, i, as.integer(icols), i[1, ivars, with = FALSE],
if (length(ivars)) paste("i.", ivars, sep = ""), is.na(nomatch),
verbose, PACKAGE = "data.table")
2: `[.data.table`(DT, , sum(sharing), by = "ID1,ID2")
3: DT[, sum(sharing), by = "ID1,ID2"]
4: computeAllPairSums(paste(GERMLINE, "bc.chr22.q.20.file", sep =
""), num, nr)
5: system.time({ computeAllPairSums(paste(GERMLINE,
"bc.chr22.q.20.file", sep = ""), num, nr)})
6: eval(expr, envir, enclos)
7: eval(c.expr, envir = args, enclos = envir)
8: doTryCatch(return(expr), name, parentenv, handler)
9: tryCatchOne(expr, names, parentenv, handlers[[1L]])
10: tryCatchList(expr, classes, parentenv, handlers)
11: tryCatch(eval(c.expr, envir = args, enclos = envir), error = function(e) e)
12: FUN(X[[1L]], ...)
13: lapply(S, FUN, ...)
14: doTryCatch(return(expr), name, parentenv, handler)
15: tryCatchOne(expr, names, parentenv, handlers[[1L]])
16: tryCatchList(expr, classes, parentenv, handlers)
17: tryCatch(expr, error = function(e) { call <- conditionCall(e)
if (!is.null(call)) { if (identical(call[[1L]],
quote(doTryCatch))) call <- sys.call(-4L) dcall <-
deparse(call)[1L] prefix <- paste("Error in", dcall, ": ")
LONG <- 75L msg <- conditionMessage(e) sm <-
strsplit(msg, "\n")[[1L]] w <- 14L + nchar(dcall, type = "w") +
nchar(sm[1L], type = "w") if (is.na(w)) w <- 14L +
nchar(dcall, type = "b") + nchar(sm[1L], type = "b")
if (w > LONG) prefix <- paste(prefix, "\n ", sep =
"") } else prefix <- "Error : " msg <- paste(prefix,
conditionMessage(e), "\n", sep = "")
.Internal(seterrmessage(msg[1L])) if (!silent &&
identical(getOption("show.error.messages"), TRUE)) {
cat(msg, file = stderr()) .Internal(printDeferredWarnings())
} invisible(structure(msg, class = "try-error"))})
18: try(lapply(S, FUN, ...), silent = TRUE)
19: sendMaster(try(lapply(S, FUN, ...), silent = TRUE))
20: FUN(1:2[[1L]], ...)
21: lapply(1:cores, inner.do)
22: mclapply(argsList, FUN, mc.preschedule = preschedule, mc.set.seed
= set.seed, mc.silent = silent, mc.cores = cores)
23: e$fun(obj, substitute(ex), parent.frame(), e$data)
24: foreach(IT = 1:2) %dopar% { require(data.table) if (IT == 1)
{ x <- system.time({
computeAllPairSums(paste(GERMLINE, "bc.chr22.q.20.file",
sep = ""), num, nr) }) } if (IT == 2) { z <-
system.time({ computeAllPairSums.gz(paste(GERMLINE,
"bc.chr22.q.20.gz", sep = ""), num, nr) })
}}
Possible actions:
1: abort (with core dump, if enabled)
2: normal R exit
3: exit R without saving workspace
4: exit R saving workspace
--
Matthew C Keller
Asst. Professor of Psychology
University of Colorado at Boulder
www.matthewckeller.com
More information about the Bioconductor
mailing list