[R-sig-hpc] Rmpi Segmentation Fault

Wed Apr 29 18:34:22 CEST 2009

I am running R 2.9.0 on a SLES 9 cluster with Rmpi 0.5-7. The cluster
has 115 nodes with 4 cpus each. Lately, I occasionally get a
segmentation fault error. For example, I run a small test job with the
simPI demo included in Rmpi, and I get the following:

Invoking R - mpirun -np 4 -machinefile /var/spool/PBS/aux/118794.head
/apps/R/R
290/bin/R --save --restore  --quiet --file=Rmpi_test.R
master (rank 0, comm 1) of size 4 is running on: n040
slave1 (rank 1, comm 1) of size 4 is running on: n040
slave2 (rank 2, comm 1) of size 4 is running on: n040
slave3 (rank 3, comm 1) of size 4 is running on: n040
> # Load the R MPI package if it is not already loaded.
> library(rlecuyer)

> # Tell all slaves to return a message identifying themselves
> mpi.remote.exec(paste("I am",mpi.comm.rank(),"of",mpi.comm.size()))
$slave1
[1] "I am 1 of 4"

$slave2
[1] "I am 2 of 4"

$slave3
[1] "I am 3 of 4"

>
> demo("simPI")

        demo(simPI)
        ---- ~~~~~

> #require rsprng package and use mpi.init.sprng to initialize SPRNG.
>  simslave <- function (){
+     request <-1
+     result <-2
+     chunksize <- 1000 #job size for slaves
+     anytag <- mpi.any.tag()
+     while (1) { #slaves do their jobs first and waiting instructions
from mast
er
+       x <- runif(chunksize)
+       y <- runif(chunksize)
+       incirvec <- (x-.5)^2 + (y-.5)^2 < .25
+       xy <- c(x[incirvec],y[incirvec])
+       #send the result to master
+       mpi.send(xy, 2, dest=0, tag=result,comm=.comm)
+       #receive instructions from master
+       mpi.recv(integer(1),type=1,source=0,tag=anytag,comm=.comm)
+       tag <- mpi.get.sourcetag()[2]
+       if (tag!=request)
+           break
+     }
+ }

> simPI <- function (n,epsilon=1e-4,comm=1)
+ {
+     tsize <- mpi.comm.size(comm)
+     if (tsize < 2)
+       stop("It seems no slaves running")
+
+     #send the function simslave to all slaves
+     mpi.bcast.Robj2slave(simslave, comm=comm)
+
+     #let slaves run the function simslave
+     mpi.bcast.cmd(simslave(), comm=comm)
+
+     chunksize <- 1000
+     request <-1
+     result <- 2
+     count <- 0
+     totalincir <- 0
+     anysrc <- mpi.any.source()
+
+    #prepare an empty plot
+     plot(c(0,0),xlim=c(0,1),ylim=c(0,1), ylab="",type="n")
+
+     while (1) {
+       #receive done job from slaves
+       xy<-mpi.recv(double(2*chunksize),type=2,source=anysrc,
+                       tag=result,comm=comm)
+       #receive buffer is biger than actual data. So need to get real
length
+       incir <-mpi.get.count(2)/2
+       src <- mpi.get.sourcetag()[1]
+       totalincir <- incir + totalincir
+       count <- count+chunksize
+       mypi <- 4*totalincir/count
+       x <- xy[1:incir]
+       y<- xy[(incir+1):(2*incir)]
+       #add incircle points to the plot
+       points(x,y,pch=".",cex=.2,col=src)
+       err <- abs(mypi-pi)
+       if (err > epsilon && cLoading required package: rlecuyer
Loading required package: rlecuyer
Loading required package: rlecuyer
ount < n)
+           mpi.send(integer(1),type=1,dest=src,tag=request,comm=comm)
+       else {
+           #tell slaves to stop with tag=0
+           mpi.send(integer(1),type=1,dest=src,tag=0,comm=comm)
+           break
+         }
+     }
+     #only one slave is stopped. So have to others to stop as well
+     if (tsize > 2){
+     for (i in 1:(tsize-2)){
+       #continue receiving other slaves jobs
+       xy<-mpi.recv(double(2*chunksize),type=2,source=anysrc,
+                       tag=result,comm=comm)
+       incir <-mpi.get.count(2)/2
+       src <- mpi.get.sourcetag()[1]
+       totalincir <- incir + totalincir
+       count <- count+chunksize
+       mypi <- 4*totalincir/count
+       x <- xy[1:incir]
+       y<- xy[(incir+1):(2*incir)]
+       points(x,y,pch=".",cex=.2,col=src)
+       #tell other slaves to stop
+       mpi.send(integer(1),type=1,dest=src,tag=0,comm=comm)
+     }
+     }
+     mypi
+ }
>
> mpi.setup.rngstream()
> simPI(1000000000)
[1] 3.141579
> mpi.close.Rslaves()
[1] 1
> mpi.exit()
[1] "Detaching Rmpi. Rmpi cannot be used unless relaunching R."
>
[n040:06512] *** Process received signal ***
[n040:06512] Signal: Segmentation fault (11)
[n040:06512] Signal code: Address not mapped (1)
[n040:06512] Failing at address: 0x90028
[n040:06512] [ 0] /lib64/tls/libc.so.6 [0x2a95c84500]
[n040:06512] [ 1] /lib64/ld-linux-x86-64.so.2 [0x2a9555d334]
[n040:06512] [ 2] /lib64/ld-linux-x86-64.so.2 [0x2a9555d724]
[n040:06512] [ 3] /lib64/ld-linux-x86-64.so.2 [0x2a9556119f]
[n040:06512] [ 4] /lib64/ld-linux-x86-64.so.2 [0x2a95560ef2]
[n040:06512] [ 5] /usr/lib64/libvapi.so(vipul_cleanup+0x50)
[0x2a996c34c0]
[n040:06512] *** End of error message ***
mpirun noticed that job rank 0 with PID 6512 on node n040c exited on
signal 11 (
Segmentation fault).
Cleaning up after job

Has anyone seen this behavior before? Or have any idea what could cause
this. Everything seems to work alright, but this doesn't seem very
stable.

Mark Lyman