[R] Kosaraju's SCC Algorithm Running

Megan megan.fight at gmail.com
Mon Nov 7 05:12:38 CET 2016


To whom it may concerns,

We encountered stack overflow issues when we implemented DFS(depth first search) algorithm on a directed graph having 800,000+ vertices and millions of edges.  The purpose of running DFS is to use Kosaraju Algorithm to calculate the size of SCC(strongly connected componment) in the graph. This is an assignment from Coursea.org <http://coursea.org/>. We know the maximum size of SCC in the graph is 434,821, which means the maximum recursion depth during code running is 434,821. We know it is a large calculation behind the scene, therefore, we’ve done the below pre-setup hopefully to solve the stack overflow issue. However, we have not got the luck yet. We’ve tried running on both R and RStudio.

We would really appreciate if someone in your team can help to investigate. We can’t wait to see if our code is working in R. 

System Information:
Model Name:	MacBook Pro
  Model Identifier:	MacBookPro11,1
  Processor Name:	Intel Core i5
  Processor Speed:	2.4 GHz
  Number of Processors:	1
  Total Number of Cores:	2
  L2 Cache (per Core):	256 KB
  L3 Cache:	3 MB
  Memory:	8 GB

System settings we have tried:
1. ulimit- s 65000 (to increase stack size before starting up R)
2. R --max-ppsize =500000 (start R with max ppsize)
3. options(expression=500000)


The data we used can be found on 
https://d18ky98rnyall9.cloudfront.net/_410e934e6553ac56409b2cb7096a44aa_SCC.txt?Expires=1478649600&Signature=YC0OjTn4hmzWpzAMw3WkQjXgxkAc0qhZrCd07JAPtud3dGQqywpQgkAASf-bWX6qgGafVvObciJ3ww-7wbNqY0YhWxwcg-IxmCnz1xJu1SORdDobiVKmhhfSaqgTyullX1hFmcxAA4y6Ud33hY1dhIP~UTAlW~IV8Y-zSliAts8_&Key-Pair-Id=APKAJLTNE6QMUY6HBC5A <https://d18ky98rnyall9.cloudfront.net/_410e934e6553ac56409b2cb7096a44aa_SCC.txt?Expires=1478649600&Signature=YC0OjTn4hmzWpzAMw3WkQjXgxkAc0qhZrCd07JAPtud3dGQqywpQgkAASf-bWX6qgGafVvObciJ3ww-7wbNqY0YhWxwcg-IxmCnz1xJu1SORdDobiVKmhhfSaqgTyullX1hFmcxAA4y6Ud33hY1dhIP~UTAlW~IV8Y-zSliAts8_&Key-Pair-Id=APKAJLTNE6QMUY6HBC5A>

Data discription provided as follow:
https://www.coursera.org/learn/algorithm-design-analysis/exam/rOtFq/programming-assignment-4 <https://www.coursera.org/learn/algorithm-design-analysis/exam/rOtFq/programming-assignment-4> 

Below is our code:

options(expressions=500000)
#Prepare input test file  
g=read.table("Downloads/scc.txt")
x<<-as.matrix(g)
remove(g)
vector.is.empty<<-function(x) {return(length(x)==0)}
'%!in%' <- function(x,y)!('%in%'(x,y))
#g<-c(2,1,3,1,3,4,4,2,5,4,5,6,6,7,7,8,8,9,2,3)
#x<<-matrix(g,nrow=10,ncol=2, byrow=TRUE)
#g<-c(1,4,2,8,3,6,4,7,5,2,6,9,7,1,8,5,8,6,9,3,9,7,10,2)
#x<<-matrix(g,nrow=12,ncol=2, byrow=TRUE)
#g<-c(1,2,2,3,2,4,3,4,3,5,4,1,4,13,5,6,6,7,7,8,8,9,9,6,10,9,10,11,11,8,11,12,12,13,12,14,13,10,14,15)
#x<<-matrix(g,20,2,byrow=TRUE)

u1<-unique(x[,1])
u2<-unique(x[,2])
u<-c(u1,u2)
n<<-length(unique(u))
remove(u1,u2,u)

G <<- vector("list", length=n)
G_REV <<- vector("list", length=n)
P = numeric(n)
FT = numeric(n)

for (i in 1:nrow(x)) {
  a = x[i,1]
  b = x[i,2]
  #for G
  if (is.null(G[[a]])) {
    G[[a]] = c(b)
  } else {
    G[[a]] = c(G[[a]], b)
  }
  if (is.null(G[[b]])) {
    G[[b]] = numeric()
  }
  #for G_VEV
  if (is.null(G_REV[[b]])) {
    G_REV[[b]] = c(a)
  } else {
    G_REV[[b]] = c(G_REV[[b]], a)
  }
  if (is.null(G_REV[[a]])) {
    G_REV[[a]] = numeric()
  }
}

G_TEMP <<- G_REV

#G_REV<<-x[,c(2,1)]

#P<<-matrix(c(1:n,rep(0,2*n)),nrow=n,ncol=3) #tracking whether vertex is explored or not

#colnames(P)<-c("node","f","parent_vertex")
explore<<-numeric()
assign("time",0,envir=globalenv())
#Algorithm -DFS

DFS_LOOP<<-function(G){
  counter = n 
  for(i in n : 1){
    if (i < counter) {
      counter = counter - 1000;
      print(i);
    }
    if (i %in% explore){next}
    else {
      DFS(i)
    }
  }
}

DFS<<-function(i){
  #if(time>=n){return(p)}
  #else{
  #print(c("i=",i))
  explore<<-c(explore,i)
  #print(c("explore",explore))
  
  #P[i,2] <<- 1 # gray
  v=G_TEMP[[i]]
  
  #print(c("v=",v))
  if (vector.is.empty(v) ==TRUE){
    len=1
    v=i
  }
  
  if(vector.is.empty(v)==FALSE){
    len=length(v)
  }
  
  for(j in 1: len){
    if(v[j] %!in% explore){
      DFS(v[j])
      #P[v[j],3] <<-i
      P[v[j]] <<- i
      # print(c("child",j,"parent",i))
    }
  }
  
  time<<-time + 1
  FT[i] <<- time
  #P[i,2] <<- time
  #P[i,2] <<- 2 #black
  # } <<-else
}
print('Starting DFS_loop on G_REV')
DFS_LOOP(G_REV)
###################################################
#temp0<-matrix(1:n,n,1)
# temp1<-P[,c(1,2)]
# colnames(temp1)<-c("before","after")
# temp1<-as.data.frame(temp1)
# colnames(x)<-c("vertex","edge")
# X<-as.data.frame(x)
# X_NEW<-merge(x=X,y=temp1,by.x="vertex",by.y="before")
# remove(X)
# names(X_NEW)[names(X_NEW)=="after"]<-"vertex_new"
# X_NEW2<-merge(x=X_NEW,y=temp1,by.x="edge",by.y="before")
# remove(X_NEW,temp1)
# names(X_NEW2)[names(X_NEW2)=="after"]<-"edge_new"
# G2<-as.matrix(X_NEW2)
# remove(X_NEW2)
# G2<-G2[,c(3,4)]
# u1<-unique(G2[,1])
# u2<-unique(G2[,2])
# u<-c(u1,u2)
# n<<-length(unique(u))
# remove(u1,u2,u)

FT_SORTED = numeric(n)
for (i in length(FT):1) {
  finish_time = FT[i]
  FT_SORTED[finish_time] = i
}

P = numeric(n)
FT = numeric(n)

#P<<-matrix(c(1:n,rep(0,2*n)),nrow=n,ncol=3) #tracking whether vertex is explored or not
#colnames(P)<-c("vertex","f","parent_vertex")
explore<<-numeric()
assign("time",0,envir=globalenv())

print('Starting DFS_loop on G')
#DFS_LOOP(G2)#2nd DFS
explore<<-numeric()

G_TEMP <<- G

for (i in length(FT_SORTED):1) { 
  k = FT_SORTED[i]
  if (k %!in% explore) {
    DFS(k)
  }
}

mscc_temp = which(P==0)
scc_temp=FT[mscc_temp]
#scc_temp<-P[P[,3]==0,2]
scc_temp=sort(scc_temp,decreasing=TRUE)
m=length(scc_temp)
scc=numeric()
for (i in 1:(m-1)){
  scc[i]=scc_temp[i]-scc_temp[i+1]
}
scc[m]<-scc_temp[m]
scc_top_5<-tail(sort(scc),5)


Thanks,
Jing
	[[alternative HTML version deleted]]



More information about the R-help mailing list