[R] Cluster Analysis: Density-Based Method

Thu Oct 21 15:51:02 CEST 2004

Dear Fernando,

below you find a DBSCAN function I wrote for my own purposes.
It comes with no warranty and without proper documentation, but I followed
the notation of the original KDD-96 DBSCAN paper.
For large data sets, it may be slow.

Best,
Christian

On Thu, 21 Oct 2004, Fernando Prass wrote:

> No, kmeans is a partition method. I need a model-based method, like DBSCAN or
> DENCLUE algorithm...
> 
> Fernando Prass

distvector <- function(x,data){
  ddata <- t(data)-x
  dv <- apply(ddata^2,2,sum)
}

# data may be nxp or distance matrix
# eps is the dbscan distance cutoff parameter
# MinPts is the minimum size of a cluster
# scale: Should the data be scaled?
# distances: has to be TRUE if data is a distance matrix
# showplot: Should the computation process be visualized? 
# countmode: dbscan gives messages when processing point no. (countmode)
dbscan <- function(data,eps,MinPts=5, scale=FALSE, distances=FALSE,
                   showplot=FALSE,
                   countmode=c(1,2,3,5,10,100,1000,5000,10000,50000)){
  data <- as.matrix(data)
  n <- nrow(data)
  if (scale) data <- scale(data)
  unregpoints <- rep(0,n)
  e2 <- eps^2
  cv <- rep(0,n)
  cn <- 0
  i <- 1
  for (i in 1:n){
    if (i %in% countmode) cat("Processing point ", i," of ",n, ".\n")
    unclass <- cv<1
    if (cv[i]==0){
      if (distances) seeds <- data[i,]<=eps
      else{
        seeds <- rep(FALSE,n)
        seeds[unclass] <- distvector(data[i,],data[unclass,])<=e2
      }
      if (sum(seeds)+unregpoints[i]<MinPts) cv[i] <- (-1)
      else{
        cn <- cn+1
        cv[i] <- cn
        seeds[i] <- unclass[i] <- FALSE
        unregpoints[seeds] <- unregpoints[seeds]+1
        while (sum(seeds)>0){
          if (showplot) plot(data,col=1+cv)
          unclass[seeds] <- FALSE
          cv[seeds] <- cn
          ap <- (1:n)[seeds]
#          print(ap)
          seeds <- rep(FALSE,n)          
          for (j in ap){
#            if (showplot) plot(data,col=1+cv)
            jseeds <- rep(FALSE,n)          
            if (distances) jseeds[unclass] <- data[j,unclass]<=eps
            else{
              jseeds[unclass] <- distvector(data[j,],data[unclass,])<=e2
            }
            unregpoints[jseeds] <- unregpoints[jseeds]+1
#            if (cn==1)
#              cat(j," sum seeds=",sum(seeds)," unreg=",unregpoints[j],
#                  " newseeds=",sum(cv[jseeds]==0),"\n")
            if (sum(jseeds)+unregpoints[j]>=MinPts){              
              seeds[jseeds] <- cv[jseeds]==0
              cv[jseeds & cv<0] <- cn
            }
          } # for j
        } # while sum seeds>0
      } # else (sum seeds + ... >= MinPts)
    } # if cv==0
  } # for i
  if (sum(cv==(-1))>0){
    noisenumber <- cn+1
    cv[cv==(-1)] <- noisenumber
  }
  else
    noisenumber <- FALSE
  out <- list(classification=cv, noisenumber=noisenumber,
              eps=eps, MinPts=MinPts, unregpoints=unregpoints)
  out
} # dbscan
# classification: classification vector
# noisenumber: number in the classification vector indicating noise points
# unregpoints: ignore...

***********************************************************************
Christian Hennig
Fachbereich Mathematik-SPST/ZMS, Universitaet Hamburg
hennig at math.uni-hamburg.de, http://www.math.uni-hamburg.de/home/hennig/
#######################################################################
ich empfehle www.boag-online.de