[R] text vector clustering
Stefan Th. Gries
stgries at gmail.com
Fri Jan 23 17:28:16 CET 2009
Hans-Joerg Bibiko's function Levenshtein would help; cf. below for an
example (very clumsy with two loops, but you can tweak that with apply
stuff).
HTH,
STG
levenshtein <- function(string1, string2, case=TRUE, map=NULL) {
########
# levenshtein algorithm in R
#
# Author : Hans-Joerg Bibiko
# Date : 29/06/2006
# Contact : bibiko at eva.mpg.de
########
# string1, string2 := strings to compare
# case = TRUE := case sensitivity; case = FALSE := case insensitivity
# map := character vector of c(regexp1, replacement1, regexp2,
replacement2, ...)
# example:
# map <- c("[aeiou]","V","[^aeiou]","C") := replaces all vowels
with V and all others with C
# levenshtein("Bank","Bond", map=map) => 0
########
if(!is.null(map)) {
m <- matrix(map, ncol=2, byrow=TRUE)
s <- c(ifelse(case, string1, tolower(string1)), ifelse(case,
string2, tolower(string2)))
for(i in 1:dim(m)[1]) s <- gsub(m[i,1], m[i,2], s)
string1 <- s[1]
string2 <- s[2]
}
if(ifelse(case, string1, tolower(string1)) == ifelse(case, string2,
tolower(string2))) return(0)
s1 <- strsplit(paste(" ", ifelse(case, string1, tolower(string1)),
sep=""), NULL)[[1]]
s2 <- strsplit(paste(" ", ifelse(case, string2, tolower(string2)),
sep=""), NULL)[[1]]
l1 <- length(s1)
l2 <- length(s2)
d <- matrix(nrow = l1, ncol = l2)
for(i in 1:l1) d[i,1] <- i-1
for(i in 1:l2) d[1,i] <- i-1
for(i in 2:l1) for(j in 2:l2) d[i,j] <- min((d[i-1,j]+1) ,
(d[i,j-1]+1) , (d[i-1,j-1]+ifelse(s1[i] == s2[j], 0, 1)))
d[l1,l2]
} # end of function Hans-Joerg Bibiko's levenshtein
# generate names
set.seed(1)
all.names<-character(10)
for (i in 1:10) {
all.names[i]<-paste(sample(letters, sample(4:10, 1), replace=T), collapse="")
}
all.names
# generate matrix
sims<-matrix(0, nrow=10, ncol=10)
attr(sims, "dimnames")<-list(all.names, all.names)
# fill matrix (clumsy)
for (j in 1:9) {
for (k in (j+1):10) {
sims[j,k]<-sims[k,j]<-levenshtein(all.names[j], all.names[k])
}
}
plot(hclust(as.dist(sims)))
More information about the R-help
mailing list