# [R] Removing duplicated rows within a matrix, with missing data as wildcards

Dimitris Rizopoulos dimitris.rizopoulos at med.kuleuven.be
Fri Mar 9 16:14:29 CET 2007

```you could also try something like the following:

x <- matrix(c(1, NA, 3, NA, 2, 3, 1, 3, 2, 2, 1, 3, 1, NA, 2, 2, 1,
3), ncol=3, byrow=TRUE)

wildcardVals <- 1:3 # possible wildcard values
ind <- complete.cases(x)
nc <- ncol(x)
nr <- nrow(x[ind, ])
nwld <- length(wildcardVals)
posb <- apply(x[!ind, , drop = FALSE], 1, function(y){
out <- matrix(y, nwld, nc, by = TRUE)
out[, is.na(y)] <- wildcardVals
t(out)
})
posb <- matrix(c(posb), ncol = nc, by = TRUE)
keep.ind <- duplicated(rbind(x[ind, ], posb))
keep.ind[-(1:nr)] <- apply(matrix(keep.ind[-(1:nr)], nc = nwld, by =
TRUE),
1, function(x) if(any(x)) rep(TRUE, length(x)) else x)
out <- rbind(x[ind, ], matrix(rep(x[!ind, ], each = nwld), nc = nc))
unique(out[!keep.ind, ])

I hope it works ok.

> > x <- matrix(c(1, NA, 3, NA, 2, 3), ncol=3, byrow=TRUE)
> > x
>      [,1] [,2] [,3]
> [1,]    1   NA    3
> [2,]   NA    2    3
>
> I would want to delete either x[1,] or x[2,] but not both.
>
> removeLooseDupRows <- function(x)
> {
>   if (nrow(x) <= 1)
>       return(x)
>   ii <- do.call("order",
>                 args=lapply(seq_len(ncol(x)),
>                             function(col) x[ , col]))
>   dup_index <- logical(nrow(x))
>   i0 <- -1
>   for (k in 1:length(ii)) {
>       i <- ii[k]
>       if (any(is.na(x[i, ]))) {
>           if (i0 == -1)
>               next
>           if (any(x[i, ] != x[i0, ], na.rm=TRUE))
>               next
>           dup_index[i] <- TRUE
>       } else {
>           i0 <- i
>       }
>   }
>   x[!dup_index, ]
> }
>
> should leave no such ambiguous cases for my data, as the nrow(x) are
> very high with few NA in each x.  For example, a row of (1, 2, 3) is
> very likely to exist in my data.
>
> However, to find the row numbers of any remaining ambiguous matches,
> should they exist, using example:
>
>> x <- matrix(c(1, NA, 3, NA, 2, 3, 1, 3, 2, 2, 1, 3, 1, NA, 2, 2, 1,
>> 3), ncol=3, byrow=TRUE)
>> x
>     [,1] [,2] [,3]
> [1,]    1   NA    3
> [2,]   NA    2    3
> [3,]    1    3    2
> [4,]    2    1    3
> [5,]    1   NA    2
> [6,]    2    1    3
>
>
>> removeLooseDupRows(x)
>     [,1] [,2] [,3]
> [1,]    1   NA    3
> [2,]   NA    2    3
> [3,]    1    3    2
> [4,]    2    1    3
> [5,]    2    1    3
>
>> q <- removeLooseDupRows(unique(x))
>> q
>     [,1] [,2] [,3]
> [1,]    1   NA    3
> [2,]   NA    2    3
> [3,]    1    3    2
> [4,]    2    1    3
>
> I could
>
>> # ambiguous matches in matrix form
>> apply(q, 1, function(row1) apply(q, 1, function(row2)
>> all(is.na(row1) | is.na(row2) | row1==row2)))
>
>      [,1]  [,2]  [,3]  [,4]
> [1,]  TRUE  TRUE FALSE FALSE
> [2,]  TRUE  TRUE FALSE FALSE
> [3,] FALSE FALSE  TRUE FALSE
> [4,] FALSE FALSE FALSE  TRUE
>
>> # indices of ambiguous matches
>> m <- which(apply(q, 1, function(row1) apply(q, 1, function(row2)
>> all(is.na(row1) | is.na(row2) | row1==row2))), arr=T)
>> m
>     row col
> [1,]   1   1
> [2,]   2   1
> [3,]   1   2
> [4,]   2   2
> [5,]   3   3
> [6,]   4   4
>
>> #put in order and omit duplicates
>> m2 <- unique(t(apply(m, 1, sort)))
>> m2
>     [,1] [,2]
> [1,]    1    1
> [2,]    1    2
> [3,]    2    2
> [4,]    3    3
> [5,]    4    4
>
>> # show the ambiguous matches
>> m2[m2[,1]!=m2[,2], drop=F]
> [1] 1 2
>
> ...and procede from there.
>
> This solution came from another helpful "R-help" respondant to my
> poorly-defined problem.
>
> Appreciative thanks to everyone for your instructive help.
>
> Cheers,
> stacey
>
