[R] how to ignore NA with "NA" or "NULL"

jeff6868 geoffrey_klein at etu.u-bourgogne.fr
Wed Jun 6 15:41:06 CEST 2012


Ok Jeff, but then it'll be a big one. I'm working on a list of files and my
problem depends on different functions used previously. So it's very hard
for me to summarize to reproduct my error. But here is the reproductible
example with the error at the last line of the code (just copy and paste
it).
You'll notice that the data.frame with only NAs is set to NULL in "refill",
and I just want to have it unchanged in output (so the same as input).
The aim of the function is to fill the NAs of my data.frames. It'll not work
in this example because there're only big NA gaps which are my problem for
the moment. But maybe now you can have an idea where the problem is (change
NULL for "only NA DF" in output to the same DF as in input).
For the example, we are just testing for "x1".
Hope you have understood my problem now :)
Thanks Jeff, Rui or everyone else!

# my data for example
DF1 <- data.frame(x1=rnorm(1:20),x2=c(31:50))
write.table(DF1,"ST001_2008.csv",sep=";")
DF2 <-
data.frame(x1=c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,rnorm(1:10)),x2=c(1:20))
write.table(DF2,"ST002_2008.csv",sep=";")
DF3 <- data.frame(x1=rnorm(81:100),x2=NA)
write.table(DF3,"ST003_2008.csv",sep=";")
DF4 <- data.frame(x1=c(21:40),x2=rnorm(1:20))
write.table(DF4,"ST004_2008.csv",sep=";")

    #list my data
    filenames <- list.files(pattern="\\_2008.csv$")

    Sensors <- paste("x", 1:2,sep="")

    Stations <-substr(filenames,1,5)

    nsensors <- length(Sensors)
    nstations <- length(Stations)

    nobs <- nrow(read.table(filenames[1], header=TRUE))

    yr2008 <- array(NA, dim=c(nobs, nsensors, nstations))

    for(i in seq_len(nstations)){
    tmp <- read.table(filenames[i], header=TRUE, sep=";")
    yr2008[ , , i] <- as.matrix(tmp[, Sensors])
    }

    dimnames(yr2008) <- list(seq.int(nobs), Sensors, Stations)

    yr2008capt1hiver<-yr2008[1:10,1,]
    yr2008capt1hiver <- as.data.frame(yr2008capt1hiver)

    #correlation between my data for x1 (for the example)
    corhiver2008capt1 <- cor(yr2008capt1hiver,use="pairwise.complete.obs")

    capt1hiver <- c(1:length(yr2008capt1hiver))

    for(i in 1:length(capt1hiver))
    {
   
if(sum(!is.na(yr2008capt1hiver[,capt1hiver[i]]))<(length(yr2008capt1hiver[[capt1hiver[i]]])/2))
    {
         corhiver2008capt1[i,]=NA
         corhiver2008capt1[,i]=NA
      }
    }


    lst <- lapply(list.files(pattern="\\_2008.csv$"), read.table,sep=";",
header=TRUE, stringsAsFactors=FALSE)
    names(lst) <- Stations

    # searching the highest correlation for each data.Frame
    get.max.cor <- function(station, mat){
     mat[row(mat) == col(mat)] <- -Inf
     m <- max(mat[station, ],na.rm=TRUE)
     if (is.finite(m)) {return(which( mat[station, ] == m ))}
     else {return(NA)}
    }

    # fill the data.frame with the data.frame which has the highest
correlation coefficient
    na.fill <- function(x, y){
     if(all(!is.finite(y[1:10,1])))  return(y)
     i <- is.na(x[1:10,1])
     xx <- y[1:10,1]
     new <- data.frame(xx=xx)
     x[1:10,1][i] <- predict(lm(x[1:10,1]~xx, na.action=na.exclude),new)[i]
     x
    }

    process.all <- function(df.list, mat){

        f <- function(station)
             na.fill(df.list[[ station ]], df.list[[ max.cor[station] ]])

        g <- function(station){
        x <- df.list[[station]]
        if(any(!is.finite(x[1:10,1]))){
            mat[row(mat) == col(mat)] <- -Inf
            nas <- which(is.na(x[1:10,1]))
            ord <- order(mat[station, ], decreasing = TRUE)[-c(1,
ncol(mat))]
            for(y in ord){
                if(all(!is.na(df.list[[y]][1:10,1][nas]))){
                    xx <- df.list[[y]][1:10,1]
                    new <- data.frame(xx=xx)
                    x[1:10,1][nas] <- predict(lm(x[1:10,1]~xx,
na.action=na.exclude), new)[nas]
                    break
                }
            }
        }
        x
    }

        n <- length(df.list)
        nms <- names(df.list)
        max.cor <- sapply(seq.int(n), get.max.cor, corhiver2008capt1)
        df.list <- lapply(seq.int(n), f)
        df.list <- lapply(seq.int(n), g)
        names(df.list) <- nms
        df.list
    }

    refill <- process.all(lst, corhiver2008capt1)
    refill <- as.data.frame(refill)                                              
########## HERE IS THE PROBLEM ######
    head(refill)

--
View this message in context: http://r.789695.n4.nabble.com/how-to-ignore-NA-with-NA-or-NULL-tp4632287p4632527.html
Sent from the R help mailing list archive at Nabble.com.



More information about the R-help mailing list