[R-es] Encontrar la primera columna no NA

Jue Oct 27 19:16:39 CEST 2016

Tengo que comprobar si todos hacen lo mismo del todo. Pero los resultados
no dejan de sorprenderme. Carlos, ya no programa nadie en base y empiezo a
sospechar que igual nos equivocamos.

===================================================================================================
# Unit: seconds
          expr        min         lq      mean    median
uq          max        neval   Lift
#   JVG         0,6716004   0,7210757   1,0513104   0,9597415   1,1624642
2,0997470       10  3,17
#   Olivier     3,0642166   3,424266    3,7383201   3,745616    3,9909474
4,4795947       10  12,3
#   Olivier2    1,2263557   1,340338    1,5031451   1,5140908   1,6264349
1,7548450       10  5,00
#   Adolfo      0,3401764   0,3425798   0,446328    0,3992639   0,5313764
0,7357900       10  1,32
#   Olivier3    0,3684704   0,3875006   0,5157852   0,4959741   0,6414696
0,6954977       10  1,64
#   GilBellosta 0,2089104   0,265419    0,3599796   0,3023052   0,4038109
0,7859248       10  1
===================================================================================================

#
===================================================================================================
# Codigo
#
===================================================================================================
library(microbenchmark)
N <- 1e1
tabla <-
  microbenchmark(
    # JVG_dplyr ={
    #   dat %>%
    #     apply( MARGIN = 1, FUN =
    #              function(x){
    #                which( !is.na(x)  ) %>%  min( na.rm = TRUE ) %>%
return()
    #                }
    #     )
    #   dat[ , First_month := First_month]
    #   N_for <- length( unique(First_month ))
    #   for( j in 1:N_for){
    #     x <- dat[  First_month == j,  j,  with = FALSE]
    #     dat[ First_month == j , Value_First_month := x ]
    #   }
    # },
    JVG ={
      dat <-
        data.table( Uno    = sample( c(runif(numero) , rep(NA , numero /2e0
)) , size = numero ) ,
                    dos    = sample( c(runif(numero) , rep(NA , numero /1e1
)) , size = numero ) ,
                    tres   = sample( c(runif(numero) , rep(NA , numero /2e1
)) , size = numero ) ,
                    cuatro = sample( c(runif(numero) , rep(NA , numero /1e2
)) , size = numero ) ,
                    cinco  = sample( c(runif(numero) , rep(NA , numero /2e2
)) , size = numero ) ,
                    seis   = sample( c(runif(numero) , rep(NA , numero /1e3
)) , size = numero )
        )

      apply(X = dat,  MARGIN = 1, FUN =
              function(x){
                return(   min(  which( !is.na(x)  ),  na.rm = TRUE ) )
              }
      )
      dat[ , First_month := First_month]
      N_for <- length( unique(First_month ))
      for( j in 1:N_for){
        x <- dat[  First_month == j,  j,  with = FALSE]
        dat[ First_month == j , Value_First_month := x ]
      }
    },
    Olivier ={
    dat <-
      data.table( Uno    = sample( c(runif(numero) , rep(NA , numero /2e0
)) , size = numero ) ,
                  dos    = sample( c(runif(numero) , rep(NA , numero /1e1
)) , size = numero ) ,
                  tres   = sample( c(runif(numero) , rep(NA , numero /2e1
)) , size = numero ) ,
                  cuatro = sample( c(runif(numero) , rep(NA , numero /1e2
)) , size = numero ) ,
                  cinco  = sample( c(runif(numero) , rep(NA , numero /2e2
)) , size = numero ) ,
                  seis   = sample( c(runif(numero) , rep(NA , numero /1e3
)) , size = numero )
      )
      dat[,First_month       := apply(X = .SD,MARGIN = 1,FUN = function(x)
colnames(.SD)[min(which(!is.na(x)))])]
      dat[,Value_First_month := apply(X = .SD,MARGIN = 1,FUN = function(x)
x[min(which(!is.na(x)))])]
    },
    Olivier2={
      dat <-
        data.table( Uno    = sample( c(runif(numero) , rep(NA , numero /2e0
)) , size = numero ) ,
                    dos    = sample( c(runif(numero) , rep(NA , numero /1e1
)) , size = numero ) ,
                    tres   = sample( c(runif(numero) , rep(NA , numero /2e1
)) , size = numero ) ,
                    cuatro = sample( c(runif(numero) , rep(NA , numero /1e2
)) , size = numero ) ,
                    cinco  = sample( c(runif(numero) , rep(NA , numero /2e2
)) , size = numero ) ,
                    seis   = sample( c(runif(numero) , rep(NA , numero /1e3
)) , size = numero )
        )

      dat[,jugador:=1:.N]
      dat2=melt(dat,id.vars="jugador")
      setkey(dat2,jugador)
      dat2[,index:=min(which(!is.na(value))),by=jugador]
      dat3 <- dat2[,list(First_month_Olivier
=variable[index[1]],Value_First_month_Olivier =value[index[1]]),by=jugador]
      setkey(x = dat, jugador)
      dat0 <- merge( x = dat, y = dat3, all.x = TRUE, all.y = FALSE)

    },

    Adolfo = {

      dat <-
        data.table( Uno    = sample( c(runif(numero) , rep(NA , numero /2e0
)) , size = numero ) ,
                    dos    = sample( c(runif(numero) , rep(NA , numero /1e1
)) , size = numero ) ,
                    tres   = sample( c(runif(numero) , rep(NA , numero /2e1
)) , size = numero ) ,
                    cuatro = sample( c(runif(numero) , rep(NA , numero /1e2
)) , size = numero ) ,
                    cinco  = sample( c(runif(numero) , rep(NA , numero /2e2
)) , size = numero ) ,
                    seis   = sample( c(runif(numero) , rep(NA , numero /1e3
)) , size = numero )
        )
      # 1) Creamos una columna con la informacion de los jugadores,
      # Como es un jugador por fila, hacemos 1:nrow.
      step1 <- dat %>%
        mutate(player = 1:nrow(dat))

      #2) Convertimos las columnas de tiempo (uno, dos, tres, ...) en dos
columnas, mes y numero de juegos. (Ojo, asumimos que en los datos las
columnas estan ordenadas como en el ejemplo, es decir uno, dos, tres y no
tres, uno, dos)

      step2 <- gather(step1, month, games, -player)

      #y 3) Filtramos los meses con NA y por cada jugador nos quedamos con
el primer dato:
      step3 <- step2 %>%
        filter(!is.na(games)) %>%
        group_by(player) %>%
        slice(1)

      dat %>%  print

    },

    Olivier3 = {
      dat <-
        data.table( Uno    = sample( c(runif(numero) , rep(NA , numero /2e0
)) , size = numero ) ,
                    dos    = sample( c(runif(numero) , rep(NA , numero /1e1
)) , size = numero ) ,
                    tres   = sample( c(runif(numero) , rep(NA , numero /2e1
)) , size = numero ) ,
                    cuatro = sample( c(runif(numero) , rep(NA , numero /1e2
)) , size = numero ) ,
                    cinco  = sample( c(runif(numero) , rep(NA , numero /2e2
)) , size = numero ) ,
                    seis   = sample( c(runif(numero) , rep(NA , numero /1e3
)) , size = numero )
        )
      M=as.matrix(dat)
      index <- which(!is.na(M)) - 1
      meses<-colnames(M)
      M2<- data.table(columna=index %/% nrow(M) +1L, jugador=index %%
nrow(M) +1L , valor=M[index+1L])
      setkey(M2,jugador,columna)

M2[,.(First_month=meses[columna[1]],Value_First_month=valor[1]),by=jugador]
    },
    GilBellosta = {

      dat <-
        data.frame( Uno    = sample( c(runif(numero) , rep(NA , numero /2e0
)) , size = numero ) ,
                    dos    = sample( c(runif(numero) , rep(NA , numero /1e1
)) , size = numero ) ,
                    tres   = sample( c(runif(numero) , rep(NA , numero /2e1
)) , size = numero ) ,
                    cuatro = sample( c(runif(numero) , rep(NA , numero /1e2
)) , size = numero ) ,
                    cinco  = sample( c(runif(numero) , rep(NA , numero /2e2
)) , size = numero ) ,
                    seis   = sample( c(runif(numero) , rep(NA , numero /1e3
)) , size = numero )
        )
      tmp <- (as.matrix(dat))
      cols <- col(tmp)
      cols[is.na(tmp)] <- Inf
      my.cols <- apply(cols, 1, min)
      my.values <- tmp[cbind(1:nrow(tmp), my.cols)]

      difftime(Sys.time(), t)

    },

    times = N, unit = "s")

tabla %>%  print
beepr::beep(3)

# Unit: seconds
#        expr       min        lq      mean    median        uq       max
neval
# JVG         0.6716004 0.7210757 1.0513104 0.9597415 1.1624642
2.0997470    10
# Olivier     3.0642166 3.4242660 3.7383201 3.7456160 3.9909474
4.4795947    10
# Olivier2    1.2263557 1.3403380 1.5031451 1.5140908 1.6264349
1.7548450    10
# Adolfo      0.3401764 0.3425798 0.4463280 0.3992639 0.5313764
0.7357900    10
# Olivier3    0.3684704 0.3875006 0.5157852 0.4959741 0.6414696
0.6954977    10
# GilBellosta 0.2089104 0.2654190 0.3599796 0.3023052 0.4038109
0.7859248    10

	[[alternative HTML version deleted]]