[R] Filtering using multiple rows in dplyr
Jeff Newmiller
jdnewm|| @end|ng |rom dcn@d@v|@@c@@u@
Thu May 31 19:55:02 CEST 2018
Yes this can be done using dplyr. One of the main advantages of doing so
is that it is possible to develop and modify code to handle fairly
complicated requirements easily, but it may not always be best from a
performance or memory usage perspective. The example below walks you
through some instructive examples using both base R and dplyr and shows
that base R seems faster for this particular example.
For reference: your failure to send your request in plain text lead to
some minor corruption when passed through the mailing list. (See the
included text below.) I have seen cases where what was received was
essentially unreadable, so please figure out your email program's plain
text setting for next time. The mailing list posting guide warns you about
this and other pitfalls, though due to the wide variety of operating
systems and email programs the details of how to circumvent these pitfalls
are mostly left to the user to track down.
#####################################################
dta0 <- read.table( text=
" subject ageGrp ear hearingGrp sex freq L2 Ldp Phidp NF SNR
1 HALAF032 A L A F 2 0 -23.54459 55.56005 -43.08282 19.538232
2 HALAF032 A L A F 2 2 -32.64881 86.22040 -23.31558 -9.333224
3 HALAF032 A L A F 2 4 -18.91058 42.12168 -35.60250 16.691919
4 HALAF032 A L A F 2 6 -23.85937 297.94499 -20.70452 -3.154846
5 HALAF032 A L A F 2 8 -14.45381 181.75329 -24.17094 9.717128
6 HALAF032 A L A F 2 10 -20.42384 67.12998 -35.77357 15.349728
", header=TRUE )
set.seed( 21 )
dta <- expand.grid( subject = sprintf( "HALAF%s", 30:35 )
, freq = 2:4
, L2 = seq( 0, 10, by=2 )
)
dta$Ldp <- rnorm( nrow( dta ), -20, 5 )
dta$SNR <- rnorm( nrow( dta ), 5, 10 )
dta1 <- dta[ order( dta$subject, dta$freq, dta$L2 ), ]
# verify whether subset of data meets criteria
# assumes DF has L2 in sorted order (and only one subject and freq)
testSNR <- function( SNR, SNRlimit, limitCount ) {
qual <- SNR > SNRlimit
consec <- cumsum( qual )
any( limitCount < consec )
}
dta1[ 1:6, "SNR" ]
#> [1] 15.720845 -3.846874 -1.650308 3.353974 5.533430 8.709555
testSNR( dta1[ 1:6, "SNR" ], SNRlimit = 3, limitCount = 3 )
#> [1] TRUE
# one row output per group
aggregate( list( isclean = dta1$SNR )
, dta1[ , c( "subject", "freq" ) ]
, FUN=testSNR
, SNRlimit=3
, limitCount=3
)
#> subject freq isclean
#> 1 HALAF30 2 TRUE
#> 2 HALAF31 2 FALSE
#> 3 HALAF32 2 FALSE
#> 4 HALAF33 2 FALSE
#> 5 HALAF34 2 FALSE
#> 6 HALAF35 2 TRUE
#> 7 HALAF30 3 FALSE
#> 8 HALAF31 3 TRUE
#> 9 HALAF32 3 FALSE
#> 10 HALAF33 3 TRUE
#> 11 HALAF34 3 FALSE
#> 12 HALAF35 3 TRUE
#> 13 HALAF30 4 FALSE
#> 14 HALAF31 4 FALSE
#> 15 HALAF32 4 TRUE
#> 16 HALAF33 4 FALSE
#> 17 HALAF34 4 TRUE
#> 18 HALAF35 4 TRUE
# results are repeated to the length of their group
# result of ave function is always numeric
ave( dta1$SNR
, dta1$subject
, dta1$freq
, FUN=function(x) { testSNR( x, 3, 3 ) }
)
#> [1] 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0
#> [36] 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0
#> [71] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
#> [106] 1 1 1
# this is the calculation you asked for
dta1$clean <- ifelse( ave( dta1$SNR
, dta1$subject
, dta1$freq
, FUN=function(x) { testSNR( x, 3, 3 ) }
)
, "Y"
, "N"
)
dta2 <- dta1[ "Y" == dta1$clean, ]
# but yes, all this can also be accomplished using dplyr operations
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
# create sample data set using dplyr
set.seed( 21 )
dtaT <- ( expand.grid( subject = sprintf( "HALAF%s", 30:35 )
, freq = 2:4
, L2 = seq( 0, 10, by=2 )
)
%>% mutate( Ldp = rnorm( n(), -20, 5 )
, SNR = rnorm( n(), 5, 10 )
)
)
# sort sample dataset using dplyr
dta1T <- ( dtaT
%>% arrange( subject, freq, L2 )
%>% group_by( subject, freq )
%>% mutate( clean = ifelse( testSNR( SNR
, SNRlimit = 3
, limitCount = 3
)
, "Y"
, "N"
)
)
%>% ungroup
)
# per your request
dta2T <- ( dta1T
%>% filter( "Y" == clean )
)
# or avoid unnecessary comparisons
dta1T2 <- ( dtaT
%>% arrange( subject, freq, L2 )
%>% group_by( subject, freq )
%>% mutate( isclean = testSNR( SNR, SNRlimit = 3, limitCount = 3 ) )
%>% ungroup
)
# per your request
dta2T2 <- ( dta1T2
%>% filter( isclean )
)
# confirm that results are equivalent
all.equal( dta2$Ldp, dta2T$Ldp )
#> [1] TRUE
all.equal( dta2$Ldp, dta2T2$Ldp )
#> [1] TRUE
library(microbenchmark)
fn_base <- function( dta1 ) {
dta1$clean <- ifelse( ave( dta1$SNR
, dta1$subject
, dta1$freq
, FUN=function(x) { testSNR( x, 3, 3 ) }
)
, "Y"
, "N"
)
dta1[ "Y" == dta1$clean, ]
}
fn_dplyr1 <- function( dta1 ) {
( dta1
%>% group_by( subject, freq )
%>% mutate( clean = ifelse( testSNR( SNR, SNRlimit = 3, limitCount = 3 )
, "Y"
, "N"
)
)
%>% ungroup
%>% filter( "Y" == clean )
)
}
fn_dplyr2 <- function( dta1 ) {
( dta1
%>% group_by( subject, freq )
%>% mutate( isclean = testSNR( SNR, SNRlimit = 3, limitCount = 3 ) )
%>% ungroup
%>% filter( isclean )
)
}
microbenchmark( fn_base( dta1 ), fn_dplyr1( dta1 ), fn_dplyr2( dta1 ) )
#> Unit: microseconds
#> expr min lq mean median uq max
#> fn_base(dta1) 756.604 797.154 932.1924 943.983 960.054 3928.349
#> fn_dplyr1(dta1) 4106.170 4180.185 4386.9290 4212.907 4253.640 7855.908
#> fn_dplyr2(dta1) 4148.434 4221.252 4421.7373 4249.389 4288.841 7519.803
#> neval
#> 100
#> 100
#> 100
#' <details><summary>Session info</summary>
devtools::session_info()
#> Session info
-------------------------------------------------------------
#> setting value
#> version R version 3.4.4 (2018-03-15)
#> system x86_64, linux-gnu
#> ui X11
#> language en_US:en
#> collate en_US.UTF-8
#> tz America/Los_Angeles
#> date 2018-05-31
#> Packages
-----------------------------------------------------------------
#> package * version date source
#> assertthat 0.2.0 2017-04-11 CRAN (R 3.4.0)
#> backports 1.1.2 2017-12-13 CRAN (R 3.4.4)
#> base * 3.4.4 2018-03-16 local
#> bindr 0.1.1 2018-03-13 CRAN (R 3.4.4)
#> bindrcpp * 0.2.2 2018-03-29 CRAN (R 3.4.4)
#> compiler 3.4.4 2018-03-16 local
#> datasets * 3.4.4 2018-03-16 local
#> devtools 1.13.5 2018-02-18 CRAN (R 3.4.4)
#> digest 0.6.15 2018-01-28 CRAN (R 3.4.4)
#> dplyr * 0.7.5 2018-05-19 CRAN (R 3.4.4)
#> evaluate 0.10.1 2017-06-24 CRAN (R 3.4.1)
#> glue 1.2.0 2017-10-29 CRAN (R 3.4.4)
#> graphics * 3.4.4 2018-03-16 local
#> grDevices * 3.4.4 2018-03-16 local
#> htmltools 0.3.6 2017-04-28 CRAN (R 3.4.1)
#> knitr 1.20 2018-02-20 CRAN (R 3.4.4)
#> magrittr 1.5 2014-11-22 CRAN (R 3.4.0)
#> memoise 1.1.0 2017-04-21 CRAN (R 3.4.1)
#> methods * 3.4.4 2018-03-16 local
#> microbenchmark * 1.4-4 2018-01-24 CRAN (R 3.4.4)
#> pillar 1.2.3 2018-05-25 CRAN (R 3.4.4)
#> pkgconfig 2.0.1 2017-03-21 CRAN (R 3.4.1)
#> purrr 0.2.5 2018-05-29 CRAN (R 3.4.4)
#> R6 2.2.2 2017-06-17 CRAN (R 3.4.1)
#> Rcpp 0.12.17 2018-05-18 CRAN (R 3.4.4)
#> rlang 0.2.1 2018-05-30 CRAN (R 3.4.4)
#> rmarkdown 1.9 2018-03-01 CRAN (R 3.4.4)
#> rprojroot 1.3-2 2018-01-03 CRAN (R 3.4.4)
#> stats * 3.4.4 2018-03-16 local
#> stringi 1.2.2 2018-05-02 CRAN (R 3.4.4)
#> stringr 1.3.1 2018-05-10 CRAN (R 3.4.4)
#> tibble 1.4.2 2018-01-22 CRAN (R 3.4.4)
#> tidyselect 0.2.4 2018-02-26 CRAN (R 3.4.4)
#> tools 3.4.4 2018-03-16 local
#> utils * 3.4.4 2018-03-16 local
#> withr 2.1.2 2018-03-15 CRAN (R 3.4.4)
#> yaml 2.1.19 2018-05-01 CRAN (R 3.4.4)
#' </details>
#####################################################
On Wed, 30 May 2018, Sumitrajit Dhar wrote:
> Hi Folks,
>
> I have just started using dplyr and could use some help getting unstuck.
> It could well be that dplyr is not the package to be using, but let me
> just pose the question and seek your advice.
>
> Here is my basic data frame.
>
> head(h)
> subject ageGrp ear hearingGrp sex freq L2 Ldp Phidp NF SNR
> 1 HALAF032 A L A F 2 0 -23.54459 55.56005 -43.08282 19.538232
> 2 HALAF032 A L A F 2 2 -32.64881 86.22040 -23.31558 -9.333224
> 3 HALAF032 A L A F 2 4 -18.91058 42.12168 -35.60250 16.691919
> 4 HALAF032 A L A F 2 6 -23.85937 297.94499 -20.70452 -3.154846
> 5 HALAF032 A L A F 2 8 -14.45381 181.75329 -24.17094 9.717128
> 6 HALAF032 A L A F 2 10 -20.42384 67.12998 -35.77357 15.349728
>
> ?subject? and ?freq? together make a set of data and I am interested in
> how the last four columns vary as a function of L2. So I grouped by
> ?subject? and ?freq? and can look at basic summaries.
>
>
> h_byFunc <- h %>% group_by(subject, freq)
>
>> h_byFunc %>% summarize(l = mean(Ldp), s = sd(Ldp) )
>
> # A tibble: 1,175 x 4
> # Groups: subject [?]
> subject freq l s
> <fct> <int> <dbl> <dbl>
> 1 HALAF032 2 -13.8 8.39
> 2 HALAF032 4 -15.8 11.0
> 3 HALAF032 8 -23.4 6.51
> 4 HALAF033 2 -14.2 9.64
> 5 HALAF033 4 -12.3 8.92
> 6 HALAF033 8 -6.55 12.3
> 7 HALAF036 2 -14.9 12.6
> 8 HALAF036 4 -16.7 11.2
> 9 HALAF036 8 -21.7 6.56
> 10 HALAF039 2 0.242 12.4
> # ... with 1,165 more rows
>
> What I would like to do is filter some groups out based on various
> criteria. For example, if SNR > 3 in three consecutive L2 within a
> group, that group qualifies and I would add a column, say ?clean? and
> assign it a value ?Y.? Is there a way to do this in dplyr or should I be
> looking at a different way.
>
> Thanks in advance for your help.
>
> Regards,
> Sumit
>
>
>
>
---------------------------------------------------------------------------
Jeff Newmiller The ..... ..... Go Live...
DCN:<jdnewmil using dcn.davis.ca.us> Basics: ##.#. ##.#. Live Go...
Live: OO#.. Dead: OO#.. Playing
Research Engineer (Solar/Batteries O.O#. #.O#. with
/Software/Embedded Controllers) .OO#. .OO#. rocks...1k
More information about the R-help
mailing list