[Rd] invert argument in grep

Fri Nov 10 11:04:04 CET 2006

On 11/9/2006 5:14 AM, Romain Francois wrote:
> Hello,
> 
> What about an `invert` argument in grep, to return elements that are 
> *not* matching a regular expression :
> 
> R> grep("pink", colors(), invert = TRUE, value = TRUE)
> 
> would essentially return the same as :
> 
> R> colors() [ - grep("pink", colors()) ]
> 
> 
> I'm attaching the files that I modified (against today's tarball) for 
> that purpose.

I think a more generally useful change would be to be able to return a 
logical vector with TRUE for a match and FALSE for a non-match, so a 
simple !grep(...) does the inversion.  (This is motivated by the recent 
R-help discussion of the fact that x[-selection] doesn't always invert 
the selection when it's a vector of indices.)

A way to do that without expanding the argument list would be to allow

value="logical"

as well as value=TRUE and value=FALSE.

This would make boolean operations easy, e.g.

colors()[grep("dark", colors(), value="logical")
       & !grep("blue", colors(), value="logical")]

to select the colors that contain "dark" but not "blue". (In this case 
the RE to select that subset is rather simple because "dark" always 
precedes "blue", but if that wasn't true, it would be a lot messier.)

Duncan Murdoch
> 
> Cheers,
> 
> Romain
> 
> 
> 
> ------------------------------------------------------------------------
> 
> grep <-
> function(pattern, x, ignore.case = FALSE, extended = TRUE, perl = FALSE,
>          value = FALSE, fixed = FALSE, useBytes = FALSE, invert = FALSE)
> {
>     pattern <- as.character(pattern)
>     ## when value = TRUE we return names
>     if(!is.character(x)) x <- structure(as.character(x), names=names(x))
>     ## behaves like == for NA pattern
>     if (is.na(pattern)) {
>         if(value)
>             return(structure(rep.int(as.character(NA), length(x)),
>                              names = names(x)))
>         else
>             return(rep.int(NA, length(x)))
>     }
> 
>     if(perl)
>         .Internal(grep.perl(pattern, x, ignore.case, value, useBytes, invert))
>     else
>         .Internal(grep(pattern, x, ignore.case, extended, value, fixed,
>                        useBytes, invert))
> }
> 
> sub <-
> function(pattern, replacement, x, ignore.case = FALSE, extended = TRUE,
>          perl = FALSE, fixed = FALSE, useBytes = FALSE)
> {
>     pattern <- as.character(pattern)
>     replacement <- as.character(replacement)
>     if(!is.character(x)) x <- as.character(x)
>     if (is.na(pattern))
>         return(rep.int(as.character(NA), length(x)))
> 
>     if(perl)
>         .Internal(sub.perl(pattern, replacement, x, ignore.case, useBytes))
>     else
>         .Internal(sub(pattern, replacement, x, ignore.case,
>                       extended, fixed, useBytes))
> }
> 
> gsub <-
> function(pattern, replacement, x, ignore.case = FALSE, extended = TRUE,
>          perl = FALSE, fixed = FALSE, useBytes = FALSE)
> {
>     pattern <- as.character(pattern)
>     replacement <- as.character(replacement)
>     if(!is.character(x)) x <- as.character(x)
>     if (is.na(pattern))
>         return(rep.int(as.character(NA), length(x)))
> 
>     if(perl)
>         .Internal(gsub.perl(pattern, replacement, x, ignore.case, useBytes))
>     else
>         .Internal(gsub(pattern, replacement, x, ignore.case,
>                        extended, fixed, useBytes))
> }
> 
> regexpr <-
> function(pattern, text, extended = TRUE, perl = FALSE,
>          fixed = FALSE, useBytes = FALSE)
> {
>     pattern <- as.character(pattern)
>     text <- as.character(text)
>     if(perl)
>         .Internal(regexpr.perl(pattern, text, useBytes))
>     else
>         .Internal(regexpr(pattern, text, extended, fixed, useBytes))
> }
> 
> gregexpr <-
> function(pattern, text, extended = TRUE, perl = FALSE,
>          fixed = FALSE, useBytes = FALSE)
> {
>     pattern <- as.character(pattern)
>     text <- as.character(text)
>     if(perl)
>       .Internal(gregexpr.perl(pattern, text, useBytes))
>     else
>       .Internal(gregexpr(pattern, text, extended, fixed, useBytes))
> }
> 
> agrep <-
> function(pattern, x, ignore.case = FALSE, value = FALSE,
>          max.distance = 0.1)
> {
>     pattern <- as.character(pattern)
>     if(!is.character(x)) x <- as.character(x)
>     ## behaves like == for NA pattern
>     if (is.na(pattern)){
>         if (value)
>             return(structure(rep.int(as.character(NA), length(x)),
>                              names = names(x)))
>         else
>             return(rep.int(NA, length(x)))
>     }
> 
>     if(!is.character(pattern)
>        || (length(pattern) < 1)
>        || ((n <- nchar(pattern)) == 0))
>         stop("'pattern' must be a non-empty character string")
> 
>     if(!is.list(max.distance)) {
>         if(!is.numeric(max.distance) || (max.distance < 0))
>             stop("'max.distance' must be non-negative")
>         if(max.distance < 1)            # transform percentages
>             max.distance <- ceiling(n * max.distance)
>         max.insertions <- max.deletions <- max.substitutions <-
>             max.distance
>     } else {
>         ## partial matching
>         table <- c("all", "deletions", "insertions", "substitutions")
>         ind <- pmatch(names(max.distance), table)
>         if(any(is.na(ind)))
>             warning("unknown match distance components ignored")
>         max.distance <- max.distance[!is.na(ind)]
>         names(max.distance) <- table[ind]
>         ## sanity checks
>         comps <- unlist(max.distance)
>         if(!all(is.numeric(comps)) || any(comps < 0))
>             stop("'max.distance' components must be non-negative")
>         ## extract restrictions
>         if(is.null(max.distance$all))
>             max.distance$all <- 0.1
>         max.insertions <- max.deletions <- max.substitutions <-
>             max.distance$all
>         if(!is.null(max.distance$deletions))
>             max.deletions <- max.distance$deletions
>         if(!is.null(max.distance$insertions))
>             max.insertions <- max.distance$insertions
>         if(!is.null(max.distance$substitutions))
>             max.substitutions <- max.distance$substitutions
>         max.distance <- max.distance$all
>         ## transform percentages
>         if(max.distance < 1)
>             max.distance <- ceiling(n * max.distance)
>         if(max.deletions < 1)
>             max.deletions <- ceiling(n * max.deletions)
>         if(max.insertions < 1)
>             max.insertions <- ceiling(n * max.insertions)
>         if(max.substitutions < 1)
>             max.substitutions <- ceiling(n * max.substitutions)
>     }
> 
>     .Internal(agrep(pattern, x, ignore.case, value, max.distance,
>                     max.deletions, max.insertions, max.substitutions))
> }
> 
> 
> ------------------------------------------------------------------------
> 
> \name{grep}
> \title{Pattern Matching and Replacement}
> \alias{grep}
> \alias{sub}
> \alias{gsub}
> \alias{regexpr}
> \alias{gregexpr}
> \description{
>   \code{grep} searches for matches to \code{pattern} (its first
>   argument) within the character vector \code{x} (second argument).
>   \code{regexpr} and \code{gregexpr} do too, but return more detail in
>   a different format.
> 
>   \code{sub} and \code{gsub} perform replacement of matches determined
>   by regular expression matching.
> }
> \usage{
> grep(pattern, x, ignore.case = FALSE, extended = TRUE,
>      perl = FALSE, value = FALSE, fixed = FALSE, 
>      useBytes = FALSE, invert = FALSE)
> 
> sub(pattern, replacement, x,
>     ignore.case = FALSE, extended = TRUE, perl = FALSE,
>     fixed = FALSE, useBytes = FALSE)
> 
> gsub(pattern, replacement, x,
>      ignore.case = FALSE, extended = TRUE, perl = FALSE,
>      fixed = FALSE, useBytes = FALSE)
> 
> regexpr(pattern, text, extended = TRUE, perl = FALSE,
>         fixed = FALSE, useBytes = FALSE)
> 
> gregexpr(pattern, text, extended = TRUE, perl = FALSE,
>          fixed = FALSE, useBytes = FALSE)
> }
> \arguments{
>   \item{pattern}{character string containing a \link{regular expression}
>     (or character string for \code{fixed = TRUE}) to be matched
>     in the given character vector.  Coerced by
>     \code{\link{as.character}} to a character string if possible.}
>   \item{x, text}{a character vector where matches are sought, or an
>     object which can be coerced by \code{as.character} to a character vector.}
>   \item{ignore.case}{if \code{FALSE}, the pattern matching is \emph{case
>       sensitive} and if \code{TRUE}, case is ignored during matching.}
>   \item{extended}{if \code{TRUE}, extended regular expression matching
>     is used, and if \code{FALSE} basic regular expressions are used.}
>   \item{perl}{logical. Should perl-compatible regexps be used?
>     Has priority over \code{extended}.}
>   \item{value}{if \code{FALSE}, a vector containing the (\code{integer})
>     indices of the matches determined by \code{grep} is returned, and if
>     \code{TRUE}, a vector containing the matching elements themselves is
>     returned.}
>   \item{fixed}{logical.  If \code{TRUE}, \code{pattern} is a string to be
>     matched as is.  Overrides all conflicting arguments.}
>   \item{useBytes}{logical.  If \code{TRUE} the matching is done
>     byte-by-byte rather than character-by-character.  See Details.}
>   \item{replacement}{a replacement for matched pattern in \code{sub} and
>     \code{gsub}.  Coerced to character if possible.  For \code{fixed =
>       FALSE} this can include backreferences \code{"\\1"} to
>     \code{"\\9"} to parenthesized subexpressions of \code{pattern}.  For
>     \code{perl = TRUE} only, it can also contain \code{"\\U"} or
>     \code{"\\L"} to convert the rest of the replacement to upper or
>     lower case.
>   }
>   \item{invert}{logical. If \code{TRUE}, then the match is reversed 
>   in order to get the items (or the values) that are \emph{not} matching the 
>   pattern}
> }
> \details{
>   Arguments which should be character strings or character vectors are
>   coerced to character if possible.
> 
>   The two \code{*sub} functions differ only in that \code{sub} replaces
>   only the first occurrence of a \code{pattern} whereas \code{gsub}
>   replaces all occurrences.
> 
>   For \code{regexpr} it is an error for \code{pattern} to be \code{NA},
>   otherwise \code{NA} is permitted and matches only itself.
> 
>   The regular expressions used are those specified by POSIX 1003.2,
>   either extended or basic, depending on the value of the
>   \code{extended} argument, unless \code{perl = TRUE} when they are
>   those of PCRE, \url{http://www.pcre.org/}.
>   (The exact set of patterns supported may depend on the version of
>   PCRE installed on the system in use, if \R was configured to use the
>   system PCRE.  \R's internal copy used PCRE 6.7.)
> 
>   \code{useBytes} is only used if \code{fixed = TRUE} or \code{perl = TRUE}.
>   For \code{grep} its main effect is to avoid errors/warnings about
>   invalid inputs, but for \code{regexpr} it changes the interpretation
>   of the output.
> }
> \value{
>   For \code{grep} a vector giving either the indices of the elements of
>   \code{x} that yielded a match or, if \code{value} is \code{TRUE}, the
>   matched elements of \code{x} (after coercion, preserving names but no
>   other attributes).
> 
>   For \code{sub} and \code{gsub} a character vector of the same length
>   and with the same attributes as \code{x} (after possible coercion).
> 
>   For \code{regexpr} an integer vector of the same length as \code{text}
>   giving the starting position of the first match, or \eqn{-1} if there
>   is none, with attribute \code{"match.length"} giving the length of the
>   matched text (or \eqn{-1} for no match).  In a multi-byte locale these
>   quantities are in characters rather than bytes unless
>   \code{useBytes = TRUE} is used with \code{fixed = TRUE} or
>   \code{perl = TRUE}.
> 
>   For \code{gregexpr} a list of the same length as \code{text} each
>   element of which is an integer vector as in \code{regexpr}, except
>   that the starting positions of every match are given.
> 
>   If in a multi-byte locale the pattern or replacement is not a valid
>   sequence of bytes, an error is thrown.  An invalid string in \code{x}
>   or \code{text} is a non-match with a warning for \code{grep} or
>   \code{regexpr}, but an error for \code{sub} or \code{gsub}.
> }
> \section{Warning}{
>   The standard regular-expression code has been reported to be very slow
>   when applied to extremely long character strings
>   (tens of thousands of characters or more): the code used when
>   \code{perl = TRUE} seems much faster and more reliable for such
>   usages.
> 
>   The standard version of \code{gsub} does not substitute correctly
>   repeated word-boundaries (e.g. \code{pattern = "\\b"}).
>   Use \code{perl = TRUE} for such matches.
> 
>   The \code{perl = TRUE} option is only implemented for single-byte and
>   UTF-8 encodings, and will warn if used in a non-UTF-8 multi-byte
>   locale (unless \code{useBytes = TRUE}).
> }
> \references{
>   Becker, R. A., Chambers, J. M. and Wilks, A. R. (1988)
>   \emph{The New S Language}.
>   Wadsworth \& Brooks/Cole (\code{grep})
> }
> \seealso{
>   \link{regular expression} (aka \code{\link{regexp}}) for the details
> % the `aka' above is for ESS (and ?reg....) where a space is problematic
>   of the pattern specification.
> 
>   \code{\link{glob2rx}} to turn wildcard matches into regular expressions.
>   
>   \code{\link{agrep}} for approximate matching.
> 
>   \code{\link{tolower}}, \code{\link{toupper}} and \code{\link{chartr}}
>   for character translations.
>   \code{\link{charmatch}}, \code{\link{pmatch}}, \code{\link{match}}.
>   \code{\link{apropos}} uses regexps and has nice examples.
> }
> \examples{
> grep("[a-z]", letters)
> 
> txt <- c("arm","foot","lefroo", "bafoobar")
> if(any(i <- grep("foo",txt)))
>    cat("'foo' appears at least once in\n\t",txt,"\n")
> i # 2 and 4
> txt[i]
> 
> ## Double all 'a' or 'b's;  "\\" must be escaped, i.e., 'doubled'
> %% and escaped even once more in this *.Rd file!
> gsub("([ab])", "\\\\1_\\\\1_", "abc and ABC")
> 
> txt <- c("The", "licenses", "for", "most", "software", "are",
>   "designed", "to", "take", "away", "your", "freedom",
>   "to", "share", "and", "change", "it.",
>    "", "By", "contrast,", "the", "GNU", "General", "Public", "License",
>    "is", "intended", "to", "guarantee", "your", "freedom", "to",
>    "share", "and", "change", "free", "software", "--",
>    "to", "make", "sure", "the", "software", "is",
>    "free", "for", "all", "its", "users")
> ( i <- grep("[gu]", txt) ) # indices
> stopifnot( txt[i] == grep("[gu]", txt, value = TRUE) )
> 
> ## Note that in locales such as en_US this includes B as the
> ## collation order is aAbBcCdEe ...
> (ot <- sub("[b-e]",".", txt))
> txt[ot != gsub("[b-e]",".", txt)]#- gsub does "global" substitution
> 
> txt[gsub("g","#", txt) !=
>     gsub("g","#", txt, ignore.case = TRUE)] # the "G" words
> 
> regexpr("en", txt)
> 
> gregexpr("e", txt)
> 
> ## trim trailing white space
> str = 'Now is the time      '
> sub(' +$', '', str)  ## spaces only
> sub('[[:space:]]+$', '', str) ## white space, POSIX-style
> sub('\\\\s+$', '', str, perl = TRUE) ## Perl-style white space
> 
> ## capitalizing
> gsub("(\\\\w)(\\\\w*)", "\\\\U\\\\1\\\\L\\\\2", "a test of capitalizing", perl=TRUE)
> gsub("\\\\b(\\\\w)", "\\\\U\\\\1", "a test of capitalizing", perl=TRUE)
> }
> \keyword{character}
> \keyword{utilities}
> 
> 
> ------------------------------------------------------------------------
> 
> ______________________________________________
> R-devel at r-project.org mailing list
> https://stat.ethz.ch/mailman/listinfo/r-devel