[R] Extracting Comments from Functions/Packages

Thu Oct 7 18:30:24 CEST 2021

Dear R Users,

I wrote a minimal parser to extract strings and comments from the 
function definitions.

The string extraction works fine. But there are no comments:

a.) Are the comments stripped from the compiled packages?

b.) Alternatively: Is the deparse() not suited for this task?

b.2.) Is deparse() parsing the function/expression itself?

[see code for extract.str.fun() function below]

### All strings in "base"
extract.str.pkg("base")
# type = 2 for Comments:
extract.str.pkg("base", type=2)
extract.str.pkg("sp", type=2)
extract.str.pkg("NetLogoR", type=2)

The code for the 2 functions (extract.str.pkg & extract.str.fun) and the 
code for the parse.simple() parser are below.

Sincerely,

Leonard

=======

The latest code is on GitHub:

https://github.com/discoleo/R/blob/master/Stat/Tools.Formulas.R

### Code to process functions in packages:
extract.str.fun = function(fn, pkg, type=1, strip=TRUE) {
     fn = as.symbol(fn); pkg = as.symbol(pkg);
     fn = list(substitute(pkg ::: fn));
     # deparse
     s = paste0(do.call(deparse, fn), collapse="");
     npos = parse.simple(s);
     extract.str(s, npos[[type]], strip=strip)
}
extract.str.pkg = function(pkg, type=1, exclude.z = TRUE, strip=TRUE) {
     nms = ls(getNamespace(pkg));
     l = lapply(nms, function(fn) extract.str.fun(fn, pkg, type=type, 
strip=strip));
     if(exclude.z) {
         hasStr = sapply(l, function(s) length(s) >= 1);
         nms = nms[hasStr];
         l = l[hasStr];
     }
     names(l) = nms;
     return(l);
}

### minimal Parser:
# - proof of concept;
# - may be useful to process non-conformant R "code", e.g.:
#   "{\"abc\" + \"bcd\"} {FUN}"; (still TODO)
# Warning:
# - not thoroughly checked &
#   may be a little buggy!

parse.simple = function(x, eol="\n") {
     len = nchar(x);
     n.comm = list(integer(0), integer(0));
     n.str  = list(integer(0), integer(0));
     is.hex = function(ch) {
         # Note: only for 1 character!
         return((ch >= "0" && ch <= "9") ||
             (ch >= "A" && ch <= "F") ||
             (ch >= "a" && ch <= "f"));
     }
     npos = 1;
     while(npos <= len) {
         s = substr(x, npos, npos);
         # State: COMMENT
         if(s == "#") {
             n.comm[[1]] = c(n.comm[[1]], npos);
             while(npos < len) {
                 npos = npos + 1;
                 if(substr(x, npos, npos) == eol) break;
             }
             n.comm[[2]] = c(n.comm[[2]], npos);
             npos = npos + 1; next;
         }
         # State: STRING
         if(s == "\"" || s == "'") {
             n.str[[1]] = c(n.str[[1]], npos);
             while(npos < len) {
                 npos = npos + 1;
                 se = substr(x, npos, npos);
                 if(se == "\\") {
                     npos = npos + 1;
                     # simple escape vs Unicode:
                     if(substr(x, npos, npos) != "u") next;
                     len.end = min(len, npos + 4);
                     npos = npos + 1;
                     isAllHex = TRUE;
                     while(npos <= len.end) {
                         se = substr(x, npos, npos);
                         if( ! is.hex(se)) { isAllHex = FALSE; break; }
                         npos = npos + 1;
                     }
                     if(isAllHex) next;
                 }
                 if(se == s) break;
             }
             n.str[[2]] = c(n.str[[2]], npos);
             npos = npos + 1; next;
         }
         npos = npos + 1;
     }
     return(list(str = n.str, comm = n.comm));
}

extract.str = function(s, npos, strip=FALSE) {
     if(length(npos[[1]]) == 0) return(character(0));
     strip.FUN = if(strip) {
             function(id) {
                 if(npos[[1]][[id]] + 1 < npos[[2]][[id]]) {
                     nStart = npos[[1]][[id]] + 1;
                     nEnd = npos[[2]][[id]] - 1; # TODO: Error with 
malformed string
                     return(substr(s, nStart, nEnd));
                 } else {
                     return("");
                 }
             }
         } else function(id) substr(s, npos[[1]][[id]], npos[[2]][[id]]);
     sapply(seq(length(npos[[1]])), strip.FUN);
}