[Rd] chartr better

Ei-ji Nakama nakama at ki.rim.or.jp
Thu Sep 13 09:29:30 CEST 2007


For example, the following changes are necessary when i convert a
Japanese hiragana into katakana in chattr.

R code:
> chartr("\u3041-\u3093","\u30a1-\u30f3","\u3084\u3063\u305f\u30fc")

--- R-alpha.orig/src/main/character.c   2007-09-05 07:13:27.000000000 +0900
+++ R-alpha/src/main/character.c        2007-09-13 16:10:21.000000000 +0900
@@ -2041,6 +2041,16 @@
     return(c);
 }

+typedef struct { wchar_t c_old, c_new; } xtable_t;
+static inline int xtable_comp(const xtable_t *a, const xtable_t *b)
+{
+    return a->c_old - b->c_old;
+}
+static inline int xtable_key_comp(const wchar_t *a, const xtable_t *b)
+{
+    return *a - b->c_old;
+}
+
 SEXP attribute_hidden do_chartr(SEXP call, SEXP op, SEXP args, SEXP env)
 {
     SEXP old, _new, x, y;
@@ -2064,14 +2074,18 @@
 #ifdef SUPPORT_MBCS
     if(mbcslocale) {
         int j, nb, nc;
-        wchar_t xtable[65536 + 1], c_old, c_new, *wc;
+        xtable_t *xtable;
+        int       xtable_cnt;
+        wchar_t c_old, c_new, *wc;
         const char *xi, *s;
         struct wtr_spec *trs_old, **trs_old_ptr;
         struct wtr_spec *trs_new, **trs_new_ptr;
-
-        for(i = 0; i <= UCHAR_MAX; i++) xtable[i] = i;
+        struct wtr_spec *trs_cnt, **trs_cnt_ptr;

         /* Initialize the old and new wtr_spec lists. */
+        trs_cnt = Calloc(1, struct wtr_spec);
+        trs_cnt->type = WTR_INIT;
+        trs_cnt->next = NULL;
         trs_old = Calloc(1, struct wtr_spec);
         trs_old->type = WTR_INIT;
         trs_old->next = NULL;
@@ -2084,6 +2098,7 @@
         if(nc < 0) error(_("invalid multibyte string 'old'"));
         wc = (wchar_t *) R_AllocStringBuffer((nc+1)*sizeof(wchar_t), &cbuff);
         mbstowcs(wc, s, nc + 1);
+        wtr_build_spec(wc, trs_cnt); /* use count only */
         wtr_build_spec(wc, trs_old);

        s = translateChar(STRING_ELT(_new, 0));
@@ -2096,38 +2111,54 @@
         /* Initialize the pointers for walking through the old and new
            wtr_spec lists and retrieving the next chars from the lists.
         */
+        trs_cnt_ptr = Calloc(1, struct wtr_spec *);
+        *trs_cnt_ptr = trs_cnt->next;
+       for( xtable_cnt = 0 ; wtr_get_next_char_from_spec(trs_cnt_ptr)
;xtable_cnt++ );
+       Free(trs_cnt_ptr);
+       xtable = (xtable_t *)R_alloc(xtable_cnt+1,sizeof(xtable_t));
+
         trs_old_ptr = Calloc(1, struct wtr_spec *);
         *trs_old_ptr = trs_old->next;
         trs_new_ptr = Calloc(1, struct wtr_spec *);
         *trs_new_ptr = trs_new->next;
-        for(;;) {
+        for(i=0;;i++) {
             c_old = wtr_get_next_char_from_spec(trs_old_ptr);
             c_new = wtr_get_next_char_from_spec(trs_new_ptr);
             if(c_old == '\0')
                 break;
             else if(c_new == '\0')
                 error(_("'old' is longer than 'new'"));
-            else
-                xtable[c_old] = c_new;
+            else{
+                xtable[i].c_old=c_old;
+                xtable[i].c_new=c_new;
+           }
         }
+
         /* Free the memory occupied by the wtr_spec lists. */
         wtr_free_spec(trs_old);
         wtr_free_spec(trs_new);
         Free(trs_old_ptr); Free(trs_new_ptr);

+        qsort(xtable, xtable_cnt, sizeof(xtable_t),
+              (int(*)(const void *, const void *))xtable_comp);
+
         n = LENGTH(x);
         PROTECT(y = allocVector(STRSXP, n));
         for(i = 0; i < n; i++) {
             if (STRING_ELT(x,i) == NA_STRING)
                 SET_STRING_ELT(y, i, NA_STRING);
             else {
+                xtable_t *tbl;
                 xi = translateChar(STRING_ELT(x, i));
                 nc = mbstowcs(NULL, xi, 0);
                 if(nc < 0)
                     error(_("invalid input multibyte string %d"), i+1);
                 wc = (wchar_t *)
R_AllocStringBuffer((nc+1)*sizeof(wchar_t), &cbuff);
                 mbstowcs(wc, xi, nc + 1);
-                for(j = 0; j < nc; j++) wc[j] = xtable[wc[j]];
+                for(j = 0; j < nc; j++)
+                    if (tbl = bsearch(&wc[j], xtable, xtable_cnt,
sizeof(xtable_t),
+                                      (int(*)(const void *, const
void *))xtable_key_comp))
+                        wc[j]=tbl->c_new;
                 nb = wcstombs(NULL, wc, 0);
                 cbuf = CallocCharBuf(nb);
                 wcstombs(cbuf, wc, nb + 1);

-- 
EI-JI Nakama  <nakama at ki.rim.or.jp>
"\u4e2d\u9593\u6804\u6cbb"  <nakama at ki.rim.or.jp>



More information about the R-devel mailing list