[Rd] "bug" and patch: quadratic running time for strsplit(..., fixed=TRUE) (PR#9902)

jbrzusto at fastmail.fm jbrzusto at fastmail.fm
Fri Sep 7 15:47:26 CEST 2007


Full_Name: John Brzustowski
Version: R-devel-trunk, R-2.4.0
OS: linux, gcc 4.0.3
Submission from: (NULL) (206.248.157.184)


This isn't a bug, but an easily-remedied performance issue.

SYMPTOM

> for (i in 1000 * (1:20)) {
   y <- paste(rep("asdf", times=i), collapse=" ")
   t <- system.time(strsplit(y, " ", fixed=TRUE))
   cat(sprintf("i=%5d time=%5d msec\n",i, round(1000*t[1])))
}

i= 1000 time=    2 msec
i= 2000 time=    9 msec
i= 3000 time=   20 msec
i= 4000 time=   34 msec
i= 5000 time=   57 msec
i= 6000 time=   77 msec
i= 7000 time=  107 msec
i= 8000 time=  136 msec
i= 9000 time=  177 msec
i=10000 time=  230 msec
i=11000 time=  275 msec
i=12000 time=  308 msec
i=13000 time=  371 msec
i=14000 time=  446 msec
i=15000 time=  544 msec
i=16000 time=  639 msec
i=17000 time=  726 msec
i=18000 time=  864 msec
i=19000 time=  944 msec
i=20000 time= 1106 msec

DIAGNOSIS

strsplit() uses strlen() in the bounds check clause of a for(;;)
statement, which forces a full scan of the source string for each
character in the source string.  Unlike R's LENGTH() macro, strlen for
C strings is an expensive operation, and in this case (at least),
gcc 4.0.3's -O2 level optimizer is not able to recognize the call as a loop
invariant, despite the declaration "const char *buf".

REMEDIED BEHAVIOUR

i= 1000 time=    0 msec
i= 2000 time=    1 msec
i= 3000 time=    1 msec
i= 4000 time=    0 msec
i= 5000 time=    1 msec
i= 6000 time=    1 msec
i= 7000 time=    1 msec
i= 8000 time=    2 msec
i= 9000 time=    2 msec
i=10000 time=    2 msec
i=11000 time=    2 msec
i=12000 time=    2 msec
i=13000 time=    2 msec
i=14000 time=    2 msec
i=15000 time=    4 msec
i=16000 time=    3 msec
i=17000 time=    3 msec
i=18000 time=    4 msec
i=19000 time=    3 msec
i=20000 time=    4 msec

RELATED ISSUES

A simple search turns up other instances of this usage in R's source.
For completeness, I'm submitting patches for all of them, but have not
tested whether they in fact cause a detectable performance problem.
In the case of modules/X11/dataentry.c, the patch also fixes a presumably
ineffectual "bug".

$ grep -nR "for *([^;]*;[^;]*strlen *(" *

main/rlocale.c:137:		for (i = 0; i < strlen(lc_str) && i < sizeof(lc_str); i++)
main/printutils.c:486:		for(j = 0; j < strlen(buf); j++) *q++ = buf[j];
main/sysutils.c:493:		for(j = 0; j < strlen(sub); j++) *outbuf++ = sub[j];
modules/X11/rotated.c:608:	for(i=0; i<strlen(text)-1; i++)
modules/X11/rotated.c:856:	for(i=0; i<strlen(text)-1; i++)
modules/X11/rotated.c:1399:	for(i=0; i<strlen(text)-1; i++)
modules/X11/rotated.c:1797:	for(i=0; i<strlen(text)-1; i++)
modules/X11/rotated.c:2045:	for(i=0; i<strlen(text)-1; i++)
modules/X11/rotated.c:2339:	for(i=0; i<strlen(text)-1; i++)
modules/X11/dataentry.c:1358:   for(i = 0; i < strlen(text); i++) *bufp++ =
text[i];


PATCHES (against trunk)
Only the first is required to fix strsplit().

Index: src/main/character.c
===================================================================
--- src/main/character.c	(revision 42792)
+++ src/main/character.c	(working copy)
@@ -357,7 +357,7 @@
     int i, j, len, tlen, ntok, slen;
     int extended_opt, cflags, fixed_opt, perl_opt;
     char *pt = NULL;
-    const char *buf, *split = "", *bufp, *laststart;
+    const char *buf, *split = "", *bufp, *laststart, *ebuf = NULL;
     regex_t reg;
     regmatch_t regmatch[1];
     pcre *re_pcre = NULL;
@@ -419,7 +419,8 @@
 	    if(fixed_opt) {
 		/* This is UTF-8 safe since it compares whole strings */
 		laststart = buf;
-		for(bufp = buf; bufp - buf < strlen(buf); bufp++) {
+		ebuf = buf + strlen(buf);
+		for(bufp = buf; bufp < ebuf; bufp++) {
 		    if((slen == 1 && *bufp != *split) ||
 		       (slen > 1 && strncmp(bufp, split, slen))) continue;
 		    ntok++;
@@ -480,7 +481,7 @@
 		    /* This is UTF-8 safe since it compares whole strings,
 		       but it would be more efficient to skip along by chars.
 		     */
-		    for(; bufp - buf < strlen(buf); bufp++) {
+		    for(; bufp < ebuf; bufp++) {
 			if((slen == 1 && *bufp != *split) ||
 			   (slen > 1 && strncmp(bufp, split, slen))) continue;
 			if(slen) {
Index: src/main/rlocale.c
===================================================================
--- src/main/rlocale.c	(revision 42792)
+++ src/main/rlocale.c	(working copy)
@@ -127,14 +127,14 @@
 int Ri18n_wcwidth(wchar_t c)
 {
     char lc_str[128];
-    unsigned int i;
+    unsigned int i, j;
 
     static char *lc_cache = "";
     static int lc = 0;
 
     if (0 != strcmp(setlocale(LC_CTYPE, NULL), lc_cache)) {
 	strncpy(lc_str, setlocale(LC_CTYPE, NULL), sizeof(lc_str));
-	for (i = 0; i < strlen(lc_str) && i < sizeof(lc_str); i++)
+	for (i = 0, j = strlen(lc_str); i < j && i < sizeof(lc_str); i++)
 	    lc_str[i] = toupper(lc_str[i]);
 	for (i = 0; i < (sizeof(cjk_locale_name)/sizeof(cjk_locale_name_t)); 
 	     i++) {
Index: src/main/printutils.c
===================================================================
--- src/main/printutils.c	(revision 42792)
+++ src/main/printutils.c	(working copy)
@@ -483,7 +483,8 @@
 			else
 #endif
 			    snprintf(buf, 11, "\\u%04x", k);
-			for(j = 0; j < strlen(buf); j++) *q++ = buf[j];
+			memcpy(q, buf, j = strlen(buf));
+			q += j;
 			p += res;
 		    }
 		    i += (res - 1);
Index: src/main/sysutils.c
===================================================================
--- src/main/sysutils.c	(revision 42792)
+++ src/main/sysutils.c	(working copy)
@@ -490,8 +490,9 @@
 			R_AllocStringBuffer(2*cbuff.bufsize, &cbuff);
 			goto top_of_loop;
 		    }
-		    for(j = 0; j < strlen(sub); j++) *outbuf++ = sub[j];
-		    outb -= strlen(sub);
+		    memcpy(outbuf, sub, j = strlen(sub)); 
+		    outbuf += j;
+		    outb -= j;
 		}
 		inbuf++; inb--;
 		goto next_char;
Index: src/modules/X11/rotated.c
===================================================================
--- src/modules/X11/rotated.c	(revision 42792)
+++ src/modules/X11/rotated.c	(working copy)
@@ -605,7 +605,7 @@
 
     /* count number of sections in string */
     if(align!=NONE)
-	for(i=0; i<strlen(text)-1; i++)
+	for(i=strlen(text)-2; i >= 0; i--)
 	    if(text[i]=='\n')
 		nl++;
 
@@ -853,7 +853,7 @@
     /* count number of sections in string */
     item->nl=1;
     if(align!=NONE)
-	for(i=0; i<strlen(text)-1; i++)
+	for(i=strlen(text)-2; i >= 0; i--)
 	    if(text[i]=='\n')
 		item->nl++;
 
@@ -1396,7 +1396,7 @@
     /* count number of sections in string */
     nl=1;
     if(align!=NONE)
-	for(i=0; i<strlen(text)-1; i++)
+	for(i=strlen(text)-2; i >= 0; i--)
 	    if(text[i]=='\n')
 		nl++;
 
@@ -1794,7 +1794,7 @@
 
     /* count number of sections in string */
     if(align!=NONE)
-	for(i=0; i<strlen(text)-1; i++)
+	for(i=strlen(text)-2; i >= 0; i--)
 	    if(text[i]=='\n')
 		nl++;
 
@@ -2042,7 +2042,7 @@
     /* count number of sections in string */
     item->nl=1;
     if(align!=NONE)
-	for(i=0; i<strlen(text)-1; i++)
+	for(i=strlen(text)-2; i >= 0; i--)
 	    if(text[i]=='\n')
 		item->nl++;
 
@@ -2336,7 +2336,7 @@
     /* count number of sections in string */
     nl=1;
     if(align!=NONE)
-	for(i=0; i<strlen(text)-1; i++)
+	for(i=strlen(text)-2; i >= 0; i--)
 	    if(text[i]=='\n')
 		nl++;
 
Index: src/modules/X11/dataentry.c
===================================================================
--- src/modules/X11/dataentry.c	(revision 42792)
+++ src/modules/X11/dataentry.c	(working copy)
@@ -1232,7 +1232,7 @@
    do as we get a char at a time */
 static void handlechar(DEstruct DE, char *text)
 {
-    int i, c = text[0];
+    int i, c = text[0], j;
 #ifdef USE_FONTSET
     wchar_t wcs[BOOSTED_BUF_SIZE];
 
@@ -1355,9 +1355,13 @@
 	goto donehc;
     }
 
-    for(i = 0; i < strlen(text); i++) *bufp++ = text[i];
-    *(bufp+1) = '\0';
-    clength += strlen(text);
+    /* as originally written, this left an undefined byte at
+       the end of bufp, followed by a zero byte; luckily, the storage
+       pointed to by bufp had already been zeroed, so the undefined
+       byte was in fact zero.  */
+    strcpy(bufp, text);
+    bufp += (j = strlen(text));
+    clength += j;
     printstring(DE, buf, clength, DE->crow, DE->ccol, 1);
     return;



More information about the R-devel mailing list