[Rd] "bug" and patch: quadratic running time for strsplit(..., fixed=TRUE) (PR#9902)
jbrzusto at fastmail.fm
jbrzusto at fastmail.fm
Fri Sep 7 15:47:26 CEST 2007
Full_Name: John Brzustowski
Version: R-devel-trunk, R-2.4.0
OS: linux, gcc 4.0.3
Submission from: (NULL) (206.248.157.184)
This isn't a bug, but an easily-remedied performance issue.
SYMPTOM
> for (i in 1000 * (1:20)) {
y <- paste(rep("asdf", times=i), collapse=" ")
t <- system.time(strsplit(y, " ", fixed=TRUE))
cat(sprintf("i=%5d time=%5d msec\n",i, round(1000*t[1])))
}
i= 1000 time= 2 msec
i= 2000 time= 9 msec
i= 3000 time= 20 msec
i= 4000 time= 34 msec
i= 5000 time= 57 msec
i= 6000 time= 77 msec
i= 7000 time= 107 msec
i= 8000 time= 136 msec
i= 9000 time= 177 msec
i=10000 time= 230 msec
i=11000 time= 275 msec
i=12000 time= 308 msec
i=13000 time= 371 msec
i=14000 time= 446 msec
i=15000 time= 544 msec
i=16000 time= 639 msec
i=17000 time= 726 msec
i=18000 time= 864 msec
i=19000 time= 944 msec
i=20000 time= 1106 msec
DIAGNOSIS
strsplit() uses strlen() in the bounds check clause of a for(;;)
statement, which forces a full scan of the source string for each
character in the source string. Unlike R's LENGTH() macro, strlen for
C strings is an expensive operation, and in this case (at least),
gcc 4.0.3's -O2 level optimizer is not able to recognize the call as a loop
invariant, despite the declaration "const char *buf".
REMEDIED BEHAVIOUR
i= 1000 time= 0 msec
i= 2000 time= 1 msec
i= 3000 time= 1 msec
i= 4000 time= 0 msec
i= 5000 time= 1 msec
i= 6000 time= 1 msec
i= 7000 time= 1 msec
i= 8000 time= 2 msec
i= 9000 time= 2 msec
i=10000 time= 2 msec
i=11000 time= 2 msec
i=12000 time= 2 msec
i=13000 time= 2 msec
i=14000 time= 2 msec
i=15000 time= 4 msec
i=16000 time= 3 msec
i=17000 time= 3 msec
i=18000 time= 4 msec
i=19000 time= 3 msec
i=20000 time= 4 msec
RELATED ISSUES
A simple search turns up other instances of this usage in R's source.
For completeness, I'm submitting patches for all of them, but have not
tested whether they in fact cause a detectable performance problem.
In the case of modules/X11/dataentry.c, the patch also fixes a presumably
ineffectual "bug".
$ grep -nR "for *([^;]*;[^;]*strlen *(" *
main/rlocale.c:137: for (i = 0; i < strlen(lc_str) && i < sizeof(lc_str); i++)
main/printutils.c:486: for(j = 0; j < strlen(buf); j++) *q++ = buf[j];
main/sysutils.c:493: for(j = 0; j < strlen(sub); j++) *outbuf++ = sub[j];
modules/X11/rotated.c:608: for(i=0; i<strlen(text)-1; i++)
modules/X11/rotated.c:856: for(i=0; i<strlen(text)-1; i++)
modules/X11/rotated.c:1399: for(i=0; i<strlen(text)-1; i++)
modules/X11/rotated.c:1797: for(i=0; i<strlen(text)-1; i++)
modules/X11/rotated.c:2045: for(i=0; i<strlen(text)-1; i++)
modules/X11/rotated.c:2339: for(i=0; i<strlen(text)-1; i++)
modules/X11/dataentry.c:1358: for(i = 0; i < strlen(text); i++) *bufp++ =
text[i];
PATCHES (against trunk)
Only the first is required to fix strsplit().
Index: src/main/character.c
===================================================================
--- src/main/character.c (revision 42792)
+++ src/main/character.c (working copy)
@@ -357,7 +357,7 @@
int i, j, len, tlen, ntok, slen;
int extended_opt, cflags, fixed_opt, perl_opt;
char *pt = NULL;
- const char *buf, *split = "", *bufp, *laststart;
+ const char *buf, *split = "", *bufp, *laststart, *ebuf = NULL;
regex_t reg;
regmatch_t regmatch[1];
pcre *re_pcre = NULL;
@@ -419,7 +419,8 @@
if(fixed_opt) {
/* This is UTF-8 safe since it compares whole strings */
laststart = buf;
- for(bufp = buf; bufp - buf < strlen(buf); bufp++) {
+ ebuf = buf + strlen(buf);
+ for(bufp = buf; bufp < ebuf; bufp++) {
if((slen == 1 && *bufp != *split) ||
(slen > 1 && strncmp(bufp, split, slen))) continue;
ntok++;
@@ -480,7 +481,7 @@
/* This is UTF-8 safe since it compares whole strings,
but it would be more efficient to skip along by chars.
*/
- for(; bufp - buf < strlen(buf); bufp++) {
+ for(; bufp < ebuf; bufp++) {
if((slen == 1 && *bufp != *split) ||
(slen > 1 && strncmp(bufp, split, slen))) continue;
if(slen) {
Index: src/main/rlocale.c
===================================================================
--- src/main/rlocale.c (revision 42792)
+++ src/main/rlocale.c (working copy)
@@ -127,14 +127,14 @@
int Ri18n_wcwidth(wchar_t c)
{
char lc_str[128];
- unsigned int i;
+ unsigned int i, j;
static char *lc_cache = "";
static int lc = 0;
if (0 != strcmp(setlocale(LC_CTYPE, NULL), lc_cache)) {
strncpy(lc_str, setlocale(LC_CTYPE, NULL), sizeof(lc_str));
- for (i = 0; i < strlen(lc_str) && i < sizeof(lc_str); i++)
+ for (i = 0, j = strlen(lc_str); i < j && i < sizeof(lc_str); i++)
lc_str[i] = toupper(lc_str[i]);
for (i = 0; i < (sizeof(cjk_locale_name)/sizeof(cjk_locale_name_t));
i++) {
Index: src/main/printutils.c
===================================================================
--- src/main/printutils.c (revision 42792)
+++ src/main/printutils.c (working copy)
@@ -483,7 +483,8 @@
else
#endif
snprintf(buf, 11, "\\u%04x", k);
- for(j = 0; j < strlen(buf); j++) *q++ = buf[j];
+ memcpy(q, buf, j = strlen(buf));
+ q += j;
p += res;
}
i += (res - 1);
Index: src/main/sysutils.c
===================================================================
--- src/main/sysutils.c (revision 42792)
+++ src/main/sysutils.c (working copy)
@@ -490,8 +490,9 @@
R_AllocStringBuffer(2*cbuff.bufsize, &cbuff);
goto top_of_loop;
}
- for(j = 0; j < strlen(sub); j++) *outbuf++ = sub[j];
- outb -= strlen(sub);
+ memcpy(outbuf, sub, j = strlen(sub));
+ outbuf += j;
+ outb -= j;
}
inbuf++; inb--;
goto next_char;
Index: src/modules/X11/rotated.c
===================================================================
--- src/modules/X11/rotated.c (revision 42792)
+++ src/modules/X11/rotated.c (working copy)
@@ -605,7 +605,7 @@
/* count number of sections in string */
if(align!=NONE)
- for(i=0; i<strlen(text)-1; i++)
+ for(i=strlen(text)-2; i >= 0; i--)
if(text[i]=='\n')
nl++;
@@ -853,7 +853,7 @@
/* count number of sections in string */
item->nl=1;
if(align!=NONE)
- for(i=0; i<strlen(text)-1; i++)
+ for(i=strlen(text)-2; i >= 0; i--)
if(text[i]=='\n')
item->nl++;
@@ -1396,7 +1396,7 @@
/* count number of sections in string */
nl=1;
if(align!=NONE)
- for(i=0; i<strlen(text)-1; i++)
+ for(i=strlen(text)-2; i >= 0; i--)
if(text[i]=='\n')
nl++;
@@ -1794,7 +1794,7 @@
/* count number of sections in string */
if(align!=NONE)
- for(i=0; i<strlen(text)-1; i++)
+ for(i=strlen(text)-2; i >= 0; i--)
if(text[i]=='\n')
nl++;
@@ -2042,7 +2042,7 @@
/* count number of sections in string */
item->nl=1;
if(align!=NONE)
- for(i=0; i<strlen(text)-1; i++)
+ for(i=strlen(text)-2; i >= 0; i--)
if(text[i]=='\n')
item->nl++;
@@ -2336,7 +2336,7 @@
/* count number of sections in string */
nl=1;
if(align!=NONE)
- for(i=0; i<strlen(text)-1; i++)
+ for(i=strlen(text)-2; i >= 0; i--)
if(text[i]=='\n')
nl++;
Index: src/modules/X11/dataentry.c
===================================================================
--- src/modules/X11/dataentry.c (revision 42792)
+++ src/modules/X11/dataentry.c (working copy)
@@ -1232,7 +1232,7 @@
do as we get a char at a time */
static void handlechar(DEstruct DE, char *text)
{
- int i, c = text[0];
+ int i, c = text[0], j;
#ifdef USE_FONTSET
wchar_t wcs[BOOSTED_BUF_SIZE];
@@ -1355,9 +1355,13 @@
goto donehc;
}
- for(i = 0; i < strlen(text); i++) *bufp++ = text[i];
- *(bufp+1) = '\0';
- clength += strlen(text);
+ /* as originally written, this left an undefined byte at
+ the end of bufp, followed by a zero byte; luckily, the storage
+ pointed to by bufp had already been zeroed, so the undefined
+ byte was in fact zero. */
+ strcpy(bufp, text);
+ bufp += (j = strlen(text));
+ clength += j;
printstring(DE, buf, clength, DE->crow, DE->ccol, 1);
return;
More information about the R-devel
mailing list