[Rd] choose(n,k) when n is almost integer
Petr Savicky
savicky at cs.cas.cz
Tue Dec 22 14:25:15 CET 2009
On Sat, Dec 19, 2009 at 10:02:49PM +0100, Martin Maechler wrote:
> {I've changed the subject; it's really no longer about
> lchoose()'s definition}
>
[.......................]
>
> Thanks a lot, Petr, for your explorations.
> I agree that at least something like your conservative change
> should happen.
>
> Personally, I'd agree with you and prefer the "progressive"
> (non-conservative) change,
> basically getting rid of all R_IS_INT(n)
> tests, which seem kludgey.
>
> OTOH, the R core members who did start using R_IS_INT(n)
> must have had good reasons with another set examples.
>
> Have you tried 'make check-devel' (or 'check-all') also with the
> progressive change?
After some tests, i changed the two suggested patches. The new versions
will be called A and B.
Let us consider computing choose(n, k) with an integer k and a general n.
Patch A is a minimal one. It rounds n to be an exact integer, if we assume
this after the test R_IS_INT(n).
Patch B corresponds to the progressive older suggestion. It tries to avoid
rounding of n, if possible. Rounding is not needed, is k < 30 or if none
of the factors in the product n(n-1)...(n-k+1) is close to zero.
Patch A (without indentation in attachment):
--- R-devel-orig-sse/src/nmath/choose.c 2009-12-17 17:52:39.000000000 +0100
+++ R-devel-work-copy-1-sse/src/nmath/choose.c 2009-12-22 09:06:32.000000000 +0100
@@ -109,6 +109,7 @@
MATHLIB_WARNING2(_("'k' (%.2f) must be integer, rounded to %.0f"), k0, k);
if (k < k_small_max) {
int j;
+ if (R_IS_INT(n)) n = floor(n + 0.5);
if(n-k < k && n >= 0 && R_IS_INT(n)) k = n-k; /* <- Symmetry */
if (k < 0) return 0.;
if (k == 0) return 1.;
@@ -126,6 +127,7 @@
return r;
}
else if (R_IS_INT(n)) {
+ n = floor(n + 0.5);
if(n < k) return 0.;
if(n - k < k_small_max) return choose(n, n-k); /* <- Symmetry */
return floor(exp(lfastchoose(n, k)) + 0.5);
Path B (without indentation in attachment):
--- R-devel-orig-sse/src/nmath/choose.c 2009-12-17 17:52:39.000000000 +0100
+++ R-devel-work-copy-2-sse/src/nmath/choose.c 2009-12-22 13:49:33.000000000 +0100
@@ -93,13 +93,14 @@
return lfastchoose(n, k);
}
+#define IS_INT(x) ((x) == floor(x))
#define k_small_max 30
/* 30 is somewhat arbitrary: it is on the *safe* side:
* both speed and precision are clearly improved for k < 30.
*/
double choose(double n, double k)
{
- double r, k0 = k;
+ double r, k0 = k, n1;
k = floor(k + 0.5);
#ifdef IEEE_754
/* NaNs propagated correctly */
@@ -109,14 +110,14 @@
MATHLIB_WARNING2(_("'k' (%.2f) must be integer, rounded to %.0f"), k0, k);
if (k < k_small_max) {
int j;
- if(n-k < k && n >= 0 && R_IS_INT(n)) k = n-k; /* <- Symmetry */
+ if(n-k < k && n >= 0 && IS_INT(n)) k = n-k; /* <- Symmetry */
if (k < 0) return 0.;
if (k == 0) return 1.;
/* else: k >= 1 */
r = n;
for(j = 2; j <= k; j++)
- r *= (n-j+1)/j;
- return R_IS_INT(n) ? floor(r + 0.5) : r;
+ r *= (n-(j-1))/j;
+ return IS_INT(n) ? floor(r + 0.5) : r;
/* might have got rounding errors */
}
/* else: k >= k_small_max */
@@ -125,12 +126,15 @@
if (ODD(k)) r = -r;
return r;
}
- else if (R_IS_INT(n)) {
+ else if (IS_INT(n)) {
if(n < k) return 0.;
if(n - k < k_small_max) return choose(n, n-k); /* <- Symmetry */
return floor(exp(lfastchoose(n, k)) + 0.5);
}
/* else non-integer n >= 0 : */
+ n1 = floor(n + 0.5);
+ if (n1 <= k-1 && fabs(n1 - n) <= 1e-7)
+ return 0.;
if (n < k-1) {
int s_choose;
r = lfastchoose2(n, k, /* -> */ &s_choose);
@@ -140,5 +144,6 @@
}
#undef ODD
+#undef IS_INT
#undef R_IS_INT
#undef k_small_max
Both patches solve the main problems of the original implementation, namely
# for k < k_small_max
choose(5 - 1e-9, 5) # [1] 0
choose(5 + 1e-9, 5) # [1] 5
# for k >= k_small_max
choose(30 - 1e-9, 30) # [1] 0
choose(60 - 1e-9, 30) # Error: segfault from C stack overflow
and we get in 7 digit precision
# for k < k_small_max
choose(5 - 1e-9, 5) # [1] 1
choose(5 + 1e-9, 5) # [1] 1
# for k >= k_small_max
choose(30 - 1e-9, 30) # [1] 1
choose(60 - 1e-9, 30) # [1] 1.182646e+17
The patches do not modify lchoose(). I think, this should be done after
a decision how to modify choose().
Both patches go through make check-devel until the test of Tcl/Tk, which fails,
since i do not have Tcl/Tk on the machine, which i have now available. I will
perform the whole test later.
Both patches and also the original implementation sometimes generate a warning
of the following type
> choose(19 - 2e-7, 30)
[1] -3.328339e-16
Warning message:
In choose(19 - 2e-07, 30) :
full precision may not have been achieved in 'lgamma'
These warnings are generated in functions called from choose() and the intention
of this patch is not to change this. Most of these warnings are eliminated
in the original implementation by treating numbers, which differ from an
integer by less than 1e-7, as integers. However, there are some such cases
even if the difference is slightly larger than 1e-7 like the above 2e-7.
The cases, when the above warning is generated, are the same for both patches A and B
in the tests, which i did.
Patch B is significantly more accurate in cases, when the result is more than 1.
In order to verify these two claims, i used the test script at
http://www.cs.cas.cz/~savicky/R-devel/test_choose_0.R
Its outputs with A and B are presented below. The list of cases with warning
Cases with warning
[,1] [,2]
[1,] 6715 31
[2,] 152 33
[3,] 152 34
[4,] 6393 37
[5,] 9388 39
[6,] 6059 40
[7,] 8773 40
[8,] 5058 44
[9,] 5531 44
[10,] 6670 46
[11,] 652 47
[12,] 2172 49
[13,] 3371 49
[14,] 1913 50
is the same in all four tests. So, i only include the tables of errors, which
are different. The table contains columns "bound" and "error". The error is
the maximum error achieved in cases, where the compared values are at most the
bound on the same row and more than the bounds on the previous rows.
Patch A on Intel extended arithmetic
minimum log2() of a wrong result for integer n : 53.95423
maximum error for real n :
bound error
[1,] 0e+00 1.206026e-08
[2,] 1e-20 2.699943e-08
[3,] 1e-15 3.444711e-08
[4,] 1e-10 2.806709e-08
[5,] 1e-05 8.395916e-09
[6,] 1e+00 3.695634e-07
[7,] Inf 2.990987e-07
Patch A on SSE arithmetic
minimum log2() of a wrong result for integer n : 49.94785
maximum error for real n :
bound error
[1,] 0e+00 1.206026e-08
[2,] 1e-20 2.699943e-08
[3,] 1e-15 3.444709e-08
[4,] 1e-10 2.806707e-08
[5,] 1e-05 8.395951e-09
[6,] 1e+00 3.695634e-07
[7,] Inf 2.990987e-07
Patch B on Intel extended arithmetic
minimum log2() of a wrong result for integer n : 53.95423
maximum error for real n :
bound error
[1,] 0e+00 2.047988e-09
[2,] 1e-20 2.699943e-08
[3,] 1e-15 3.444711e-08
[4,] 1e-10 2.806709e-08
[5,] 1e-05 8.395916e-09
[6,] 1e+00 1.207856e-11
[7,] Inf 1.509903e-14
Patch B on SSE arithmetic
minimum log2() of a wrong result for integer n : 49.94785
maximum error for real n :
bound error
[1,] 0e+00 2.047988e-09
[2,] 1e-20 2.699943e-08
[3,] 1e-15 3.444709e-08
[4,] 1e-10 2.806707e-08
[5,] 1e-05 8.395951e-09
[6,] 1e+00 1.205369e-11
[7,] Inf 2.731149e-14
We can see that B has smaller error, if the output is more than 1.
For integer n, we can see that the smallest output values, where we
do not get the exact result is the same for A and B. There is a difference
between Intel extended arithmetic and SSE for clear reasons.
I suggest to use B and i appreciate comments.
Petr Savicky.
-------------- next part --------------
--- R-devel-orig-sse/src/nmath/choose.c 2009-12-17 17:52:39.000000000 +0100
+++ R-devel-work-copy-1-sse/src/nmath/choose.c 2009-12-22 09:06:32.000000000 +0100
@@ -109,6 +109,7 @@
MATHLIB_WARNING2(_("'k' (%.2f) must be integer, rounded to %.0f"), k0, k);
if (k < k_small_max) {
int j;
+ if (R_IS_INT(n)) n = floor(n + 0.5);
if(n-k < k && n >= 0 && R_IS_INT(n)) k = n-k; /* <- Symmetry */
if (k < 0) return 0.;
if (k == 0) return 1.;
@@ -126,6 +127,7 @@
return r;
}
else if (R_IS_INT(n)) {
+ n = floor(n + 0.5);
if(n < k) return 0.;
if(n - k < k_small_max) return choose(n, n-k); /* <- Symmetry */
return floor(exp(lfastchoose(n, k)) + 0.5);
-------------- next part --------------
--- R-devel-orig-sse/src/nmath/choose.c 2009-12-17 17:52:39.000000000 +0100
+++ R-devel-work-copy-2-sse/src/nmath/choose.c 2009-12-22 13:49:33.000000000 +0100
@@ -93,13 +93,14 @@
return lfastchoose(n, k);
}
+#define IS_INT(x) ((x) == floor(x))
#define k_small_max 30
/* 30 is somewhat arbitrary: it is on the *safe* side:
* both speed and precision are clearly improved for k < 30.
*/
double choose(double n, double k)
{
- double r, k0 = k;
+ double r, k0 = k, n1;
k = floor(k + 0.5);
#ifdef IEEE_754
/* NaNs propagated correctly */
@@ -109,14 +110,14 @@
MATHLIB_WARNING2(_("'k' (%.2f) must be integer, rounded to %.0f"), k0, k);
if (k < k_small_max) {
int j;
- if(n-k < k && n >= 0 && R_IS_INT(n)) k = n-k; /* <- Symmetry */
+ if(n-k < k && n >= 0 && IS_INT(n)) k = n-k; /* <- Symmetry */
if (k < 0) return 0.;
if (k == 0) return 1.;
/* else: k >= 1 */
r = n;
for(j = 2; j <= k; j++)
- r *= (n-j+1)/j;
- return R_IS_INT(n) ? floor(r + 0.5) : r;
+ r *= (n-(j-1))/j;
+ return IS_INT(n) ? floor(r + 0.5) : r;
/* might have got rounding errors */
}
/* else: k >= k_small_max */
@@ -125,12 +126,15 @@
if (ODD(k)) r = -r;
return r;
}
- else if (R_IS_INT(n)) {
+ else if (IS_INT(n)) {
if(n < k) return 0.;
if(n - k < k_small_max) return choose(n, n-k); /* <- Symmetry */
return floor(exp(lfastchoose(n, k)) + 0.5);
}
/* else non-integer n >= 0 : */
+ n1 = floor(n + 0.5);
+ if (n1 <= k-1 && fabs(n1 - n) <= 1e-7)
+ return 0.;
if (n < k-1) {
int s_choose;
r = lfastchoose2(n, k, /* -> */ &s_choose);
@@ -140,5 +144,6 @@
}
#undef ODD
+#undef IS_INT
#undef R_IS_INT
#undef k_small_max
More information about the R-devel
mailing list