[RsR] Problem re-implementing whimed from package robustbase

Fri Jun 29 14:55:34 CEST 2012

Replacing

     w_tot=w.sum();

by

     w_tot=w.head(n).sum();

in the whimed function did the trick.
(~a classic)

On 06/24/2012 11:05 PM, Andreas Alfons wrote:
> I only had a quick look at the code and did not have time to try it
> myself, but you made many more changes to the code besides swapping
> the C arrays to C++ Eigen data types.
>
> I would suggest one of the following approaches to go further:
>
> 1) The original C code has been used for a long time and is proved to
> be reliable. If you need to use the EIgen data types for your
> application, you could simply write a wrapper function.
>
> 2) If you really need to re-implement it, I would start by changing
> only as little as necessary to make it work with the Eigen data types.
> If you first have something that works, you can then incrementally make
> more changes to take advantage of the Eigen library.
>
> - Andreas
>
>
> On Sat, Jun 23, 2012 at 9:27 PM, Kaveh Vakili
> <Kaveh.Vakili using wis.kuleuven.be> wrote:
>> Ok, "for make benefit" of future generations, i've attached to this
>> mail the ,stand alone' version of qn_sn.c (i've removed sn and 200
>> loc of bell and whistles to make it shorter).
>>
>> I guess now i will feed both this and my implementation to a
>> debugger and see what is causing the strange divergence....
>>
>> #include <stdlib.h>
>> #include <stdbool.h>
>> #include <stdio.h>
>> #include <inttypes.h>
>> #include <R.h>
>> #include <Rmath.h> /* -> <math.h> and much more */
>>
>> double qn0(double *x, int n);
>> double whimed_i(double *a, int *w, int n, double* a_cand, double *a_psrt, int
>> *w_cand);
>> double pull(double *a_in, int n, int k);
>> int cmp (const void *a, const void *b);
>> int main();
>>
>>
>> double whimed_i(double *a, int *w, int n, double* a_cand, double *a_psrt, int
>> *w_cand){
>> /*
>>   Algorithm to compute the weighted high median in O(n) time.
>>
>>   The whimed is defined as the smallest a[j] such that the sum
>>   of the weights of all a[i] <= a[j] is strictly greater than
>>   half of the total weight.
>>
>>   Arguments:
>>
>>   a: double array containing the observations
>>   n: number of observations
>>   w: array of (int/double) weights of the observations.
>> */
>>
>>     int n2, i, kcand,mm=0,vv1=1;
>>     /* sum of weights: `int' do overflow when  n ~>= 1e5 */
>>     int wleft, wmid, wright, w_tot, wrest;
>>
>>     double trial,s_apsrt;
>>
>>     w_tot = 0;
>>     for (i = 0; i < n; ++i)
>>         w_tot += w[i];
>>     wrest = 0;
>>
>> /* REPEAT : */
>>     do {
>>         for (i = 0; i < n; ++i){
>>             a_psrt[i] = a[i];
>>         }
>>         n2 = n/2;/* =^= n/2 +1 with 0-indexing */
>>         trial = pull(a_psrt, n, n2);;
>>         wleft = 0;    wmid  = 0;    wright= 0;
>>         for (i = 0; i < n; ++i) {
>>             if (a[i] < trial)   wleft += w[i];
>>             else if (a[i] > trial) wright += w[i];
>>             else wmid += w[i];
>>         }
>>         /* wleft = sum_{i; a[i]  < trial}  w[i]
>>          * wmid  = sum_{i; a[i] == trial}  w[i] at least one 'i' since trial is
>> one a[]!
>>          * wright= sum_{i; a[i]  > trial}  w[i]
>>          */
>>         kcand = 0;
>>         if (2 * (wrest + wleft) > w_tot) {
>>             for (i = 0; i < n; ++i) {
>>                 if (a[i] < trial) {
>>                     a_cand[kcand] = a[i];
>>                     w_cand[kcand] = w[i];
>>                     ++kcand;
>>                 }
>>             }
>>         }
>>         else if (2 * (wrest + wleft + wmid) <= w_tot) {
>>             for (i = 0; i < n; ++i) {
>>                 if (a[i] > trial) {
>>                     a_cand[kcand] = a[i];
>>                     w_cand[kcand] = w[i];
>>                     ++kcand;
>>                 }
>>             }
>>             wrest += wleft + wmid;
>>         }
>>         else {
>>             return trial;
>>         }
>>         mm++;
>>         n = kcand;
>>         s_apsrt=0;
>>         for (i = 0;i<n;++i) {
>>             a[i] = a_cand[i];
>>             w[i] = w_cand[i];
>>             s_apsrt += a[i];
>>         }
>>
>>     } while(1);
>>
>> } /* _WHIMED_ */
>>
>> /* Main routines for C API */
>> //double qn(double *x, int n, int finite_corr);
>> /* these have no extra factors (no consistency factor & finite_corr): */
>>
>>
>> /* ----------- Implementations -----------------------------------*/
>>
>> //void Qn0(double *x, Sint *n, double *res) { *res = qn0(x, (int)*n); }
>>
>> int cmp (const void *pa, const void *pb){
>>         double a = *(const double*)pa;
>>    double b = *(const double*)pb;
>>   return (int) (a - b);
>> }
>>
>> double qn0(double *x, int n){
>> /*--------------------------------------------------------------------
>>
>>    Efficient algorithm for the scale estimator:
>>
>>        Q*_n = { |x_i - x_j|; i<j }_(k)  [= Qn without scaling ]
>>
>>                 i.e. the k-th order statistic of the |x_i - x_j|
>>
>>    Parameters of the function Qn :
>>        x  : double array containing the observations
>>        n  : number of observations (n >=2)
>>   */
>>
>>     double *y     = (double *)calloc(n, sizeof(double));
>>     double *work  = (double *)calloc(n, sizeof(double));
>>     double *a_psrt = (double *)calloc(n, sizeof(double));
>>     double *a_cand = (double *)calloc(n, sizeof(double));
>>
>>     int *left     = (int *)calloc(n, sizeof(int));
>>     int *right    = (int *)calloc(n, sizeof(int));
>>     int *p        = (int *)calloc(n, sizeof(int));
>>     int *q        = (int *)calloc(n, sizeof(int));
>>     int *weight   = (int *)calloc(n, sizeof(int));
>>
>>     double trial;
>>     bool found;
>>
>>     double w_tot=0.0;
>>
>>     int h, i, j,jj,jh,vv2=3,ll=0;
>>     /* Following should be `long long int' : they can be of order n^2 */
>>     int64_t k, knew, nl,nr, sump,sumq;
>>
>>     h = n / 2 + 1;
>>     k = (int64_t)h * (h - 1) / 2;
>>     for (i = 0; i < n; ++i) {
>>         y[i] = x[i];
>>         left [i] = n - i + 1;
>>         right[i] = (i <= h) ? n : n - (i - h);
>>         /* the n - (i-h) is from the paper; original code had `n' */
>>     }
>>   //  R_qsort(y, 1, n); /* y := sort(x) */
>>     qsort(y,n,sizeof(double),cmp);
>>     nl = (int64_t)n * (n + 1) / 2;
>>     nr = (int64_t)n * n;
>>     knew = k + nl;/* = k + (n+1 \over 2) */
>>     found = FALSE;
>> #ifdef DEBUG_qn
>>     REprintf("qn0(): h,k= %2d,%2d;  nl,nr= %d,%d\n", h,k, nl,nr);
>> #endif
>> /* L200: */
>>     while(!found && nr - nl > n) {
>>         j = 0;
>>         /* Truncation to float :
>>            try to make sure that the same values are got later (guard bits !) */
>>         for (i = 1; i < n; ++i) {
>>             if (left[i] <= right[i]) {
>>                 weight[j] = right[i] - left[i] + 1;
>>                 jh = left[i] + weight[j] / 2;
>>                 work[j] = (float)(y[i] - y[n - jh]);
>>                 ++j;
>>             }
>>         }
>>         trial = whimed_i(work, weight, j, a_cand, a_psrt, /*iw_cand*/ p);
>> #ifdef DEBUG_qn
>>         REprintf(" ..!found: whimed(");
>> #  ifdef DEBUG_long
>>         REprintf("wrk=c(");
>>         for(i=0; i < j; i++) REprintf("%s%g", (i>0)? ", " : "", work[i]);
>>         REprintf("),\n     wgt=c(");
>>         for(i=0; i < j; i++) REprintf("%s%d", (i>0)? ", " : "", weight[i]);
>>         REprintf("), j= %3d) -> trial= %7g\n", j, trial);
>> #  else
>>         REprintf("j=%3d) -> trial= %g:", j, trial);
>> #  endif
>> #endif
>>         j = 0;
>>         for (i = n - 1; i >= 0; --i) {
>>             while (j < n && ((float)(y[i] - y[n - j - 1])) < trial)
>>                 ++j;
>>             p[i] = j;
>>         }
>> #ifdef DEBUG_qn
>>         REprintf(" f_1: j=%2d", j);
>> #endif
>>         j = n + 1;
>>         for (i = 0; i < n; ++i) {
>>             while ((float)(y[i] - y[n - j + 1]) > trial)
>>                 --j;
>>             q[i] = j;
>>         }
>>         sump = 0;
>>         sumq = 0;
>>         for (i = 0; i < n; ++i) {
>>             sump += p[i];
>>             sumq += q[i] - 1;
>>         }
>> #ifdef DEBUG_qn
>>         REprintf(" f_2 -> j=%2d, sump|q= %lld,%lld", j, sump,sumq);
>> #endif
>>         if (knew <= sump) {
>>             for (i = 0; i < n; ++i)
>>                 right[i] = p[i];
>>             nr = sump;
>> #ifdef DEBUG_qn
>>             REprintf("knew <= sump =: nr , new right[]\n");
>> #endif
>>         } else if (knew > sumq) {
>>             for (i = 0; i < n; ++i)
>>                 left[i] = q[i];
>>             nl = sumq;
>> #ifdef DEBUG_qn
>>             REprintf("knew > sumq =: nl , new left[]\n");
>> #endif
>>         } else { /* sump < knew <= sumq */
>>             found = TRUE;
>> #ifdef DEBUG_qn
>>             REprintf("sump < knew <= sumq ---> FOUND\n");
>> #endif
>>         }
>>         ll++;
>>         if(ll>vv2)      found=TRUE;
>> //      printf("%lf\n",trial);
>>
>>     } /* while */
>>
>>     if (found)
>>         return trial;
>>     else {
>> #ifdef DEBUG_qn
>>         REprintf(".. not fnd -> new work[]");
>> #endif
>>         j = 0;
>>         for (i = 1; i < n; ++i) {
>>             for (jj = left[i]; jj <= right[i]; ++jj) {
>>                 work[j] = y[i] - y[n - jj];
>>                 j++;
>>             }/* j will be = sum_{i=2}^n (right[i] - left[i] + 1)_{+}  */
>>         }
>> #ifdef DEBUG_qn
>>         REprintf(" of length %d; knew-nl=%d\n", j, knew-nl);
>> #endif
>>
>>         return (pull(work, j - 1, knew - nl));
>> //      knew -= (nl + 1); /* -1: 0-indexing */
>> //      rPsort(work, j, knew);
>> //      return(work[knew]);
>>     }
>> } /* qn0 */
>>
>>
>> double pull(double *a_in, int n, int k){
>>     int j;
>>     double *a, ax;
>>   //  char* vmax = vmaxget();
>>     a = (double *)calloc(n, sizeof(double));
>>     /* Copy a[] and use copy since it will be re-shuffled: */
>>     for (j = 0; j < n; j++)
>>         a[j] = a_in[j];
>>
>>     k--; /* 0-indexing */
>>     qsort(a,n,sizeof(double),cmp);
>>     ax = a[k];
>>
>> //    vmaxset(vmax);
>>     return ax;
>> } /* pull */
>> int main(){
>>         FILE *inFile;
>>         double x[100],resu;
>>         int i,n=100,numRead=0;
>>         inFile=fopen("test.txt","r");
>>         if (!inFile)            return 1;
>>         for(i=0;i<n;i++) numRead=fscanf(inFile,"%lf",&x[i]);
>>         printf("%lf\n",x[0]);
>>         fclose(inFile);
>>         resu=qn0(x,n);
>>         printf("%lf\n",resu);
>>         return 0;
>> }
>>
>> _______________________________________________
>> R-SIG-Robust using r-project.org mailing list
>> https://stat.ethz.ch/mailman/listinfo/r-sig-robust
>
>