[RsR] Problem re-implementing whimed from package robustbase

Sun Jun 24 23:05:27 CEST 2012

I only had a quick look at the code and did not have time to try it
myself, but you made many more changes to the code besides swapping
the C arrays to C++ Eigen data types.

I would suggest one of the following approaches to go further:

1) The original C code has been used for a long time and is proved to
be reliable. If you need to use the EIgen data types for your
application, you could simply write a wrapper function.

2) If you really need to re-implement it, I would start by changing
only as little as necessary to make it work with the Eigen data types.
If you first have something that works, you can then incrementally make
more changes to take advantage of the Eigen library.

- Andreas

On Sat, Jun 23, 2012 at 9:27 PM, Kaveh Vakili
<Kaveh.Vakili using wis.kuleuven.be> wrote:
>
> Ok, "for make benefit" of future generations, i've attached to this
> mail the ,stand alone' version of qn_sn.c (i've removed sn and 200
> loc of bell and whistles to make it shorter).
>
> I guess now i will feed both this and my implementation to a
> debugger and see what is causing the strange divergence....
>
> #include <stdlib.h>
> #include <stdbool.h>
> #include <stdio.h>
> #include <inttypes.h>
> #include <R.h>
> #include <Rmath.h> /* -> <math.h> and much more */
>
> double qn0(double *x, int n);
> double whimed_i(double *a, int *w, int n, double* a_cand, double *a_psrt, int
> *w_cand);
> double pull(double *a_in, int n, int k);
> int cmp (const void *a, const void *b);
> int main();
>
>
> double whimed_i(double *a, int *w, int n, double* a_cand, double *a_psrt, int
> *w_cand){
> /*
>  Algorithm to compute the weighted high median in O(n) time.
>
>  The whimed is defined as the smallest a[j] such that the sum
>  of the weights of all a[i] <= a[j] is strictly greater than
>  half of the total weight.
>
>  Arguments:
>
>  a: double array containing the observations
>  n: number of observations
>  w: array of (int/double) weights of the observations.
> */
>
>    int n2, i, kcand,mm=0,vv1=1;
>    /* sum of weights: `int' do overflow when  n ~>= 1e5 */
>    int wleft, wmid, wright, w_tot, wrest;
>
>    double trial,s_apsrt;
>
>    w_tot = 0;
>    for (i = 0; i < n; ++i)
>        w_tot += w[i];
>    wrest = 0;
>
> /* REPEAT : */
>    do {
>        for (i = 0; i < n; ++i){
>            a_psrt[i] = a[i];
>        }
>        n2 = n/2;/* =^= n/2 +1 with 0-indexing */
>        trial = pull(a_psrt, n, n2);;
>        wleft = 0;    wmid  = 0;    wright= 0;
>        for (i = 0; i < n; ++i) {
>            if (a[i] < trial)   wleft += w[i];
>            else if (a[i] > trial) wright += w[i];
>            else wmid += w[i];
>        }
>        /* wleft = sum_{i; a[i]  < trial}  w[i]
>         * wmid  = sum_{i; a[i] == trial}  w[i] at least one 'i' since trial is
> one a[]!
>         * wright= sum_{i; a[i]  > trial}  w[i]
>         */
>        kcand = 0;
>        if (2 * (wrest + wleft) > w_tot) {
>            for (i = 0; i < n; ++i) {
>                if (a[i] < trial) {
>                    a_cand[kcand] = a[i];
>                    w_cand[kcand] = w[i];
>                    ++kcand;
>                }
>            }
>        }
>        else if (2 * (wrest + wleft + wmid) <= w_tot) {
>            for (i = 0; i < n; ++i) {
>                if (a[i] > trial) {
>                    a_cand[kcand] = a[i];
>                    w_cand[kcand] = w[i];
>                    ++kcand;
>                }
>            }
>            wrest += wleft + wmid;
>        }
>        else {
>            return trial;
>        }
>        mm++;
>        n = kcand;
>        s_apsrt=0;
>        for (i = 0;i<n;++i) {
>            a[i] = a_cand[i];
>            w[i] = w_cand[i];
>            s_apsrt += a[i];
>        }
>
>    } while(1);
>
> } /* _WHIMED_ */
>
> /* Main routines for C API */
> //double qn(double *x, int n, int finite_corr);
> /* these have no extra factors (no consistency factor & finite_corr): */
>
>
> /* ----------- Implementations -----------------------------------*/
>
> //void Qn0(double *x, Sint *n, double *res) { *res = qn0(x, (int)*n); }
>
> int cmp (const void *pa, const void *pb){
>        double a = *(const double*)pa;
>   double b = *(const double*)pb;
>  return (int) (a - b);
> }
>
> double qn0(double *x, int n){
> /*--------------------------------------------------------------------
>
>   Efficient algorithm for the scale estimator:
>
>       Q*_n = { |x_i - x_j|; i<j }_(k)  [= Qn without scaling ]
>
>                i.e. the k-th order statistic of the |x_i - x_j|
>
>   Parameters of the function Qn :
>       x  : double array containing the observations
>       n  : number of observations (n >=2)
>  */
>
>    double *y     = (double *)calloc(n, sizeof(double));
>    double *work  = (double *)calloc(n, sizeof(double));
>    double *a_psrt = (double *)calloc(n, sizeof(double));
>    double *a_cand = (double *)calloc(n, sizeof(double));
>
>    int *left     = (int *)calloc(n, sizeof(int));
>    int *right    = (int *)calloc(n, sizeof(int));
>    int *p        = (int *)calloc(n, sizeof(int));
>    int *q        = (int *)calloc(n, sizeof(int));
>    int *weight   = (int *)calloc(n, sizeof(int));
>
>    double trial;
>    bool found;
>
>    double w_tot=0.0;
>
>    int h, i, j,jj,jh,vv2=3,ll=0;
>    /* Following should be `long long int' : they can be of order n^2 */
>    int64_t k, knew, nl,nr, sump,sumq;
>
>    h = n / 2 + 1;
>    k = (int64_t)h * (h - 1) / 2;
>    for (i = 0; i < n; ++i) {
>        y[i] = x[i];
>        left [i] = n - i + 1;
>        right[i] = (i <= h) ? n : n - (i - h);
>        /* the n - (i-h) is from the paper; original code had `n' */
>    }
>  //  R_qsort(y, 1, n); /* y := sort(x) */
>    qsort(y,n,sizeof(double),cmp);
>    nl = (int64_t)n * (n + 1) / 2;
>    nr = (int64_t)n * n;
>    knew = k + nl;/* = k + (n+1 \over 2) */
>    found = FALSE;
> #ifdef DEBUG_qn
>    REprintf("qn0(): h,k= %2d,%2d;  nl,nr= %d,%d\n", h,k, nl,nr);
> #endif
> /* L200: */
>    while(!found && nr - nl > n) {
>        j = 0;
>        /* Truncation to float :
>           try to make sure that the same values are got later (guard bits !) */
>        for (i = 1; i < n; ++i) {
>            if (left[i] <= right[i]) {
>                weight[j] = right[i] - left[i] + 1;
>                jh = left[i] + weight[j] / 2;
>                work[j] = (float)(y[i] - y[n - jh]);
>                ++j;
>            }
>        }
>        trial = whimed_i(work, weight, j, a_cand, a_psrt, /*iw_cand*/ p);
> #ifdef DEBUG_qn
>        REprintf(" ..!found: whimed(");
> #  ifdef DEBUG_long
>        REprintf("wrk=c(");
>        for(i=0; i < j; i++) REprintf("%s%g", (i>0)? ", " : "", work[i]);
>        REprintf("),\n     wgt=c(");
>        for(i=0; i < j; i++) REprintf("%s%d", (i>0)? ", " : "", weight[i]);
>        REprintf("), j= %3d) -> trial= %7g\n", j, trial);
> #  else
>        REprintf("j=%3d) -> trial= %g:", j, trial);
> #  endif
> #endif
>        j = 0;
>        for (i = n - 1; i >= 0; --i) {
>            while (j < n && ((float)(y[i] - y[n - j - 1])) < trial)
>                ++j;
>            p[i] = j;
>        }
> #ifdef DEBUG_qn
>        REprintf(" f_1: j=%2d", j);
> #endif
>        j = n + 1;
>        for (i = 0; i < n; ++i) {
>            while ((float)(y[i] - y[n - j + 1]) > trial)
>                --j;
>            q[i] = j;
>        }
>        sump = 0;
>        sumq = 0;
>        for (i = 0; i < n; ++i) {
>            sump += p[i];
>            sumq += q[i] - 1;
>        }
> #ifdef DEBUG_qn
>        REprintf(" f_2 -> j=%2d, sump|q= %lld,%lld", j, sump,sumq);
> #endif
>        if (knew <= sump) {
>            for (i = 0; i < n; ++i)
>                right[i] = p[i];
>            nr = sump;
> #ifdef DEBUG_qn
>            REprintf("knew <= sump =: nr , new right[]\n");
> #endif
>        } else if (knew > sumq) {
>            for (i = 0; i < n; ++i)
>                left[i] = q[i];
>            nl = sumq;
> #ifdef DEBUG_qn
>            REprintf("knew > sumq =: nl , new left[]\n");
> #endif
>        } else { /* sump < knew <= sumq */
>            found = TRUE;
> #ifdef DEBUG_qn
>            REprintf("sump < knew <= sumq ---> FOUND\n");
> #endif
>        }
>        ll++;
>        if(ll>vv2)      found=TRUE;
> //      printf("%lf\n",trial);
>
>    } /* while */
>
>    if (found)
>        return trial;
>    else {
> #ifdef DEBUG_qn
>        REprintf(".. not fnd -> new work[]");
> #endif
>        j = 0;
>        for (i = 1; i < n; ++i) {
>            for (jj = left[i]; jj <= right[i]; ++jj) {
>                work[j] = y[i] - y[n - jj];
>                j++;
>            }/* j will be = sum_{i=2}^n (right[i] - left[i] + 1)_{+}  */
>        }
> #ifdef DEBUG_qn
>        REprintf(" of length %d; knew-nl=%d\n", j, knew-nl);
> #endif
>
>        return (pull(work, j - 1, knew - nl));
> //      knew -= (nl + 1); /* -1: 0-indexing */
> //      rPsort(work, j, knew);
> //      return(work[knew]);
>    }
> } /* qn0 */
>
>
> double pull(double *a_in, int n, int k){
>    int j;
>    double *a, ax;
>  //  char* vmax = vmaxget();
>    a = (double *)calloc(n, sizeof(double));
>    /* Copy a[] and use copy since it will be re-shuffled: */
>    for (j = 0; j < n; j++)
>        a[j] = a_in[j];
>
>    k--; /* 0-indexing */
>    qsort(a,n,sizeof(double),cmp);
>    ax = a[k];
>
> //    vmaxset(vmax);
>    return ax;
> } /* pull */
> int main(){
>        FILE *inFile;
>        double x[100],resu;
>        int i,n=100,numRead=0;
>        inFile=fopen("test.txt","r");
>        if (!inFile)            return 1;
>        for(i=0;i<n;i++) numRead=fscanf(inFile,"%lf",&x[i]);
>        printf("%lf\n",x[0]);
>        fclose(inFile);
>        resu=qn0(x,n);
>        printf("%lf\n",resu);
>        return 0;
> }
>
> _______________________________________________
> R-SIG-Robust using r-project.org mailing list
> https://stat.ethz.ch/mailman/listinfo/r-sig-robust

-- 
Andreas Alfons
Faculty of Business and Economics, KU Leuven
www.econ.kuleuven.be/andreas.alfons/public/