[RsR] Problem re-implementing whimed from package robustbase

Kaveh k@veh@v@k||| @end|ng |rom w|@@ku|euven@be
Sat Jun 23 15:31:24 CEST 2012


Dear All,

I'm trying to re-implement the Qn estimator in cpp 
(using the c code from Prof. Maechler as a canvas). 

As part of the Qn estimation algorithm we need to 
perform repeated calls to the whimed_i() function 
(for computing the weighted 'hi'-median). My problem 
can be traced to these lines

	n2 = n/2;/* =^= n/2 +1 with 0-indexing */
	rPsort(a_psrt, n, n2);
	trial = a_psrt[n2];


Oddly enough, in my implementation, the results 
of the first 10 values of 'trial' i get are the
 same result as in prof Maechler's implementation,
 but afterwards, my implementation diverges from his. 

For example, with this data:

set.seed(123)
x<-round(runif(100,0,10),3)
n<-length(x)
ro<-0.001;

If we add some printf()'s to Prof Maechler's 
code on the line just before the '}while(1)'
 we get: 

        s_apsrt=0;
	for (i = 0;i<n;++i) {
	    a[i] = a_cand[i];
	    w[i] = w_cand[i];
	    s_apsrt += a[i];
	}

printf("%lf %i %lf\n",s_apsrt,n,trial);

127.078001 49 2.232000
58.367000 24 2.558000
67.325000 49 1.206000
30.730000 24 1.339000
15.010000 12 1.284000
7.409000 6 1.261000
2.515000 2 1.232000
1.255000 1 1.260000
85.412000 43 1.846000
40.249000 21 1.988000
18.878000 10 1.916000
9.359000 5 1.886000
73.007000 44 1.549000
35.188000 22 1.654000
17.359000 11 1.601000
7.812000 5 1.588000
3.108000 2 1.566000

At the same point, my implementation 
yields (the last column is the iteration 
number):

127.078   49  2.232   0
58.367   24  2.558   1
67.325   49  1.206   2
30.73   24  1.339   3
15.01   12  1.284   4
7.409   6  1.261   5
2.515   2  1.232   6
1.255   1  1.26   7
85.412   43  1.846   8
40.249   21  1.988   9
19.455   10  1.916   10
9.636   5  1.937   11
74.477   44  1.581   12
35.763   22  1.678   13
16.586   10  1.613   14
6.699   4  1.671   15

Up to iteration 10, both 
 implementations give the same results, 
then, starting from 11 onwards, the values 
of trial diverge. I'm puzzled as to why. 

In trying to come up with a solution. 
I was wondering whether anyone could 
briefly explain/point to a reference 
where it is explained how i could make 
qn_sn.c as a stand alone code (with it's
 own main(){}) so i can feed it unto a 
debugger. When i try to compile the 
attached file:

gcc -I/usr/share/R/include -fpic -pipe -g -c qn_sn_test.c -o qn_sn_test

I get: 

In file included from qn_sn_test.c:4:0:
/usr/share/R/include/R.h:49:13: error: two or more data types in declaration 
specifiers
/usr/share/R/include/R.h:49:1: warning: useless type name in empty declaration 
[enabled by default]

[ubuntu 12.04].

Alternatively, maybe someone has a guess on what is 
causing mqn.cpp to produce erroneous results after a
 couple of iterations. I'm adding mqn.cpp below in case 
that could help diagnose the problem.  

Any input/help/advice is welcomed. 

//###########qn_sn_whimed.cpp
#include <stdio.h>   
#include <inttypes.h>
#include "robustbase.h"
#include <R.h>
#include <Rmath.h> /* -> <math.h> and much more */

double whimed_i(double *a, int *w, int n, double* a_cand, double *a_psrt, int 
*w_cand){
/*
  Algorithm to compute the weighted high median in O(n) time.

  The whimed is defined as the smallest a[j] such that the sum
  of the weights of all a[i] <= a[j] is strictly greater than
  half of the total weight.

  Arguments:

  a: double array containing the observations
  n: number of observations
  w: array of (int/double) weights of the observations.
*/

    int n2, i, kcand,mm=0,vv1=1;
    /* sum of weights: `int' do overflow when  n ~>= 1e5 */
    int wleft, wmid, wright, w_tot, wrest;

    double trial,s_apsrt;

    w_tot = 0;
    for (i = 0; i < n; ++i)
	w_tot += w[i];
    wrest = 0;

/* REPEAT : */
    do {
	for (i = 0; i < n; ++i){
	    a_psrt[i] = a[i];
	}
	n2 = n/2;/* =^= n/2 +1 with 0-indexing */
	rPsort(a_psrt, n, n2);
	trial = a_psrt[n2];
	wleft = 0;    wmid  = 0;    wright= 0;
	for (i = 0; i < n; ++i) {
	    if (a[i] < trial)	wleft += w[i];
	    else if (a[i] > trial) wright += w[i];
	    else wmid += w[i];
	}
	/* wleft = sum_{i; a[i]	 < trial}  w[i]
	 * wmid	 = sum_{i; a[i] == trial}  w[i] at least one 'i' since trial is 
one a[]!
	 * wright= sum_{i; a[i]	 > trial}  w[i]
	 */
	kcand = 0;
	if (2 * (wrest + wleft) > w_tot) {
	    for (i = 0; i < n; ++i) {
		if (a[i] < trial) {
		    a_cand[kcand] = a[i];
		    w_cand[kcand] = w[i];	
		    ++kcand;
		}
	    }
	}
	else if (2 * (wrest + wleft + wmid) <= w_tot) {
	    for (i = 0; i < n; ++i) {
		if (a[i] > trial) {
		    a_cand[kcand] = a[i];
		    w_cand[kcand] = w[i];	
                    ++kcand;
		}
	    }
	    wrest += wleft + wmid;
	}
	else {
	    return trial;
	}
	mm++;
	n = kcand;
        s_apsrt=0;
	for (i = 0;i<n;++i) {
	    a[i] = a_cand[i];
	    w[i] = w_cand[i];
	    s_apsrt += a[i];
	}

    } while(1);

} /* _WHIMED_ */

/* Main routines for C API */
double qn(double *x, int n, int finite_corr);
/* these have no extra factors (no consistency factor & finite_corr): */
double qn0(double *x, int n);


/* ----------- Implementations -----------------------------------*/

void Qn0(double *x, Sint *n, double *res) { *res = qn0(x, (int)*n); }


double qn0(double *x, int n)
{
/*--------------------------------------------------------------------

   Efficient algorithm for the scale estimator:

       Q*_n = { |x_i - x_j|; i<j }_(k)	[= Qn without scaling ]

		i.e. the k-th order statistic of the |x_i - x_j|

   Parameters of the function Qn :
       x  : double array containing the observations
       n  : number of observations (n >=2)
 */

    double *y	  = (double *)R_alloc(n, sizeof(double));
    double *work  = (double *)R_alloc(n, sizeof(double));
    double *a_psrt = (double *)R_alloc(n, sizeof(double));
    double *a_cand = (double *)R_alloc(n, sizeof(double));

    int *left	  = (int *)R_alloc(n, sizeof(int));
    int *right	  = (int *)R_alloc(n, sizeof(int));
    int *p	  = (int *)R_alloc(n, sizeof(int));
    int *q	  = (int *)R_alloc(n, sizeof(int));
    int *weight	  = (int *)R_alloc(n, sizeof(int));

    double trial = R_NaReal;/* -Wall */
    Rboolean found;

double w_tot=0.0;

    int h, i, j,jj,jh,vv2=3,ll=0;
    /* Following should be `long long int' : they can be of order n^2 */
    int64_t k, knew, nl,nr, sump,sumq;

    h = n / 2 + 1;
    k = (int64_t)h * (h - 1) / 2;
    for (i = 0; i < n; ++i) {
	y[i] = x[i];
	left [i] = n - i + 1;
	right[i] = (i <= h) ? n : n - (i - h);
	/* the n - (i-h) is from the paper; original code had `n' */
    }
    R_qsort(y, 1, n); /* y := sort(x) */
    nl = (int64_t)n * (n + 1) / 2;
    nr = (int64_t)n * n;
    knew = k + nl;/* = k + (n+1 \over 2) */
    found = FALSE;
#ifdef DEBUG_qn
    REprintf("qn0(): h,k= %2d,%2d;  nl,nr= %d,%d\n", h,k, nl,nr);
#endif
/* L200: */
    while(!found && nr - nl > n) {
	j = 0;
	/* Truncation to float :
	   try to make sure that the same values are got later (guard bits !) */
	for (i = 1; i < n; ++i) {
	    if (left[i] <= right[i]) {
		weight[j] = right[i] - left[i] + 1;
		jh = left[i] + weight[j] / 2;
		work[j] = (float)(y[i] - y[n - jh]);
		++j;
	    }
	}
	trial = whimed_i(work, weight, j, a_cand, a_psrt, /*iw_cand*/ p);
#ifdef DEBUG_qn
	REprintf(" ..!found: whimed(");
#  ifdef DEBUG_long
	REprintf("wrk=c(");
	for(i=0; i < j; i++) REprintf("%s%g", (i>0)? ", " : "", work[i]);
	REprintf("),\n	   wgt=c(");
	for(i=0; i < j; i++) REprintf("%s%d", (i>0)? ", " : "", weight[i]);
	REprintf("), j= %3d) -> trial= %7g\n", j, trial);
#  else
	REprintf("j=%3d) -> trial= %g:", j, trial);
#  endif
#endif
	j = 0;
	for (i = n - 1; i >= 0; --i) {
	    while (j < n && ((float)(y[i] - y[n - j - 1])) < trial)
		++j;
	    p[i] = j;
	}
#ifdef DEBUG_qn
	REprintf(" f_1: j=%2d", j);
#endif
	j = n + 1;
	for (i = 0; i < n; ++i) {
	    while ((float)(y[i] - y[n - j + 1]) > trial)
		--j;
	    q[i] = j;
	}
	sump = 0;
	sumq = 0;
	for (i = 0; i < n; ++i) {
	    sump += p[i];
	    sumq += q[i] - 1;
	}
#ifdef DEBUG_qn
	REprintf(" f_2 -> j=%2d, sump|q= %lld,%lld", j, sump,sumq);
#endif
	if (knew <= sump) {
	    for (i = 0; i < n; ++i)
		right[i] = p[i];
	    nr = sump;
#ifdef DEBUG_qn
	    REprintf("knew <= sump =: nr , new right[]\n");
#endif
	} else if (knew > sumq) {
	    for (i = 0; i < n; ++i)
		left[i] = q[i];
	    nl = sumq;
#ifdef DEBUG_qn
	    REprintf("knew > sumq =: nl , new left[]\n");
#endif
	} else { /* sump < knew <= sumq */
	    found = TRUE;
#ifdef DEBUG_qn
	    REprintf("sump < knew <= sumq ---> FOUND\n");
#endif
	}
	ll++;
	if(ll>vv2)	found=TRUE;
//	printf("%lf\n",trial);

    } /* while */

    if (found)
	return trial;
    else {
#ifdef DEBUG_qn
	REprintf(".. not fnd -> new work[]");
#endif
	j = 0;
	for (i = 1; i < n; ++i) {
	    for (jj = left[i]; jj <= right[i]; ++jj) {
		work[j] = y[i] - y[n - jj];
		j++;
	    }/* j will be = sum_{i=2}^n (right[i] - left[i] + 1)_{+}  */
	}
#ifdef DEBUG_qn
	REprintf(" of length %d; knew-nl=%d\n", j, knew-nl);
#endif

	/* return pull(work, j - 1, knew - nl)	: */
	knew -= (nl + 1); /* -1: 0-indexing */
	rPsort(work, j, knew);
	return(work[knew]);
    }
} /* qn0 */

double qn(double *x, int n, int finite_corr){
/* Efficient algorithm for the scale estimator:

	Qn = dn * 2.2219 * {|x_i-x_j|; i<j}_(k)
*/
    double r, dn = 1./*-Wall*/;

    r = 2.2219 * qn0(x, n); /* asymptotic consistency for sigma^2 */
    /*		 === */

    if (finite_corr) {
	if (n <= 9) {
	    if	    (n == 2)	dn = .399;
	    else if (n == 3)	dn = .994;
	    else if (n == 4)	dn = .512;
	    else if (n == 5)	dn = .844;
	    else if (n == 6)	dn = .611;
	    else if (n == 7)	dn = .857;
	    else if (n == 8)	dn = .669;
	    else if (n == 9)	dn = .872;
	} else {
	    if (n % 2 == 1)
		dn = n / (n + 1.4);
	    else /* (n % 2 == 0) */
		dn = n / (n + 3.8);
	}
	return dn * r;
    }
    else return r;
} /* qn */
double pull(double *a_in, int n, int k){
    int j;
    double *a, ax;
    char* vmax = vmaxget();
    a = (double *)R_alloc(n, sizeof(double));
    /* Copy a[] and use copy since it will be re-shuffled: */
    for (j = 0; j < n; j++)
	a[j] = a_in[j];

    k--; /* 0-indexing */
    rPsort(a, n, k);
    ax = a[k];

    vmaxset(vmax);
    return ax;
} /* pull */
int main(){
   printf("hello qn");
   return 0;
}

//###########mqn.cpp

#include <algorithm>
#include <cstdlib>
#include <cmath>
#include <ctime>
#include <functional>
#include <fstream>
#include <inttypes.h>
#include <iostream>
#include <limits>
#include <math.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <vector>

#include <Eigen/Cholesky>
#include <Eigen/Dense>
#include <Eigen/LU>
#include <Eigen/Eigenvalues>

using namespace std;
using namespace Eigen;
using Eigen::VectorXd;
using Eigen::VectorXi;

double whimed_i(VectorXd& a,VectorXi& w,int n,VectorXd& a_cand,VectorXd& 
a_psrt,VectorXi& w_cand){
	int n2,i,k_cand,nn=n,vv1=1;
	/* sum of weights: `int' do overflow when  n ~>= 1e5 */
	int wleft,wmid,wright,w_tot,wrest,mm=0;
	double trial,retv;
	w_tot=w.sum();
	wrest=0;
	do{
		a_psrt.head(nn)=a.head(nn);
		n2=nn/2;
		
std::nth_element(a_psrt.data(),a_psrt.data()+n2,a_psrt.data()+nn);
		trial=a_psrt(n2);
		wleft=0;wmid=0;wright=0;
		for (i=0;i<nn;++i){
		    if(a(i)<trial)				wleft+=w(i);
		    else if(a(i)>trial)				wright+=w(i);
		    else wmid+=w(i);
		}
		k_cand=0;
		if((2*(wrest+wleft))>w_tot){
			for (i=0;i<nn;++i){
				if(a(i)<trial){
				    	a_cand(k_cand)=a(i);
				    	w_cand(k_cand)=w(i);
					++k_cand;
				}
			}
		}
		else if((2*(wrest+wleft+wmid))<=w_tot){
			for (i=0;i<nn;++i){
				if (a(i)>trial){
					a_cand(k_cand)=a(i);
					w_cand(k_cand)=w(i);
					++k_cand;
				}
		    	}
			wrest+=wleft+wmid;
		} else {
		        return(trial);
		}
		if(mm>100){
			std::cout << "Achtung!" << std::endl;			
			return(trial);
		}
		mm++;
		nn=k_cand;
		a.head(nn)=a_cand.head(nn);
		w.head(nn)=w_cand.head(nn);
cout << a.head(nn).sum()  <<  "   " << nn << "  " << trial  <<endl;
	} while(1); 
}
double qn0(VectorXd& y,int n){
		VectorXd work=VectorXd::Zero(n);
		VectorXd a_psrt=VectorXd::Zero(n);
		VectorXd a_cand=VectorXd::Zero(n);

		VectorXi left=VectorXi::Zero(n);
    		VectorXi right=VectorXi::Zero(n);
    		VectorXi p=VectorXi::Zero(n);
    		VectorXi q=VectorXi::Zero(n);
    		VectorXi weight=VectorXi::Zero(n);

		bool found;
		int h,i,j,jj,jh,ll=0,vv2=3;
		double ri,trial,dn=1.0;

	    	int64_t k,knew,nl,nr,sump,sumq;
		h=n/2+1;
    		k=(int64_t)h*(h-1)/2;
    		for(i=0;i<n;++i){
			left(i)=n-i+1;
			right(i)=(i<=h)?n:n-(i-h);
		}
		std::sort(y.data(),y.data()+y.size());
		nl=(int64_t)n*(n+1)/2;
		nr=(int64_t)n*n;
		knew=k+nl;/*=k +(n+1 \over 2) */
		found=false;	
		while(!found && nr-nl>n){
			j=0;
			for(i=1;i<n;++i){
				if(left(i)<=right(i)){
					weight(j)=right(i)-left(i)+1;
					jh=left(i)+weight(j)/2;
					work(j)=(float)(y(i)-y(n-jh));
					++j;
				}
			}
			trial=whimed_i(work,weight,j,a_cand,a_psrt,p);
			ll++;
			j=0;
			for(i=n-1;i>=0;--i){
				while(j<n&&((float)(y(i)-y(n-j-1)))<trial) ++j;
				p(i)=j;
			}
			j=n+1;
			for(i=0;i<n;++i){
				while((float)(y(i)-y(n-j+1))>trial)	--j;
				q(i)=j;
			}
			sump=p.sum();
			sumq=q.sum()-n;
			if(knew<=sump){
				right=p;
				nr=sump;
			} else if(knew>sumq){
				left=q;
				nl=sumq;
			} else { /* sump < knew <= sumq */
			    	found=true;
			}
			if(ll>vv2)	found=true;
    		} /* while */
		if(found) ri=trial;
		else{
			j=0;
			for(i=1;i<n;++i){
		    		for(jj=left(i);jj<=right(i);++jj){
					work(j)=y(i)-y(n-jj);
					j++;
		    		}/* j will be=sum_{i=2}^n(right[i] - left[i] + 
1)_{+}  */
			}
			std::nth_element(work.data(),work.data()+knew-nl-
1,work.data()+j-1);
			ri=work(knew-nl-1);
		}
    		ri*=2.2219;
		if(n<=9){
			if(n==2)	dn=.399;
	    		else if(n==3)	dn=.994;
	    		else if(n==4)	dn=.512;
	    		else if(n==5)	dn=.844;
	    		else if(n==6)	dn=.611;
	    		else if(n==7)	dn=.857;
	    		else if(n==8)	dn=.669;
	    		else if(n==9)	dn=.872;
		} else {	
	    		if(n%2==1) dn=n/(n+1.4);
	    		else dn=n/(n+3.8);
		}
		ri*=dn;
		return(ri);
}
extern "C"{
	void R_mqn(int* nR,double* xr,double* ro){
		VectorXd x=Map<VectorXd>(xr,*nR);	
		int n=*nR;
		double ri=qn0(x,n);
		*ro=ri;
	}
}




More information about the R-SIG-Robust mailing list