[RsR] Problem re-implementing whimed from package robustbase
Kaveh
k@veh@v@k||| @end|ng |rom w|@@ku|euven@be
Sat Jun 23 15:31:24 CEST 2012
Dear All,
I'm trying to re-implement the Qn estimator in cpp
(using the c code from Prof. Maechler as a canvas).
As part of the Qn estimation algorithm we need to
perform repeated calls to the whimed_i() function
(for computing the weighted 'hi'-median). My problem
can be traced to these lines
n2 = n/2;/* =^= n/2 +1 with 0-indexing */
rPsort(a_psrt, n, n2);
trial = a_psrt[n2];
Oddly enough, in my implementation, the results
of the first 10 values of 'trial' i get are the
same result as in prof Maechler's implementation,
but afterwards, my implementation diverges from his.
For example, with this data:
set.seed(123)
x<-round(runif(100,0,10),3)
n<-length(x)
ro<-0.001;
If we add some printf()'s to Prof Maechler's
code on the line just before the '}while(1)'
we get:
s_apsrt=0;
for (i = 0;i<n;++i) {
a[i] = a_cand[i];
w[i] = w_cand[i];
s_apsrt += a[i];
}
printf("%lf %i %lf\n",s_apsrt,n,trial);
127.078001 49 2.232000
58.367000 24 2.558000
67.325000 49 1.206000
30.730000 24 1.339000
15.010000 12 1.284000
7.409000 6 1.261000
2.515000 2 1.232000
1.255000 1 1.260000
85.412000 43 1.846000
40.249000 21 1.988000
18.878000 10 1.916000
9.359000 5 1.886000
73.007000 44 1.549000
35.188000 22 1.654000
17.359000 11 1.601000
7.812000 5 1.588000
3.108000 2 1.566000
At the same point, my implementation
yields (the last column is the iteration
number):
127.078 49 2.232 0
58.367 24 2.558 1
67.325 49 1.206 2
30.73 24 1.339 3
15.01 12 1.284 4
7.409 6 1.261 5
2.515 2 1.232 6
1.255 1 1.26 7
85.412 43 1.846 8
40.249 21 1.988 9
19.455 10 1.916 10
9.636 5 1.937 11
74.477 44 1.581 12
35.763 22 1.678 13
16.586 10 1.613 14
6.699 4 1.671 15
Up to iteration 10, both
implementations give the same results,
then, starting from 11 onwards, the values
of trial diverge. I'm puzzled as to why.
In trying to come up with a solution.
I was wondering whether anyone could
briefly explain/point to a reference
where it is explained how i could make
qn_sn.c as a stand alone code (with it's
own main(){}) so i can feed it unto a
debugger. When i try to compile the
attached file:
gcc -I/usr/share/R/include -fpic -pipe -g -c qn_sn_test.c -o qn_sn_test
I get:
In file included from qn_sn_test.c:4:0:
/usr/share/R/include/R.h:49:13: error: two or more data types in declaration
specifiers
/usr/share/R/include/R.h:49:1: warning: useless type name in empty declaration
[enabled by default]
[ubuntu 12.04].
Alternatively, maybe someone has a guess on what is
causing mqn.cpp to produce erroneous results after a
couple of iterations. I'm adding mqn.cpp below in case
that could help diagnose the problem.
Any input/help/advice is welcomed.
//###########qn_sn_whimed.cpp
#include <stdio.h>
#include <inttypes.h>
#include "robustbase.h"
#include <R.h>
#include <Rmath.h> /* -> <math.h> and much more */
double whimed_i(double *a, int *w, int n, double* a_cand, double *a_psrt, int
*w_cand){
/*
Algorithm to compute the weighted high median in O(n) time.
The whimed is defined as the smallest a[j] such that the sum
of the weights of all a[i] <= a[j] is strictly greater than
half of the total weight.
Arguments:
a: double array containing the observations
n: number of observations
w: array of (int/double) weights of the observations.
*/
int n2, i, kcand,mm=0,vv1=1;
/* sum of weights: `int' do overflow when n ~>= 1e5 */
int wleft, wmid, wright, w_tot, wrest;
double trial,s_apsrt;
w_tot = 0;
for (i = 0; i < n; ++i)
w_tot += w[i];
wrest = 0;
/* REPEAT : */
do {
for (i = 0; i < n; ++i){
a_psrt[i] = a[i];
}
n2 = n/2;/* =^= n/2 +1 with 0-indexing */
rPsort(a_psrt, n, n2);
trial = a_psrt[n2];
wleft = 0; wmid = 0; wright= 0;
for (i = 0; i < n; ++i) {
if (a[i] < trial) wleft += w[i];
else if (a[i] > trial) wright += w[i];
else wmid += w[i];
}
/* wleft = sum_{i; a[i] < trial} w[i]
* wmid = sum_{i; a[i] == trial} w[i] at least one 'i' since trial is
one a[]!
* wright= sum_{i; a[i] > trial} w[i]
*/
kcand = 0;
if (2 * (wrest + wleft) > w_tot) {
for (i = 0; i < n; ++i) {
if (a[i] < trial) {
a_cand[kcand] = a[i];
w_cand[kcand] = w[i];
++kcand;
}
}
}
else if (2 * (wrest + wleft + wmid) <= w_tot) {
for (i = 0; i < n; ++i) {
if (a[i] > trial) {
a_cand[kcand] = a[i];
w_cand[kcand] = w[i];
++kcand;
}
}
wrest += wleft + wmid;
}
else {
return trial;
}
mm++;
n = kcand;
s_apsrt=0;
for (i = 0;i<n;++i) {
a[i] = a_cand[i];
w[i] = w_cand[i];
s_apsrt += a[i];
}
} while(1);
} /* _WHIMED_ */
/* Main routines for C API */
double qn(double *x, int n, int finite_corr);
/* these have no extra factors (no consistency factor & finite_corr): */
double qn0(double *x, int n);
/* ----------- Implementations -----------------------------------*/
void Qn0(double *x, Sint *n, double *res) { *res = qn0(x, (int)*n); }
double qn0(double *x, int n)
{
/*--------------------------------------------------------------------
Efficient algorithm for the scale estimator:
Q*_n = { |x_i - x_j|; i<j }_(k) [= Qn without scaling ]
i.e. the k-th order statistic of the |x_i - x_j|
Parameters of the function Qn :
x : double array containing the observations
n : number of observations (n >=2)
*/
double *y = (double *)R_alloc(n, sizeof(double));
double *work = (double *)R_alloc(n, sizeof(double));
double *a_psrt = (double *)R_alloc(n, sizeof(double));
double *a_cand = (double *)R_alloc(n, sizeof(double));
int *left = (int *)R_alloc(n, sizeof(int));
int *right = (int *)R_alloc(n, sizeof(int));
int *p = (int *)R_alloc(n, sizeof(int));
int *q = (int *)R_alloc(n, sizeof(int));
int *weight = (int *)R_alloc(n, sizeof(int));
double trial = R_NaReal;/* -Wall */
Rboolean found;
double w_tot=0.0;
int h, i, j,jj,jh,vv2=3,ll=0;
/* Following should be `long long int' : they can be of order n^2 */
int64_t k, knew, nl,nr, sump,sumq;
h = n / 2 + 1;
k = (int64_t)h * (h - 1) / 2;
for (i = 0; i < n; ++i) {
y[i] = x[i];
left [i] = n - i + 1;
right[i] = (i <= h) ? n : n - (i - h);
/* the n - (i-h) is from the paper; original code had `n' */
}
R_qsort(y, 1, n); /* y := sort(x) */
nl = (int64_t)n * (n + 1) / 2;
nr = (int64_t)n * n;
knew = k + nl;/* = k + (n+1 \over 2) */
found = FALSE;
#ifdef DEBUG_qn
REprintf("qn0(): h,k= %2d,%2d; nl,nr= %d,%d\n", h,k, nl,nr);
#endif
/* L200: */
while(!found && nr - nl > n) {
j = 0;
/* Truncation to float :
try to make sure that the same values are got later (guard bits !) */
for (i = 1; i < n; ++i) {
if (left[i] <= right[i]) {
weight[j] = right[i] - left[i] + 1;
jh = left[i] + weight[j] / 2;
work[j] = (float)(y[i] - y[n - jh]);
++j;
}
}
trial = whimed_i(work, weight, j, a_cand, a_psrt, /*iw_cand*/ p);
#ifdef DEBUG_qn
REprintf(" ..!found: whimed(");
# ifdef DEBUG_long
REprintf("wrk=c(");
for(i=0; i < j; i++) REprintf("%s%g", (i>0)? ", " : "", work[i]);
REprintf("),\n wgt=c(");
for(i=0; i < j; i++) REprintf("%s%d", (i>0)? ", " : "", weight[i]);
REprintf("), j= %3d) -> trial= %7g\n", j, trial);
# else
REprintf("j=%3d) -> trial= %g:", j, trial);
# endif
#endif
j = 0;
for (i = n - 1; i >= 0; --i) {
while (j < n && ((float)(y[i] - y[n - j - 1])) < trial)
++j;
p[i] = j;
}
#ifdef DEBUG_qn
REprintf(" f_1: j=%2d", j);
#endif
j = n + 1;
for (i = 0; i < n; ++i) {
while ((float)(y[i] - y[n - j + 1]) > trial)
--j;
q[i] = j;
}
sump = 0;
sumq = 0;
for (i = 0; i < n; ++i) {
sump += p[i];
sumq += q[i] - 1;
}
#ifdef DEBUG_qn
REprintf(" f_2 -> j=%2d, sump|q= %lld,%lld", j, sump,sumq);
#endif
if (knew <= sump) {
for (i = 0; i < n; ++i)
right[i] = p[i];
nr = sump;
#ifdef DEBUG_qn
REprintf("knew <= sump =: nr , new right[]\n");
#endif
} else if (knew > sumq) {
for (i = 0; i < n; ++i)
left[i] = q[i];
nl = sumq;
#ifdef DEBUG_qn
REprintf("knew > sumq =: nl , new left[]\n");
#endif
} else { /* sump < knew <= sumq */
found = TRUE;
#ifdef DEBUG_qn
REprintf("sump < knew <= sumq ---> FOUND\n");
#endif
}
ll++;
if(ll>vv2) found=TRUE;
// printf("%lf\n",trial);
} /* while */
if (found)
return trial;
else {
#ifdef DEBUG_qn
REprintf(".. not fnd -> new work[]");
#endif
j = 0;
for (i = 1; i < n; ++i) {
for (jj = left[i]; jj <= right[i]; ++jj) {
work[j] = y[i] - y[n - jj];
j++;
}/* j will be = sum_{i=2}^n (right[i] - left[i] + 1)_{+} */
}
#ifdef DEBUG_qn
REprintf(" of length %d; knew-nl=%d\n", j, knew-nl);
#endif
/* return pull(work, j - 1, knew - nl) : */
knew -= (nl + 1); /* -1: 0-indexing */
rPsort(work, j, knew);
return(work[knew]);
}
} /* qn0 */
double qn(double *x, int n, int finite_corr){
/* Efficient algorithm for the scale estimator:
Qn = dn * 2.2219 * {|x_i-x_j|; i<j}_(k)
*/
double r, dn = 1./*-Wall*/;
r = 2.2219 * qn0(x, n); /* asymptotic consistency for sigma^2 */
/* === */
if (finite_corr) {
if (n <= 9) {
if (n == 2) dn = .399;
else if (n == 3) dn = .994;
else if (n == 4) dn = .512;
else if (n == 5) dn = .844;
else if (n == 6) dn = .611;
else if (n == 7) dn = .857;
else if (n == 8) dn = .669;
else if (n == 9) dn = .872;
} else {
if (n % 2 == 1)
dn = n / (n + 1.4);
else /* (n % 2 == 0) */
dn = n / (n + 3.8);
}
return dn * r;
}
else return r;
} /* qn */
double pull(double *a_in, int n, int k){
int j;
double *a, ax;
char* vmax = vmaxget();
a = (double *)R_alloc(n, sizeof(double));
/* Copy a[] and use copy since it will be re-shuffled: */
for (j = 0; j < n; j++)
a[j] = a_in[j];
k--; /* 0-indexing */
rPsort(a, n, k);
ax = a[k];
vmaxset(vmax);
return ax;
} /* pull */
int main(){
printf("hello qn");
return 0;
}
//###########mqn.cpp
#include <algorithm>
#include <cstdlib>
#include <cmath>
#include <ctime>
#include <functional>
#include <fstream>
#include <inttypes.h>
#include <iostream>
#include <limits>
#include <math.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <vector>
#include <Eigen/Cholesky>
#include <Eigen/Dense>
#include <Eigen/LU>
#include <Eigen/Eigenvalues>
using namespace std;
using namespace Eigen;
using Eigen::VectorXd;
using Eigen::VectorXi;
double whimed_i(VectorXd& a,VectorXi& w,int n,VectorXd& a_cand,VectorXd&
a_psrt,VectorXi& w_cand){
int n2,i,k_cand,nn=n,vv1=1;
/* sum of weights: `int' do overflow when n ~>= 1e5 */
int wleft,wmid,wright,w_tot,wrest,mm=0;
double trial,retv;
w_tot=w.sum();
wrest=0;
do{
a_psrt.head(nn)=a.head(nn);
n2=nn/2;
std::nth_element(a_psrt.data(),a_psrt.data()+n2,a_psrt.data()+nn);
trial=a_psrt(n2);
wleft=0;wmid=0;wright=0;
for (i=0;i<nn;++i){
if(a(i)<trial) wleft+=w(i);
else if(a(i)>trial) wright+=w(i);
else wmid+=w(i);
}
k_cand=0;
if((2*(wrest+wleft))>w_tot){
for (i=0;i<nn;++i){
if(a(i)<trial){
a_cand(k_cand)=a(i);
w_cand(k_cand)=w(i);
++k_cand;
}
}
}
else if((2*(wrest+wleft+wmid))<=w_tot){
for (i=0;i<nn;++i){
if (a(i)>trial){
a_cand(k_cand)=a(i);
w_cand(k_cand)=w(i);
++k_cand;
}
}
wrest+=wleft+wmid;
} else {
return(trial);
}
if(mm>100){
std::cout << "Achtung!" << std::endl;
return(trial);
}
mm++;
nn=k_cand;
a.head(nn)=a_cand.head(nn);
w.head(nn)=w_cand.head(nn);
cout << a.head(nn).sum() << " " << nn << " " << trial <<endl;
} while(1);
}
double qn0(VectorXd& y,int n){
VectorXd work=VectorXd::Zero(n);
VectorXd a_psrt=VectorXd::Zero(n);
VectorXd a_cand=VectorXd::Zero(n);
VectorXi left=VectorXi::Zero(n);
VectorXi right=VectorXi::Zero(n);
VectorXi p=VectorXi::Zero(n);
VectorXi q=VectorXi::Zero(n);
VectorXi weight=VectorXi::Zero(n);
bool found;
int h,i,j,jj,jh,ll=0,vv2=3;
double ri,trial,dn=1.0;
int64_t k,knew,nl,nr,sump,sumq;
h=n/2+1;
k=(int64_t)h*(h-1)/2;
for(i=0;i<n;++i){
left(i)=n-i+1;
right(i)=(i<=h)?n:n-(i-h);
}
std::sort(y.data(),y.data()+y.size());
nl=(int64_t)n*(n+1)/2;
nr=(int64_t)n*n;
knew=k+nl;/*=k +(n+1 \over 2) */
found=false;
while(!found && nr-nl>n){
j=0;
for(i=1;i<n;++i){
if(left(i)<=right(i)){
weight(j)=right(i)-left(i)+1;
jh=left(i)+weight(j)/2;
work(j)=(float)(y(i)-y(n-jh));
++j;
}
}
trial=whimed_i(work,weight,j,a_cand,a_psrt,p);
ll++;
j=0;
for(i=n-1;i>=0;--i){
while(j<n&&((float)(y(i)-y(n-j-1)))<trial) ++j;
p(i)=j;
}
j=n+1;
for(i=0;i<n;++i){
while((float)(y(i)-y(n-j+1))>trial) --j;
q(i)=j;
}
sump=p.sum();
sumq=q.sum()-n;
if(knew<=sump){
right=p;
nr=sump;
} else if(knew>sumq){
left=q;
nl=sumq;
} else { /* sump < knew <= sumq */
found=true;
}
if(ll>vv2) found=true;
} /* while */
if(found) ri=trial;
else{
j=0;
for(i=1;i<n;++i){
for(jj=left(i);jj<=right(i);++jj){
work(j)=y(i)-y(n-jj);
j++;
}/* j will be=sum_{i=2}^n(right[i] - left[i] +
1)_{+} */
}
std::nth_element(work.data(),work.data()+knew-nl-
1,work.data()+j-1);
ri=work(knew-nl-1);
}
ri*=2.2219;
if(n<=9){
if(n==2) dn=.399;
else if(n==3) dn=.994;
else if(n==4) dn=.512;
else if(n==5) dn=.844;
else if(n==6) dn=.611;
else if(n==7) dn=.857;
else if(n==8) dn=.669;
else if(n==9) dn=.872;
} else {
if(n%2==1) dn=n/(n+1.4);
else dn=n/(n+3.8);
}
ri*=dn;
return(ri);
}
extern "C"{
void R_mqn(int* nR,double* xr,double* ro){
VectorXd x=Map<VectorXd>(xr,*nR);
int n=*nR;
double ri=qn0(x,n);
*ro=ri;
}
}
More information about the R-SIG-Robust
mailing list