/* This file contains code to calculate Kendall's Tau in O(N log N) time in * a manner similar to the following reference: * * A Computer Method for Calculating Kendall's Tau with Ungrouped Data * William R. Knight Journal of the American Statistical Association, Vol. 61, * No. 314, Part 1 (Jun., 1966), pp. 436-439 * * Copyright 2010 David Simcha * * License: * Boost Software License - Version 1.0 - August 17th, 2003 * * Permission is hereby granted, free of charge, to any person or organization * obtaining a copy of the software and accompanying documentation covered by * this license (the "Software") to use, reproduce, display, distribute, * execute, and transmit the Software, and to prepare derivative works of the * Software, and to permit third-parties to whom the Software is furnished to * do so, all subject to the following: * * The copyright notices in the Software and this entire statement, including * the above license grant, this restriction and the following disclaimer, * must be included in all copies of the Software, in whole or in part, and * all derivative works of the Software, unless such copies or derivative * works are solely in the form of machine-executable object code generated by * a source language processor. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT * SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE * FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. * */ #include #include #include #include uint64_t insertionSort(double*, size_t); #define kendallTest #ifdef kendallTest #include #include #include /* Kludge: In testing mode, just forward R_rsort to insertionSort to make this * module testable without having to include (and compile) a bunch of other * stuff. */ void R_rsort(double* arr, int len) { insertionSort(arr, len); } #else #include /* For R_rsort. */ #endif /* Sorts in place, returns the bubble sort distance between the input array * and the sorted array. */ uint64_t insertionSort(double* arr, size_t len) { size_t maxJ, i; uint64_t swapCount = 0; if(len < 2) { return 0; } maxJ = len - 1; for(i = len - 2; i < len; --i) { size_t j = i; double val = arr[i]; for(; j < maxJ && arr[j + 1] < val; ++j) { arr[j] = arr[j + 1]; } arr[j] = val; swapCount += (j - i); } return swapCount; } static uint64_t merge(double* from, double* to, size_t middle, size_t len) { size_t bufIndex, leftLen, rightLen; uint64_t swaps; double* left; double* right; bufIndex = 0; swaps = 0; left = from; right = from + middle; rightLen = len - middle; leftLen = middle; while(leftLen && rightLen) { if(right[0] < left[0]) { to[bufIndex] = right[0]; swaps += leftLen; rightLen--; right++; } else { to[bufIndex] = left[0]; leftLen--; left++; } bufIndex++; } if(leftLen) { memcpy(to + bufIndex, left, leftLen * sizeof(double)); } else if(rightLen) { memcpy(to + bufIndex, right, rightLen * sizeof(double)); } return swaps; } /* Sorts in place, returns the bubble sort distance between the input array * and the sorted array. */ uint64_t mergeSort(double* x, double* buf, size_t len) { uint64_t swaps; size_t half; if(len < 10) { return insertionSort(x, len); } swaps = 0; if(len < 2) { return 0; } half = len / 2; swaps += mergeSort(x, buf, half); swaps += mergeSort(x + half, buf + half, len - half); swaps += merge(x, buf, half, len); memcpy(x, buf, len * sizeof(double)); return swaps; } static uint64_t getMs(double* data, size_t len) { /* Assumes data is sorted.*/ uint64_t Ms = 0, tieCount = 0; size_t i; for(i = 1; i < len; i++) { if(data[i] == data[i-1]) { tieCount++; } else if(tieCount) { Ms += (tieCount * (tieCount + 1)) / 2; tieCount++; tieCount = 0; } } if(tieCount) { Ms += (tieCount * (tieCount + 1)) / 2; tieCount++; } return Ms; } /* This function calculates the Kendall covariance (if cor == 0) or * correlation (if cor != 0), but assumes arr1 has already been sorted and * arr2 has already been reordered in lockstep. This can be done within R * before calling this function by doing something like: * * perm <- order(arr1) * arr1 <- arr1[perm] * arr2 <- arr2[perm] */ double kendallNlogN(double* arr1, double* arr2, size_t len, int cor) { uint64_t m1 = 0, m2 = 0, tieCount, swapCount, nPair; int64_t s; size_t i; nPair = (uint64_t) len * ((uint64_t) len - 1) / 2; s = nPair; tieCount = 0; for(i = 1; i < len; i++) { if(arr1[i - 1] == arr1[i]) { tieCount++; } else if(tieCount > 0) { R_rsort(arr2 + i - tieCount - 1, tieCount + 1); m1 += tieCount * (tieCount + 1) / 2; s += getMs(arr2 + i - tieCount - 1, tieCount + 1); tieCount++; tieCount = 0; } } if(tieCount > 0) { R_rsort(arr2 + i - tieCount - 1, tieCount + 1); m1 += tieCount * (tieCount + 1) / 2; s += getMs(arr2 + i - tieCount - 1, tieCount + 1); tieCount++; } swapCount = mergeSort(arr2, arr1, len); m2 = getMs(arr2, len); s -= (m1 + m2) + 2 * swapCount; if(cor) { double denominator1 = nPair - m1; double denominator2 = nPair - m2; double cor = s / sqrt(denominator1) / sqrt(denominator2); return cor; } else { /* Return covariance. */ return 2 * s; } } /* This function uses a simple O(N^2) implementation. It probably has a smaller * constant and therefore is useful in the small N case, and is also useful * for testing the relatively complex O(N log N) implementation. */ double kendallSmallN(double* arr1, double* arr2, size_t len, int cor) { /* Not using 64-bit ints here because this function is meant only for small N and for testing. */ int m1 = 0, m2 = 0, s = 0, nPair; size_t i, j; double denominator1, denominator2; for(i = 0; i < len; i++) { for(j = i + 1; j < len; j++) { if(arr2[i] > arr2[j]) { if (arr1[i] > arr1[j]) { s++; } else if(arr1[i] < arr1[j]) { s--; } else { m1++; } } else if(arr2[i] < arr2[j]) { if (arr1[i] > arr1[j]) { s--; } else if(arr1[i] < arr1[j]) { s++; } else { m1++; } } else { m2++; if(arr1[i] == arr1[j]) { m1++; } } } } nPair = len * (len - 1) / 2; if(cor) { denominator1 = nPair - m1; denominator2 = nPair - m2; return s / sqrt(denominator1) / sqrt(denominator2); } else { /* Return covariance. */ return 2 * s; } } #ifdef kendallTest int main() { double a[100], b[100]; double smallNCor, smallNCov, largeNCor, largeNCov; int i; /* Test the small N version against a few values obtained from the old * version in R. Only exercising the small N version because the large * N version requires the first array to be sorted and the second to be * reordered in lockstep before it's called.*/ { double a1[] = {1,2,3,5,4}; double a2[] = {1,2,3,3,5}; assert(kendallSmallN(a1, a2, 5, 1) - 0.7378648 < 0.00001); assert(kendallSmallN(a1, a2, 5, 0) == 14); double b1[] = {8,6,7,5,3,0,9}; double b2[] = {3,1,4,1,5,9,2}; assert(kendallSmallN(b1, b2, 7, 1) + 0.39036 < 0.00001); assert(kendallSmallN(b1, b2, 7, 0) == -16); double c1[] = {1,1,1,2,3,3,4,4}; double c2[] = {1,2,1,3,3,5,5,5}; assert(kendallSmallN(c1, c2, 8, 1) - 0.8695652 < 0.00001); assert(kendallSmallN(c1, c2, 8, 0) == 40); } /* Now that we're confident that the simple, small N version works, * extensively test it against the much more complex and bug-prone * O(N log N) version. */ for(i = 0; i < 10000; i++) { int j, len; for(j = 0; j < 100; j++) { a[j] = rand() % 30; b[j] = rand() % 30; } len = rand() % 50 + 50; /* The large N version assumes that the first array is sorted. This * will usually be made true in R before passing the arrays to the * C functions. */ insertionSort(a, len); if(i & 1) { /* Test correlation on odd iterations, covariance on even ones. * Can't test both on every iteration because the large N * impl destroys the contents of the arrays passed in.*/ smallNCor = kendallSmallN(a, b, len, 1); largeNCor = kendallNlogN(a, b, len, 1); assert(largeNCor == smallNCor); } else { smallNCov = kendallSmallN(a, b, len, 0); largeNCov = kendallNlogN(a, b, len, 0); assert(largeNCov == smallNCov); } } printf("Passed all tests.\n"); /* Speed test. Compare the O(N^2) version, which is very similar to * R's current impl, to my O(N log N) version. */ { const int N = 30000; double *foo, *bar, *buf; size_t i; double startTime, stopTime; foo = (double*) malloc(N * sizeof(double)); bar = (double*) malloc(N * sizeof(double)); for(i = 0; i < N; i++) { foo[i] = rand(); bar[i] = rand(); } startTime = clock(); kendallSmallN(foo, bar, N, 1); stopTime = clock(); printf("O(N^2) version: %f milliseconds\n", stopTime - startTime); startTime = clock(); /* Only sorting first array. Normally the second one would be * reordered in lockstep. */ buf = (double*) malloc(N * sizeof(double)); mergeSort(foo, buf, N); kendallNlogN(foo, bar, N, 1); stopTime = clock(); printf("O(N log N) version: %f milliseconds\n", stopTime - startTime); } return 0; } #endif