I created a version of Michael Herf's radix sort implementation that returns the indices of the correct order instead of the sorted values. This approach is generally more useful.
The only problem that this way I need an additional lookup when I need the values for the sorting passes and it's extremely degrades the performance:
Original: 50M values in 850msec (average)
Edited: 50M values in 10000msec (average)
Could someone improve it? ...or should I use parallelization?
typedef unsigned long uint32;
typefef float float32;
typedef const char *cpointer;
/**************************************************************************************************
Use SSE prefetch (needs compiler support), not really a problem on non-SSE machines.
**************************************************************************************************/
#define PREFETCH 1
#if PREFETCH
#include <xmmintrin.h> // for prefetch
#define pfval 64
#define pfval2 128
#define pf(x) _mm_prefetch(cpointer(x + i + pfval), 0)
#define pf2(x) _mm_prefetch(cpointer(x + i + pfval2), 0)
#else
#define pf(x)
#define pf2(x)
#endif
__forceinline uint32 FloatFlip2(uint32 &f)
{
uint32 mask = -int32(f >> 31) | 0x80000000;
return (f ^= mask);
}
#define _0(x) (x & 0x7FF)
#define _1(x) (x >> 11 & 0x7FF)
#define _2(x) (x >> 22 )
sort(float32 *keys, uint32 *indices, uint32 *indices2, uint32 numKeys)
{
// indices2 is the temp array
uint32 i;
uint32 *input = (uint32*)keys;
// 3 histograms on the stack:
const uint32 kHist = 2048;
uint32 b0[kHist * 3];
uint32 *b1 = b0 + kHist;
uint32 *b2 = b1 + kHist;
for (i = 0; i < kHist * 3; i++) {
b0[i] = 0;
}
//memset(b0, 0, kHist * 12);
// 1. Parallel histogramming pass
for (i = 0; i < numKeys; i++)
{
pf(input);
uint32 fi = FloatFlip2((uint32&)input[i]);
b0[_0(fi)] ++;
b1[_1(fi)] ++;
b2[_2(fi)] ++;
}
// 2. Sum the histograms - each histogram entry records the number of values preceding itself
{
uint32 sum0 = 0, sum1 = 0, sum2 = 0;
uint32 tsum;
for (i = 0; i < kHist; i++)
{
tsum = b0[i] + sum0;
b0[i] = sum0 - 1;
sum0 = tsum;
tsum = b1[i] + sum1;
b1[i] = sum1 - 1;
sum1 = tsum;
tsum = b2[i] + sum2;
b2[i] = sum2 - 1;
sum2 = tsum;
}
}
// byte 0: read/write histogram
// -> indices
for (i = 0; i < numKeys; i++)
{
pf2(input);
//uint32 ind = i;
uint32 key = input[i];
uint32 pos = _0(key);
uint32 index = ++b0[pos];
indices[index] = i;
}
// byte 1: read/write histogram, copy
// indices -> indices2
for (i = 0; i < numKeys; i++)
{
pf2(indices);
pf2(input);
uint32 ind = indices[i];
uint32 key = input[ind]; // additional lookup
uint32 pos = _1(key);
uint32 index = ++b1[pos];
indices2[index] = ind;
}
// byte 2: read/write histogram, copy
// indices2 -> indices
for (i = 0; i < numKeys; i++)
{
pf2(indices);
pf2(input);
uint32 ind = indices2[i];
uint32 key = input[ind]; // additional lookup
uint32 pos = _2(key);
uint32 index = ++b2[pos];
indices[index] = ind;
}
}
The original implementation:
...
__forceinline uint32 IFloatFlip(uint32 f)
{
uint32 mask = ((f >> 31) - 1) | 0x80000000;
return f ^ mask;
}
void RadixSort11(float32 *farray, float32 *sorted, uint32 elements)
{
uint32 i;
uint32 *sort = (uint32*)sorted;
uint32 *array = (uint32*)farray;
// 3 histograms on the stack:
const uint32 kHist = 2048;
uint32 b0[kHist * 3];
uint32 *b1 = b0 + kHist;
uint32 *b2 = b1 + kHist;
for (i = 0; i < kHist * 3; i++) {
b0[i] = 0;
}
//memset(b0, 0, kHist * 12);
// 1. parallel histogramming pass
//
for (i = 0; i < elements; i++)
{
pf(array);
uint32 fi = FloatFlip((uint32&)array[i]);
b0[_0(fi)] ++;
b1[_1(fi)] ++;
b2[_2(fi)] ++;
}
// 2. Sum the histograms -- each histogram entry records the number of values preceding itself.
{
uint32 sum0 = 0, sum1 = 0, sum2 = 0;
uint32 tsum;
for (i = 0; i < kHist; i++)
{
tsum = b0[i] + sum0;
b0[i] = sum0 - 1;
sum0 = tsum;
tsum = b1[i] + sum1;
b1[i] = sum1 - 1;
sum1 = tsum;
tsum = b2[i] + sum2;
b2[i] = sum2 - 1;
sum2 = tsum;
}
}
// byte 0: floatflip entire value, read/write histogram, write out flipped
for (i = 0; i < elements; i++)
{
uint32 fi = array[i];
FloatFlipX(fi);
uint32 pos = _0(fi);
pf2(array);
sort[++b0[pos]] = fi;
}
// byte 1: read/write histogram, copy
// sorted -> array
for (i = 0; i < elements; i++)
{
uint32 si = sort[i];
uint32 pos = _1(si);
pf2(sort);
array[++b1[pos]] = si;
}
// byte 2: read/write histogram, copy & flip out
// array -> sorted
for (i = 0; i < elements; i++)
{
uint32 ai = array[i];
uint32 pos = _2(ai);
pf2(array);
sort[++b2[pos]] = IFloatFlip(ai);
}
// to write original:
// memcpy(array, sorted, elements * 4);
}