Tell me more ×
Code Review Stack Exchange is a question and answer site for peer programmer code reviews. It's 100% free, no registration required.

I created a version of Michael Herf's radix sort implementation that returns the indices of the correct order instead of the sorted values. This approach is generally more useful.

The only problem that this way I need an additional lookup when I need the values for the sorting passes and it's extremely degrades the performance:

Original: 50M values in 850msec (average)

Edited: 50M values in 10000msec (average)

Could someone improve it? ...or should I use parallelization?

typedef unsigned long uint32;    
typefef float float32;
typedef const char *cpointer;

/**************************************************************************************************
    Use SSE prefetch (needs compiler support), not really a problem on non-SSE machines.
**************************************************************************************************/
#define PREFETCH 1

#if PREFETCH
    #include <xmmintrin.h>  // for prefetch
    #define pfval   64
    #define pfval2  128
    #define pf(x)   _mm_prefetch(cpointer(x + i + pfval), 0)
    #define pf2(x)  _mm_prefetch(cpointer(x + i + pfval2), 0)
#else
    #define pf(x)
    #define pf2(x)
#endif

__forceinline uint32 FloatFlip2(uint32 &f)
{
    uint32 mask = -int32(f >> 31) | 0x80000000;
    return (f ^= mask);
}    

#define _0(x)   (x & 0x7FF)
#define _1(x)   (x >> 11 & 0x7FF)
#define _2(x)   (x >> 22 )


sort(float32 *keys, uint32 *indices, uint32 *indices2, uint32 numKeys)
{
    // indices2 is the temp array

    uint32 i;
    uint32 *input = (uint32*)keys;

    // 3 histograms on the stack:
    const uint32 kHist = 2048;
    uint32 b0[kHist * 3];

    uint32 *b1 = b0 + kHist;
    uint32 *b2 = b1 + kHist;

    for (i = 0; i < kHist * 3; i++) {
        b0[i] = 0;
    }
    //memset(b0, 0, kHist * 12);

    // 1. Parallel histogramming pass
    for (i = 0; i < numKeys; i++)
    {   
        pf(input);

        uint32 fi = FloatFlip2((uint32&)input[i]);

        b0[_0(fi)] ++;
        b1[_1(fi)] ++;
        b2[_2(fi)] ++;
    }

    // 2.  Sum the histograms - each histogram entry records the number of values preceding itself
    {
        uint32 sum0 = 0, sum1 = 0, sum2 = 0;
        uint32 tsum;
        for (i = 0; i < kHist; i++)
        {
            tsum = b0[i] + sum0;
            b0[i] = sum0 - 1;
            sum0 = tsum;

            tsum = b1[i] + sum1;
            b1[i] = sum1 - 1;
            sum1 = tsum;

            tsum = b2[i] + sum2;
            b2[i] = sum2 - 1;
            sum2 = tsum;
        }
    }

    // byte 0: read/write histogram
    // -> indices
    for (i = 0; i < numKeys; i++)
    {
        pf2(input);

        //uint32 ind = i;
        uint32 key = input[i];

        uint32 pos = _0(key);
        uint32 index = ++b0[pos];

        indices[index] = i;
    }

    // byte 1: read/write histogram, copy
    //   indices -> indices2
    for (i = 0; i < numKeys; i++)
    {
        pf2(indices);
        pf2(input);

        uint32 ind = indices[i];
        uint32 key = input[ind]; // additional lookup

        uint32 pos = _1(key);
        uint32 index = ++b1[pos];

        indices2[index] = ind;
    }

    // byte 2: read/write histogram, copy
    //   indices2 -> indices
    for (i = 0; i < numKeys; i++)
    {
        pf2(indices);
        pf2(input);

        uint32 ind = indices2[i];
        uint32 key = input[ind]; // additional lookup

        uint32 pos = _2(key);
        uint32 index = ++b2[pos];

        indices[index] = ind;
    }
}

The original implementation:

...

__forceinline uint32 IFloatFlip(uint32 f)
{
    uint32 mask = ((f >> 31) - 1) | 0x80000000;
    return f ^ mask;
}

void RadixSort11(float32 *farray, float32 *sorted, uint32 elements)
{
    uint32 i;
    uint32 *sort = (uint32*)sorted;
    uint32 *array = (uint32*)farray;

    // 3 histograms on the stack:
    const uint32 kHist = 2048;
    uint32 b0[kHist * 3];

    uint32 *b1 = b0 + kHist;
    uint32 *b2 = b1 + kHist;

    for (i = 0; i < kHist * 3; i++) {
        b0[i] = 0;
    }
    //memset(b0, 0, kHist * 12);

    // 1.  parallel histogramming pass
    //
    for (i = 0; i < elements; i++)
    {   
        pf(array);

        uint32 fi = FloatFlip((uint32&)array[i]);

        b0[_0(fi)] ++;
        b1[_1(fi)] ++;
        b2[_2(fi)] ++;
    }

    // 2.  Sum the histograms -- each histogram entry records the number of values preceding itself.
    {
        uint32 sum0 = 0, sum1 = 0, sum2 = 0;
        uint32 tsum;
        for (i = 0; i < kHist; i++)
        {
            tsum = b0[i] + sum0;
            b0[i] = sum0 - 1;
            sum0 = tsum;

            tsum = b1[i] + sum1;
            b1[i] = sum1 - 1;
            sum1 = tsum;

            tsum = b2[i] + sum2;
            b2[i] = sum2 - 1;
            sum2 = tsum;
        }
    }

    // byte 0: floatflip entire value, read/write histogram, write out flipped
    for (i = 0; i < elements; i++)
    {
        uint32 fi = array[i];
        FloatFlipX(fi);
        uint32 pos = _0(fi);

        pf2(array);
        sort[++b0[pos]] = fi;
    }

    // byte 1: read/write histogram, copy
    //   sorted -> array
    for (i = 0; i < elements; i++)
    {
        uint32 si = sort[i];
        uint32 pos = _1(si);
        pf2(sort);
        array[++b1[pos]] = si;
    }

    // byte 2: read/write histogram, copy & flip out
    //   array -> sorted
    for (i = 0; i < elements; i++)
    {
        uint32 ai = array[i];
        uint32 pos = _2(ai);

        pf2(array);
        sort[++b2[pos]] = IFloatFlip(ai);
    }

    // to write original:
    // memcpy(array, sorted, elements * 4);
}
share|improve this question

Know someone who can answer? Share a link to this question via email, Google+, Twitter, or Facebook.

Your Answer

 
discard

By posting your answer, you agree to the privacy policy and terms of service.

Browse other questions tagged or ask your own question.