// Author:      Colin Andrews

// Institution: Maxis Emeryville

// email:       candrews@maxis.com

namespace ColinAndrews_ElectronicArtsMaxis_sub2

{

    // hopefully you'll link us against the multi-threaded libraries

    // that was not clear from the insructions

    extern "C" {

        uintptr_t __cdecl _beginthread (void (__cdecl *) (void *), unsigned, void *);

    }

 

    typedef unsigned int size_t;

 

    /// quick_sort stuff - All massaged from Paul Pedriana's EASTL implementations

 

    static const int kQuickSortLimit = 28;

 

    // TODO: template based on value_type unnecessary - Always PointC

    //   it would be interesting to compare the two if there's a performance

    //   difference in the generated code

    template <typename T, typename Compare>

        inline const T& median(const T& a, const T& b, const T& c, Compare compare)

    {

        if(compare(a, b))

        {

            if(compare(b, c))

                return b;

            else if(compare(a, c))

                return c;

            else

                return a;

        }

        else if(compare(a, c))

            return a;

        else if(compare(b, c))

            return c;

        return b;

    }

 

    inline size_t Log2(size_t n)

    {

        int i;

        for(i = 0; n; ++i)

            n >>= 1;

        return i - 1;

    }

 

    template <typename value_type>

        inline void simple_swap(value_type& a, value_type& b)

    {

        value_type tmp(a);

        a = b;

        b = tmp;

    }

 

    template <typename value_type, typename Compare>

        inline value_type* get_partition(value_type* first, value_type* last, value_type pivotValue, Compare compare)

    {

        for(; ; ++first)

        {

            while(compare(*first, pivotValue))

                ++first;

            --last;

 

            while(compare(pivotValue, *last))

                --last;

 

            if(first >= last)

                return first;

 

            simple_swap(*first, *last);

        }

    }

 

 

    template <typename value_type, typename StrictWeakOrdering>

        void insertion_sort(value_type* first, value_type* last, StrictWeakOrdering compare)

    {

        if(first != last)

        {

            value_type* iCurrent;

            value_type* iNext;

            value_type* iSorted = first;

 

            for(++iSorted; iSorted != last; ++iSorted)

            {

                const value_type temp(*iSorted);

 

                iNext = iCurrent = iSorted;

 

                for(--iCurrent; (iNext != first) && compare(temp, *iCurrent); --iNext, --iCurrent)

                    *iNext = *iCurrent;

                *iNext = temp;

            }

        }

    } // insertion_sort

 

    template <typename value_type, typename Compare>

        inline void insertion_sort_simple(value_type* first, value_type* last, Compare compare)

    {

        for(value_type* current = first; current != last; ++current)

        {

            value_type* end(current);

            value_type* prev(current);

            const value_type     value(*current);

 

            for(--prev; compare(value, *prev); --end, --prev)

                *end = *prev;

            *end = value;

        }

    }

 

    template <typename value_type, typename Compare>

        inline void quick_sort_impl(value_type* first, value_type* last, size_t kRecursionCount, Compare compare)

    {

 

        while(((last - first) > kQuickSortLimit) && (kRecursionCount >

            0))

        {

            value_type* position(get_partition<value_type,

                Compare>(first, last, median<value_type, Compare>(*first, *(first +

                (last - first) / 2), *(last - 1), compare), compare));

 

            quick_sort_impl<value_type, Compare>(position, last, --kRecursionCount, compare);

            last = position;

        }

 

        if(kRecursionCount == 0)

            insertion_sort_simple<value_type, Compare>(first, last, compare); // replaced partial_sort

    }

 

    template <typename value_type, typename Compare>

        void quick_sort(value_type* first, value_type* last, Compare

        compare)

    {

        if(first != last)

        {

            quick_sort_impl<value_type, Compare>(first, last, 2 * Log2(size_t(last - first)), compare);

 

            if((last - first) > (size_t)kQuickSortLimit)

            {

                insertion_sort<value_type, Compare>(first, first + kQuickSortLimit, compare);

                insertion_sort_simple<value_type, Compare>(first + kQuickSortLimit, last, compare);

            }

            else

                insertion_sort<value_type, Compare>(first, last, compare);

        }

    }

 

 

 

    /***********************************************************************

    *******/

    /*! GetMinDist

 

    \brief  Returns the distance between the two "Nearest Neighbors" in the Cloud.

 

    \param  pCloud  - CloudC pointer

    \return         - float.

 

    This is my second submission which is only a minor refinement of my original

    solution. I was pretty certain that it would be easy to improve upon my original

    implementation. To my surprise, only this implementation here has given any

    (though only slight) improvement. I experimented with an SSE optimization of

    my original algorithm, but that only made it worse. That's not to say there isn't

    a way to do it with SSE, I'm just not skill3d at low level hacking.

    Also,

    everything I tried was based on my original axis sort idea. I found one paper

    that presents a different algorithm that makes only one pass through the points,

    inserting them all into a tree structure. Lack of time and my hunch that the

    overhead of building and maintaining would be to much the data structure prevented

    me from trying that approach.

 

    My original idea starts by sorting the points along a major axis.Once the points

    are sorted on an axis, the brute force search can stop testing points that are

    farther apart on that axis than the current min-distance found.

 

    This refinement atempts to split the problem in two and spawn a thread to work

    on one half to hopefully make use of hyperthreading feature of Pentium 4's.

 

    Since the quicksort algorithm starts by partitioning the list into a high and

    a low portion it lends itself well to a multi-processor solution.

    After choosing

    the first pivot and partitioning the cloud in two sets, I spawn a worker thread

    to handle the smaller region. Then in both the worker and the main thread I

    complete the sort of each region and execute my original closest pair search.

    When both threads are done, the min distance is set to the min of the results

    found in each region. Of course there is the possibility that we are incredibly

    unlucky and the closest pair spans the boundary between the two regions. None

    of the test cases I used (random data and the stanford models) actually had a

    closest pair that spanned the pivot. I threw in a fixup search that looks within

    minDist of the boundary for a pair closer than the one found. With my luck, your

    test data will have the closest pair span the boundary and there'll be a bug

    in my fixup.

 

    This implementation gives much better performance on my true multi-cpu dev machines

    than my original, but on a single P4 system it gives only slightly improved times

    in my testing and in the occasional case worse times. This is likely due to the

    fact that hyper-threading is not really multi-processing. Also I've noticed

    a lot of variability of how much better the multi-thread version is depending

    on the input data. I interpret this as differences in how good a pivot is chosen

    at the start. The closer the pivot is to the actual median point along the sort axis,

    the more exqual the work that is done in each thread is an the smaller the time

    overall. Choosing a perfect median would of course require you to sort the points

    before hand so you just have to take your chances.

 

    Thanks for this challenge. I think it was a great problem and I've really enjoyed

    thinking about it. I look forward to hearing about the other solutions.

 

    */

 

    /***********************************************************************

    *******/

 

 

    // my distance function, I bet the speed freaks will figure out how to 

    // do theirs in integer math

    inline float xzdist2(PointC *pA, PointC *pB)

    {

        register float s(0.f);

        register float d(pA->x - pB->x);

        s += d*d;

        d = (pA->z - pB->z);

        return d*d + s;

    }

 

    inline float yd2(PointC *pA, PointC *pB)

    {

        register float d(pA->y - pB->y);

        return d*d;

    }

 

    struct compareX {

        inline bool operator()(const PointC& a, const PointC& b)

        {

            return a.x < b.x;

        }

    };

 

    struct compareY {

        inline bool operator()(const PointC& a, const PointC& b)

        {

            return a.y < b.y;

        }

    };

 

    struct compareZ {

        inline bool operator()(const PointC& a, const PointC& b)

        {

            return a.z < b.z;

        }

    };

 

    typedef compareY AxisCompare;

 

    struct ThreadData {

        unsigned int mbDone;

        PointC* mpStart;

        PointC* mpEnd;

        float mMin;

    };

 

 

    ////////////////////////////////////////////////////////////////////////

    ///////

    // sort_thread

    //    thread function to work on one portion of the full cloud - hopefully ~half

    //

    void sort_thread(void *pVoid) {

        ThreadData& threadData(*(ThreadData*)pVoid);

        PointC* pStart(threadData.mpStart);

        PointC* pEnd  (threadData.mpEnd);

        AxisCompare compare;

 

        quick_sort(pStart, pEnd, compare);

 

        float minDist = FLT_MAX;

 

        for (PointC * pSrc = pStart; pSrc != pEnd; ++pSrc)

        {

            for (PointC * pTst = pSrc+1; pTst != pEnd; ++pTst)

            {

                register float y2 = yd2(pSrc,pTst);

 

                if (y2 > minDist) { // list is sorted, need to search further for closest pair

                    break;

                }

 

                register float dist = y2 + xzdist2(pSrc,pTst);

 

                if (dist < minDist) {

                    minDist = dist;

                }

            }

        }

        threadData.mMin = minDist;

 

        threadData.mbDone = -1;

    }

 

 

    ////////////////////////////////////////////////////////////////////////

    ///////

    // BoundaryCheck

    //    Checks for a closer pair that spans the boundary between two regions

    //   of a y sorted cloud where the closest distance in either region has

    //   already been determined to be fMin

    inline float BoundaryCheck(float minDist, PointC *pBase, PointC *pMidPoint, PointC *pEnd) {

        PointC *pUpper(pMidPoint);

        PointC *pLower(pMidPoint-1);

        register float  y2(yd2(pLower,pUpper));

 

        while (y2<minDist) { // outer loop bumps lower

            while (y2<minDist) { // inner loop bumps upper

                register float dist(y2+xzdist2(pLower,pUpper));

                if (dist < minDist) {

                    minDist=dist;

                }

 

                ++pUpper;

                if (pUpper!=pEnd) {

                    break;

                }

                y2 = yd2(pLower, pUpper);

            }

 

            // decrement lower, reset upper

            --pLower;

            if (pLower<pBase) {

                break;

            }

 

            pUpper = pMidPoint;

            y2 = yd2(pLower, pUpper);

        }

 

        return minDist;

    }

 

    // Still not very hacky - I'll some speed freaks will do better than this

    float GetMinDist(CloudC *pCloud)

    {

        PointC * pBuffer            = pCloud->GetPointBuffer();

        unsigned long uiNumPoints   = pCloud->GetNumPoints();

        PointC * pBufferEnd = pBuffer + uiNumPoints;

 

        AxisCompare compare;

        PointC* pBufferMid(get_partition<PointC, AxisCompare>(pBuffer, pBufferEnd, median<PointC, AxisCompare>(*pBuffer, *(pBuffer + (pBufferEnd - pBuffer) / 2), *(pBufferEnd - 1), compare), compare));

 

        ThreadData treadData;

 

        PointC* pStart; // range of this thread's sort

        PointC* pEnd;

        // give smaller sort to other thread

        if ((pBufferMid-pBuffer)< uiNumPoints/2) {

            treadData.mpStart = pBuffer;

            treadData.mpEnd = pBufferMid;

            pStart = pBufferMid+1;

            pEnd = pBufferEnd;

        }

        else {

            treadData.mpStart = pBufferMid+1; // +1 ?

            treadData.mpEnd = pBufferEnd;

            pStart = pBuffer;

            pEnd = pBufferMid;

        }

 

        treadData.mbDone = 0;

 

        _beginthread(sort_thread, 256, &treadData);

 

        quick_sort(pStart, pEnd, compare);

 

        float minDist = FLT_MAX;

 

        for (PointC * pSrc = pStart; pSrc != pEnd; ++pSrc)

        {

            for (PointC * pTst = pSrc+1; pTst != pEnd; ++pTst)

            {

                float y2 = yd2(pSrc,pTst);

 

                if (y2 > minDist) { // list is sorted, need to search further for closest pair

                    break;

                }

 

                float dist = y2 + xzdist2(pSrc,pTst);

 

                if (dist < minDist) {

                    minDist = dist;

                }

            }

        }

 

        // wait for second thread signal

        while (!treadData.mbDone);

 

        if (treadData.mMin<minDist) { // not really done

            minDist = treadData.mMin;

        }

 

        // have to check that the closest pair didn't span the midpoint

        minDist = BoundaryCheck(minDist, pBuffer, pBufferMid, pBufferEnd);

 

        // return the distance between the two closest points

        return sqrtf(minDist); // not actually correct

    }

 

} // end namespace