// Author: Colin Andrews
// Institution:
Maxis Emeryville
// email: candrews@maxis.com
namespace
ColinAndrews_ElectronicArtsMaxis_sub2
{
// hopefully
you'll link us against the multi-threaded libraries
// that was not
clear from the insructions
extern
"C" {
uintptr_t __cdecl _beginthread (void (__cdecl *) (void *), unsigned, void *);
}
typedef unsigned
int size_t;
/// quick_sort stuff - All massaged from Paul Pedriana's EASTL implementations
static
const int kQuickSortLimit
= 28;
// TODO: template
based on value_type unnecessary - Always PointC
// it would be interesting to compare the two
if there's a performance
// difference in the generated code
template
<typename T, typename Compare>
inline const
T& median(const T& a, const T& b, const
T& c, Compare compare)
{
if(compare(a, b))
{
if(compare(b, c))
return b;
else if(compare(a,
c))
return c;
else
return a;
}
else if(compare(a,
c))
return a;
else if(compare(b,
c))
return c;
return b;
}
inline
size_t Log2(size_t n)
{
int i;
for(i = 0; n; ++i)
n >>= 1;
return i - 1;
}
template
<typename value_type>
inline void simple_swap(value_type& a, value_type& b)
{
value_type
tmp(a);
a = b;
b = tmp;
}
template
<typename value_type, typename
Compare>
inline value_type* get_partition(value_type* first, value_type* last, value_type pivotValue, Compare compare)
{
for(; ; ++first)
{
while(compare(*first, pivotValue))
++first;
--last;
while(compare(pivotValue,
*last))
--last;
if(first >= last)
return first;
simple_swap(*first, *last);
}
}
template
<typename value_type, typename
StrictWeakOrdering>
void insertion_sort(value_type* first, value_type*
last, StrictWeakOrdering compare)
{
if(first != last)
{
value_type*
iCurrent;
value_type*
iNext;
value_type*
iSorted = first;
for(++iSorted; iSorted != last; ++iSorted)
{
const value_type
temp(*iSorted);
iNext = iCurrent = iSorted;
for(--iCurrent; (iNext != first) && compare(temp, *iCurrent); --iNext, --iCurrent)
*iNext
= *iCurrent;
*iNext
= temp;
}
}
} // insertion_sort
template
<typename value_type, typename
Compare>
inline void insertion_sort_simple(value_type*
first, value_type* last, Compare compare)
{
for(value_type*
current = first; current != last; ++current)
{
value_type*
end(current);
value_type*
prev(current);
const value_type value(*current);
for(--prev;
compare(value, *prev); --end, --prev)
*end = *prev;
*end = value;
}
}
template
<typename value_type, typename
Compare>
inline void quick_sort_impl(value_type*
first, value_type* last, size_t
kRecursionCount, Compare compare)
{
while(((last - first) > kQuickSortLimit)
&& (kRecursionCount >
0))
{
value_type*
position(get_partition<value_type,
Compare>(first,
last, median<value_type, Compare>(*first,
*(first +
(last
- first) / 2), *(last - 1), compare), compare));
quick_sort_impl<value_type, Compare>(position,
last, --kRecursionCount, compare);
last =
position;
}
if(kRecursionCount
== 0)
insertion_sort_simple<value_type, Compare>(first,
last, compare); // replaced partial_sort
}
template
<typename value_type, typename
Compare>
void quick_sort(value_type* first, value_type*
last, Compare
compare)
{
if(first != last)
{
quick_sort_impl<value_type, Compare>(first,
last, 2 * Log2(size_t(last - first)), compare);
if((last - first) > (size_t)kQuickSortLimit)
{
insertion_sort<value_type, Compare>(first,
first + kQuickSortLimit, compare);
insertion_sort_simple<value_type, Compare>(first + kQuickSortLimit, last, compare);
}
else
insertion_sort<value_type, Compare>(first,
last, compare);
}
}
/***********************************************************************
*******/
/*! GetMinDist
\brief Returns the distance between the two
"Nearest Neighbors" in the Cloud.
\param pCloud - CloudC pointer
\return - float.
This is my second submission which is only
a minor refinement of my original
solution. I was
pretty certain that it would be easy to improve upon my original
implementation. To
my surprise, only this implementation here has given any
(though only
slight) improvement. I experimented with an SSE optimization of
my original
algorithm, but that only made it worse. That's not to say there isn't
a way to do it
with SSE, I'm just not skill3d at low level hacking.
Also,
everything I tried
was based on my original axis sort idea. I found one paper
that presents a
different algorithm that makes only one pass through the points,
inserting them all
into a tree structure. Lack of time and my hunch that the
overhead of
building and maintaining would be to much the data structure prevented
me from trying
that approach.
My original idea starts by sorting the
points along a major axis.Once the points
are sorted on an
axis, the brute force search can stop testing points that are
farther apart on
that axis than the current min-distance found.
This refinement atempts
to split the problem in two and spawn a thread to work
on one half to
hopefully make use of hyperthreading feature of
Pentium 4's.
Since the quicksort
algorithm starts by partitioning the list into a high and
a low portion it
lends itself well to a multi-processor solution.
After choosing
the first pivot
and partitioning the cloud in two sets, I spawn a worker thread
to handle the
smaller region. Then in both the worker and the main thread I
complete the sort
of each region and execute my original closest pair search.
When both threads are done, the min
distance is set to the min of the results
found in each
region. Of course there is the possibility that we are incredibly
unlucky and the
closest pair spans the boundary between the two regions. None
of the test cases
I used (random data and the stanford models) actually
had a
closest pair that
spanned the pivot. I threw in a fixup search that
looks within
minDist
of the boundary for a pair closer than the one found. With my luck, your
test data will
have the closest pair span the boundary and there'll be a bug
in my fixup.
This implementation gives much better
performance on my true multi-cpu dev machines
than my original,
but on a single P4 system it gives only slightly improved times
in my testing and
in the occasional case worse times. This is likely due to the
fact that
hyper-threading is not really multi-processing. Also I've noticed
a lot of
variability of how much better the multi-thread version is depending
on the input data.
I interpret this as differences in how good a pivot is chosen
at the start. The
closer the pivot is to the actual median point along the sort axis,
the more exqual the work that is done in each thread is an the
smaller the time
overall. Choosing
a perfect median would of course require you to sort the points
before hand so you
just have to take your chances.
Thanks for this challenge. I think it was a
great problem and I've really enjoyed
thinking about it.
I look forward to hearing about the other solutions.
*/
/***********************************************************************
*******/
// my distance
function, I bet the speed freaks will figure out how to
// do theirs in
integer math
inline
float xzdist2(PointC
*pA, PointC *pB)
{
register float
s(0.f);
register float
d(pA->x - pB->x);
s += d*d;
d = (pA->z
- pB->z);
return d*d + s;
}
inline
float yd2(PointC *pA, PointC *pB)
{
register float
d(pA->y - pB->y);
return d*d;
}
struct compareX
{
inline bool operator()(const PointC& a, const PointC& b)
{
return a.x < b.x;
}
};
struct compareY
{
inline bool operator()(const PointC& a, const PointC& b)
{
return a.y < b.y;
}
};
struct compareZ
{
inline bool operator()(const PointC& a, const PointC& b)
{
return a.z < b.z;
}
};
typedef compareY
AxisCompare;
struct ThreadData
{
unsigned int mbDone;
PointC* mpStart;
PointC* mpEnd;
float mMin;
};
////////////////////////////////////////////////////////////////////////
///////
// sort_thread
// thread function to work on one portion of
the full cloud - hopefully ~half
//
void
sort_thread(void *pVoid) {
ThreadData&
threadData(*(ThreadData*)pVoid);
PointC* pStart(threadData.mpStart);
PointC* pEnd (threadData.mpEnd);
AxisCompare
compare;
quick_sort(pStart,
pEnd, compare);
float minDist =
FLT_MAX;
for (PointC * pSrc = pStart; pSrc != pEnd; ++pSrc)
{
for (PointC * pTst = pSrc+1; pTst != pEnd; ++pTst)
{
register float
y2 = yd2(pSrc,pTst);
if (y2 > minDist)
{ // list is sorted, need to search further for
closest pair
break;
}
register float
dist = y2 + xzdist2(pSrc,pTst);
if (dist < minDist)
{
minDist = dist;
}
}
}
threadData.mMin
= minDist;
threadData.mbDone
= -1;
}
////////////////////////////////////////////////////////////////////////
///////
// BoundaryCheck
// Checks for a closer pair that spans the
boundary between two regions
// of a y sorted cloud where the closest
distance in either region has
// already been determined to be fMin
inline
float BoundaryCheck(float minDist, PointC *pBase, PointC *pMidPoint, PointC *pEnd) {
PointC *pUpper(pMidPoint);
PointC *pLower(pMidPoint-1);
register float y2(yd2(pLower,pUpper));
while (y2<minDist)
{ // outer loop bumps lower
while (y2<minDist)
{ // inner loop bumps upper
register float
dist(y2+xzdist2(pLower,pUpper));
if (dist < minDist)
{
minDist=dist;
}
++pUpper;
if (pUpper!=pEnd) {
break;
}
y2 = yd2(pLower, pUpper);
}
//
decrement lower, reset upper
--pLower;
if (pLower<pBase) {
break;
}
pUpper = pMidPoint;
y2 = yd2(pLower, pUpper);
}
return minDist;
}
// Still not very
hacky - I'll some speed freaks will do better than
this
float
GetMinDist(CloudC *pCloud)
{
PointC * pBuffer = pCloud->GetPointBuffer();
unsigned long uiNumPoints = pCloud->GetNumPoints();
PointC * pBufferEnd = pBuffer + uiNumPoints;
AxisCompare compare;
PointC* pBufferMid(get_partition<PointC, AxisCompare>(pBuffer, pBufferEnd, median<PointC, AxisCompare>(*pBuffer, *(pBuffer + (pBufferEnd - pBuffer) / 2), *(pBufferEnd - 1),
compare), compare));
ThreadData treadData;
PointC* pStart; // range of this thread's
sort
PointC* pEnd;
// give
smaller sort to other thread
if ((pBufferMid-pBuffer)<
uiNumPoints/2) {
treadData.mpStart
= pBuffer;
treadData.mpEnd
= pBufferMid;
pStart = pBufferMid+1;
pEnd = pBufferEnd;
}
else {
treadData.mpStart
= pBufferMid+1; // +1 ?
treadData.mpEnd
= pBufferEnd;
pStart = pBuffer;
pEnd = pBufferMid;
}
treadData.mbDone
= 0;
_beginthread(sort_thread, 256, &treadData);
quick_sort(pStart,
pEnd, compare);
float minDist =
FLT_MAX;
for (PointC * pSrc = pStart; pSrc != pEnd; ++pSrc)
{
for (PointC * pTst = pSrc+1; pTst != pEnd; ++pTst)
{
float y2 = yd2(pSrc,pTst);
if (y2 > minDist)
{ // list is sorted, need to search further for
closest pair
break;
}
float dist = y2 + xzdist2(pSrc,pTst);
if (dist < minDist)
{
minDist = dist;
}
}
}
// wait for
second thread signal
while (!treadData.mbDone);
if (treadData.mMin<minDist) { // not really done
minDist = treadData.mMin;
}
// have to
check that the closest pair didn't span the midpoint
minDist
= BoundaryCheck(minDist, pBuffer, pBufferMid, pBufferEnd);
// return the
distance between the two closest points
return sqrtf(minDist); // not actually correct
}
} // end namespace