namespace
Senzee_AlistairMilneAlgorithm_1
{
union
Vector4 { struct { float
a, b, c, d; }; float f[4]; };
__forceinline
void Swap_MMX(Vector4 *a, Vector4 *b)
{
__asm
{
mov eax,
a
mov ecx,
b
movaps xmm0, [eax]
movaps xmm1, [ecx]
movaps [eax], xmm1
movaps [ecx], xmm0
}
}
__forceinline
void CompareSwap_MMX(Vector4 *a, Vector4 *b, unsigned i)
{
__asm
{
mov
eax, a
mov ecx,
b
mov edx,
i
movss xmm0, [eax + edx * 4]
movss xmm1, [ecx + edx * 4]
comiss xmm0, xmm1
jbe no_swap
movaps xmm0, [eax]
movaps xmm1, [ecx]
movaps [eax], xmm1
movaps [ecx], xmm0
no_swap:
}
}
float
GetMinDistSq_MMX(const Vector4 *lo, const Vector4 *hi)
{
float
mind = 0.f, flt_max = FLT_MAX;
__asm
{
movss xmm2, flt_max
mov eax,
lo
mov edi,
hi
loop_0:
cmp eax,
edi
jge done
movaps xmm0, [eax] // pa
lea edx,
[eax + 16]
loop_1:
movaps xmm1, [edx] // pb
subps xmm1, xmm0
mulps xmm1, xmm1
movaps xmm3, xmm1
psrldq xmm1, 4
addss xmm3, xmm1
psrldq xmm1, 4
addss xmm3, xmm1
comiss xmm3, xmm2
jae no_set
movss xmm2, xmm3
no_set:
add edx,
16
cmp edx,
edi
jl loop_1
add eax,
16
jmp loop_0
done:
movss mind, xmm2
}
return
mind;
}
Vector4 *PartitionMedian3(Vector4 *lo,
Vector4 *hi, unsigned c)
{
Vector4 *i, *j, *mid = lo + ((hi - lo)
>> 1);
/*
if ( lo->f[c] > hi->f[c]) Swap_MMX(lo, hi);
if ( lo->f[c] >
mid->f[c]) Swap_MMX(lo, mid);
if ( hi->f[c] >
mid->f[c]) Swap_MMX(mid, hi); // median of 3 - put middle in hi spot
float fc = hi->f[c];
for (j = lo; j < hi; j++)
if (j->f[c] <= fc)
Swap_MMX(i++, j);
*/
CompareSwap_MMX(lo, hi, c);
CompareSwap_MMX(lo, mid, c);
CompareSwap_MMX(hi, mid, c); // median of 3 - put middle in hi spot
__asm
{
mov edi,
lo // j
mov esi,
lo // i
mov edx,
hi
mov ecx,
c
movss xmm0, [edx + ecx * 4]
top:
cmp edi,
edx
jae done
movss xmm2, [edi + ecx * 4]
comiss xmm0, xmm2
ja no_swap
movaps xmm1, [edi]
movaps xmm2, [esi]
movaps [esi], xmm1
movaps [edi], xmm2
add
esi, 16
no_swap:
add edi, 16
jmp top
done:
mov i, esi
mov j, edi
}
Swap_MMX(i, j);
return
i;
}
Vector4 *PartitionByDistance(Vector4 *lo,
Vector4 *hi, Vector4 *pivot, unsigned c, float distancesq) // based
on distance
{
Vector4 *i, *j;
Swap_MMX(hi, pivot);
/*
float fc = hi->f[c], diff
= 0.f;
for (j = lo; j < hi; j++)
{
diff = j->f[c] - fc;
if (diff * diff <=
dist)
Swap_MMX(i++, j);
}
*/
__asm
{
mov edi,
lo // j
mov esi,
lo // i
mov edx,
hi
mov ecx,
c
movss xmm0, [edx + ecx * 4]
movss xmm3, distancesq
top:
cmp edi,
edx
jae done
movss xmm2, [edi + ecx * 4]
subss xmm2, xmm0
mulss xmm2, xmm2
comiss xmm2, xmm3
ja
no_swap
movaps xmm1, [edi]
movaps xmm2, [esi]
movaps [esi], xmm1
movaps [edi], xmm2
add esi, 16
no_swap:
add edi, 16
jmp top
done:
mov i, esi
mov j, edi
}
Swap_MMX(i, j);
return
i;
}
enum {
N2_CUTOFF = 23 }; // tested best value
__declspec(align(16)) Vector4 gStaticArray[CLOUD_ELEMENTS];
float gMind
= FLT_MAX;
float
GetMinDistSquared(Vector4 *lo, Vector4 *hi, unsigned
depth)
{
recurse:
if (lo
+ N2_CUTOFF < hi)
{
unsigned
c = depth % 3;
//
partition segment
Vector4 *q = PartitionMedian3(lo,
hi, c);
// now
recurse, divide and conquer
GetMinDistSquared(lo, q - 1, depth
+ 1);
GetMinDistSquared(q + 1, hi, depth
+ 1);
// now
get any 'smallest' pairs that cross a partition
q = PartitionByDistance(lo, hi, q,
c, gMind);
depth++; hi = q; // tail recurse to get min distance from crossing area
goto
recurse;
}
else if (lo < hi)
{
float
d = GetMinDistSq_MMX(lo, hi + 1);
if
(d < gMind) gMind = d;
}
return
gMind;
}
void
CopyToVector4_MMX(const PointC *points, unsigned count, Vector4 *vectors)
{
/*
// copy points to four
component vectors
for (Vector4 *p = points, *e
= points + count; p < e; p++)
{
const PointC &p3 =
*points3;
Vector4 p4 = { p3.x,
p3.y, p3.z, 0.f };
*p = p4;
points3++;
}
*/
__asm
{
mov eax,
points
mov ecx,
vectors
mov edi,
count
imul edi,
12
add edi,
eax
loop_0:
movups xmm0,
[eax]
cmp eax,
edi
jge done
movaps [ecx], xmm0
add eax,
12
add ecx,
16
jmp loop_0
done:
}
}
float
GetMinDist(CloudC *pCloud)
{
PointC
*points3 = pCloud->GetPointBuffer();
unsigned count
= pCloud->GetNumPoints();
Vector4
*points = gStaticArray;
CopyToVector4_MMX(points3, count,
points);
gMind = FLT_MAX;
return
sqrtf(GetMinDistSquared(points, points + (count - 1), 0));
}
}