namespace Senzee_AlistairMilneAlgorithm_1

{

    union Vector4 { struct { float a, b, c, d; }; float f[4]; };

 

    __forceinline void Swap_MMX(Vector4 *a, Vector4 *b)

    {

        __asm

        {

            mov    eax,  a

            mov    ecx,  b

            movaps xmm0, [eax]

            movaps xmm1, [ecx]

            movaps [eax], xmm1

            movaps [ecx], xmm0

        }

    }

 

    __forceinline void CompareSwap_MMX(Vector4 *a, Vector4 *b, unsigned i)

    {

        __asm

        {

            mov    eax,  a

            mov    ecx,  b

            mov    edx,  i

            movss  xmm0, [eax + edx * 4]

            movss  xmm1, [ecx + edx * 4]

            comiss xmm0, xmm1

            jbe    no_swap

            movaps xmm0, [eax]

            movaps xmm1, [ecx]

            movaps [eax], xmm1

            movaps [ecx], xmm0

no_swap:

        }

    }

 

    float GetMinDistSq_MMX(const Vector4 *lo, const Vector4 *hi)

    {

        float mind = 0.f, flt_max = FLT_MAX;

        __asm          

            {

            movss  xmm2, flt_max

                  mov    eax,  lo

            mov    edi,  hi

loop_0:

            cmp    eax,  edi

            jge    done

                  movaps xmm0, [eax]         // pa

                  lea    edx,  [eax + 16]

loop_1:

                  movaps xmm1, [edx]         // pb

                  subps  xmm1, xmm0

                  mulps  xmm1, xmm1

                  movaps xmm3, xmm1

                  psrldq xmm1, 4

                  addss  xmm3, xmm1

                  psrldq xmm1, 4

                  addss  xmm3, xmm1

            comiss xmm3, xmm2

            jae    no_set

                  movss  xmm2, xmm3

no_set:        

                  add    edx,  16

            cmp    edx,  edi

                  jl     loop_1

                  add    eax,  16

            jmp    loop_0

done:

            movss  mind, xmm2

        }

        return mind;

    }

 

    Vector4 *PartitionMedian3(Vector4 *lo, Vector4 *hi, unsigned c)

    {

        Vector4 *i, *j, *mid = lo + ((hi - lo) >> 1);

        /*

        if ( lo->f[c] >  hi->f[c]) Swap_MMX(lo,  hi);

        if ( lo->f[c] > mid->f[c]) Swap_MMX(lo, mid);

        if ( hi->f[c] > mid->f[c]) Swap_MMX(mid, hi); // median of 3 - put middle in hi spot

        float fc = hi->f[c];       

        for (j = lo; j < hi; j++)

            if (j->f[c] <= fc)

                Swap_MMX(i++, j);

        */

        CompareSwap_MMX(lo, hi,  c);

        CompareSwap_MMX(lo, mid, c);

        CompareSwap_MMX(hi, mid, c); // median of 3 - put middle in hi spot

        __asm

        {

            mov    edi,  lo // j

            mov    esi,  lo // i

            mov    edx,  hi

            mov    ecx,  c

            movss  xmm0, [edx + ecx * 4]

top:

            cmp    edi,   edx

            jae    done

            movss  xmm2, [edi + ecx * 4]

            comiss xmm0, xmm2

            ja     no_swap

            movaps xmm1, [edi]

            movaps xmm2, [esi]

            movaps [esi], xmm1

            movaps [edi], xmm2

            add    esi, 16

no_swap:

            add    edi, 16

            jmp    top

done:

            mov    i, esi

            mov    j, edi

        }

        Swap_MMX(i, j);

        return i;

    }

 

    Vector4 *PartitionByDistance(Vector4 *lo, Vector4 *hi, Vector4 *pivot, unsigned c, float distancesq) // based on distance

    {

        Vector4 *i, *j;

        Swap_MMX(hi, pivot);

        /*

        float fc = hi->f[c], diff = 0.f;

        for (j = lo; j < hi; j++)

        {

            diff = j->f[c] - fc;

            if (diff * diff <= dist)

                Swap_MMX(i++, j);

        }

        */

        __asm

        {

            mov    edi,  lo // j

            mov    esi,  lo // i

            mov    edx,  hi

            mov    ecx,  c

            movss  xmm0, [edx + ecx * 4]

            movss  xmm3, distancesq

top:

            cmp    edi,   edx

            jae    done

            movss  xmm2, [edi + ecx * 4]

            subss  xmm2, xmm0

            mulss  xmm2, xmm2

            comiss xmm2, xmm3

            ja     no_swap

            movaps xmm1, [edi]

            movaps xmm2, [esi]

            movaps [esi], xmm1

            movaps [edi], xmm2

            add    esi, 16

no_swap:

            add    edi, 16

            jmp    top

done:

            mov    i, esi

            mov    j, edi

        }

        Swap_MMX(i, j);

        return i;

    }

 

    enum { N2_CUTOFF = 23 }; // tested best value

    __declspec(align(16)) Vector4 gStaticArray[CLOUD_ELEMENTS];

    float gMind = FLT_MAX;

 

    float GetMinDistSquared(Vector4 *lo, Vector4 *hi, unsigned depth)

    {

recurse:

        if (lo + N2_CUTOFF < hi)

        {

            unsigned c = depth % 3;

            // partition segment

            Vector4 *q = PartitionMedian3(lo, hi, c);

            // now recurse, divide and conquer

            GetMinDistSquared(lo, q - 1, depth + 1);

            GetMinDistSquared(q + 1, hi, depth + 1);

            // now get any 'smallest' pairs that cross a partition

            q = PartitionByDistance(lo, hi, q, c, gMind);

            depth++; hi = q; // tail recurse to get min distance from crossing area

            goto recurse;

        }

        else if (lo < hi)

        {

            float d = GetMinDistSq_MMX(lo, hi + 1);

            if (d < gMind) gMind = d;

        }

        return gMind;

    }

 

    void CopyToVector4_MMX(const PointC *points, unsigned count, Vector4 *vectors)

    {

        /*

        // copy points to four component vectors

        for (Vector4 *p = points, *e = points + count; p < e; p++)

        {

            const PointC &p3 = *points3;

            Vector4 p4 = { p3.x, p3.y, p3.z, 0.f };

            *p = p4;

            points3++;

        }

        */

        __asm

            {

                  mov     eax,   points

            mov     ecx,   vectors

            mov     edi,   count

            imul    edi,   12

            add     edi,   eax

loop_0:

                  movups  xmm0,  [eax]

            cmp     eax,   edi

            jge     done

            movaps [ecx],  xmm0

            add     eax,   12

            add     ecx,   16

            jmp     loop_0

done:

        }

    }

 

    float GetMinDist(CloudC *pCloud)

    {

        PointC   *points3 = pCloud->GetPointBuffer();

        unsigned  count   = pCloud->GetNumPoints();

        Vector4  *points  = gStaticArray;

        CopyToVector4_MMX(points3, count, points);

        gMind = FLT_MAX;

        return sqrtf(GetMinDistSquared(points, points + (count - 1), 0));

    }

}